Redaktor:Atomobot/atomobot cosmetics.py
Vzhled
#!/usr/bin/python2.4 # -*- coding: utf-8 -*- import re numeric_elem_skelet = r'&#(\d+);' named_elem_skelet = r'&([a-zA-Z]+);' named_elements = { u'aacute': u'á', u'alpha': u'α', u'apos': u"'", u'bull': u'•', u'copy': u'©', u'Dagger': u'‡', u'dagger': u'†', u'deg': u'°', u'euro': u'€', u'gt': u'>', u'iacute': u'í', u'lt': u'<', u'mdash': unichr( 0x2014 ), u'micro': u'µ', u'middot': u'·', u'nbsp': unichr( 0xA0 ), u'ndash': unichr( 0x2013 ), u'permil': u'‰', u'plusmn': u'±', u'quot': u'"', u'reg': u'®', u'Scaron': u'Š', u'scaron': u'š', u'sect': u'§', u'shy': unichr( 0xAD ), u'sup2': u'²', u'sup3': u'³', u'times': u'×', u'trade': u'™', u'yacute': u'ý', } replacements = [ ( r'^(=+)([^ =].*?)[ \t]*(=+)$', r'\1 \2 \3' ), # expand headings ( r'^(\:*)[ \t]*?(\*+)([^ *].*?)$', r'\1\2 \3' ), # expand lists ( r'^(\:+)([^ :*].*?)$', r'\1 \2' ), # expand indents ( r'^(=+)[ \t][ \t]+(.*?)[ \t]*(=+)$', r'\1 \2 \3' ), # shrink headings ( r'^(\:*)[ \t]*?(\*+)[ \t][ \t]+(.*?)$', r'\1\2 \3' ), # shrink lists ( r'^(\:+)[ \t][ \t]+(.*?)$', r'\1 \2' ), # shrink indents ( r'(?i)\[\[Image:', ur'[[Obrázok:' ), # Image -> Obrázok ( r'(?i)\[\[Category:', ur'[[Kategória:' ), # Category -> Kategória ( ur'(?i)\[\[Obrázok:(.*?)\|thumb(.*?)\]\]', ur'[[Obrázok:\1|náhľad\2]]' ), # thumb -> náhľad ( ur'(?i)\[\[Obrázok:(.*?)\|left(.*?)\]\]', ur'[[Obrázok:\1|vľavo\2]]' ), # left -> vľavo ( ur'(?i)\[\[Obrázok:(.*?)\|center(.*?)\]\]', ur'[[Obrázok:\1|stred\2]]' ), # center -> stred ( ur'(?i)\[\[Obrázok:(.*?)\|right(.*?)\]\]', ur'[[Obrázok:\1|vpravo\2]]' ), # right -> vpravo ( r'\<br\>', r'<br />' ), # <br> -> <br /> ] class AtomobotCosmetics( object ): def __init__( self ): self.compiled = [] self.prepare() self.stat_numeric_elements = 0 self.stat_named_elements = 0 self.stat_replacements = 0 def prepare( self ): self.re_numeric_elem = re.compile( numeric_elem_skelet ) self.re_named_elem = re.compile( named_elem_skelet ) for pat, repl in replacements: cpat = re.compile( pat, re.MULTILINE ) self.compiled.append( ( cpat, repl ) ) def numerical_deelementize( self, data ): shift = 0 results = self.re_numeric_elem.finditer( data ) for mat in results: if not mat: continue start = mat.start() end = mat.end() shortchar = unichr( int( mat.groups()[0] ) ) data = data[ :start-shift ] + shortchar + data[ end-shift: ] shift += end - start - 1 self.stat_numeric_elements += 1 return data def alphabetical_deelementize( self, data ): shift = 0 results = self.re_named_elem.finditer( data ) for mat in results: if not mat: continue start = mat.start() end = mat.end() shortchar = named_elements.get( mat.groups()[0], None ) if not shortchar: continue data = data[ :start-shift ] + shortchar + data[ end-shift: ] shift += end - start - 1 self.stat_named_elements += 1 return data def deelementize( self, data ): data = self.numerical_deelementize( data ) data = self.alphabetical_deelementize( data ) return data def do_replacements( self, data ): for pat, repl in self.compiled: data, numb = pat.subn( repl, data ) self.stat_replacements += numb return data def print_stats( self ): if self.stat_numeric_elements: print '%d numeric elems' % self.stat_numeric_elements if self.stat_named_elements: print '%d named elems' % self.stat_named_elements if self.stat_replacements: print '%d replacements' % self.stat_replacements def cosmetize( self, data ): olddata = data data = self.deelementize( data ) data = self.do_replacements( data ) #if data != olddata: # print data self.print_stats() return data