Redaktor:Atomobot/atomobot cosmetics.py

z Wikipédie, slobodnej encyklopédie
#!/usr/bin/python2.4
# -*- coding: utf-8  -*-

import re


numeric_elem_skelet = r'&#(\d+);'
named_elem_skelet = r'&([a-zA-Z]+);'

named_elements = {
    u'aacute': u'á',
    u'alpha': u'α',
    u'apos': u"'",
    u'bull': u'•',
    u'copy': u'©',
    u'Dagger': u'‡',
    u'dagger': u'†',
    u'deg': u'°',
    u'euro': u'€',
    u'gt': u'>',
    u'iacute': u'í',
    u'lt': u'<',
    u'mdash': unichr( 0x2014 ),
    u'micro': u'µ',
    u'middot': u'·',
    u'nbsp': unichr( 0xA0 ),
    u'ndash': unichr( 0x2013 ),
    u'permil': u'‰',
    u'plusmn': u'±',
    u'quot': u'"',
    u'reg': u'®',
    u'Scaron': u'Š',
    u'scaron': u'š',
    u'sect': u'§',
    u'shy': unichr( 0xAD ),
    u'sup2': u'²',
    u'sup3': u'³',
    u'times': u'×',
    u'trade': u'™',
    u'yacute': u'ý',
}

replacements = [
    ( r'^(=+)([^ =].*?)[ \t]*(=+)$', r'\1 \2 \3' ),			# expand headings
    ( r'^(\:*)[ \t]*?(\*+)([^ *].*?)$', r'\1\2 \3' ),			# expand lists
    ( r'^(\:+)([^ :*].*?)$', r'\1 \2' ),				# expand indents
    ( r'^(=+)[ \t][ \t]+(.*?)[ \t]*(=+)$', r'\1 \2 \3' ),		# shrink headings
    ( r'^(\:*)[ \t]*?(\*+)[ \t][ \t]+(.*?)$', r'\1\2 \3' ),		# shrink lists
    ( r'^(\:+)[ \t][ \t]+(.*?)$', r'\1 \2' ),				# shrink indents
    ( r'(?i)\[\[Image:', ur'[[Obrázok:' ),				# Image -> Obrázok
    ( r'(?i)\[\[Category:', ur'[[Kategória:' ),				# Category -> Kategória
    ( ur'(?i)\[\[Obrázok:(.*?)\|thumb(.*?)\]\]', ur'[[Obrázok:\1|náhľad\2]]' ),		# thumb -> náhľad
    ( ur'(?i)\[\[Obrázok:(.*?)\|left(.*?)\]\]', ur'[[Obrázok:\1|vľavo\2]]' ),		# left -> vľavo
    ( ur'(?i)\[\[Obrázok:(.*?)\|center(.*?)\]\]', ur'[[Obrázok:\1|stred\2]]' ),		# center -> stred
    ( ur'(?i)\[\[Obrázok:(.*?)\|right(.*?)\]\]', ur'[[Obrázok:\1|vpravo\2]]' ),		# right -> vpravo
    ( r'\<br\>', r'<br />' ),						# <br> -> <br />
]


class AtomobotCosmetics( object ):

    def __init__( self ):
        self.compiled = []
        self.prepare()
        self.stat_numeric_elements = 0
        self.stat_named_elements = 0
        self.stat_replacements = 0


    def prepare( self ):
        self.re_numeric_elem = re.compile( numeric_elem_skelet )
        self.re_named_elem = re.compile( named_elem_skelet )
        for pat, repl in replacements:
            cpat = re.compile( pat, re.MULTILINE )
            self.compiled.append( ( cpat, repl ) )


    def numerical_deelementize( self, data ):
        shift = 0
        results = self.re_numeric_elem.finditer( data )
        for mat in results:
            if not mat:
                continue
            start = mat.start()
            end = mat.end()
            shortchar = unichr( int( mat.groups()[0] ) )
            data = data[ :start-shift ] + shortchar + data[ end-shift: ]
            shift += end - start - 1
            self.stat_numeric_elements += 1
        return data


    def alphabetical_deelementize( self, data ):
        shift = 0
        results = self.re_named_elem.finditer( data )
        for mat in results:
            if not mat:
                continue
            start = mat.start()
            end = mat.end()
            shortchar = named_elements.get( mat.groups()[0], None )
            if not shortchar:
                continue
            data = data[ :start-shift ] + shortchar + data[ end-shift: ]
            shift += end - start - 1
            self.stat_named_elements += 1
        return data


    def deelementize( self, data ):
        data = self.numerical_deelementize( data )
        data = self.alphabetical_deelementize( data )
        return data


    def do_replacements( self, data ):
        for pat, repl in self.compiled:
            data, numb = pat.subn( repl, data )
            self.stat_replacements += numb
        return data


    def print_stats( self ):
        if self.stat_numeric_elements:
            print '%d numeric elems' % self.stat_numeric_elements
        if self.stat_named_elements:
            print '%d named elems' % self.stat_named_elements
        if self.stat_replacements:
            print '%d replacements' % self.stat_replacements


    def cosmetize( self, data ):
        olddata = data
        data = self.deelementize( data )
        data = self.do_replacements( data )
        #if data != olddata:
        #    print data
        self.print_stats()
        return data