#!/usr/bin/python2.4
# -*- coding: utf-8 -*-
import re
numeric_elem_skelet = r'&#(\d+);'
named_elem_skelet = r'&([a-zA-Z]+);'
named_elements = {
u'aacute': u'á',
u'alpha': u'α',
u'apos': u"'",
u'bull': u'•',
u'copy': u'©',
u'Dagger': u'‡',
u'dagger': u'†',
u'deg': u'°',
u'euro': u'€',
u'gt': u'>',
u'iacute': u'í',
u'lt': u'<',
u'mdash': unichr( 0x2014 ),
u'micro': u'µ',
u'middot': u'·',
u'nbsp': unichr( 0xA0 ),
u'ndash': unichr( 0x2013 ),
u'permil': u'‰',
u'plusmn': u'±',
u'quot': u'"',
u'reg': u'®',
u'Scaron': u'Š',
u'scaron': u'š',
u'sect': u'§',
u'shy': unichr( 0xAD ),
u'sup2': u'²',
u'sup3': u'³',
u'times': u'×',
u'trade': u'™',
u'yacute': u'ý',
}
replacements = [
( r'^(=+)([^ =].*?)[ \t]*(=+)$', r'\1 \2 \3' ), # expand headings
( r'^(\:*)[ \t]*?(\*+)([^ *].*?)$', r'\1\2 \3' ), # expand lists
( r'^(\:+)([^ :*].*?)$', r'\1 \2' ), # expand indents
( r'^(=+)[ \t][ \t]+(.*?)[ \t]*(=+)$', r'\1 \2 \3' ), # shrink headings
( r'^(\:*)[ \t]*?(\*+)[ \t][ \t]+(.*?)$', r'\1\2 \3' ), # shrink lists
( r'^(\:+)[ \t][ \t]+(.*?)$', r'\1 \2' ), # shrink indents
( r'(?i)\[\[Image:', ur'[[Obrázok:' ), # Image -> Obrázok
( r'(?i)\[\[Category:', ur'[[Kategória:' ), # Category -> Kategória
( ur'(?i)\[\[Obrázok:(.*?)\|thumb(.*?)\]\]', ur'[[Obrázok:\1|náhľad\2]]' ), # thumb -> náhľad
( ur'(?i)\[\[Obrázok:(.*?)\|left(.*?)\]\]', ur'[[Obrázok:\1|vľavo\2]]' ), # left -> vľavo
( ur'(?i)\[\[Obrázok:(.*?)\|center(.*?)\]\]', ur'[[Obrázok:\1|stred\2]]' ), # center -> stred
( ur'(?i)\[\[Obrázok:(.*?)\|right(.*?)\]\]', ur'[[Obrázok:\1|vpravo\2]]' ), # right -> vpravo
( r'\<br\>', r'<br />' ), # <br> -> <br />
]
class AtomobotCosmetics( object ):
def __init__( self ):
self.compiled = []
self.prepare()
self.stat_numeric_elements = 0
self.stat_named_elements = 0
self.stat_replacements = 0
def prepare( self ):
self.re_numeric_elem = re.compile( numeric_elem_skelet )
self.re_named_elem = re.compile( named_elem_skelet )
for pat, repl in replacements:
cpat = re.compile( pat, re.MULTILINE )
self.compiled.append( ( cpat, repl ) )
def numerical_deelementize( self, data ):
shift = 0
results = self.re_numeric_elem.finditer( data )
for mat in results:
if not mat:
continue
start = mat.start()
end = mat.end()
shortchar = unichr( int( mat.groups()[0] ) )
data = data[ :start-shift ] + shortchar + data[ end-shift: ]
shift += end - start - 1
self.stat_numeric_elements += 1
return data
def alphabetical_deelementize( self, data ):
shift = 0
results = self.re_named_elem.finditer( data )
for mat in results:
if not mat:
continue
start = mat.start()
end = mat.end()
shortchar = named_elements.get( mat.groups()[0], None )
if not shortchar:
continue
data = data[ :start-shift ] + shortchar + data[ end-shift: ]
shift += end - start - 1
self.stat_named_elements += 1
return data
def deelementize( self, data ):
data = self.numerical_deelementize( data )
data = self.alphabetical_deelementize( data )
return data
def do_replacements( self, data ):
for pat, repl in self.compiled:
data, numb = pat.subn( repl, data )
self.stat_replacements += numb
return data
def print_stats( self ):
if self.stat_numeric_elements:
print '%d numeric elems' % self.stat_numeric_elements
if self.stat_named_elements:
print '%d named elems' % self.stat_named_elements
if self.stat_replacements:
print '%d replacements' % self.stat_replacements
def cosmetize( self, data ):
olddata = data
data = self.deelementize( data )
data = self.do_replacements( data )
#if data != olddata:
# print data
self.print_stats()
return data