Preskočiť na obsah

Redaktor:Atomobot/atomobot language.py

z Wikipédie, slobodnej encyklopédie
#!/usr/bin/python2.4
# -*- coding: utf-8  -*-


class AtomobotLanguageError( Exception ):
    pass



class LanguageText( object ):

    def __init__( self, lang, text ):
        self.lang = lang
        self.text = text
        self.decomposed = tuple( self.lang.decompose( text ) )


    def __str__( self ):
        return self.text


    def __repr__( self ):
        return "LanguageText('%s', %s)" % ( self.lang.code, repr( self.text ) )


    def __hash__( self ):
        return hash( self.decomposed )


    def __cmp__( self, other ):
        if isinstance( other, LanguageText ):
            return cmp( self.decomposed, other.decomposed )
        elif isinstance( other, basestring ):
            otherlangtext = LanguageText( self.lang, other )
            return cmp( self, otherlangtext )
        else:
            raise TypeError()



class AtomobotLanguage( object ):

    plurals = {}


    def __init__( self ):
        self.number = 0
        self.letters = [ u'?' ]
        self.number_by_letter = {}
        self.cgroups_by_first = {}
        self.letters_by_first = {}
        self.ch_types = {}


    def new_number( self ):
        self.number += 1
        return self.number


    def plural( self, number, name ):
        forms = self.plurals.get( name, None )
        if not forms:
            return name
        number = abs( number )
        if number >= 5:
            return forms[5]
        return forms[ number ]


    def add_letter( self, letter, ch_type='letter' ):
        number = self.new_number()
        self.letters.append( letter )
        self.number_by_letter[ letter ] = number
        cgroups = self.cgroups_by_first.setdefault( letter[0], {} )
        samelets = cgroups.setdefault( len( letter ), [] )
        samelets.append( letter )
        self.ch_types.setdefault( ch_type, set() ).add( letter )


    def tidy_letters( self ):
        for letter, cgroups in self.cgroups_by_first.iteritems():
            lst = self.letters_by_first.setdefault( letter, [] )
            for size in reversed( sorted( cgroups.keys() ) ):
                lst.extend( cgroups[ size ] )


    def decompose( self, text ):
        text = text.upper()
        decomposed = []
        pos = 0
        while True:
            if pos >= len( text ):
                break
            ch = text[ pos ]
            firstlets = self.letters_by_first.get( ch, None )
            if not firstlets:
                decomposed.append( 0 )
                pos += 1
                continue
            for firstlet in firstlets:
                if pos + len( firstlet ) > len( text ):
                    continue
                cpos = 1
                mismatched = False
                for fl in firstlet[ 1: ]:
                    if fl != text[ pos + cpos ]:
                        mismatched = True
                        break
                if mismatched:
                    continue
                decomposed.append( self.number_by_letter[ firstlet ] )
                pos += len( firstlet )
                break
        return decomposed


    def compose( self, decotext ):
        return u''.join( [ self.letters[ number ] for number in decotext ] )


    def compare_ci( self, text1, text2 ):
        dec1 = self.decompose( text1 )
        dec2 = self.decompose( text2 )
        return cmp( dec1, dec2 )




class AtomobotLanguageSlovak( AtomobotLanguage ):

    code = 'sk'

    plurals = {
        'článok': ( u'článkov', u'článok', u'články', u'články', u'články', u'článkov' ),
        'kategória': ( u'kategórií', u'kategória', u'kategórie', u'kategórie', u'kategórie', u'kategórií' ),
        }

    TEXT_UPDATE = u'Atomobot :: aktualizácia'

    MONTH_NAME = {
        1: u'január', 2: u'február', 3: u'marec', 4: u'apríl', 5: u'máj', 6: u'jún', 7: u'júl',
        8: u'august', 9: u'september', 10: u'október', 11: u'november', 12: u'december' }

    MONTH_NAME_GEN = {
        1: u'januára', 2: u'februára', 3: u'marca', 4: u'apríla', 5: u'mája', 6: u'júna', 7: u'júla',
        8: u'augusta', 9: u'septembra', 10: u'októbra', 11: u'novembra', 12: u'decembra' }
    
    
    def __init__( self ):
        super( AtomobotLanguageSlovak, self ).__init__()
        self.init_collated_letters()
        self.tidy_letters()


    def format_number( self, number, places=0 ):
        minus = False
        number = float( number )
        formatstr = '%%.%sf' % places
        numberstr = formatstr % number
        parts = numberstr.split( '.', 1 )
        if len( parts ) > 1:
            pre, post = parts
        else:
            pre = parts[0]
            post = ''
        if pre.startswith( '-' ):
            pre = pre[1:]
            minus = True
        final_number = []
        grouppos = 0
        for digit in reversed( pre ):
            if grouppos >= 3:
                final_number.insert( 0, ' ' )
                grouppos = 0
            final_number.insert( 0, digit )
            grouppos += 1
        if minus:
            final_number.insert( 0, '-' )
        final_str = ''.join( final_number )
        if post:
            final_str += ',' + post
        return final_str


    def init_collated_letters( self ):
        self.add_letter( u' ', 'symbol' )
        self.add_letter( u'!', 'symbol' )
        self.add_letter( u'"', 'symbol' )
        self.add_letter( u'#', 'symbol' )
        self.add_letter( u'$', 'symbol' )
        self.add_letter( u'.', 'symbol' )
        self.add_letter( u',', 'symbol' )
        self.add_letter( u'-', 'symbol' )
        self.add_letter( u'(', 'symbol' )
        self.add_letter( u')', 'symbol' )
        self.add_letter( u'0', 'number' )
        self.add_letter( u'1', 'number' )
        self.add_letter( u'2', 'number' )
        self.add_letter( u'3', 'number' )
        self.add_letter( u'4', 'number' )
        self.add_letter( u'5', 'number' )
        self.add_letter( u'6', 'number' )
        self.add_letter( u'7', 'number' )
        self.add_letter( u'8', 'number' )
        self.add_letter( u'9', 'number' )
        self.add_letter( u'A' )
        self.add_letter( u'Á' )
        self.add_letter( u'Â' )
        self.add_letter( u'Ä' )
        self.add_letter( u'Å' )
        self.add_letter( u'Æ' )
        self.add_letter( u'B' )
        self.add_letter( u'C' )
        self.add_letter( u'Ć' )
        self.add_letter( u'Č' )
        self.add_letter( u'Ç' )
        self.add_letter( u'D' )
        self.add_letter( u'Ď' )
        self.add_letter( u'DZ' )
        self.add_letter( u'DŽ' )
        self.add_letter( u'Ð' )
        self.add_letter( u'E' )
        self.add_letter( u'É' )
        self.add_letter( u'Ě' )
        self.add_letter( u'Ę' )
        self.add_letter( u'F' )
        self.add_letter( u'G' )
        self.add_letter( u'H' )
        self.add_letter( u'CH' )
        self.add_letter( u'I' )
        self.add_letter( u'Ì' )
        self.add_letter( u'Í' )
        self.add_letter( u'Î' )
        self.add_letter( u'Ï' )
        self.add_letter( u'Į' )
        self.add_letter( u'J' )
        self.add_letter( u'K' )
        self.add_letter( u'L' )
        self.add_letter( u'Ĺ' )
        self.add_letter( u'Ľ' )
        self.add_letter( u'Ł' )
        self.add_letter( u'M' )
        self.add_letter( u'N' )
        self.add_letter( u'Ń' )
        self.add_letter( u'Ň' )
        self.add_letter( u'Ñ' )
        self.add_letter( u'O' )
        self.add_letter( u'Ó' )
        self.add_letter( u'Ô' )
        self.add_letter( u'Ö' )
        self.add_letter( u'Ő' )
        self.add_letter( u'Œ' )
        self.add_letter( u'Ø' )
        self.add_letter( u'P' )
        self.add_letter( u'Q' )
        self.add_letter( u'R' )
        self.add_letter( u'Ŕ' )
        self.add_letter( u'Ř' )
        self.add_letter( u'S' )
        self.add_letter( u'Ś' )
        self.add_letter( u'Š' )
        self.add_letter( u'Ş' )
        self.add_letter( u'T' )
        self.add_letter( u'Ť' )
        self.add_letter( u'U' )
        self.add_letter( u'Ú' )
        self.add_letter( u'Ů' )
        self.add_letter( u'Ü' )
        self.add_letter( u'Ű' )
        self.add_letter( u'V' )
        self.add_letter( u'W' )
        self.add_letter( u'X' )
        self.add_letter( u'Y' )
        self.add_letter( u'Ý' )
        self.add_letter( u'Z' )
        self.add_letter( u'Ź' )
        self.add_letter( u'Ž' )