Redaktor:Atomobot/atomobot makeindex.py

z Wikipédie, slobodnej encyklopédie
#!/usr/bin/python2.4
# -*- coding: utf-8  -*-

from datetime import date, datetime, timedelta
import re
import wikipedia
import catlib
import atomobot_settings as config
from atomobot_date import AtomobotDate
from atomobot_language import AtomobotLanguageSlovak
from atomobot_misc import dequotize, dewikize


lang = AtomobotLanguageSlovak()



class AtomobotIndex( object ):

    def __init__( self ):
        self.all_arts = set()
        self.old_links = set()


    def matches_list( self, text, lst ):
        for rexp in lst:
            m = rexp.match( text )
            if m:
                return True
        return False


    def make_index_letter( self, deco ):
        letter = lang.letters[ deco[0] ]
        if letter in ' !"#$.,-()':
            return u'!'
        return letter.upper()


    def generate_index( self ):
        site = self.site
        p = wikipedia.Page( site, self.portal_data[ 'categorization' ] )
        data = p.get()
        lines = data.split( '\n' )
        lines = [ line for line in lines if line[:1] in ( '*', ':' ) ]
        re1 = re.compile( ur'.*\[\[\:%s:(.*)\]\]' % config.kategoria )
        katset = set()
        for line in lines:
            m = re1.match( line )
            if not m:
                continue
            groups = m.groups()
            if not groups:
                continue
            name = groups[0]
            katset.add( name )

        s = set()

        for kat in sorted( katset ):
            try:
                p = catlib.Category( site, u'%s:%s' % ( config.kategoria, kat ) )
                arts = p.articles()
            except KeyboardInterrupt:
                raise
            except Exception, e:
                print str( e )
                continue
            except:
                print u"Problem with category '%s'" % kat
                continue
            for art in arts:
                s.add( art.title() )

        exclude_list_compiled = []
        for item in config.index_exclude_list:
            exclude_list_compiled.append( re.compile( item ) )

        arts_by_alpha = {}

        for art in s:
            if not art:
                continue
            if self.matches_list( art, exclude_list_compiled ):
                continue
            self.all_arts.add( art )
            deco = lang.decompose( art )
            alpha = arts_by_alpha.setdefault( self.make_index_letter( deco ), [] )
            alpha.append( ( deco, art ) )

        letters = [ ( lang.decompose( key ), key ) for key in arts_by_alpha.keys() ]
        letters.sort()
        
        lets = [ letter for deco, letter in letters ]
        
        zoznam = self.portal_data[ 'list' ]
        
        data = u'\n'.join( config.index_index_body % { 'letter': letter, 'zoznam': zoznam } for letter in lets ) + '\n'
        data = config.index_index_head + data + config.index_index_tail

        p = wikipedia.Page( site, self.portal_data[ 'index' ] )
        p.put( data, lang.TEXT_UPDATE )
        
        self.arts_by_alpha = arts_by_alpha
        self.lets = lets
        self.grand_total = len( s )


    def generate_list( self, letter, arts ):
        p = wikipedia.Page( self.site, u'%s/%s' % ( self.portal_data[ 'list' ], letter ) )
        try:
            olddata = p.get()
            links_to = p.linkedPages()
            self.old_links.update( [ item.title() for item in links_to ] )
        except wikipedia.NoPage:
            olddata = u''
        
        twoletters = {}
        for deco, art in arts:
            alpha = twoletters.setdefault( tuple( deco[:2] ), [] )
            alpha.append( ( deco, art ) )
        
        firsts = [ key for key in twoletters.keys() ]
        firsts.sort()
        
        body = u''
        for twolet in firsts:
            subarts = twoletters.get( twolet, None )
            if not subarts:
                continue
            body += u'=== %s ===\n' % ( lang.compose( twolet ).lower() )
            final_subarts = [ u'[[%s]]' % item for deco, item in sorted( subarts ) ]
            body += self.portal_data[ 'list separator' ].join( final_subarts ) + '\n\n'

        head = config.index_list_head % {
            'number': len( arts ),
            'clankov': lang.plural( len( arts ), 'článok' ),
            'kategorizacia': self.portal_data[ 'categorization' ],
            'date': date.today().strftime( '%d.%m.%Y' ) }
        tail = u"""\n\n[[%s]]""" % self.portal_data[ 'lists' ]
        
        data = head + body + tail
        
        if data != olddata:
            p.put( data, lang.TEXT_UPDATE )


    def generate_lists( self ):
        for letter in self.lets:
            arts = self.arts_by_alpha.get( letter, None )
            if not arts:
                continue
            self.generate_list( letter, arts )


    def generate_last_actualization( self ):
        now = date.today()
        data = u'%s. %s - %s %s' % (
            now.day,
            lang.MONTH_NAME[ now.month ],
            self.grand_total,
            lang.plural( self.grand_total, 'článok' ) )
        p = wikipedia.Page( self.site, self.portal_data[ 'last update' ] )
        p.put( data, lang.TEXT_UPDATE )


    def load_settings( self, pagename, defaults=None ):
        settings = {}
        if defaults:
            settings.update( defaults )
        p_settings = wikipedia.Page( self.site, pagename )
        try:
            data = p_settings.get()
        except wikipedia.NoPage:
            return settings
        
        lines = data.split( '\n' )
        
        re1 = re.compile( ur'^\*+\s*(.*)\b\s*\=\s*(.*)$' )
        
        for line in lines:
            m = re1.match( line )
            if not m:
                continue
            groups = m.groups()
            if len( groups ) < 2:
                continue
            key = groups[0].lower()
            value = groups[1]
            settings[ key ] = value
        
        return settings


    def load_portal_settings( self, portal ):
        defaults = {}
        defaults[ u'separátor najnovších' ] = u' · '
        defaults[ u'separátor zoznamu' ] = u' · '
        defaults[ u'počet najnovších' ] = u'40'
        settings = self.load_settings( u'Portál:%s/Nastavenia' % portal, defaults )
        
        settings[ 'index' ] = dewikize( settings[ u'index' ] )
        settings[ 'categorization' ] = dewikize( settings[ u'kategorizácia' ] )
        settings[ 'last update' ] = dewikize( settings[ u'posledná aktualizácia' ] )
        settings[ 'list' ] = dequotize( settings[ u'zoznam' ] )
        settings[ 'list separator' ] = dequotize( settings[ u'separátor zoznamu' ] )
        settings[ 'lists' ] = dewikize( settings[ u'zoznamy' ] )[ 1: ]
        settings[ 'newest articles' ] = dewikize( settings[ u'najnovšie články' ] )
        settings[ 'newest count' ] = int( settings[ u'počet najnovších' ] )
        settings[ 'newest separator' ] = dequotize( settings[ u'separátor najnovších' ] )
        
        self.portal_data = settings


    def generate_newest( self ):
        p_naj = wikipedia.Page( self.site, self.portal_data[ 'newest articles' ] )
        links = p_naj.linkedPages()
        arts = [ item.title() for item in links ]
        old_newest_arts = set( arts )
        
        new_ones = self.all_arts.difference( self.old_links ).difference( old_newest_arts )
        real_new_ones = []
        
        now = datetime.today()
        
        for arttitle in sorted( new_ones ):
            try:
                p = wikipedia.Page( self.site, arttitle )
                if p.isRedirectPage():
                    continue
                history = p.getVersionHistory()
                lasttime = history[-1][0]
                dt = self.atomodate.wiki2python( lasttime )
                if dt + timedelta( days=7 ) < now:
                    continue
                real_new_ones.append( ( dt, arttitle ) )
            except KeyboardInterrupt:
                raise
            except Exception, e:
                print str( e )
                pass
            except:
                print 'UNKNOWN EXCEPTION'
                pass
        
        real_new_ones.sort()
        
        comment_atomobot = u'Atomobot :: '
        comment_added = u''
        
        for dt, arttitle in real_new_ones:
            arts.insert( 0, arttitle )
            comment_added = "+%s %s" % ( arttitle, comment_added )

        separator = self.portal_data[ 'newest separator' ]
        pocet = self.portal_data[ 'newest count' ]
        
        arts_removed = arts[ pocet: ]
        arts[ pocet: ] = []
        
        comment_removed = u' '.join( [ u'-%s' % item for item in arts_removed ] )
        
        arts = [ u'[[%s]]' % item for item in arts ]
        
        new_arts_data = separator.join( arts )
        comment = comment_atomobot + comment_added + comment_removed
        comment = comment.strip()
        
        print comment
        p_naj.put( new_arts_data, comment )


    def run( self, portal ):
        self.atomodate = AtomobotDate( lang )
        self.site = wikipedia.getSite()
        self.site.forceLogin()
        self.load_portal_settings( portal )
        self.generate_index()
        self.generate_lists()
        self.generate_last_actualization()
        self.generate_newest()
        wikipedia.stopme()



def main():
    for portal in config.index_portals:
        AtomobotIndex().run( portal )


main()