#!/usr/bin/python2.4
# -*- coding: utf-8 -*-
from datetime import date, datetime, timedelta
import re
import wikipedia
import catlib
import atomobot_settings as config
from atomobot_date import AtomobotDate
from atomobot_language import AtomobotLanguageSlovak
from atomobot_misc import dequotize, dewikize
lang = AtomobotLanguageSlovak()
class AtomobotIndex( object ):
def __init__( self ):
self.all_arts = set()
self.old_links = set()
def matches_list( self, text, lst ):
for rexp in lst:
m = rexp.match( text )
if m:
return True
return False
def make_index_letter( self, deco ):
letter = lang.letters[ deco[0] ]
if letter in ' !"#$.,-()':
return u'!'
return letter.upper()
def generate_index( self ):
site = self.site
p = wikipedia.Page( site, self.portal_data[ 'categorization' ] )
data = p.get()
lines = data.split( '\n' )
lines = [ line for line in lines if line[:1] in ( '*', ':' ) ]
re1 = re.compile( ur'.*\[\[\:%s:(.*)\]\]' % config.kategoria )
katset = set()
for line in lines:
m = re1.match( line )
if not m:
continue
groups = m.groups()
if not groups:
continue
name = groups[0]
katset.add( name )
s = set()
for kat in sorted( katset ):
try:
p = catlib.Category( site, u'%s:%s' % ( config.kategoria, kat ) )
arts = p.articles()
except KeyboardInterrupt:
raise
except Exception, e:
print str( e )
continue
except:
print u"Problem with category '%s'" % kat
continue
for art in arts:
s.add( art.title() )
exclude_list_compiled = []
for item in config.index_exclude_list:
exclude_list_compiled.append( re.compile( item ) )
arts_by_alpha = {}
for art in s:
if not art:
continue
if self.matches_list( art, exclude_list_compiled ):
continue
self.all_arts.add( art )
deco = lang.decompose( art )
alpha = arts_by_alpha.setdefault( self.make_index_letter( deco ), [] )
alpha.append( ( deco, art ) )
letters = [ ( lang.decompose( key ), key ) for key in arts_by_alpha.keys() ]
letters.sort()
lets = [ letter for deco, letter in letters ]
zoznam = self.portal_data[ 'list' ]
data = u'\n'.join( config.index_index_body % { 'letter': letter, 'zoznam': zoznam } for letter in lets ) + '\n'
data = config.index_index_head + data + config.index_index_tail
p = wikipedia.Page( site, self.portal_data[ 'index' ] )
p.put( data, lang.TEXT_UPDATE )
self.arts_by_alpha = arts_by_alpha
self.lets = lets
self.grand_total = len( s )
def generate_list( self, letter, arts ):
p = wikipedia.Page( self.site, u'%s/%s' % ( self.portal_data[ 'list' ], letter ) )
try:
olddata = p.get()
links_to = p.linkedPages()
self.old_links.update( [ item.title() for item in links_to ] )
except wikipedia.NoPage:
olddata = u''
twoletters = {}
for deco, art in arts:
alpha = twoletters.setdefault( tuple( deco[:2] ), [] )
alpha.append( ( deco, art ) )
firsts = [ key for key in twoletters.keys() ]
firsts.sort()
body = u''
for twolet in firsts:
subarts = twoletters.get( twolet, None )
if not subarts:
continue
body += u'=== %s ===\n' % ( lang.compose( twolet ).lower() )
final_subarts = [ u'[[%s]]' % item for deco, item in sorted( subarts ) ]
body += self.portal_data[ 'list separator' ].join( final_subarts ) + '\n\n'
head = config.index_list_head % {
'number': len( arts ),
'clankov': lang.plural( len( arts ), 'článok' ),
'kategorizacia': self.portal_data[ 'categorization' ],
'date': date.today().strftime( '%d.%m.%Y' ) }
tail = u"""\n\n[[%s]]""" % self.portal_data[ 'lists' ]
data = head + body + tail
if data != olddata:
p.put( data, lang.TEXT_UPDATE )
def generate_lists( self ):
for letter in self.lets:
arts = self.arts_by_alpha.get( letter, None )
if not arts:
continue
self.generate_list( letter, arts )
def generate_last_actualization( self ):
now = date.today()
data = u'%s. %s - %s %s' % (
now.day,
lang.MONTH_NAME[ now.month ],
self.grand_total,
lang.plural( self.grand_total, 'článok' ) )
p = wikipedia.Page( self.site, self.portal_data[ 'last update' ] )
p.put( data, lang.TEXT_UPDATE )
def load_settings( self, pagename, defaults=None ):
settings = {}
if defaults:
settings.update( defaults )
p_settings = wikipedia.Page( self.site, pagename )
try:
data = p_settings.get()
except wikipedia.NoPage:
return settings
lines = data.split( '\n' )
re1 = re.compile( ur'^\*+\s*(.*)\b\s*\=\s*(.*)$' )
for line in lines:
m = re1.match( line )
if not m:
continue
groups = m.groups()
if len( groups ) < 2:
continue
key = groups[0].lower()
value = groups[1]
settings[ key ] = value
return settings
def load_portal_settings( self, portal ):
defaults = {}
defaults[ u'separátor najnovších' ] = u' · '
defaults[ u'separátor zoznamu' ] = u' · '
defaults[ u'počet najnovších' ] = u'40'
settings = self.load_settings( u'Portál:%s/Nastavenia' % portal, defaults )
settings[ 'index' ] = dewikize( settings[ u'index' ] )
settings[ 'categorization' ] = dewikize( settings[ u'kategorizácia' ] )
settings[ 'last update' ] = dewikize( settings[ u'posledná aktualizácia' ] )
settings[ 'list' ] = dequotize( settings[ u'zoznam' ] )
settings[ 'list separator' ] = dequotize( settings[ u'separátor zoznamu' ] )
settings[ 'lists' ] = dewikize( settings[ u'zoznamy' ] )[ 1: ]
settings[ 'newest articles' ] = dewikize( settings[ u'najnovšie články' ] )
settings[ 'newest count' ] = int( settings[ u'počet najnovších' ] )
settings[ 'newest separator' ] = dequotize( settings[ u'separátor najnovších' ] )
self.portal_data = settings
def generate_newest( self ):
p_naj = wikipedia.Page( self.site, self.portal_data[ 'newest articles' ] )
links = p_naj.linkedPages()
arts = [ item.title() for item in links ]
old_newest_arts = set( arts )
new_ones = self.all_arts.difference( self.old_links ).difference( old_newest_arts )
real_new_ones = []
now = datetime.today()
for arttitle in sorted( new_ones ):
try:
p = wikipedia.Page( self.site, arttitle )
if p.isRedirectPage():
continue
history = p.getVersionHistory()
lasttime = history[-1][0]
dt = self.atomodate.wiki2python( lasttime )
if dt + timedelta( days=7 ) < now:
continue
real_new_ones.append( ( dt, arttitle ) )
except KeyboardInterrupt:
raise
except Exception, e:
print str( e )
pass
except:
print 'UNKNOWN EXCEPTION'
pass
real_new_ones.sort()
comment_atomobot = u'Atomobot :: '
comment_added = u''
for dt, arttitle in real_new_ones:
arts.insert( 0, arttitle )
comment_added = "+%s %s" % ( arttitle, comment_added )
separator = self.portal_data[ 'newest separator' ]
pocet = self.portal_data[ 'newest count' ]
arts_removed = arts[ pocet: ]
arts[ pocet: ] = []
comment_removed = u' '.join( [ u'-%s' % item for item in arts_removed ] )
arts = [ u'[[%s]]' % item for item in arts ]
new_arts_data = separator.join( arts )
comment = comment_atomobot + comment_added + comment_removed
comment = comment.strip()
print comment
p_naj.put( new_arts_data, comment )
def run( self, portal ):
self.atomodate = AtomobotDate( lang )
self.site = wikipedia.getSite()
self.site.forceLogin()
self.load_portal_settings( portal )
self.generate_index()
self.generate_lists()
self.generate_last_actualization()
self.generate_newest()
wikipedia.stopme()
def main():
for portal in config.index_portals:
AtomobotIndex().run( portal )
main()