Redaktor:Atomobot/atomobot makeindex.py
Vzhled
#!/usr/bin/python2.4 # -*- coding: utf-8 -*- from datetime import date, datetime, timedelta import re import wikipedia import catlib import atomobot_settings as config from atomobot_date import AtomobotDate from atomobot_language import AtomobotLanguageSlovak from atomobot_misc import dequotize, dewikize lang = AtomobotLanguageSlovak() class AtomobotIndex( object ): def __init__( self ): self.all_arts = set() self.old_links = set() def matches_list( self, text, lst ): for rexp in lst: m = rexp.match( text ) if m: return True return False def make_index_letter( self, deco ): letter = lang.letters[ deco[0] ] if letter in ' !"#$.,-()': return u'!' return letter.upper() def generate_index( self ): site = self.site p = wikipedia.Page( site, self.portal_data[ 'categorization' ] ) data = p.get() lines = data.split( '\n' ) lines = [ line for line in lines if line[:1] in ( '*', ':' ) ] re1 = re.compile( ur'.*\[\[\:%s:(.*)\]\]' % config.kategoria ) katset = set() for line in lines: m = re1.match( line ) if not m: continue groups = m.groups() if not groups: continue name = groups[0] katset.add( name ) s = set() for kat in sorted( katset ): try: p = catlib.Category( site, u'%s:%s' % ( config.kategoria, kat ) ) arts = p.articles() except KeyboardInterrupt: raise except Exception, e: print str( e ) continue except: print u"Problem with category '%s'" % kat continue for art in arts: s.add( art.title() ) exclude_list_compiled = [] for item in config.index_exclude_list: exclude_list_compiled.append( re.compile( item ) ) arts_by_alpha = {} for art in s: if not art: continue if self.matches_list( art, exclude_list_compiled ): continue self.all_arts.add( art ) deco = lang.decompose( art ) alpha = arts_by_alpha.setdefault( self.make_index_letter( deco ), [] ) alpha.append( ( deco, art ) ) letters = [ ( lang.decompose( key ), key ) for key in arts_by_alpha.keys() ] letters.sort() lets = [ letter for deco, letter in letters ] zoznam = self.portal_data[ 'list' ] data = u'\n'.join( config.index_index_body % { 'letter': letter, 'zoznam': zoznam } for letter in lets ) + '\n' data = config.index_index_head + data + config.index_index_tail p = wikipedia.Page( site, self.portal_data[ 'index' ] ) p.put( data, lang.TEXT_UPDATE ) self.arts_by_alpha = arts_by_alpha self.lets = lets self.grand_total = len( s ) def generate_list( self, letter, arts ): p = wikipedia.Page( self.site, u'%s/%s' % ( self.portal_data[ 'list' ], letter ) ) try: olddata = p.get() links_to = p.linkedPages() self.old_links.update( [ item.title() for item in links_to ] ) except wikipedia.NoPage: olddata = u'' twoletters = {} for deco, art in arts: alpha = twoletters.setdefault( tuple( deco[:2] ), [] ) alpha.append( ( deco, art ) ) firsts = [ key for key in twoletters.keys() ] firsts.sort() body = u'' for twolet in firsts: subarts = twoletters.get( twolet, None ) if not subarts: continue body += u'=== %s ===\n' % ( lang.compose( twolet ).lower() ) final_subarts = [ u'[[%s]]' % item for deco, item in sorted( subarts ) ] body += self.portal_data[ 'list separator' ].join( final_subarts ) + '\n\n' head = config.index_list_head % { 'number': len( arts ), 'clankov': lang.plural( len( arts ), 'článok' ), 'kategorizacia': self.portal_data[ 'categorization' ], 'date': date.today().strftime( '%d.%m.%Y' ) } tail = u"""\n\n[[%s]]""" % self.portal_data[ 'lists' ] data = head + body + tail if data != olddata: p.put( data, lang.TEXT_UPDATE ) def generate_lists( self ): for letter in self.lets: arts = self.arts_by_alpha.get( letter, None ) if not arts: continue self.generate_list( letter, arts ) def generate_last_actualization( self ): now = date.today() data = u'%s. %s - %s %s' % ( now.day, lang.MONTH_NAME[ now.month ], self.grand_total, lang.plural( self.grand_total, 'článok' ) ) p = wikipedia.Page( self.site, self.portal_data[ 'last update' ] ) p.put( data, lang.TEXT_UPDATE ) def load_settings( self, pagename, defaults=None ): settings = {} if defaults: settings.update( defaults ) p_settings = wikipedia.Page( self.site, pagename ) try: data = p_settings.get() except wikipedia.NoPage: return settings lines = data.split( '\n' ) re1 = re.compile( ur'^\*+\s*(.*)\b\s*\=\s*(.*)$' ) for line in lines: m = re1.match( line ) if not m: continue groups = m.groups() if len( groups ) < 2: continue key = groups[0].lower() value = groups[1] settings[ key ] = value return settings def load_portal_settings( self, portal ): defaults = {} defaults[ u'separátor najnovších' ] = u' · ' defaults[ u'separátor zoznamu' ] = u' · ' defaults[ u'počet najnovších' ] = u'40' settings = self.load_settings( u'Portál:%s/Nastavenia' % portal, defaults ) settings[ 'index' ] = dewikize( settings[ u'index' ] ) settings[ 'categorization' ] = dewikize( settings[ u'kategorizácia' ] ) settings[ 'last update' ] = dewikize( settings[ u'posledná aktualizácia' ] ) settings[ 'list' ] = dequotize( settings[ u'zoznam' ] ) settings[ 'list separator' ] = dequotize( settings[ u'separátor zoznamu' ] ) settings[ 'lists' ] = dewikize( settings[ u'zoznamy' ] )[ 1: ] settings[ 'newest articles' ] = dewikize( settings[ u'najnovšie články' ] ) settings[ 'newest count' ] = int( settings[ u'počet najnovších' ] ) settings[ 'newest separator' ] = dequotize( settings[ u'separátor najnovších' ] ) self.portal_data = settings def generate_newest( self ): p_naj = wikipedia.Page( self.site, self.portal_data[ 'newest articles' ] ) links = p_naj.linkedPages() arts = [ item.title() for item in links ] old_newest_arts = set( arts ) new_ones = self.all_arts.difference( self.old_links ).difference( old_newest_arts ) real_new_ones = [] now = datetime.today() for arttitle in sorted( new_ones ): try: p = wikipedia.Page( self.site, arttitle ) if p.isRedirectPage(): continue history = p.getVersionHistory() lasttime = history[-1][0] dt = self.atomodate.wiki2python( lasttime ) if dt + timedelta( days=7 ) < now: continue real_new_ones.append( ( dt, arttitle ) ) except KeyboardInterrupt: raise except Exception, e: print str( e ) pass except: print 'UNKNOWN EXCEPTION' pass real_new_ones.sort() comment_atomobot = u'Atomobot :: ' comment_added = u'' for dt, arttitle in real_new_ones: arts.insert( 0, arttitle ) comment_added = "+%s %s" % ( arttitle, comment_added ) separator = self.portal_data[ 'newest separator' ] pocet = self.portal_data[ 'newest count' ] arts_removed = arts[ pocet: ] arts[ pocet: ] = [] comment_removed = u' '.join( [ u'-%s' % item for item in arts_removed ] ) arts = [ u'[[%s]]' % item for item in arts ] new_arts_data = separator.join( arts ) comment = comment_atomobot + comment_added + comment_removed comment = comment.strip() print comment p_naj.put( new_arts_data, comment ) def run( self, portal ): self.atomodate = AtomobotDate( lang ) self.site = wikipedia.getSite() self.site.forceLogin() self.load_portal_settings( portal ) self.generate_index() self.generate_lists() self.generate_last_actualization() self.generate_newest() wikipedia.stopme() def main(): for portal in config.index_portals: AtomobotIndex().run( portal ) main()