Szerkesztő:Cherybot/linkify.py
Megjelenés
#!/usr/bin/python # -*- coding: utf-8 -*- """ Linkifies a keyword in every articles at the first occurrence. Usage: linkify.py [OPTIONS] <keyword> Available options: -file:PAGELIST list of articles to check -cat:CATEGORY category to check -xml:XMLFILE XML dump to check -namespace:ID namespace to work on """ # # hu:User:Chery, January 23, 2007 # Public domain. # import wikipedia import pagegenerators import sys, re import catlib # # TODO keyword elejen tudjon matchelni ekezetes lowercase-re is # class XmlDumpLinkifyPageGenerator: def __init__(self, regex, exception, xmlfilename): self.regex = regex self.exception = exception self.xmlfilename = xmlfilename def __iter__(self): import xmlreader mysite = wikipedia.getSite() dump = xmlreader.XmlDump(self.xmlfilename) for entry in dump.parse(): if self.regex.search(entry.text) and not self.exception.search(entry.text): page = wikipedia.Page(mysite, entry.title) yield page def inspect(gen, regex, exception, keyword): msg = { 'en': u'Robot: Linkifying [[%s]]', 'hu': u'Robot: Hivatkozás erre: [[%s]]', } acceptall = False for page in gen: try: textold = page.get() if not page.canBeEdited(): wikipedia.output(u'Skipping locked page %s' % page.title()) continue except wikipedia.NoPage: wikipedia.output(u'Page %s not found' % page.title()) continue if exception.search(textold): wikipedia.output(u'Page %s already includes a link; skipping.' % page.title()) continue textnew = regex.sub('[[' + keyword + ']]', textold, 1) if textnew == textold: wikipedia.output('No changes were necessary in %s' % page.title()) else: colors = [None] * 5 + [13] * len(page.title()) + [None] * 4 wikipedia.output(u'\n>>> %s <<<' % page.title(), colors = colors) wikipedia.showDiff(textold, textnew) if not acceptall: choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N') if choice in ['a', 'A']: acceptall = True if acceptall or choice in ['y', 'Y']: try: wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg) % keyword) page.put(textnew) except wikipedia.EditConflict: wikipedia.output(u'Skipping %s because of edit conflict' % page.title()) except wikipedia.SpamfilterError, url: wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), url)) try: leavelinktext = False reason = '' pagelistfile = '' xmlfilename = '' namespaces = [] keyword = '' gen = None for arg in wikipedia.handleArgs(): if arg.startswith('-namespace:'): namespaces.append(int(arg[11:])) elif arg.startswith('-file:'): pagelistfile = arg[6:] gen = pagegenerators.TextfilePageGenerator(pagelistfile) elif arg.startswith('-cat:'): categoryname = arg[5:] cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % categoryname) gen = pagegenerators.CategorizedPageGenerator(cat) elif arg.startswith('-xml:'): xmlfilename = arg[5:] elif arg.startswith('-google'): if len(arg) >= 8: googlequery = arg[8:] gen = pagegenerators.GoogleSearchPageGenerator(googlequery) else: if keyword: wikipedia.output(u'Wrong number of arguments; check header for usage.') wikipedia.stopme() sys.exit() keyword = arg if not keyword: wikipedia.output(u'Wrong number of arguments; check header for usage.') wikipedia.stopme() sys.exit() exception = re.compile('\[\[' + keyword + '(\||\]\])') regex = re.compile(keyword + '(?![^\[]*?\])(?![^{]*?}})') if xmlfilename: gen = XmlDumpLinkifyPageGenerator(regex, exception, xmlfilename) if not gen: wikipedia.output(u'I was unable to generate a pagelist; exiting.') wikipedia.stopme() sys.exit() if namespaces != []: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) pregen = pagegenerators.PreloadingGenerator(gen, pageNumber = 50) inspect(pregen, regex, exception, keyword) finally: wikipedia.stopme()