Szerkesztő:Cherybot/linkify.py

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
Linkifies a keyword in every articles at the first occurrence.

Usage: linkify.py [OPTIONS] <keyword>

Available options:
  -file:PAGELIST    list of articles to check
  -cat:CATEGORY     category to check
  -xml:XMLFILE      XML dump to check
  -namespace:ID     namespace to work on
"""
#
# hu:User:Chery, January 23, 2007
# Public domain.
#
import wikipedia
import pagegenerators
import sys, re
import catlib

#
# TODO keyword elejen tudjon matchelni ekezetes lowercase-re is
#

class XmlDumpLinkifyPageGenerator:
	def __init__(self, regex, exception, xmlfilename):
		self.regex = regex
		self.exception = exception
		self.xmlfilename = xmlfilename

	def __iter__(self):
		import xmlreader

		mysite = wikipedia.getSite()
		dump = xmlreader.XmlDump(self.xmlfilename)
		for entry in dump.parse():
			if self.regex.search(entry.text) and not self.exception.search(entry.text):
				page = wikipedia.Page(mysite, entry.title)
				yield page

def inspect(gen, regex, exception, keyword):
	msg = {
		'en': u'Robot: Linkifying [[%s]]',
		'hu': u'Robot: HivatkozÃ¡s erre: [[%s]]',
	}

	acceptall = False

	for page in gen:
		try:
			textold = page.get()
			if not page.canBeEdited():
				wikipedia.output(u'Skipping locked page %s' % page.title())
				continue
		except wikipedia.NoPage:
			wikipedia.output(u'Page %s not found' % page.title())
			continue

		if exception.search(textold):
			wikipedia.output(u'Page %s already includes a link;  skipping.' % page.title())
			continue

		textnew = regex.sub('[[' + keyword + ']]', textold, 1)
		if textnew == textold:
			wikipedia.output('No changes were necessary in %s' % page.title())
		else:
			colors = [None] * 5 + [13] * len(page.title()) + [None] * 4
			wikipedia.output(u'\n>>> %s <<<' % page.title(), colors = colors)
			wikipedia.showDiff(textold, textnew)
			if not acceptall:
				choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
				if choice in ['a', 'A']:
					acceptall = True
			if acceptall or choice in ['y', 'Y']:
				try:
					wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg) % keyword)
					page.put(textnew)
				except wikipedia.EditConflict:
					wikipedia.output(u'Skipping %s because of edit conflict' % page.title())
				except wikipedia.SpamfilterError, url:
					wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), url))

try:
	leavelinktext = False
	reason = ''
	pagelistfile = ''
	xmlfilename = ''
	namespaces = []
	keyword = ''
	gen = None

	for arg in wikipedia.handleArgs():
		if arg.startswith('-namespace:'):
			namespaces.append(int(arg[11:]))
		elif arg.startswith('-file:'):
			pagelistfile = arg[6:]
			gen = pagegenerators.TextfilePageGenerator(pagelistfile)
		elif arg.startswith('-cat:'):
			categoryname = arg[5:]
			cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % categoryname)
			gen = pagegenerators.CategorizedPageGenerator(cat)
		elif arg.startswith('-xml:'):
			xmlfilename = arg[5:]
		elif arg.startswith('-google'):
			if len(arg) >= 8:
				googlequery = arg[8:]
			gen = pagegenerators.GoogleSearchPageGenerator(googlequery)
		else:
			if keyword:
				wikipedia.output(u'Wrong number of arguments;  check header for usage.')
				wikipedia.stopme()
				sys.exit()
			keyword = arg

	if not keyword:
		wikipedia.output(u'Wrong number of arguments;  check header for usage.')
		wikipedia.stopme()
		sys.exit()

	exception = re.compile('\[\[' + keyword + '(\||\]\])')
	regex = re.compile(keyword + '(?![^\[]*?\])(?![^{]*?}})')
	if xmlfilename:
		gen = XmlDumpLinkifyPageGenerator(regex, exception, xmlfilename)

	if not gen:
		wikipedia.output(u'I was unable to generate a pagelist;  exiting.')
		wikipedia.stopme()
		sys.exit()

	if namespaces != []:
		gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
	pregen = pagegenerators.PreloadingGenerator(gen, pageNumber = 50)
	inspect(pregen, regex, exception, keyword)

finally:
    wikipedia.stopme()