Prianka has uploaded a new change for review. https://gerrit.wikimedia.org/r/184118
Change subject: Porting patrol.py to core. ...................................................................... Porting patrol.py to core. Made minor necessary changes to make it compatible with core (like using pywikibot etc.).Added returndict parameter in site.recentchanges to make the script run.Made changes in patrol.api_feed_repeater. Bug:T74206 Change-Id: I8612ce905d149d0e440d819f62f923385a583920 --- M pywikibot/site.py A scripts/patrol.py 2 files changed, 475 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/18/184118/1 diff --git a/pywikibot/site.py b/pywikibot/site.py index c1fd08a..0955e58 100644 --- a/pywikibot/site.py +++ b/pywikibot/site.py @@ -3608,7 +3608,8 @@ namespaces=None, pagelist=None, changetype=None, showMinor=None, showBot=None, showAnon=None, showRedirects=None, showPatrolled=None, topOnly=False, - step=None, total=None, user=None, excludeuser=None): + step=None, total=None, user=None, excludeuser=None, + returndict=False): """Iterate recent changes. @param start: Timestamp to start listing from diff --git a/scripts/patrol.py b/scripts/patrol.py new file mode 100644 index 0000000..ca687fa --- /dev/null +++ b/scripts/patrol.py @@ -0,0 +1,473 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +The bot is meant to mark the edits based on info obtained by whitelist. + +This bot obtains a list of recent changes and newpages and marks the +edits as patrolled based on a whitelist. +See http://en.wikisource.org/wiki/User:JVbot/patrol_whitelist + +The following parameters are supported: + +¶ms; + +""" +# +# (C) Pywikipedia bot team, 2011-2013 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' +import pywikibot +from pywikibot import pagegenerators +import mwlib.uparser # used to parse the whitelist +import mwlib.parser # used to parse the whitelist +import time + +_logger = "patrol" + +# This is required for the text that is shown when you run this script +# with the parameter -help. +docuReplacements = { + '¶ms;': pagegenerators.parameterHelp +} + + +class PatrolBot: + # Localised name of the whitelist page + whitelist_subpage_name = { + 'en': u'patrol_whitelist', + } + + def __init__(self, feed, user=None, ask=True, whitelist=None): + """ + Constructor. Parameters: + * feed - The changes feed to work on (Newpages or Recentchanges) + * user - Limit whitelist parsing and patrolling to a specific user + * ask - If True, confirm each patrol action + * whitelist - page title for whitelist (optional) + + """ + self.feed = feed + self.user = user + self.ask = ask + self.site = pywikibot.Site() + if whitelist: + self.whitelist_pagename = whitelist + else: + local_whitelist_subpage_name = pywikibot.translate( + self.site, self.whitelist_subpage_name) + self.whitelist_pagename = u'%s:%s/%s' \ + % (self.site.namespace(2), + self.site.username(), + local_whitelist_subpage_name) + self.whitelist = None + self.whitelist_ts = 0 + self.whitelist_load_ts = 0 + + self.autopatroluserns = False + self.highest_rcid = 0 # used to track loops + self.last_rcid = 0 + self.repeat_start_ts = 0 + + self.rc_item_counter = 0 # counts how many items have been reviewed + self.patrol_counter = 0 # and how many times an action was taken + + def load_whitelist(self): + # Check for a more recent version after 5 minutes + if self.whitelist_load_ts and ( + (time.time() - self.whitelist_load_ts) < 300): + pywikibot.output(u'Whitelist not stale yet') + return + + whitelist_page = pywikibot.Page(pywikibot.Site(), + self.whitelist_pagename) + + if not self.whitelist: + pywikibot.output(u'Loading %s' % self.whitelist_pagename) + + try: + if self.whitelist_ts: + # check for a more recent version + h = whitelist_page.getVersionHistory(forceReload=True, + revCount=1) + last_edit_ts = pywikibot.parsetime2stamp(h[0][1]) + if last_edit_ts == self.whitelist_ts: + # As there hasn't been any changed to the whitelist + # it has been effectively reloaded 'now' + self.whitelist_load_ts = time.time() + pywikibot.output(u'Whitelist not modified') + return + + if self.whitelist: + pywikibot.output(u'Reloading whitelist') + + # Fetch whitelist + wikitext = whitelist_page.get() + # Parse whitelist + self.whitelist = self.parse_page_tuples(wikitext, self.user) + # Record timestamp + self.whitelist_ts = whitelist_page.editTime() + self.whitelist_load_ts = time.time() + except Exception as e: + # cascade if there isnt a whitelist to fallback on + if not self.whitelist: + raise + pywikibot.error(u'%s' % e) + + def add_to_tuples(self, tuples, user, page): + pywikibot.output(u"Adding %s:%s" % (user, page.title())) + + if user in tuples: + tuples[user].append(page) + else: + tuples[user] = [page] + + def in_list(self, pagelist, title, laxyload=True): + pywikibot.output(u'Checking whitelist for: %s' % title) + + # quick check for exact match + if title in pagelist: + return title + + # quick check for wildcard + if '' in pagelist: + pywikibot.output(u"wildcarded") + return '.*' + + for item in pagelist: + pywikibot.output(u"checking against whitelist item = %s" % item) + + if isinstance(item, PatrolRule): + pywikibot.output(u"invoking programmed rule") + if item.match(title): + return item + + elif title_match(item, title): + return item + + pywikibot.output(u'not found') + + def parse_page_tuples(self, wikitext, user=None): + tuples = {} + + # for any structure, the only first 'user:' page + # is registered as the user the rest of the structure + # refers to. + def process_children(obj, current_user): + pywikibot.debug(u'parsing node: %s' % obj, _logger) + for c in obj.children: + temp = process_node(c, current_user) + if temp and not current_user: + current_user = temp + + def process_node(obj, current_user): + # links are analysed; interwiki links are included because mwlib + # incorrectly calls 'Wikisource:' namespace links an interwiki + if isinstance(obj, mwlib.parser.NamespaceLink) or \ + isinstance(obj, mwlib.parser.InterwikiLink) or \ + isinstance(obj, mwlib.parser.ArticleLink): + if obj.namespace == -1: + # the parser accepts 'special:prefixindex/' as a wildcard + # this allows a prefix that doesnt match an existing page + # to be a blue link, and can be clicked to see what pages + # will be included in the whitelist + if obj.target[:20].lower() == 'special:prefixindex/': + if len(obj.target) == 20: + pywikibot.output(u'Whitelist everything') + page = '' + else: + page = obj.target[20:] + pywikibot.output(u'Whitelist prefixindex hack ' + u'for: %s' % page) + # p = pywikibot.Page(self.site, obj.target[20:]) + # obj.namespace = p.namespace + # obj.target = p.title() + + elif obj.namespace == 2 and not current_user: + # if a target user hasn't been found yet, and the link is + # 'user:' + # the user will be the target of subsequent rules + page_prefix_len = len(self.site.namespace(2)) + current_user = obj.target[(page_prefix_len + 1):] + pywikibot.output(u'Whitelist user: %s' % current_user) + return current_user + else: + page = obj.target + + if current_user: + if not user or current_user == user: + if self.is_wikisource_author_page(page): + pywikibot.output(u'Whitelist author: %s' % page) + author = LinkedPagesRule(page) + self.add_to_tuples(tuples, current_user, author) + else: + pywikibot.output(u'Whitelist page: %s' % page) + self.add_to_tuples(tuples, current_user, page) + pywikibot.output(u'Discarding whitelist page for ' + u'another user: %s' % page) + else: + raise Exception(u"No user set for page %s" % page) + else: + process_children(obj, current_user) + + root = mwlib.uparser.parseString(title='Not used', raw=wikitext) + process_children(root, None) + + return tuples + + def is_wikisource_author_page(self, title): + if not self.site.family.name == 'wikisource': + return + + author_ns = 0 + try: + author_ns = self.site.family.authornamespaces[self.site.lang][0] + except: + pass + if author_ns: + author_ns_prefix = self.site.namespace(author_ns) + pywikibot.debug(u'Author ns: %d; name: %s' + % (author_ns, author_ns_prefix), _logger) + if title.find(author_ns_prefix + ':') == 0: + return True + + author_page_name = title[len(author_ns_prefix) + 1:] + pywikibot.output(u'Found author %s' % author_page_name) + return + + def run(self, feed=None): + if self.whitelist is None: + self.load_whitelist() + if not feed: + feed = self.feed + for page in feed: + self.treat(page) + + def treat(self, page): + """ + Load the given page, does some changes, and saves it. + """ + choice = None + try: + # page: title, date, username, comment, loginfo, rcid, token + username = page[1] + # when the feed isnt from the API, it used to contain + # '(not yet written)' or '(page does not exist)' when it was + # a redlink + rcid = page[3] + if not rcid: + raise Exception('rcid not present') + + # check whether we have wrapped around to higher rcids + # which indicates a new RC feed is being processed + if rcid > self.last_rcid: + # refresh the whitelist + self.load_whitelist() + self.repeat_start_ts = time.time() + + title = page[0]['title'] + if self.ask: + pywikibot.output(u"User %s has created or modified page %s" + % (username, title)) + + if self.autopatroluserns and (page[0].namespace() == 2 or + page[0].namespace() == 3): + # simple rule to whitelist any user editing their own userspace + if page[0].title(withNamespace=False).startswith(username): + pywikibot.output(u'%s is whitelisted to modify %s' + % (username, page[0]['title'])) + choice = 'y' + + if choice != 'y' and username in self.whitelist: + if self.in_list(self.whitelist[username], page[0]['title']): + pywikibot.output(u'%s is whitelisted to modify %s' + % (username, page[0]['title'])) + choice = 'y' + + if self.ask: + options = ['y', 'N'] + # default to automatic choice + if choice == 'y': + options = ['Y', 'n'] + else: + choice = 'N' + + choice = pywikibot.input( + u'Do you want to mark page as patrolled?', + ['Yes', 'No'], options, choice) + + # Patrol the page + if choice == 'y': + self.patrol_counter = self.patrol_counter + 1 + pywikibot.output(u"Patrolled %s (rcid %d) by user %s" + % (title, rcid, username)) + else: + pywikibot.output(u"skipped") + + if rcid > self.highest_rcid: + self.highest_rcid = rcid + self.last_rcid = rcid + self.rc_item_counter = self.rc_item_counter + 1 + + except pywikibot.NoPage: + pywikibot.output(u"Page %s does not exist; skipping." + % page.title(asLink=True)) + return + except pywikibot.IsRedirectPage: + pywikibot.output(u"Page %s is a redirect; skipping." + % page.title(asLink=True)) + return + + +def title_match(prefix, title): + pywikibot.output(u'matching %s to prefix %s' % (title, prefix)) + prefix_len = len(prefix) + title_trimmed = title[:prefix_len] + if title_trimmed == prefix: + pywikibot.output(u"substr match") + return True + return + + +class PatrolRule: + def __init__(self, page_title): + """ + Constructor. + + Parameters: + * page_title - The page title for this rule + """ + self.page_title = page_title + + def title(self): + return self.page_title + + def match(self, page): + pass + + +class LinkedPagesRule(PatrolRule): + def __init__(self, page_title): + self.site = pywikibot.getSite() + self.page_title = page_title + self.linkedpages = None + + def match(self, page_title): + if page_title == self.page_title: + return True + + if not self.site.family.name == 'wikisource': + raise Exception('This is a wikisource rule') + + if not self.linkedpages: + pywikibot.output(u"loading page links on %s" % self.page_title) + p = pywikibot.Page(self.site, self.page_title) + linkedpages = list() + for linkedpage in p.linkedPages(): + linkedpages.append(linkedpage.title()) + + self.linkedpages = linkedpages + pywikibot.output(u"loaded %d page links" % len(linkedpages)) + + for p in self.linkedpages: + pywikibot.output(u"checking against '%s'" % p) + if title_match(p, page_title): + pywikibot.output(u"Matched.") + return p + + +def api_feed_repeater(gen, delay=0, repeat=False, number=1000, namespaces=None, + user=None): + while True: + for page in gen(namespaces=namespaces, user=user, returndict=True): + attrs = page + yield page, attrs.get(u'user', 'userhidden'), attrs['revid'], attrs['rcid'] + if repeat: + pywikibot.output(u'Sleeping for %d seconds' % delay) + time.sleep(delay) + else: + break + + +def main(): + # This factory is responsible for processing command line arguments + # that are also used by other scripts and that determine on which pages + # to work on. + genFactory = pagegenerators.GeneratorFactory() + # The generator gives the pages that should be worked upon. + gen = None + # This temporary array is used to read the page title if one single + # page to work on is specified by the arguments. + pageTitleParts = [] + ask = False + repeat = False + autopatroluserns = False + recentchanges = False + newpages = False + namespaces = None + user = None + + # Parse command line arguments + for arg in pywikibot.handle_args(): + if arg.startswith("-ask"): + ask = True + elif arg.startswith("-autopatroluserns"): + autopatroluserns = True + elif arg.startswith("-repeat"): + repeat = True + elif arg.startswith("-newpages"): + newpages = True + elif arg.startswith("-recentchanges"): + recentchanges = True + elif arg.startswith("-namespace:"): + namespace = arg[11:] + namespace = int(namespace) + elif arg.startswith("-user:"): + user = arg[6:] + else: + # check if a standard argument like + # -start:XYZ or -ref:Asdf was given. + generator = genFactory.handle_args(arg) + if generator: + gen = generator + else: + pageTitleParts.append(arg) + + site = pywikibot.Site() + site.login() + + if user: + pywikibot.output(u"processing user: %s" % user) + + newpage_count = 300 + if not newpages and not recentchanges and not user: + if site.family.name == 'wikipedia': + newpages = True + newpage_count = 5000 + else: + recentchanges = True + + bot = PatrolBot(None, user, ask) + bot.autopatroluserns = autopatroluserns + + if newpages or user: + pywikibot.output(u"Newpages:") + gen = site.newpages + feed = api_feed_repeater(gen, delay=60, repeat=repeat, + number=newpage_count, namespace=namespace, + user=user) + bot.run(feed) + + if recentchanges or user: + pywikibot.output(u"Recentchanges:") + gen = site.recentchanges + feed = api_feed_repeater(gen, delay=60, repeat=repeat, number=1000, + namespaces=namespaces, user=user) + bot.run(feed) + + pywikibot.output(u'%d/%d patrolled' + % (bot.patrol_counter, bot.rc_item_counter)) + +if __name__ == "__main__": + main() -- To view, visit https://gerrit.wikimedia.org/r/184118 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I8612ce905d149d0e440d819f62f923385a583920 Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Prianka <priyankajayaswal...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits