Prianka has uploaded a new change for review.
https://gerrit.wikimedia.org/r/184118
Change subject: Porting patrol.py to core.
......................................................................
Porting patrol.py to core.
Made minor necessary changes to make it compatible with core (like using
pywikibot etc.).Added returndict parameter in site.recentchanges to
make the script run.Made changes in patrol.api_feed_repeater.
Bug:T74206
Change-Id: I8612ce905d149d0e440d819f62f923385a583920
---
M pywikibot/site.py
A scripts/patrol.py
2 files changed, 475 insertions(+), 1 deletion(-)
git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core
refs/changes/18/184118/1
diff --git a/pywikibot/site.py b/pywikibot/site.py
index c1fd08a..0955e58 100644
--- a/pywikibot/site.py
+++ b/pywikibot/site.py
@@ -3608,7 +3608,8 @@
namespaces=None, pagelist=None, changetype=None,
showMinor=None, showBot=None, showAnon=None,
showRedirects=None, showPatrolled=None, topOnly=False,
- step=None, total=None, user=None, excludeuser=None):
+ step=None, total=None, user=None, excludeuser=None,
+ returndict=False):
"""Iterate recent changes.
@param start: Timestamp to start listing from
diff --git a/scripts/patrol.py b/scripts/patrol.py
new file mode 100644
index 0000000..ca687fa
--- /dev/null
+++ b/scripts/patrol.py
@@ -0,0 +1,473 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+The bot is meant to mark the edits based on info obtained by whitelist.
+
+This bot obtains a list of recent changes and newpages and marks the
+edits as patrolled based on a whitelist.
+See http://en.wikisource.org/wiki/User:JVbot/patrol_whitelist
+
+The following parameters are supported:
+
+¶ms;
+
+"""
+#
+# (C) Pywikipedia bot team, 2011-2013
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+import pywikibot
+from pywikibot import pagegenerators
+import mwlib.uparser # used to parse the whitelist
+import mwlib.parser # used to parse the whitelist
+import time
+
+_logger = "patrol"
+
+# This is required for the text that is shown when you run this script
+# with the parameter -help.
+docuReplacements = {
+ '¶ms;': pagegenerators.parameterHelp
+}
+
+
+class PatrolBot:
+ # Localised name of the whitelist page
+ whitelist_subpage_name = {
+ 'en': u'patrol_whitelist',
+ }
+
+ def __init__(self, feed, user=None, ask=True, whitelist=None):
+ """
+ Constructor. Parameters:
+ * feed - The changes feed to work on (Newpages or Recentchanges)
+ * user - Limit whitelist parsing and patrolling to a specific user
+ * ask - If True, confirm each patrol action
+ * whitelist - page title for whitelist (optional)
+
+ """
+ self.feed = feed
+ self.user = user
+ self.ask = ask
+ self.site = pywikibot.Site()
+ if whitelist:
+ self.whitelist_pagename = whitelist
+ else:
+ local_whitelist_subpage_name = pywikibot.translate(
+ self.site, self.whitelist_subpage_name)
+ self.whitelist_pagename = u'%s:%s/%s' \
+ % (self.site.namespace(2),
+ self.site.username(),
+ local_whitelist_subpage_name)
+ self.whitelist = None
+ self.whitelist_ts = 0
+ self.whitelist_load_ts = 0
+
+ self.autopatroluserns = False
+ self.highest_rcid = 0 # used to track loops
+ self.last_rcid = 0
+ self.repeat_start_ts = 0
+
+ self.rc_item_counter = 0 # counts how many items have been reviewed
+ self.patrol_counter = 0 # and how many times an action was taken
+
+ def load_whitelist(self):
+ # Check for a more recent version after 5 minutes
+ if self.whitelist_load_ts and (
+ (time.time() - self.whitelist_load_ts) < 300):
+ pywikibot.output(u'Whitelist not stale yet')
+ return
+
+ whitelist_page = pywikibot.Page(pywikibot.Site(),
+ self.whitelist_pagename)
+
+ if not self.whitelist:
+ pywikibot.output(u'Loading %s' % self.whitelist_pagename)
+
+ try:
+ if self.whitelist_ts:
+ # check for a more recent version
+ h = whitelist_page.getVersionHistory(forceReload=True,
+ revCount=1)
+ last_edit_ts = pywikibot.parsetime2stamp(h[0][1])
+ if last_edit_ts == self.whitelist_ts:
+ # As there hasn't been any changed to the whitelist
+ # it has been effectively reloaded 'now'
+ self.whitelist_load_ts = time.time()
+ pywikibot.output(u'Whitelist not modified')
+ return
+
+ if self.whitelist:
+ pywikibot.output(u'Reloading whitelist')
+
+ # Fetch whitelist
+ wikitext = whitelist_page.get()
+ # Parse whitelist
+ self.whitelist = self.parse_page_tuples(wikitext, self.user)
+ # Record timestamp
+ self.whitelist_ts = whitelist_page.editTime()
+ self.whitelist_load_ts = time.time()
+ except Exception as e:
+ # cascade if there isnt a whitelist to fallback on
+ if not self.whitelist:
+ raise
+ pywikibot.error(u'%s' % e)
+
+ def add_to_tuples(self, tuples, user, page):
+ pywikibot.output(u"Adding %s:%s" % (user, page.title()))
+
+ if user in tuples:
+ tuples[user].append(page)
+ else:
+ tuples[user] = [page]
+
+ def in_list(self, pagelist, title, laxyload=True):
+ pywikibot.output(u'Checking whitelist for: %s' % title)
+
+ # quick check for exact match
+ if title in pagelist:
+ return title
+
+ # quick check for wildcard
+ if '' in pagelist:
+ pywikibot.output(u"wildcarded")
+ return '.*'
+
+ for item in pagelist:
+ pywikibot.output(u"checking against whitelist item = %s" % item)
+
+ if isinstance(item, PatrolRule):
+ pywikibot.output(u"invoking programmed rule")
+ if item.match(title):
+ return item
+
+ elif title_match(item, title):
+ return item
+
+ pywikibot.output(u'not found')
+
+ def parse_page_tuples(self, wikitext, user=None):
+ tuples = {}
+
+ # for any structure, the only first 'user:' page
+ # is registered as the user the rest of the structure
+ # refers to.
+ def process_children(obj, current_user):
+ pywikibot.debug(u'parsing node: %s' % obj, _logger)
+ for c in obj.children:
+ temp = process_node(c, current_user)
+ if temp and not current_user:
+ current_user = temp
+
+ def process_node(obj, current_user):
+ # links are analysed; interwiki links are included because mwlib
+ # incorrectly calls 'Wikisource:' namespace links an interwiki
+ if isinstance(obj, mwlib.parser.NamespaceLink) or \
+ isinstance(obj, mwlib.parser.InterwikiLink) or \
+ isinstance(obj, mwlib.parser.ArticleLink):
+ if obj.namespace == -1:
+ # the parser accepts 'special:prefixindex/' as a wildcard
+ # this allows a prefix that doesnt match an existing page
+ # to be a blue link, and can be clicked to see what pages
+ # will be included in the whitelist
+ if obj.target[:20].lower() == 'special:prefixindex/':
+ if len(obj.target) == 20:
+ pywikibot.output(u'Whitelist everything')
+ page = ''
+ else:
+ page = obj.target[20:]
+ pywikibot.output(u'Whitelist prefixindex hack '
+ u'for: %s' % page)
+ # p = pywikibot.Page(self.site, obj.target[20:])
+ # obj.namespace = p.namespace
+ # obj.target = p.title()
+
+ elif obj.namespace == 2 and not current_user:
+ # if a target user hasn't been found yet, and the link is
+ # 'user:'
+ # the user will be the target of subsequent rules
+ page_prefix_len = len(self.site.namespace(2))
+ current_user = obj.target[(page_prefix_len + 1):]
+ pywikibot.output(u'Whitelist user: %s' % current_user)
+ return current_user
+ else:
+ page = obj.target
+
+ if current_user:
+ if not user or current_user == user:
+ if self.is_wikisource_author_page(page):
+ pywikibot.output(u'Whitelist author: %s' % page)
+ author = LinkedPagesRule(page)
+ self.add_to_tuples(tuples, current_user, author)
+ else:
+ pywikibot.output(u'Whitelist page: %s' % page)
+ self.add_to_tuples(tuples, current_user, page)
+ pywikibot.output(u'Discarding whitelist page for '
+ u'another user: %s' % page)
+ else:
+ raise Exception(u"No user set for page %s" % page)
+ else:
+ process_children(obj, current_user)
+
+ root = mwlib.uparser.parseString(title='Not used', raw=wikitext)
+ process_children(root, None)
+
+ return tuples
+
+ def is_wikisource_author_page(self, title):
+ if not self.site.family.name == 'wikisource':
+ return
+
+ author_ns = 0
+ try:
+ author_ns = self.site.family.authornamespaces[self.site.lang][0]
+ except:
+ pass
+ if author_ns:
+ author_ns_prefix = self.site.namespace(author_ns)
+ pywikibot.debug(u'Author ns: %d; name: %s'
+ % (author_ns, author_ns_prefix), _logger)
+ if title.find(author_ns_prefix + ':') == 0:
+ return True
+
+ author_page_name = title[len(author_ns_prefix) + 1:]
+ pywikibot.output(u'Found author %s' % author_page_name)
+ return
+
+ def run(self, feed=None):
+ if self.whitelist is None:
+ self.load_whitelist()
+ if not feed:
+ feed = self.feed
+ for page in feed:
+ self.treat(page)
+
+ def treat(self, page):
+ """
+ Load the given page, does some changes, and saves it.
+ """
+ choice = None
+ try:
+ # page: title, date, username, comment, loginfo, rcid, token
+ username = page[1]
+ # when the feed isnt from the API, it used to contain
+ # '(not yet written)' or '(page does not exist)' when it was
+ # a redlink
+ rcid = page[3]
+ if not rcid:
+ raise Exception('rcid not present')
+
+ # check whether we have wrapped around to higher rcids
+ # which indicates a new RC feed is being processed
+ if rcid > self.last_rcid:
+ # refresh the whitelist
+ self.load_whitelist()
+ self.repeat_start_ts = time.time()
+
+ title = page[0]['title']
+ if self.ask:
+ pywikibot.output(u"User %s has created or modified page %s"
+ % (username, title))
+
+ if self.autopatroluserns and (page[0].namespace() == 2 or
+ page[0].namespace() == 3):
+ # simple rule to whitelist any user editing their own userspace
+ if page[0].title(withNamespace=False).startswith(username):
+ pywikibot.output(u'%s is whitelisted to modify %s'
+ % (username, page[0]['title']))
+ choice = 'y'
+
+ if choice != 'y' and username in self.whitelist:
+ if self.in_list(self.whitelist[username], page[0]['title']):
+ pywikibot.output(u'%s is whitelisted to modify %s'
+ % (username, page[0]['title']))
+ choice = 'y'
+
+ if self.ask:
+ options = ['y', 'N']
+ # default to automatic choice
+ if choice == 'y':
+ options = ['Y', 'n']
+ else:
+ choice = 'N'
+
+ choice = pywikibot.input(
+ u'Do you want to mark page as patrolled?',
+ ['Yes', 'No'], options, choice)
+
+ # Patrol the page
+ if choice == 'y':
+ self.patrol_counter = self.patrol_counter + 1
+ pywikibot.output(u"Patrolled %s (rcid %d) by user %s"
+ % (title, rcid, username))
+ else:
+ pywikibot.output(u"skipped")
+
+ if rcid > self.highest_rcid:
+ self.highest_rcid = rcid
+ self.last_rcid = rcid
+ self.rc_item_counter = self.rc_item_counter + 1
+
+ except pywikibot.NoPage:
+ pywikibot.output(u"Page %s does not exist; skipping."
+ % page.title(asLink=True))
+ return
+ except pywikibot.IsRedirectPage:
+ pywikibot.output(u"Page %s is a redirect; skipping."
+ % page.title(asLink=True))
+ return
+
+
+def title_match(prefix, title):
+ pywikibot.output(u'matching %s to prefix %s' % (title, prefix))
+ prefix_len = len(prefix)
+ title_trimmed = title[:prefix_len]
+ if title_trimmed == prefix:
+ pywikibot.output(u"substr match")
+ return True
+ return
+
+
+class PatrolRule:
+ def __init__(self, page_title):
+ """
+ Constructor.
+
+ Parameters:
+ * page_title - The page title for this rule
+ """
+ self.page_title = page_title
+
+ def title(self):
+ return self.page_title
+
+ def match(self, page):
+ pass
+
+
+class LinkedPagesRule(PatrolRule):
+ def __init__(self, page_title):
+ self.site = pywikibot.getSite()
+ self.page_title = page_title
+ self.linkedpages = None
+
+ def match(self, page_title):
+ if page_title == self.page_title:
+ return True
+
+ if not self.site.family.name == 'wikisource':
+ raise Exception('This is a wikisource rule')
+
+ if not self.linkedpages:
+ pywikibot.output(u"loading page links on %s" % self.page_title)
+ p = pywikibot.Page(self.site, self.page_title)
+ linkedpages = list()
+ for linkedpage in p.linkedPages():
+ linkedpages.append(linkedpage.title())
+
+ self.linkedpages = linkedpages
+ pywikibot.output(u"loaded %d page links" % len(linkedpages))
+
+ for p in self.linkedpages:
+ pywikibot.output(u"checking against '%s'" % p)
+ if title_match(p, page_title):
+ pywikibot.output(u"Matched.")
+ return p
+
+
+def api_feed_repeater(gen, delay=0, repeat=False, number=1000, namespaces=None,
+ user=None):
+ while True:
+ for page in gen(namespaces=namespaces, user=user, returndict=True):
+ attrs = page
+ yield page, attrs.get(u'user', 'userhidden'), attrs['revid'],
attrs['rcid']
+ if repeat:
+ pywikibot.output(u'Sleeping for %d seconds' % delay)
+ time.sleep(delay)
+ else:
+ break
+
+
+def main():
+ # This factory is responsible for processing command line arguments
+ # that are also used by other scripts and that determine on which pages
+ # to work on.
+ genFactory = pagegenerators.GeneratorFactory()
+ # The generator gives the pages that should be worked upon.
+ gen = None
+ # This temporary array is used to read the page title if one single
+ # page to work on is specified by the arguments.
+ pageTitleParts = []
+ ask = False
+ repeat = False
+ autopatroluserns = False
+ recentchanges = False
+ newpages = False
+ namespaces = None
+ user = None
+
+ # Parse command line arguments
+ for arg in pywikibot.handle_args():
+ if arg.startswith("-ask"):
+ ask = True
+ elif arg.startswith("-autopatroluserns"):
+ autopatroluserns = True
+ elif arg.startswith("-repeat"):
+ repeat = True
+ elif arg.startswith("-newpages"):
+ newpages = True
+ elif arg.startswith("-recentchanges"):
+ recentchanges = True
+ elif arg.startswith("-namespace:"):
+ namespace = arg[11:]
+ namespace = int(namespace)
+ elif arg.startswith("-user:"):
+ user = arg[6:]
+ else:
+ # check if a standard argument like
+ # -start:XYZ or -ref:Asdf was given.
+ generator = genFactory.handle_args(arg)
+ if generator:
+ gen = generator
+ else:
+ pageTitleParts.append(arg)
+
+ site = pywikibot.Site()
+ site.login()
+
+ if user:
+ pywikibot.output(u"processing user: %s" % user)
+
+ newpage_count = 300
+ if not newpages and not recentchanges and not user:
+ if site.family.name == 'wikipedia':
+ newpages = True
+ newpage_count = 5000
+ else:
+ recentchanges = True
+
+ bot = PatrolBot(None, user, ask)
+ bot.autopatroluserns = autopatroluserns
+
+ if newpages or user:
+ pywikibot.output(u"Newpages:")
+ gen = site.newpages
+ feed = api_feed_repeater(gen, delay=60, repeat=repeat,
+ number=newpage_count, namespace=namespace,
+ user=user)
+ bot.run(feed)
+
+ if recentchanges or user:
+ pywikibot.output(u"Recentchanges:")
+ gen = site.recentchanges
+ feed = api_feed_repeater(gen, delay=60, repeat=repeat, number=1000,
+ namespaces=namespaces, user=user)
+ bot.run(feed)
+
+ pywikibot.output(u'%d/%d patrolled'
+ % (bot.patrol_counter, bot.rc_item_counter))
+
+if __name__ == "__main__":
+ main()
--
To view, visit https://gerrit.wikimedia.org/r/184118
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I8612ce905d149d0e440d819f62f923385a583920
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Prianka <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits