NikiWiki has uploaded a new change for review.
https://gerrit.wikimedia.org/r/86377
Change subject: Pagegenerators: Add filter for article-bodies
......................................................................
Pagegenerators: Add filter for article-bodies
Add a filter that matches a regex against the bodies of all pages
returned by the following generators.
Change-Id: I8c659479f3591446fbea8b0ac5fde8a72c8a08e4
---
M pywikibot/pagegenerators.py
1 file changed, 32 insertions(+), 1 deletion(-)
git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core
refs/changes/77/86377/1
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 5be6e16..2444c50 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -159,6 +159,21 @@
-page Work on a single page. Argument can also be given as
"-page:pagetitle".
+
+-articlefilterregex A regular expression that needs to match the article
+ otherwise the page won't be returned. The filter works
+ for all subsequent generators.
+
+ Example:
+ pagegenerators.py \\
+ -family:wikipedia -lang:en \\
+ -recentchanges:5 \\
+ -articlefilterregex:'.*Thor.*'
+ -cat:Thunder_gods \\
+ -cat:Sky_and_weather_gods
+ This will find the five most recently edited pages
+ and all pages in the categories 'Thunder gods' and
+ 'Sky and weather gods' that refer to 'Thor'.
"""
docuReplacements = {'¶ms;': parameterHelp}
@@ -178,6 +193,7 @@
self.namespaces = []
self.step = None
self.limit = None
+ self.articlefilter = None
def getCombinedGenerator(self):
"""Return the combination of all accumulated generators.
@@ -453,12 +469,21 @@
else:
regex = arg[7:]
gen = RegexFilterPageGenerator(pywikibot.Site().allpages(), regex)
+ elif arg.startswith('-articlefilterregex'):
+ if len(arg) == 19:
+ self.articlefilter = pywikibot.input(u'Please enter your
filter-expression:')
+ else:
+ self.articlefilter = arg[20:]
+ return True # No generator is returned, so just stop here.
elif arg.startswith('-yahoo'):
gen = YahooSearchPageGenerator(arg[7:])
else:
pass
if gen:
- self.gens.append(gen)
+ if self.articlefilter:
+ self.gens.append(RegexBodyFilterPageGenerator(gen,
self.articlefilter))
+ else:
+ self.gens.append(gen)
return True
else:
return False
@@ -775,6 +800,12 @@
if reg.match(page.title(withNamespace=False)):
yield page
+def RegexBodyFilterPageGenerator(generator, regex):
+ """Yield pages from another generator whose body matches regex with
options re.IGNORECASE|re.DOTALL."""
+ reg = re.compile(regex, re.IGNORECASE|re.DOTALL)
+ for page in generator:
+ if reg.match(page.text):
+ yield page
def CombinedPageGenerator(generators):
return itertools.chain(*generators)
--
To view, visit https://gerrit.wikimedia.org/r/86377
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I8c659479f3591446fbea8b0ac5fde8a72c8a08e4
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: NikiWiki <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits