Sn1per has uploaded a new change for review.
https://gerrit.wikimedia.org/r/258422
Change subject: Subpage filter generator
......................................................................
Subpage filter generator
Exclude subpages that have a high depth
i.e. how many parents
Bug: T120587
Change-Id: Ia53580cf8ad7387c14d6ca3bf4fcf5b35f53edd4
---
M pywikibot/pagegenerators.py
M tests/pagegenerators_tests.py
2 files changed, 76 insertions(+), 3 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core
refs/changes/22/258422/1
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 9198dca..e1913ac 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -190,6 +190,10 @@
of pages, only retrieve n pages at a time from the wiki
server.
+-subpage:n Filters pages to only those that have depth n.
+ i.e. a depth of 0 filters out all pages that are subpages,
+ a depth of 1 filters out all pages that are subpages of
subpages
+
-titleregex A regular expression that needs to match the article title
otherwise the page won't be returned.
Multiple -titleregex:regexpr can be provided and the page
will
@@ -339,6 +343,7 @@
self.titlefilter_list = []
self.claimfilter_list = []
self.intersect = False
+ self.subpage_max_depth = None
self._site = site
@property
@@ -406,9 +411,9 @@
if self.limit:
self.gens[i] = itertools.islice(self.gens[i], self.limit)
if len(self.gens) == 0:
- if self.titlefilter_list or self.articlefilter_list:
+ if self.titlefilter_list or self.articlefilter_list or
self.subpage_max_depth is not None:
pywikibot.warning(
- 'grep/titleregex filters specified but no generators.')
+ 'grep/titleregex or subpage filters specified but no
generators.')
return None
elif len(self.gens) == 1:
gensList = self.gens[0]
@@ -440,7 +445,12 @@
dupfiltergen = RegexBodyFilterPageGenerator(
PreloadingGenerator(dupfiltergen), self.articlefilter_list)
- return dupfiltergen
+ # Add on subpage filter generator
+ if self.subpage_max_depth is not None:
+ dupfiltergen = SubpageFilterGenerator(
+ dupfiltergen, self.subpage_max_depth)
+
+ return dupfiltergen
def getCategoryGen(self, arg, recurse=False, content=False,
gen_func=None):
@@ -799,6 +809,13 @@
gen = MySQLPageGenerator(query, site=self.site)
elif arg.startswith('-intersect'):
self.intersect = True
+ return True
+ elif arg.startswith('-subpage'):
+ max_depth = arg[len('-subpage:'):]
+ if not max_depth:
+ max_depth = pywikibot.input(
+ u'Maximum subpage depth:')
+ self.subpage_max_depth = int(max_depth)
return True
elif arg.startswith('-logevents:'):
gen = self._parse_log_events(*arg[len('-logevents:'):].split(','))
@@ -1376,6 +1393,38 @@
ItemClaimFilterPageGenerator = ItemClaimFilter.filter
+def SubpageFilterGenerator(generator, max_depth=0, show_filtered=False):
+ """
+ Generator which filters out subpages based on depth.
+
+ It looks at the namespace of each page and checks if that namespace has
+ subpages enabled. If so, pages with forward slashes ('/') are excluded.
+
+ @param generator: A generator object
+ @type generator: any generator or iterator
+ @param max_depth: Max depth of subpages to yield, at least zero
+ @type max_depth: int
+ @param show_filtered: Output a message for each page not yielded
+ @type show_filtered: bool
+ """
+ for page in generator:
+ # Count how many '/'s we have in the title
+ depth = len(list(re.finditer('/', page.title())))
+
+ try:
+ # Check if the namespace allows subpages
+ if page.namespace().subpages == '':
+ if depth <= max_depth:
+ yield page
+ else:
+ pass
+ except AttributeError:
+ # Does not allow subpages, which means depth is always 0
+ yield page
+
+ pass
+
+
class RegexFilter(object):
"""Regex filter."""
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index 5c5e207..8378e8c 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -233,6 +233,30 @@
self.assertEqual(len(list(gen)), 0)
+class SubpageFilterGeneratorTestCase(TestCase):
+
+ """Test SubpageFilterGenerator."""
+
+ test_site = pywikibot.Site('test', 'test')
+ test_cat = pywikibot.Category('Subpage testing')
+
+ def test_subpage_filter(self):
+ gen = CategorizedPageGenerator(test_cat)
+ gen = pagegenerators.SubpageFilterGenerator(gen, 0)
+ expect_0 = (u'/home/lol')
+ self.assertPagelistTitles(gen, titles=expect_0, site=test_site)
+
+ gen = CategorizedPageGenerator(test_cat)
+ gen = pagegenerators.SubpageFilterGenerator(gen, 3)
+ expect_3 = (
+ u'/home/lol',
+ u'Sn1per/ProtectTest1/test',
+ u'Sn1per/ProtectTest1/test/test',
+ u'Sn1per/sandbox'
+ )
+ self.assertPagelistTitles(gen, titles=expect_3, site=test_site)
+
+
class TestRepeatingGenerator(RecentChangesTestCase):
"""Test RepeatingGenerator."""
--
To view, visit https://gerrit.wikimedia.org/r/258422
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia53580cf8ad7387c14d6ca3bf4fcf5b35f53edd4
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Sn1per <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits