jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/315899 )
Change subject: Add -titleregexnot page filter
......................................................................
Add -titleregexnot page filter
- Add -titleregexnot page generator, completing -titleregex option
- Add corresponding tests, following -titleregex test example
- Add documentation on how -titleregexnot is useful to weblinkchecker.py
Change-Id: If580dd9734717fe74c91e1a373217c4046372606
---
M pywikibot/pagegenerators.py
M scripts/weblinkchecker.py
M tests/aspects.py
M tests/pagegenerators_tests.py
4 files changed, 68 insertions(+), 0 deletions(-)
Approvals:
Mpaa: Looks good to me, but someone else must approve
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index e60c98f..65e45b9 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -226,6 +226,9 @@
Case insensitive regular expressions will be used and
dot matches any character.
+-titleregexnot Like -titleregex, but return the page only if the regular
+ expression does not match.
+
-transcludes Work on all pages that use a certain template.
Argument can also be given as "-transcludes:Title".
@@ -384,6 +387,7 @@
self.qualityfilter_list = []
self.articlefilter_list = []
self.titlefilter_list = []
+ self.titlenotfilter_list = []
self.claimfilter_list = []
self.catfilter_list = []
self.intersect = False
@@ -456,6 +460,7 @@
self.gens[i] = itertools.islice(self.gens[i], self.limit)
if len(self.gens) == 0:
if (self.titlefilter_list or
+ self.titlenotfilter_list or
self.articlefilter_list or
self.claimfilter_list or
self.catfilter_list or
@@ -498,6 +503,10 @@
if self.titlefilter_list:
dupfiltergen = RegexFilterPageGenerator(
dupfiltergen, self.titlefilter_list)
+
+ if self.titlenotfilter_list:
+ dupfiltergen = RegexFilterPageGenerator(
+ dupfiltergen, self.titlenotfilter_list, 'none')
if self.articlefilter_list:
dupfiltergen = RegexBodyFilterPageGenerator(
@@ -855,6 +864,12 @@
'What page names are you looking for?')
self.titlefilter_list.append(value)
return True
+ elif arg == '-titleregexnot':
+ if not value:
+ value = pywikibot.input(
+ 'All pages except which ones?')
+ self.titlenotfilter_list.append(value)
+ return True
elif arg == '-grep':
if not value:
value = pywikibot.input('Which pattern do you want to grep?')
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index 44e3296..603a5f8 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -15,6 +15,10 @@
two times, with a time lag of at least one week. Such links will be logged to a
.txt file in the deadlinks subdirectory.
+The .txt file uses wiki markup and so it may be useful to post it on the
+wiki and then exclude that page from subsequent runs. For example if the
+page is named Broken Links, exclude it with '-titleregexnot:^Broken Links$'
+
After running the bot and waiting for at least one week, you can re-check those
pages where dead links were found, using the -repeat parameter.
diff --git a/tests/aspects.py b/tests/aspects.py
index e9a8788..4d30d14 100644
--- a/tests/aspects.py
+++ b/tests/aspects.py
@@ -85,6 +85,15 @@
"""
return self.assertRegexpMatches(*args, **kwargs)
+ if not hasattr(unittest.TestCase, 'assertNotRegex'):
+ def assertNotRegex(self, *args, **kwargs):
+ """
+ Wrapper of unittest.assertNotRegexpMatches for Python 2 unittest.
+
+ assertNotRegexpMatches is deprecated in Python 3.
+ """
+ return self.assertNotRegexpMatches(*args, **kwargs)
+
if not hasattr(unittest.TestCase, 'assertCountEqual'):
def assertCountEqual(self, *args, **kwargs):
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index 06380ef..098ad3c 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -786,6 +786,46 @@
self.assertLessEqual(len(pages), 10)
self.assertPagesInNamespaces(pages, 1)
+ def test_regexfilternot_default(self):
+ """Test allpages generator with titleregexnot filter."""
+ gf = pagegenerators.GeneratorFactory()
+ self.assertTrue(gf.handleArg('-start'))
+ # matches titles with less than 11 characters
+ self.assertTrue(gf.handleArg('-titleregexnot:.{11,}'))
+ gf.handleArg('-limit:10')
+ gen = gf.getCombinedGenerator()
+ self.assertIsNotNone(gen)
+ pages = list(gen)
+ self.assertLessEqual(len(pages), 10)
+ for page in pages:
+ self.assertIsInstance(page, pywikibot.Page)
+ self.assertNotRegex(page.title().lower(), '.{11,}')
+
+ def test_regexfilternot_ns_after(self):
+ """Test allpages generator with titleregexnot and namespace filter."""
+ gf = pagegenerators.GeneratorFactory()
+ self.assertTrue(gf.handleArg('-start'))
+ self.assertTrue(gf.handleArg('-titleregexnot:zzzz'))
+ gf.handleArg('-ns:1')
+ gf.handleArg('-limit:10')
+ gen = gf.getCombinedGenerator()
+ pages = list(gen)
+ self.assertLessEqual(len(pages), 10)
+ self.assertPagesInNamespaces(pages, 1)
+
+ def test_regexfilternot_ns_before(self):
+ """Test allpages generator with namespace and titleregexnot filter."""
+ gf = pagegenerators.GeneratorFactory()
+ self.assertTrue(gf.handleArg('-start'))
+ gf.handleArg('-ns:1')
+ self.assertTrue(gf.handleArg('-titleregexnot:zzzz'))
+ gf.handleArg('-limit:10')
+ gen = gf.getCombinedGenerator()
+ self.assertIsNotNone(gen)
+ pages = list(gen)
+ self.assertLessEqual(len(pages), 10)
+ self.assertPagesInNamespaces(pages, 1)
+
def test_allpages_with_two_ns(self):
"""Test that allpages fails with two ns as parameter."""
gf = pagegenerators.GeneratorFactory()
--
To view, visit https://gerrit.wikimedia.org/r/315899
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: If580dd9734717fe74c91e1a373217c4046372606
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Ian Kelling <[email protected]>
Gerrit-Reviewer: Ian Kelling <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Magul <[email protected]>
Gerrit-Reviewer: Matěj Suchánek <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: Mpaa <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits