jenkins-bot has submitted this change and it was merged.
Change subject: Filters needed for newitem.py
......................................................................
Filters needed for newitem.py
EdittimeFilterPageGenerator, functionally equivalent to the compat
version, but better.
Also added filter WikibaseItemFilterPageGenerator, and
Page.oldest_revision.
Renamed WikidataItemGenerator to WikibaseItemGenerator.
Bug: 55007
Change-Id: I71b051818773e8b2f78eab534c4dfb23072d0ee9
---
M pywikibot/page.py
M pywikibot/pagegenerators.py
M tests/pagegenerators_tests.py
3 files changed, 221 insertions(+), 31 deletions(-)
Approvals:
John Vandenberg: Looks good to me, but someone else must approve
Ladsgroup: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/page.py b/pywikibot/page.py
index 961fe6f..cc094bf 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -563,6 +563,9 @@
"""
return self.site.page_exists(self)
+ def oldest_revision(self):
+ return self.getVersionHistory(reverseOrder=True, total=1)[0]
+
def isRedirectPage(self):
"""Return True if this is a redirect, False if not or not existing."""
return self.site.page_isredirect(self)
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index f946bfb..922c0a9 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -21,10 +21,11 @@
__version__ = '$Id$'
#
-import sys
import codecs
+import datetime
import itertools
import re
+import sys
import time
import pywikibot
@@ -977,13 +978,30 @@
yield page
-def RedirectFilterPageGenerator(generator, no_redirects=True):
- """Yield pages from another generator that are redirects or not."""
- for page in generator:
- if not page.isRedirectPage() and no_redirects:
- yield page
- elif page.isRedirectPage() and not no_redirects:
- yield page
+def RedirectFilterPageGenerator(generator, no_redirects=True,
+ show_filtered=False):
+ """
+ Yield pages from another generator that are redirects or not.
+
+ @param no_redirects: Exclude redirects if True, else only include
+ redirects.
+ @param no_redirects: bool
+ @param show_filtered: Output a message for each page not yielded
+ @type show_filtered: bool
+ """
+ for page in generator or []:
+ if no_redirects:
+ if not page.isRedirectPage():
+ yield page
+ elif show_filtered:
+ pywikibot.output(u'%s is a redirect page. Skipping.' % page)
+
+ else:
+ if page.isRedirectPage():
+ yield page
+ elif show_filtered:
+ pywikibot.output(u'%s is not a redirect page. Skipping.'
+ % page)
def DuplicateFilterPageGenerator(generator):
@@ -1079,6 +1097,74 @@
# name the generator methods
RegexFilterPageGenerator = RegexFilter.titlefilter
RegexBodyFilterPageGenerator = RegexFilter.contentfilter
+
+
+@deprecated_args(begintime='last_edit_start', endtime='last_edit_end')
+def EdittimeFilterPageGenerator(generator,
+ last_edit_start=None,
+ last_edit_end=None,
+ first_edit_start=None,
+ first_edit_end=None,
+ show_filtered=False):
+ """
+ Wrap a generator to filter pages outside last or first edit range.
+
+ @param generator: A generator object
+ @param last_edit_start: Only yield pages last edited after this time
+ @type last_edit_start: datetime
+ @param last_edit_end: Only yield pages last edited before this time
+ @type last_edit_end: datetime
+ @param first_edit_start: Only yield pages first edited after this time
+ @type first_edit_start: datetime
+ @param first_edit_end: Only yield pages first edited before this time
+ @type first_edit_end: datetime
+ @param show_filtered: Output a message for each page not yielded
+ @type show_filtered: bool
+
+ """
+ do_last_edit = last_edit_start or last_edit_end
+ do_first_edit = first_edit_start or first_edit_end
+
+ last_edit_start = last_edit_start or datetime.datetime.min
+ last_edit_end = last_edit_end or datetime.datetime.max
+ first_edit_start = first_edit_start or datetime.datetime.min
+ first_edit_end = first_edit_end or datetime.datetime.max
+
+ for page in generator or []:
+ if do_last_edit:
+ last_edit = page.editTime()
+
+ if last_edit < last_edit_start:
+ if show_filtered:
+ pywikibot.output(
+ u'Last edit on %s was on %s.\nToo old. Skipping.'
+ % (page, last_edit.isoformat()))
+ continue
+
+ if last_edit > last_edit_end:
+ if show_filtered:
+ pywikibot.output(
+ u'Last edit on %s was on %s.\nToo recent. Skipping.'
+ % (page, last_edit.isoformat()))
+ continue
+
+ if do_first_edit:
+ first_edit = page.oldest_revision().timestamp
+
+ if first_edit < first_edit_start:
+ if show_filtered:
+ pywikibot.output(
+ u'First edit on %s was on %s.\nToo old. Skipping.'
+ % (page, first_edit.isoformat()))
+
+ if first_edit > first_edit_end:
+ if show_filtered:
+ pywikibot.output(
+ u'First edit on %s was on %s.\nToo recent. Skipping.'
+ % (page, first_edit.isoformat()))
+ continue
+
+ yield page
def CombinedPageGenerator(generators):
@@ -1273,9 +1359,9 @@
yield entry.title()
-def WikidataItemGenerator(gen):
+def WikibaseItemGenerator(gen):
"""
- A wrapper generator used to yield Wikidata items of another generator.
+ A wrapper generator used to yield Wikibase items of another generator.
@param gen: Generator to wrap.
@type gen: generator
@@ -1286,13 +1372,52 @@
if isinstance(page, pywikibot.ItemPage):
yield page
elif page.site.data_repository() == page.site:
- # These are already items, just not item pages
+ # These are already items, as they have a DataSite in page.site.
+ # However generator is yielding Page, so convert to ItemPage.
# FIXME: If we've already fetched content, we should retain it
yield pywikibot.ItemPage(page.site, page.title())
else:
yield pywikibot.ItemPage.fromPage(page)
+WikidataItemGenerator = WikibaseItemGenerator
+
+
+def WikibaseItemFilterPageGenerator(generator, has_item=True,
+ show_filtered=False):
+ """
+ A wrapper generator used to exclude if page has a wikibase item or not.
+
+ @param gen: Generator to wrap.
+ @type gen: generator
+ @param has_item: Exclude pages without an item if True, or only
+ include pages without an item if False
+ @type has_item: bool
+ @param show_filtered: Output a message for each page not yielded
+ @type show_filtered: bool
+ @return: Wrapped generator
+ @rtype: generator
+ """
+ for page in generator or []:
+ try:
+ page_item = pywikibot.ItemPage.fromPage(page, lazy_load=False)
+ except pywikibot.NoPage:
+ page_item = None
+
+ if page_item:
+ if not has_item:
+ if show_filtered:
+ pywikibot.output(
+ '%s has a wikidata item. Skipping.' % page)
+ continue
+ else:
+ if has_item:
+ if show_filtered:
+ pywikibot.output(
+ '%s doesn\'t have a wikidata item. Skipping.' % page)
+ continue
+
+
# TODO below
@deprecated_args(extension=None, number="total", repeat=None)
def UnusedFilesGenerator(total=100, site=None, extension=None):
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index 9492579..0ce3bba 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -7,11 +7,14 @@
# Distributed under the terms of the MIT license.
__version__ = '$Id$'
+import datetime
import os
import sys
import pywikibot
from pywikibot import pagegenerators
+
+from pywikibot.pagegenerators import PagesFromTitlesGenerator
from tests import _data_dir
from tests.aspects import (
@@ -23,7 +26,28 @@
from tests.thread_tests import GeneratorIntersectTestCase
-class TestPageGenerators(TestCase):
+en_wp_page_titles = (
+ # just a bunch of randomly selected titles for English Wikipedia tests
+ u"Eastern Sayan",
+ u"The Addams Family (pinball)",
+ u"Talk:Nowy Sącz",
+ u"Talk:Battle of Węgierska Górka",
+ u"Template:!",
+ u"Template:Template",
+)
+
+en_wp_nopage_titles = (
+ u"Cities in Burkina Faso",
+ u"Talk:Hispanic (U.S. Census)",
+ u"Talk:Stołpce",
+ u"Template:!/Doc",
+ u"Template:!/Meta",
+ u"Template:Template/Doc",
+ u"Template:Template/Meta",
+)
+
+
+class TestDryPageGenerators(TestCase):
"""Test pagegenerators methods."""
@@ -32,25 +56,10 @@
dry = True
- titles = (
- # just a bunch of randomly selected titles
- u"Cities in Burkina Faso",
- u"Eastern Sayan",
- u"The Addams Family (pinball)",
- u"Talk:Hispanic (U.S. Census)",
- u"Talk:Stołpce",
- u"Talk:Nowy Sącz",
- u"Talk:Battle of Węgierska Górka",
- u"Template:!",
- u"Template:!/Doc",
- u"Template:!/Meta",
- u"Template:Template",
- u"Template:Template/Doc",
- u"Template:Template/Meta",
- )
+ titles = en_wp_page_titles + en_wp_nopage_titles
def setUp(self):
- super(TestPageGenerators, self).setUp()
+ super(TestDryPageGenerators, self).setUp()
self.site = self.get_site()
def assertFunction(self, obj):
@@ -113,8 +122,8 @@
gen = pagegenerators.RegexFilterPageGenerator(gen, ['template',
'/meta'],
quantifier='any')
self.assertPagelistTitles(gen,
- ('Template:!/Meta',
- 'Template:Template',
+ ('Template:Template',
+ 'Template:!/Meta',
'Template:Template/Doc',
'Template:Template/Meta'))
gen = pagegenerators.PagesFromTitlesGenerator(self.titles,
@@ -156,6 +165,59 @@
self.assertEqual(len(tuple(gen)), 9)
+class EdittimeFilterPageGeneratorTestCase(TestCase):
+
+ """Test EdittimeFilterPageGenerator."""
+
+ family = 'wikipedia'
+ code = 'en'
+
+ titles = en_wp_page_titles
+
+ def test_first_edit(self):
+ expect = (
+ u'The Addams Family (pinball)',
+ u'Talk:Nowy Sącz',
+ u'Template:Template',
+ )
+ gen = PagesFromTitlesGenerator(self.titles, self.site)
+ gen = pagegenerators.EdittimeFilterPageGenerator(
+ gen, first_edit_end=datetime.datetime(2006, 1, 1))
+ self.assertPagelistTitles(gen, titles=expect, site=self.site)
+
+ gen = PagesFromTitlesGenerator(self.titles, self.site)
+ gen = pagegenerators.EdittimeFilterPageGenerator(
+ gen, first_edit_start=datetime.datetime(2006, 1, 1))
+ opposite_pages = list(gen)
+ self.assertTrue(all(isinstance(p, pywikibot.Page)
+ for p in opposite_pages))
+ self.assertTrue(all(p.title not in expect for p in opposite_pages))
+
+ def test_last_edit(self):
+ two_days_ago = datetime.datetime.now() - datetime.timedelta(days=2)
+ nine_days_ago = datetime.datetime.now() - datetime.timedelta(days=9)
+
+ gen = PagesFromTitlesGenerator(['Wikipedia:Sandbox'], self.site)
+ gen = pagegenerators.EdittimeFilterPageGenerator(
+ gen, last_edit_start=two_days_ago)
+ self.assertEqual(len(list(gen)), 1)
+
+ gen = PagesFromTitlesGenerator(['Wikipedia:Sandbox'], self.site)
+ gen = pagegenerators.EdittimeFilterPageGenerator(
+ gen, last_edit_end=two_days_ago)
+ self.assertEqual(len(list(gen)), 0)
+
+ gen = PagesFromTitlesGenerator(['Template:Sidebox'], self.site)
+ gen = pagegenerators.EdittimeFilterPageGenerator(
+ gen, last_edit_end=nine_days_ago)
+ self.assertEqual(len(list(gen)), 1)
+
+ gen = PagesFromTitlesGenerator(['Template:Sidebox'], self.site)
+ gen = pagegenerators.EdittimeFilterPageGenerator(
+ gen, last_edit_start=nine_days_ago)
+ self.assertEqual(len(list(gen)), 0)
+
+
class TestRepeatingGenerator(TestCase):
"""Test RepeatingGenerator."""
--
To view, visit https://gerrit.wikimedia.org/r/172493
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I71b051818773e8b2f78eab534c4dfb23072d0ee9
Gerrit-PatchSet: 4
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits