jenkins-bot has submitted this change and it was merged.
Change subject: filter_unique for pagegenerators
......................................................................
filter_unique for pagegenerators
This changeset introduces a more flexible de-duplication method,
especially needed to avoid memory problems for very long job runs.
It only provides a basic 100% backwards compatible implementation in
pagegenerators. The next step will be to switch to explicitly using
hash which improves performance and addresses the general memory
problem, and also sharing the seen list between unioned generators.
Change-Id: I8d72b148bcee50dba8645112af6f8e2ce3e30bca
---
M pywikibot/pagegenerators.py
M pywikibot/tools/__init__.py
M tests/tools_tests.py
3 files changed, 353 insertions(+), 32 deletions(-)
Approvals:
XZise: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 830f2fa..92231c3 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -34,13 +34,15 @@
import pywikibot
-from pywikibot import date, config, i18n
from pywikibot.tools import (
deprecated,
deprecated_args,
DequeGenerator,
intersect_generators,
+ filter_unique,
)
+
+from pywikibot import date, config, i18n
from pywikibot.comms import http
from pywikibot.data import wikidataquery as wdquery
from pywikibot.exceptions import ArgumentDeprecationWarning
@@ -289,6 +291,15 @@
that are used by many scripts and that determine which pages to work on.
"""
+ # This is the function that will be used to de-duplicate iterators.
+ # See the documentation in L{pywikibot.tools.filter_unique} for reasons
+ # why this should be changed to improve space and time of execution.
+ _filter_unique = staticmethod(filter_unique)
+ # The seen list can not yet be shared at present, due to `intersect` mode
+ # not being known until after all generators have been created.
+ # When not in intersect mode, _filter_unique could be:
+ # functools.partial(filter_unique, container=global_seen_list)
+
def __init__(self, site=None):
"""
Constructor.
@@ -384,7 +395,7 @@
dupfiltergen = gensList
else:
gensList = CombinedPageGenerator(self.gens)
- dupfiltergen = DuplicateFilterPageGenerator(gensList)
+ dupfiltergen = self._filter_unique(gensList)
if self.claimfilter_list:
dupfiltergen = PreloadingItemGenerator(dupfiltergen)
@@ -530,14 +541,13 @@
gen = RandomPageGenerator(total=int(arg[8:]), site=self.site)
elif arg.startswith('-recentchanges'):
if len(arg) >= 15:
- gen = RecentChangesPageGenerator(namespaces=self.namespaces,
- total=int(arg[15:]),
- site=self.site)
+ total = int(arg[15:])
else:
- gen = RecentChangesPageGenerator(namespaces=self.namespaces,
- total=60,
- site=self.site)
- gen = DuplicateFilterPageGenerator(gen)
+ total = 60
+ gen = RecentChangesPageGenerator(namespaces=self.namespaces,
+ total=total,
+ site=self.site,
+
_filter_unique=self._filter_unique)
elif arg.startswith('-liverecentchanges'):
if len(arg) >= 19:
gen = LiveRCPageGenerator(self.site, total=int(arg[19:]))
@@ -942,9 +952,10 @@
showBot=None, showAnon=None,
showRedirects=None, showPatrolled=None,
topOnly=False, step=None, total=None,
- user=None, excludeuser=None, site=None):
+ user=None, excludeuser=None, site=None,
+ _filter_unique=None):
"""
- Generate pages that are in the recent changes list.
+ Generate pages that are in the recent changes list, including duplicates.
@param start: Timestamp to start listing from
@type start: pywikibot.Timestamp
@@ -986,18 +997,22 @@
"""
if site is None:
site = pywikibot.Site()
- for item in site.recentchanges(start=start, end=end, reverse=reverse,
- namespaces=namespaces, pagelist=pagelist,
- changetype=changetype, showMinor=showMinor,
- showBot=showBot, showAnon=showAnon,
- showRedirects=showRedirects,
- showPatrolled=showPatrolled,
- topOnly=topOnly, step=step, total=total,
- user=user, excludeuser=excludeuser):
- # The title in a log entry may have been suppressed
- if 'title' not in item and item['type'] == 'log':
- continue
- yield pywikibot.Page(pywikibot.Link(item["title"], site))
+
+ gen = site.recentchanges(start=start, end=end, reverse=reverse,
+ namespaces=namespaces, pagelist=pagelist,
+ changetype=changetype, showMinor=showMinor,
+ showBot=showBot, showAnon=showAnon,
+ showRedirects=showRedirects,
+ showPatrolled=showPatrolled,
+ topOnly=topOnly, step=step, total=total,
+ user=user, excludeuser=excludeuser)
+
+ gen = (pywikibot.Page(site, x['title'])
+ for x in gen if x['type'] != 'log' or 'title' in x)
+
+ if _filter_unique:
+ gen = _filter_unique(gen)
+ return gen
def FileLinksGenerator(referredFilePage, step=None, total=None, content=False):
@@ -1143,7 +1158,8 @@
@deprecated_args(number="total")
def UserContributionsGenerator(username, namespaces=None, site=None,
- step=None, total=None):
+ step=None, total=None,
+ _filter_unique=filter_unique):
"""Yield unique pages edited by user:username.
@param step: Maximum number of pages to retrieve per API query
@@ -1158,7 +1174,7 @@
"""
if site is None:
site = pywikibot.Site()
- return DuplicateFilterPageGenerator(
+ return _filter_unique(
pywikibot.Page(pywikibot.Link(contrib["title"], source=site))
for contrib in site.usercontribs(user=username, namespaces=namespaces,
step=step, total=total)
@@ -1256,13 +1272,7 @@
% page)
-def DuplicateFilterPageGenerator(generator):
- """Yield all unique pages from another generator, omitting duplicates."""
- seenPages = {}
- for page in generator:
- if page not in seenPages:
- seenPages[page] = True
- yield page
+DuplicateFilterPageGenerator = filter_unique
class ItemClaimFilter(object):
diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py
index 3a836db..403ccd9 100644
--- a/pywikibot/tools/__init__.py
+++ b/pywikibot/tools/__init__.py
@@ -620,6 +620,62 @@
return
+def filter_unique(iterable, container=None, key=None, add=None):
+ """
+ Yield unique items from an iterable, omitting duplicates.
+
+ By default, to provide uniqueness, it puts the generated items into
+ the keys of a dict created as a local variable, each with a value of True.
+ It only yields items which are not already present in the local dict.
+
+ For large collections, this is not memory efficient, as a strong reference
+ to every item is kept in a local dict which can not be cleared.
+
+ Also, the local dict cant be re-used when chaining unique operations on
+ multiple generators.
+
+ To avoid these issues, it is advisable for the caller to provide their own
+ container and set the key parameter to be the function L{hash}, or use a
+ L{weakref} as the key.
+
+ The container can be any object that supports __contains__.
+ If the container is a set or dict, the method add or __setitem__ will be
+ used automatically. Any other method may be provided explicitly using the
+ add parameter.
+
+ Note: This is not thread safe.
+
+ @param iterable: the source iterable
+ @type iterable: collections.Iterable
+ @param container: storage of seen items
+ @type container: type
+ @param key: function to convert the item to a key
+ @type key: callable
+ @param add: function to add an item to the container
+ @type add: callable
+ """
+ if container is None:
+ container = {}
+
+ if not add:
+ if hasattr(container, 'add'):
+ def container_add(x):
+ container.add(key(x) if key else x)
+
+ add = container_add
+ else:
+ def container_setitem(x):
+ container.__setitem__(key(x) if key else x,
+ True)
+
+ add = container_setitem
+
+ for item in iterable:
+ if (key(item) if key else item) not in container:
+ add(item)
+ yield item
+
+
class CombinedError(KeyError, IndexError):
"""An error that gets caught by both KeyError and IndexError."""
diff --git a/tests/tools_tests.py b/tests/tools_tests.py
index 434cab5..b60b2a5 100644
--- a/tests/tools_tests.py
+++ b/tests/tools_tests.py
@@ -9,13 +9,17 @@
__version__ = '$Id$'
+import collections
+import decimal
import os.path
import subprocess
+import sys
from pywikibot import tools
from tests import _data_dir
from tests.aspects import unittest, TestCase
+from tests.utils import expected_failure_if
_xml_data_dir = os.path.join(_data_dir, 'xml')
@@ -153,6 +157,257 @@
ValueError, '42', tools.merge_unique_dicts, self.dct1, **self.dct1)
+def passthrough(x):
+ """Return x."""
+ return x
+
+
+class SkipList(set):
+
+ """Container that ignores items."""
+
+ skip_list = [1, 3]
+
+ def __contains__(self, item):
+ """Override to not process some items."""
+ if item in self.skip_list:
+ return True
+ else:
+ return super(SkipList, self).__contains__(item)
+
+
+class ProcessAgainList(set):
+
+ """Container that keeps processing certain items."""
+
+ process_again_list = [1, 3]
+
+ def add(self, item):
+ """Override to not add some items."""
+ if item in self.process_again_list:
+ return
+ else:
+ return super(ProcessAgainList, self).add(item)
+
+
+class ContainsStopList(set):
+
+ """Container that stops when encountering items."""
+
+ stop_list = []
+
+ def __contains__(self, item):
+ """Override to stop on encountering items."""
+ if item in self.stop_list:
+ raise StopIteration
+ else:
+ return super(ContainsStopList, self).__contains__(item)
+
+
+class AddStopList(set):
+
+ """Container that stops when encountering items."""
+
+ stop_list = []
+
+ def add(self, item):
+ """Override to not continue on encountering items."""
+ if item in self.stop_list:
+ raise StopIteration
+ else:
+ super(AddStopList, self).add(item)
+
+
+class TestFilterUnique(TestCase):
+
+ """Test filter_unique."""
+
+ net = False
+
+ ints = [1, 3, 2, 1, 2, 1, 2, 4, 2]
+ strs = [str(i) for i in ints]
+ decs = [decimal.Decimal(i) for i in ints]
+
+ def _test_dedup_int(self, deduped, deduper, key=None):
+ """Test filter_unique results for int."""
+ if not key:
+ key = passthrough
+
+ self.assertEqual(len(deduped), 0)
+
+ self.assertEqual(next(deduper), 1)
+ self.assertEqual(next(deduper), 3)
+
+ if key in (hash, passthrough):
+ if isinstance(deduped, tools.OrderedDict):
+ self.assertEqual(list(deduped.keys()), [1, 3])
+ elif isinstance(deduped, collections.Mapping):
+ self.assertCountEqual(list(deduped.keys()), [1, 3])
+ else:
+ self.assertEqual(deduped, set([1, 3]))
+
+ self.assertEqual(next(deduper), 2)
+ self.assertEqual(next(deduper), 4)
+
+ if key in (hash, passthrough):
+ if isinstance(deduped, tools.OrderedDict):
+ self.assertEqual(list(deduped.keys()), [1, 3, 2, 4])
+ elif isinstance(deduped, collections.Mapping):
+ self.assertCountEqual(list(deduped.keys()), [1, 2, 3, 4])
+ else:
+ self.assertEqual(deduped, set([1, 2, 3, 4]))
+
+ self.assertRaises(StopIteration, next, deduper)
+
+ def _test_dedup_str(self, deduped, deduper, key=None):
+ """Test filter_unique results for str."""
+ if not key:
+ key = passthrough
+
+ self.assertEqual(len(deduped), 0)
+
+ self.assertEqual(next(deduper), '1')
+ self.assertEqual(next(deduper), '3')
+
+ if key in (hash, passthrough):
+ if isinstance(deduped, collections.Mapping):
+ self.assertEqual(deduped.keys(), [key('1'), key('3')])
+ else:
+ self.assertEqual(deduped, set([key('1'), key('3')]))
+
+ self.assertEqual(next(deduper), '2')
+ self.assertEqual(next(deduper), '4')
+
+ if key in (hash, passthrough):
+ if isinstance(deduped, collections.Mapping):
+ self.assertEqual(deduped.keys(), [key(i) for i in self.strs])
+ else:
+ self.assertEqual(deduped, set(key(i) for i in self.strs))
+
+ self.assertRaises(StopIteration, next, deduper)
+
+ def test_set(self):
+ """Test filter_unique with a set."""
+ deduped = set()
+ deduper = tools.filter_unique(self.ints, container=deduped)
+ self._test_dedup_int(deduped, deduper)
+
+ def test_dict(self):
+ """Test filter_unique with a dict."""
+ deduped = dict()
+ deduper = tools.filter_unique(self.ints, container=deduped)
+ self._test_dedup_int(deduped, deduper)
+
+ def test_OrderedDict(self):
+ """Test filter_unique with a OrderedDict."""
+ deduped = tools.OrderedDict()
+ deduper = tools.filter_unique(self.ints, container=deduped)
+ self._test_dedup_int(deduped, deduper)
+
+ def test_int_hash(self):
+ """Test filter_unique with ints using hash as key."""
+ deduped = set()
+ deduper = tools.filter_unique(self.ints, container=deduped, key=hash)
+ self._test_dedup_int(deduped, deduper, hash)
+
+ def test_int_id(self):
+ """Test filter_unique with ints using id as key."""
+ deduped = set()
+ deduper = tools.filter_unique(self.ints, container=deduped, key=id)
+ self._test_dedup_int(deduped, deduper, id)
+
+ def test_obj(self):
+ """Test filter_unique with objects."""
+ deduped = set()
+ deduper = tools.filter_unique(self.decs, container=deduped)
+ self._test_dedup_int(deduped, deduper)
+
+ def test_obj_hash(self):
+ """Test filter_unique with objects using hash as key."""
+ deduped = set()
+ deduper = tools.filter_unique(self.decs, container=deduped, key=hash)
+ self._test_dedup_int(deduped, deduper, hash)
+
+ @unittest.expectedFailure
+ def test_obj_id(self):
+ """Test filter_unique with objects using id as key, which fails."""
+ # Two objects which may be equal do not have the same id.
+ deduped = set()
+ deduper = tools.filter_unique(self.decs, container=deduped, key=id)
+ self._test_dedup_int(deduped, deduper, id)
+
+ def test_str(self):
+ """Test filter_unique with str."""
+ deduped = set()
+ deduper = tools.filter_unique(self.strs, container=deduped)
+ self._test_dedup_str(deduped, deduper)
+
+ def test_str_hash(self):
+ """Test filter_unique with str using hash as key."""
+ deduped = set()
+ deduper = tools.filter_unique(self.strs, container=deduped, key=hash)
+ self._test_dedup_str(deduped, deduper, hash)
+
+ @expected_failure_if(sys.version_info[0] >= 3)
+ def test_str_id(self):
+ """Test str using id as key fails on Python 3."""
+ # str in Python 3 behave like objects.
+ deduped = set()
+ deduper = tools.filter_unique(self.strs, container=deduped, key=id)
+ self._test_dedup_str(deduped, deduper, id)
+
+ def test_for_resumable(self):
+ """Test filter_unique is resumable after a for loop."""
+ gen2 = tools.filter_unique(self.ints)
+ deduped = []
+ for item in gen2:
+ deduped.append(item)
+ if len(deduped) == 3:
+ break
+ self.assertEqual(deduped, [1, 3, 2])
+ last = next(gen2)
+ self.assertEqual(last, 4)
+ self.assertRaises(StopIteration, next, gen2)
+
+ def test_skip(self):
+ """Test filter_unique with a container that skips items."""
+ deduped = SkipList()
+ deduper = tools.filter_unique(self.ints, container=deduped)
+ deduped_out = list(deduper)
+ self.assertCountEqual(deduped, deduped_out)
+ self.assertEqual(deduped, set([2, 4]))
+
+ def test_process_again(self):
+ """Test filter_unique with an ignoring container."""
+ deduped = ProcessAgainList()
+ deduper = tools.filter_unique(self.ints, container=deduped)
+ deduped_out = list(deduper)
+ self.assertEqual(deduped_out, [1, 3, 2, 1, 1, 4])
+ self.assertEqual(deduped, set([2, 4]))
+
+ def test_stop(self):
+ """Test filter_unique with an ignoring container."""
+ deduped = ContainsStopList()
+ deduped.stop_list = [2]
+ deduper = tools.filter_unique(self.ints, container=deduped)
+ deduped_out = list(deduper)
+ self.assertCountEqual(deduped, deduped_out)
+ self.assertEqual(deduped, set([1, 3]))
+
+ # And it should not resume
+ self.assertRaises(StopIteration, next, deduper)
+
+ deduped = AddStopList()
+ deduped.stop_list = [4]
+ deduper = tools.filter_unique(self.ints, container=deduped)
+ deduped_out = list(deduper)
+ self.assertCountEqual(deduped, deduped_out)
+ self.assertEqual(deduped, set([1, 2, 3]))
+
+ # And it should not resume
+ self.assertRaises(StopIteration, next, deduper)
+
+
if __name__ == '__main__':
try:
unittest.main()
--
To view, visit https://gerrit.wikimedia.org/r/183536
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I8d72b148bcee50dba8645112af6f8e2ce3e30bca
Gerrit-PatchSet: 10
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: XZise <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits