jenkins-bot has submitted this change. ( 
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/631282 )

Change subject: [bugfix] Avoid duplicates in intersect_generators()
......................................................................

[bugfix] Avoid duplicates in intersect_generators()

Avoid duplicates in intersect_generators().

Add also the possibility to have duplicates if all generators contain
the same page several times.

The solution uses a cache, so for infinite streams there will be memory
issues.

Bug: T263947
Change-Id: I86adcacded455ecd87a35b140dd371cc6caca5a4
---
M pywikibot/tools/__init__.py
M tests/thread_tests.py
2 files changed, 47 insertions(+), 15 deletions(-)

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py
index 10bf6bc..b44f246 100644
--- a/pywikibot/tools/__init__.py
+++ b/pywikibot/tools/__init__.py
@@ -692,7 +692,7 @@
                   .format(thd, thd.queue.qsize()), self._logger)


-def intersect_generators(genlist):
+def intersect_generators(genlist, allow_duplicates=False):
     """
     Intersect generators listed in genlist.

@@ -707,6 +707,8 @@

     @param genlist: list of page generators
     @type genlist: list
+    @param allow_duplicates: allow duplicates if present in all generators
+    @type allow_duplicates: bool
     """
     # If any generator is empty, no pages are going to be returned
     for source in genlist:
@@ -717,7 +719,8 @@

     # Item is cached to check that it is found n_gen
     # times before being yielded.
-    cache = collections.defaultdict(set)
+    from collections import Counter
+    cache = collections.defaultdict(Counter)
     n_gen = len(genlist)

     # Class to keep track of alive threads.
@@ -729,6 +732,9 @@
         threaded_gen.daemon = True
         thrlist.append(threaded_gen)

+    ones = Counter(thrlist)
+    seen = {}
+
     while True:
         # Get items from queues in a round-robin way.
         for t in thrlist:
@@ -736,14 +742,23 @@
                 # TODO: evaluate if True and timeout is necessary.
                 item = t.queue.get(True, 0.1)

-                # Cache entry is a set of thread.
-                # Duplicates from same thread are not counted twice.
-                cache[item].add(t)
+                if not allow_duplicates and hash(item) in seen:
+                    continue
+
+                # Cache entry is a Counter of ThreadedGenerator objects.
+                cache[item].update([t])
                 if len(cache[item]) == n_gen:
-                    yield item
-                    # Remove item from cache.
-                    # No chance of seeing it again (see later: early stop).
-                    cache.pop(item)
+                    if allow_duplicates:
+                        yield item
+                        # Remove item from cache if possible.
+                        if all(el == 1 for el in cache[item].values()):
+                            cache.pop(item)
+                        else:
+                            cache[item] -= ones
+                    else:
+                        yield item
+                        cache.pop(item)
+                        seen[hash(item)] = True

                 active = thrlist.active_count()
                 max_cache = n_gen
diff --git a/tests/thread_tests.py b/tests/thread_tests.py
index 615395b..1516b0e 100644
--- a/tests/thread_tests.py
+++ b/tests/thread_tests.py
@@ -5,6 +5,7 @@
 #
 # Distributed under the terms of the MIT license.
 #
+from collections import Counter
 from contextlib import suppress

 from tests.aspects import unittest, TestCase
@@ -53,6 +54,18 @@
         self.assertCountEqual(set(result), result)
         self.assertCountEqual(result, set_result)

+    def assertEqualItertoolsWithDuplicates(self, gens):
+        """Assert intersect_generators result equals Counter intersection."""
+        # If they are a generator, we need to convert to a list
+        # first otherwise the generator is empty the second time.
+        datasets = [list(gen) for gen in gens]
+        counter_result = Counter(datasets[0])
+        for dataset in datasets[1:]:
+            counter_result = counter_result & Counter(dataset)
+        counter_result = list(counter_result.elements())
+        result = list(intersect_generators(datasets, allow_duplicates=True))
+        self.assertCountEqual(counter_result, result)
+

 class BasicGeneratorIntersectTestCase(GeneratorIntersectTestCase):

@@ -61,17 +74,21 @@
     net = False

     def test_intersect_basic(self):
-        """Test basic interset without duplicates."""
+        """Test basic intersect without duplicates."""
         self.assertEqualItertools(['abc', 'db', 'ba'])
 
     def test_intersect_with_dups(self):
-        """Test basic interset with duplicates."""
+        """Test basic intersect with duplicates."""
         self.assertEqualItertools(['aabc', 'dddb', 'baa'])

-    @unittest.expectedFailure
-    def test_intersect_with_dups_failing(self):
-        """Failing basic intersect test with duplicates."""
-        self.assertEqualItertools(['abb', 'bb'])
+    def test_intersect_with_accepted_dups(self):
+        """Test intersect with duplicates accepted."""
+        self.assertEqualItertoolsWithDuplicates(['abc', 'db', 'ba'])
+        self.assertEqualItertoolsWithDuplicates(['aabc', 'dddb', 'baa'])
+        self.assertEqualItertoolsWithDuplicates(['abb', 'bb'])
+        self.assertEqualItertoolsWithDuplicates(['bb', 'abb'])
+        self.assertEqualItertoolsWithDuplicates(['abbcd', 'abcba'])
+        self.assertEqualItertoolsWithDuplicates(['abcba', 'abbcd'])


 if __name__ == '__main__':  # pragma: no cover

--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/631282
To unsubscribe, or for help writing mail filters, visit 
https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I86adcacded455ecd87a35b140dd371cc6caca5a4
Gerrit-Change-Number: 631282
Gerrit-PatchSet: 2
Gerrit-Owner: Mpaa <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits

Reply via email to