HyukjinKwon commented on code in PR #36701:
URL: https://github.com/apache/spark/pull/36701#discussion_r884365865


##########
python/pyspark/tests/test_shuffle.py:
##########
@@ -117,6 +171,34 @@ def legit_merge_combiners(x, y):
             m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), data))
 
 
+class ExternalGroupByTests(unittest.TestCase):
+    def setUp(self):
+        self.N = 1 << 20
+        values = [i for i in range(self.N)]
+        keys = [i for i in range(2)]
+        import itertools
+
+        self.data = [value for value in itertools.product(keys, values)]
+        self.agg = Aggregator(
+            lambda x: [x], lambda x, y: x.append(y) or x, lambda x, y: 
x.extend(y) or x
+        )
+
+    def test_medium_dataset(self):
+        # SPARK-39179: Test external group by for medium dataset
+        m = ExternalGroupBy(self.agg, 5, partitions=3)
+        m.mergeValues(self.data)
+        self.assertTrue(m.spills >= 1)
+        self.assertEqual(sum(sum(v) for k, v in m.items()), 2 * 
sum(range(self.N)))
+
+    def test_dataset_with_keys_are_unsorted(self):
+        # SPARK-39179: Test external group when numbers of keys are greater 
than SORT KEY Limit.
+        m = ExternalGroupBy(self.agg, 5, partitions=3)
+        m.SORT_KEY_LIMIT = 1

Review Comment:
   Let's restore this back to the original value via try-except



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to