HyukjinKwon commented on code in PR #36701:
URL: https://github.com/apache/spark/pull/36701#discussion_r888519926


##########
python/pyspark/tests/test_shuffle.py:
##########
@@ -117,6 +169,37 @@ def legit_merge_combiners(x, y):
             m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), data))
 
 
+class ExternalGroupByTests(unittest.TestCase):
+    def setUp(self):
+        self.N = 1 << 20
+        values = [i for i in range(self.N)]
+        keys = [i for i in range(2)]
+        import itertools
+
+        self.data = [value for value in itertools.product(keys, values)]
+        self.agg = Aggregator(
+            lambda x: [x], lambda x, y: x.append(y) or x, lambda x, y: 
x.extend(y) or x
+        )
+
+    def test_medium_dataset(self):
+        # SPARK-39179: Test external group by for medium dataset
+        m = ExternalGroupBy(self.agg, 5, partitions=3)
+        m.mergeValues(self.data)
+        self.assertTrue(m.spills >= 1)
+        self.assertEqual(sum(sum(v) for k, v in m.items()), 2 * 
sum(range(self.N)))
+
+    def test_dataset_with_keys_are_unsorted(self):
+        # SPARK-39179: Test external group when numbers of keys are greater 
than SORT KEY Limit.
+        m = ExternalGroupBy(self.agg, 5, partitions=3)
+        try:
+            m.SORT_KEY_LIMIT = 1
+            m.mergeValues(self.data)
+            self.assertTrue(m.spills >= 1)
+            self.assertEqual(sum(sum(v) for k, v in m.items()), 2 * 
sum(range(self.N)))
+        finally:
+            m.SORT_KEY_LIMIT = 1000

Review Comment:
   Let's probably do this way:
   
   ```python
   origin = m.SORT_KEY_LIMIT
   try:
       m.SORT_KEY_LIMIT = ...
   finally:
       m.SORT_KEY_LIMIT = origin
   ```
   
   In this way, if somebody changes the default value in the main code, we 
won't have to fix the test together.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to