Github user holdenk commented on a diff in the pull request:
https://github.com/apache/spark/pull/22275#discussion_r232420076
--- Diff: python/pyspark/sql/tests.py ---
@@ -4923,6 +4923,28 @@ def test_timestamp_dst(self):
self.assertPandasEqual(pdf, df_from_python.toPandas())
self.assertPandasEqual(pdf, df_from_pandas.toPandas())
+ def test_toPandas_batch_order(self):
+
+ # Collects Arrow RecordBatches out of order in driver JVM then
re-orders in Python
+ def run_test(num_records, num_parts, max_records):
+ df = self.spark.range(num_records,
numPartitions=num_parts).toDF("a")
+ with
self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": max_records}):
+ pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
+ self.assertPandasEqual(pdf, pdf_arrow)
+
+ cases = [
+ (1024, 512, 2), # Try large num partitions for good chance of
not collecting in order
+ (512, 64, 2), # Try medium num partitions to test out of
order collection
+ (64, 8, 2), # Try small number of partitions to test out
of order collection
+ (64, 64, 1), # Test single batch per partition
+ (64, 1, 64), # Test single partition, single batch
+ (64, 1, 8), # Test single partition, multiple batches
+ (30, 7, 2), # Test different sized partitions
+ ]
--- End diff --
I like the new tests, I think 0.1 on one of partitions is enough.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]