This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new fac70f9 ARROW-8414: [Python] Fix non-deterministic row order failure
in parquet tests
fac70f9 is described below
commit fac70f9ea4600ea4da31a6fb4c3be07c3d53107b
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Mon Apr 13 09:44:05 2020 -0500
ARROW-8414: [Python] Fix non-deterministic row order failure in parquet
tests
Closes #6903 from jorisvandenbossche/fix-dataset-failure
Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
---
python/pyarrow/tests/test_parquet.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/python/pyarrow/tests/test_parquet.py
b/python/pyarrow/tests/test_parquet.py
index 89131b4..3dee3ac 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2691,12 +2691,15 @@ def _test_write_to_dataset_with_partitions(base_path,
input_df_cols = input_df.columns.tolist()
assert partition_by == input_df_cols[-1 * len(partition_by):]
- # Partitioned columns become 'categorical' dtypes
input_df = input_df[cols]
if use_legacy_dataset:
+ # Partitioned columns become 'categorical' dtypes
for col in partition_by:
output_df[col] = output_df[col].astype('category')
- assert output_df.equals(input_df)
+ else:
+ # ensure deterministic row order
+ input_df = input_df.sort_values(by=["num"]).reset_index(drop=True)
+ tm.assert_frame_equal(output_df, input_df)
def _test_write_to_dataset_no_partitions(base_path,