This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0a0d7fea95 ARROW-16420: [Python] pq.write_to_dataset always ignores
partitioning
0a0d7fea95 is described below
commit 0a0d7fea9550b48706cf678036eaa8ddb1ef2be8
Author: Alenka Frim <[email protected]>
AuthorDate: Thu May 19 10:01:56 2022 +0200
ARROW-16420: [Python] pq.write_to_dataset always ignores partitioning
Remove the lines that unconditionally set `partitioning` and `file_visitor`
in `pq.write_to_dataset` to None. This is a leftover from
https://github.com/apache/arrow/pull/12811 where additional `pq.write_dataset`
keywords were exposed.
Closes #13062 from AlenkaF/ARROW-16420
Authored-by: Alenka Frim <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/parquet/__init__.py | 7 +++----
python/pyarrow/tests/parquet/test_dataset.py | 27 +++++++++++++++++++++++++++
2 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/python/pyarrow/parquet/__init__.py
b/python/pyarrow/parquet/__init__.py
index 967d39d3db..a9437aec05 100644
--- a/python/pyarrow/parquet/__init__.py
+++ b/python/pyarrow/parquet/__init__.py
@@ -3125,7 +3125,6 @@ def write_to_dataset(table, root_path,
partition_cols=None,
"implementation."
)
metadata_collector = kwargs.pop('metadata_collector', None)
- file_visitor = None
if metadata_collector is not None:
def file_visitor(written_file):
metadata_collector.append(written_file.metadata)
@@ -3140,15 +3139,15 @@ def write_to_dataset(table, root_path,
partition_cols=None,
if filesystem is not None:
filesystem = _ensure_filesystem(filesystem)
- partitioning = None
if partition_cols:
part_schema = table.select(partition_cols).schema
partitioning = ds.partitioning(part_schema, flavor="hive")
if basename_template is None:
basename_template = guid() + '-{i}.parquet'
- if existing_data_behavior is None:
- existing_data_behavior = 'overwrite_or_ignore'
+
+ if existing_data_behavior is None:
+ existing_data_behavior = 'overwrite_or_ignore'
ds.write_dataset(
table, root_path, filesystem=filesystem,
diff --git a/python/pyarrow/tests/parquet/test_dataset.py
b/python/pyarrow/tests/parquet/test_dataset.py
index 99d2f6d1de..14b97265d8 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -17,6 +17,7 @@
import datetime
import os
+import pathlib
import numpy as np
import pytest
@@ -1792,3 +1793,29 @@ def
test_parquet_write_to_dataset_unsupported_keywards_in_legacy(tempdir):
with pytest.raises(ValueError, match="existing_data_behavior"):
pq.write_to_dataset(table, path, use_legacy_dataset=True,
existing_data_behavior='error')
+
+
[email protected]
+def test_parquet_write_to_dataset_exposed_keywords(tempdir):
+ table = pa.table({'a': [1, 2, 3]})
+ path = tempdir / 'partitioning'
+
+ paths_written = []
+
+ def file_visitor(written_file):
+ paths_written.append(written_file.path)
+
+ basename_template = 'part-{i}.parquet'
+
+ pq.write_to_dataset(table, path, partitioning=["a"],
+ file_visitor=file_visitor,
+ basename_template=basename_template,
+ use_legacy_dataset=False)
+
+ expected_paths = {
+ path / '1' / 'part-0.parquet',
+ path / '2' / 'part-0.parquet',
+ path / '3' / 'part-0.parquet'
+ }
+ paths_written_set = set(map(pathlib.Path, paths_written))
+ assert paths_written_set == expected_paths