This is an automated email from the ASF dual-hosted git repository.
alenka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f52ebbbf76 GH-37470: [Python][Parquet] Add missing arguments to
`ParquetFileWriteOptions` (#37469)
f52ebbbf76 is described below
commit f52ebbbf76df5d5a16257aae2504d23319723ae5
Author: Judah Rand <[email protected]>
AuthorDate: Wed Sep 20 09:52:16 2023 +0100
GH-37470: [Python][Parquet] Add missing arguments to
`ParquetFileWriteOptions` (#37469)
### Rationale for this change
I think this may have been missed when this feature was added.
### What changes are included in this PR?
### Are these changes tested?
### Are there any user-facing changes?
* Closes: #37470
Authored-by: Judah Rand <[email protected]>
Signed-off-by: AlenkaF <[email protected]>
---
python/pyarrow/_dataset_parquet.pyx | 8 ++++++++
python/pyarrow/tests/test_dataset.py | 32 ++++++++++++++++++++++++++++++++
2 files changed, 40 insertions(+)
diff --git a/python/pyarrow/_dataset_parquet.pyx
b/python/pyarrow/_dataset_parquet.pyx
index 79bd270ce5..cf5c44c1c9 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -595,6 +595,10 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
),
column_encoding=self._properties["column_encoding"],
data_page_version=self._properties["data_page_version"],
+ encryption_properties=self._properties["encryption_properties"],
+ write_batch_size=self._properties["write_batch_size"],
+
dictionary_pagesize_limit=self._properties["dictionary_pagesize_limit"],
+ write_page_index=self._properties["write_page_index"],
)
def _set_arrow_properties(self):
@@ -631,6 +635,10 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
coerce_timestamps=None,
allow_truncated_timestamps=False,
use_compliant_nested_type=True,
+ encryption_properties=None,
+ write_batch_size=None,
+ dictionary_pagesize_limit=None,
+ write_page_index=False,
)
self._set_properties()
self._set_arrow_properties()
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index b8a0c38089..e0988f2752 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -5291,6 +5291,38 @@ def test_write_dataset_preserve_field_metadata(tempdir):
assert dataset.to_table().schema.equals(schema_metadata,
check_metadata=True)
+def test_write_dataset_write_page_index(tempdir):
+ for write_statistics in [True, False]:
+ for write_page_index in [True, False]:
+ schema = pa.schema([
+ pa.field("x", pa.int64()),
+ pa.field("y", pa.int64())])
+
+ arrays = [[1, 2, 3], [None, 5, None]]
+ table = pa.Table.from_arrays(arrays, schema=schema)
+
+ file_format = ds.ParquetFileFormat()
+ base_dir = tempdir / f"write_page_index_{write_page_index}"
+ ds.write_dataset(
+ table,
+ base_dir,
+ format="parquet",
+ file_options=file_format.make_write_options(
+ write_statistics=write_statistics,
+ write_page_index=write_page_index,
+ ),
+ existing_data_behavior='overwrite_or_ignore',
+ )
+ ds1 = ds.dataset(base_dir, format="parquet")
+
+ for file in ds1.files:
+ # Can retrieve sorting columns from metadata
+ metadata = pq.read_metadata(file)
+ cc = metadata.row_group(0).column(0)
+ assert cc.has_offset_index is write_page_index
+ assert cc.has_column_index is write_page_index &
write_statistics
+
+
@pytest.mark.parametrize('dstype', [
"fs", "mem"
])