This is an automated email from the ASF dual-hosted git repository.

alenka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new f52ebbbf76 GH-37470: [Python][Parquet] Add missing arguments to 
`ParquetFileWriteOptions` (#37469)
f52ebbbf76 is described below

commit f52ebbbf76df5d5a16257aae2504d23319723ae5
Author: Judah Rand <[email protected]>
AuthorDate: Wed Sep 20 09:52:16 2023 +0100

    GH-37470: [Python][Parquet] Add missing arguments to 
`ParquetFileWriteOptions` (#37469)
    
    
    
    ### Rationale for this change
    
    I think this may have been missed when this feature was added.
    
    ### What changes are included in this PR?
    
    ### Are these changes tested?
    
    ### Are there any user-facing changes?
    
    * Closes: #37470
    
    Authored-by: Judah Rand <[email protected]>
    Signed-off-by: AlenkaF <[email protected]>
---
 python/pyarrow/_dataset_parquet.pyx  |  8 ++++++++
 python/pyarrow/tests/test_dataset.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/python/pyarrow/_dataset_parquet.pyx 
b/python/pyarrow/_dataset_parquet.pyx
index 79bd270ce5..cf5c44c1c9 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -595,6 +595,10 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
             ),
             column_encoding=self._properties["column_encoding"],
             data_page_version=self._properties["data_page_version"],
+            encryption_properties=self._properties["encryption_properties"],
+            write_batch_size=self._properties["write_batch_size"],
+            
dictionary_pagesize_limit=self._properties["dictionary_pagesize_limit"],
+            write_page_index=self._properties["write_page_index"],
         )
 
     def _set_arrow_properties(self):
@@ -631,6 +635,10 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
             coerce_timestamps=None,
             allow_truncated_timestamps=False,
             use_compliant_nested_type=True,
+            encryption_properties=None,
+            write_batch_size=None,
+            dictionary_pagesize_limit=None,
+            write_page_index=False,
         )
         self._set_properties()
         self._set_arrow_properties()
diff --git a/python/pyarrow/tests/test_dataset.py 
b/python/pyarrow/tests/test_dataset.py
index b8a0c38089..e0988f2752 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -5291,6 +5291,38 @@ def test_write_dataset_preserve_field_metadata(tempdir):
     assert dataset.to_table().schema.equals(schema_metadata, 
check_metadata=True)
 
 
+def test_write_dataset_write_page_index(tempdir):
+    for write_statistics in [True, False]:
+        for write_page_index in [True, False]:
+            schema = pa.schema([
+                pa.field("x", pa.int64()),
+                pa.field("y", pa.int64())])
+
+            arrays = [[1, 2, 3], [None, 5, None]]
+            table = pa.Table.from_arrays(arrays, schema=schema)
+
+            file_format = ds.ParquetFileFormat()
+            base_dir = tempdir / f"write_page_index_{write_page_index}"
+            ds.write_dataset(
+                table,
+                base_dir,
+                format="parquet",
+                file_options=file_format.make_write_options(
+                    write_statistics=write_statistics,
+                    write_page_index=write_page_index,
+                ),
+                existing_data_behavior='overwrite_or_ignore',
+            )
+            ds1 = ds.dataset(base_dir, format="parquet")
+
+            for file in ds1.files:
+                # Can retrieve sorting columns from metadata
+                metadata = pq.read_metadata(file)
+                cc = metadata.row_group(0).column(0)
+                assert cc.has_offset_index is write_page_index
+                assert cc.has_column_index is write_page_index & 
write_statistics
+
+
 @pytest.mark.parametrize('dstype', [
     "fs", "mem"
 ])

Reply via email to