This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 7984cf7  ARROW-10130: [C++][Dataset] Ensure 
ParquetFileFragment::SplitByRowGroup preserves the 'has_complete_metadata' 
status
7984cf7 is described below

commit 7984cf75663b357500d7476c64a5001202ca0276
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Wed Sep 30 10:29:40 2020 +0200

    ARROW-10130: [C++][Dataset] Ensure ParquetFileFragment::SplitByRowGroup 
preserves the 'has_complete_metadata' status
    
    By also passing the physical_schema, the ParquetFileFragment constructor 
will set `has_complete_metadata_` if the statistics are present in the row 
group infos.
    
    Closes #8298 from jorisvandenbossche/ARROW-10130-split_preserve_metadata
    
    Authored-by: Joris Van den Bossche <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 cpp/src/arrow/dataset/file_parquet.cc | 6 +++---
 python/pyarrow/tests/test_dataset.py  | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_parquet.cc 
b/cpp/src/arrow/dataset/file_parquet.cc
index a5e6b6f..33f7463 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -567,9 +567,9 @@ Result<FragmentVector> ParquetFileFragment::SplitByRowGroup(
   FragmentVector fragments(row_groups.size());
   auto fragment = fragments.begin();
   for (auto&& row_group : row_groups) {
-    ARROW_ASSIGN_OR_RAISE(*fragment++,
-                          parquet_format_.MakeFragment(source_, 
partition_expression(),
-                                                       
{std::move(row_group)}));
+    ARROW_ASSIGN_OR_RAISE(*fragment++, parquet_format_.MakeFragment(
+                                           source_, partition_expression(),
+                                           {std::move(row_group)}, 
physical_schema_));
   }
 
   return fragments;
diff --git a/python/pyarrow/tests/test_dataset.py 
b/python/pyarrow/tests/test_dataset.py
index 293b5bd..cab6f70 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -2055,6 +2055,11 @@ def test_parquet_dataset_lazy_filtering(tempdir, 
open_logging_fs):
     with assert_opens([]):
         fragments[0].split_by_row_group(ds.field("f1") > 15)
 
+    # ensuring metadata of splitted fragment should also not open any file
+    with assert_opens([]):
+        rg_fragments = fragments[0].split_by_row_group()
+        rg_fragments[0].ensure_complete_metadata()
+
     # FIXME(bkietz) on Windows this results in FileNotFoundErrors.
     # but actually scanning does open files
     # with assert_opens([f.path for f in fragments]):

Reply via email to