This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7984cf7 ARROW-10130: [C++][Dataset] Ensure
ParquetFileFragment::SplitByRowGroup preserves the 'has_complete_metadata'
status
7984cf7 is described below
commit 7984cf75663b357500d7476c64a5001202ca0276
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Wed Sep 30 10:29:40 2020 +0200
ARROW-10130: [C++][Dataset] Ensure ParquetFileFragment::SplitByRowGroup
preserves the 'has_complete_metadata' status
By also passing the physical_schema, the ParquetFileFragment constructor
will set `has_complete_metadata_` if the statistics are present in the row
group infos.
Closes #8298 from jorisvandenbossche/ARROW-10130-split_preserve_metadata
Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
cpp/src/arrow/dataset/file_parquet.cc | 6 +++---
python/pyarrow/tests/test_dataset.py | 5 +++++
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/cpp/src/arrow/dataset/file_parquet.cc
b/cpp/src/arrow/dataset/file_parquet.cc
index a5e6b6f..33f7463 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -567,9 +567,9 @@ Result<FragmentVector> ParquetFileFragment::SplitByRowGroup(
FragmentVector fragments(row_groups.size());
auto fragment = fragments.begin();
for (auto&& row_group : row_groups) {
- ARROW_ASSIGN_OR_RAISE(*fragment++,
- parquet_format_.MakeFragment(source_,
partition_expression(),
-
{std::move(row_group)}));
+ ARROW_ASSIGN_OR_RAISE(*fragment++, parquet_format_.MakeFragment(
+ source_, partition_expression(),
+ {std::move(row_group)},
physical_schema_));
}
return fragments;
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index 293b5bd..cab6f70 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -2055,6 +2055,11 @@ def test_parquet_dataset_lazy_filtering(tempdir,
open_logging_fs):
with assert_opens([]):
fragments[0].split_by_row_group(ds.field("f1") > 15)
+ # ensuring metadata of splitted fragment should also not open any file
+ with assert_opens([]):
+ rg_fragments = fragments[0].split_by_row_group()
+ rg_fragments[0].ensure_complete_metadata()
+
# FIXME(bkietz) on Windows this results in FileNotFoundErrors.
# but actually scanning does open files
# with assert_opens([f.path for f in fragments]):