This is an automated email from the ASF dual-hosted git repository.
maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 66cb7495d1 GH-43382: [C++][Parquet] min-max Statistics doesn't work
well when one of min-max is truncated (#43383)
66cb7495d1 is described below
commit 66cb7495d1a43f3539cf66f6d88bac40fb9d28d4
Author: mwish <[email protected]>
AuthorDate: Mon Aug 5 16:28:53 2024 +0800
GH-43382: [C++][Parquet] min-max Statistics doesn't work well when one of
min-max is truncated (#43383)
### Rationale for this change
See https://github.com/apache/arrow/issues/43382
### What changes are included in this PR?
Change stats has min-max from min || max to &&
### Are these changes tested?
* [x] TODO
### Are there any user-facing changes?
Might affect interface using HasMinMax
**This PR includes breaking changes to public APIs.**
* GitHub Issue: #43382
Authored-by: mwish <[email protected]>
Signed-off-by: mwish <[email protected]>
---
cpp/src/parquet/arrow/arrow_statistics_test.cc | 23 +++++++++++++++++++++++
cpp/src/parquet/metadata.cc | 4 ++--
cpp/src/parquet/statistics.h | 2 +-
cpp/src/parquet/statistics_test.cc | 25 +++++++++++++++++++++++++
4 files changed, 51 insertions(+), 3 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc
b/cpp/src/parquet/arrow/arrow_statistics_test.cc
index ad4496933e..a19303c3dc 100644
--- a/cpp/src/parquet/arrow/arrow_statistics_test.cc
+++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc
@@ -156,4 +156,27 @@ INSTANTIATE_TEST_SUITE_P(
/*expected_min=*/"z",
/*expected_max=*/"z"}));
+TEST(StatisticsTest, TruncateOnlyHalfMinMax) {
+ // GH-43382: Tests when we only have min or max, the `HasMinMax` should be
false.
+ std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer();
+ auto out_stream =
std::make_shared<::arrow::io::BufferOutputStream>(serialized_data);
+ auto schema = ::arrow::schema({::arrow::field("a", ::arrow::utf8())});
+ ::parquet::WriterProperties::Builder properties_builder;
+ properties_builder.max_statistics_size(2);
+ ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileWriter> writer,
+ FileWriter::Open(*schema, default_memory_pool(), out_stream,
+ properties_builder.build(),
default_arrow_writer_properties()));
+ auto table = Table::Make(schema, {ArrayFromJSON(::arrow::utf8(), R"(["a",
"abc"])")});
+ ASSERT_OK(writer->WriteTable(*table, std::numeric_limits<int64_t>::max()));
+ ASSERT_OK(writer->Close());
+ ASSERT_OK(out_stream->Close());
+
+ auto buffer_reader =
std::make_shared<::arrow::io::BufferReader>(serialized_data);
+ auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
+ std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
+ std::shared_ptr<Statistics> stats =
metadata->RowGroup(0)->ColumnChunk(0)->statistics();
+ ASSERT_FALSE(stats->HasMinMax());
+}
+
} // namespace parquet::arrow
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 139793219d..fe16f5b76b 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -97,7 +97,7 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
descr, metadata.statistics.min_value, metadata.statistics.max_value,
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
- metadata.statistics.__isset.max_value ||
metadata.statistics.__isset.min_value,
+ metadata.statistics.__isset.max_value &&
metadata.statistics.__isset.min_value,
metadata.statistics.__isset.null_count,
metadata.statistics.__isset.distinct_count);
}
@@ -106,7 +106,7 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
descr, metadata.statistics.min, metadata.statistics.max,
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
- metadata.statistics.__isset.max || metadata.statistics.__isset.min,
+ metadata.statistics.__isset.max && metadata.statistics.__isset.min,
metadata.statistics.__isset.null_count,
metadata.statistics.__isset.distinct_count);
}
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 0d6ea9898f..c5da44a7b6 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -246,7 +246,7 @@ class PARQUET_EXPORT Statistics {
/// \brief The number of non-null values in the column
virtual int64_t num_values() const = 0;
- /// \brief Return true if the min and max statistics are set. Obtain
+ /// \brief Return true if both min and max statistics are set. Obtain
/// with TypedStatistics<T>::min and max
virtual bool HasMinMax() const = 0;
diff --git a/cpp/src/parquet/statistics_test.cc
b/cpp/src/parquet/statistics_test.cc
index cb2e6455ab..dad414ac89 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -1602,5 +1602,30 @@ TEST(TestEncodedStatistics, CopySafe) {
EXPECT_EQ("abc", encoded_statistics.max());
}
+TEST(TestEncodedStatistics, ApplyStatSizeLimits) {
+ EncodedStatistics encoded_statistics;
+ encoded_statistics.set_min("a");
+ encoded_statistics.has_min = true;
+
+ encoded_statistics.set_max("abc");
+ encoded_statistics.has_max = true;
+
+ encoded_statistics.ApplyStatSizeLimits(2);
+
+ ASSERT_TRUE(encoded_statistics.has_min);
+ ASSERT_EQ("a", encoded_statistics.min());
+ ASSERT_FALSE(encoded_statistics.has_max);
+
+ NodePtr node =
+ PrimitiveNode::Make("StringColumn", Repetition::REQUIRED,
Type::BYTE_ARRAY);
+ ColumnDescriptor descr(node, 0, 0);
+ std::shared_ptr<TypedStatistics<::parquet::ByteArrayType>> statistics =
+ std::dynamic_pointer_cast<TypedStatistics<::parquet::ByteArrayType>>(
+ Statistics::Make(&descr, &encoded_statistics,
+ /*num_values=*/1000));
+ // GH-43382: HasMinMax should be false if one of min/max is not set.
+ EXPECT_FALSE(statistics->HasMinMax());
+}
+
} // namespace test
} // namespace parquet