This is an automated email from the ASF dual-hosted git repository.

maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 66cb7495d1 GH-43382: [C++][Parquet] min-max Statistics doesn't work 
well when one of min-max is truncated (#43383)
66cb7495d1 is described below

commit 66cb7495d1a43f3539cf66f6d88bac40fb9d28d4
Author: mwish <[email protected]>
AuthorDate: Mon Aug 5 16:28:53 2024 +0800

    GH-43382: [C++][Parquet] min-max Statistics doesn't work well when one of 
min-max is truncated (#43383)
    
    
    
    ### Rationale for this change
    
    See https://github.com/apache/arrow/issues/43382
    
    ### What changes are included in this PR?
    
    Change stats has min-max from min || max to &&
    
    ### Are these changes tested?
    
    * [x] TODO
    
    ### Are there any user-facing changes?
    
    Might affect interface using HasMinMax
    
    **This PR includes breaking changes to public APIs.**
    
    * GitHub Issue: #43382
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: mwish <[email protected]>
---
 cpp/src/parquet/arrow/arrow_statistics_test.cc | 23 +++++++++++++++++++++++
 cpp/src/parquet/metadata.cc                    |  4 ++--
 cpp/src/parquet/statistics.h                   |  2 +-
 cpp/src/parquet/statistics_test.cc             | 25 +++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc 
b/cpp/src/parquet/arrow/arrow_statistics_test.cc
index ad4496933e..a19303c3dc 100644
--- a/cpp/src/parquet/arrow/arrow_statistics_test.cc
+++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc
@@ -156,4 +156,27 @@ INSTANTIATE_TEST_SUITE_P(
             /*expected_min=*/"z",
             /*expected_max=*/"z"}));
 
+TEST(StatisticsTest, TruncateOnlyHalfMinMax) {
+  // GH-43382: Tests when we only have min or max, the `HasMinMax` should be 
false.
+  std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer();
+  auto out_stream = 
std::make_shared<::arrow::io::BufferOutputStream>(serialized_data);
+  auto schema = ::arrow::schema({::arrow::field("a", ::arrow::utf8())});
+  ::parquet::WriterProperties::Builder properties_builder;
+  properties_builder.max_statistics_size(2);
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<FileWriter> writer,
+      FileWriter::Open(*schema, default_memory_pool(), out_stream,
+                       properties_builder.build(), 
default_arrow_writer_properties()));
+  auto table = Table::Make(schema, {ArrayFromJSON(::arrow::utf8(), R"(["a", 
"abc"])")});
+  ASSERT_OK(writer->WriteTable(*table, std::numeric_limits<int64_t>::max()));
+  ASSERT_OK(writer->Close());
+  ASSERT_OK(out_stream->Close());
+
+  auto buffer_reader = 
std::make_shared<::arrow::io::BufferReader>(serialized_data);
+  auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
+  std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
+  std::shared_ptr<Statistics> stats = 
metadata->RowGroup(0)->ColumnChunk(0)->statistics();
+  ASSERT_FALSE(stats->HasMinMax());
+}
+
 }  // namespace parquet::arrow
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 139793219d..fe16f5b76b 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -97,7 +97,7 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
         descr, metadata.statistics.min_value, metadata.statistics.max_value,
         metadata.num_values - metadata.statistics.null_count,
         metadata.statistics.null_count, metadata.statistics.distinct_count,
-        metadata.statistics.__isset.max_value || 
metadata.statistics.__isset.min_value,
+        metadata.statistics.__isset.max_value && 
metadata.statistics.__isset.min_value,
         metadata.statistics.__isset.null_count,
         metadata.statistics.__isset.distinct_count);
   }
@@ -106,7 +106,7 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
       descr, metadata.statistics.min, metadata.statistics.max,
       metadata.num_values - metadata.statistics.null_count,
       metadata.statistics.null_count, metadata.statistics.distinct_count,
-      metadata.statistics.__isset.max || metadata.statistics.__isset.min,
+      metadata.statistics.__isset.max && metadata.statistics.__isset.min,
       metadata.statistics.__isset.null_count, 
metadata.statistics.__isset.distinct_count);
 }
 
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 0d6ea9898f..c5da44a7b6 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -246,7 +246,7 @@ class PARQUET_EXPORT Statistics {
   /// \brief The number of non-null values in the column
   virtual int64_t num_values() const = 0;
 
-  /// \brief Return true if the min and max statistics are set. Obtain
+  /// \brief Return true if both min and max statistics are set. Obtain
   /// with TypedStatistics<T>::min and max
   virtual bool HasMinMax() const = 0;
 
diff --git a/cpp/src/parquet/statistics_test.cc 
b/cpp/src/parquet/statistics_test.cc
index cb2e6455ab..dad414ac89 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -1602,5 +1602,30 @@ TEST(TestEncodedStatistics, CopySafe) {
   EXPECT_EQ("abc", encoded_statistics.max());
 }
 
+TEST(TestEncodedStatistics, ApplyStatSizeLimits) {
+  EncodedStatistics encoded_statistics;
+  encoded_statistics.set_min("a");
+  encoded_statistics.has_min = true;
+
+  encoded_statistics.set_max("abc");
+  encoded_statistics.has_max = true;
+
+  encoded_statistics.ApplyStatSizeLimits(2);
+
+  ASSERT_TRUE(encoded_statistics.has_min);
+  ASSERT_EQ("a", encoded_statistics.min());
+  ASSERT_FALSE(encoded_statistics.has_max);
+
+  NodePtr node =
+      PrimitiveNode::Make("StringColumn", Repetition::REQUIRED, 
Type::BYTE_ARRAY);
+  ColumnDescriptor descr(node, 0, 0);
+  std::shared_ptr<TypedStatistics<::parquet::ByteArrayType>> statistics =
+      std::dynamic_pointer_cast<TypedStatistics<::parquet::ByteArrayType>>(
+          Statistics::Make(&descr, &encoded_statistics,
+                           /*num_values=*/1000));
+  // GH-43382: HasMinMax should be false if one of min/max is not set.
+  EXPECT_FALSE(statistics->HasMinMax());
+}
+
 }  // namespace test
 }  // namespace parquet

Reply via email to