This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 32a112d05e GH-47102: [Statistics][C++] Implement Statistics 
specification attribute ARROW:max_byte_width:{exact,approximate} Component: C++ 
 (#47463)
32a112d05e is described below

commit 32a112d05eecb327c4880d38bd3d1477cc3dd2d6
Author: Arash Andishgar <42874930+andish...@users.noreply.github.com>
AuthorDate: Mon Sep 8 00:48:58 2025 +0330

    GH-47102: [Statistics][C++] Implement Statistics specification attribute 
ARROW:max_byte_width:{exact,approximate} Component: C++  (#47463)
    
    
    ### Rationale for this change
    Add` max_byte_width statistics{exact,approxiamte} `statistics attributes
    ### What changes are included in this PR?
    Add `arrow::ArrayStatistics::max_byte_width` with relevant unit tests
    ### Are these changes tested?
    Yes, I ran the related unit tests
    ### Are there any user-facing changes?
    Yes, Add  `arrow::ArrayStatistics::max_byte_width`
    
    * GitHub Issue: #47102
    
    Authored-by: Arash Andishgar <arashandishg...@gmail.com>
    Signed-off-by: Sutou Kouhei <k...@clear-code.com>
---
 cpp/src/arrow/array/array_test.cc      | 19 ++++++++++
 cpp/src/arrow/array/statistics.h       |  6 +++
 cpp/src/arrow/array/statistics_test.cc | 28 ++++++++++++++
 cpp/src/arrow/compare.cc               |  2 +
 cpp/src/arrow/record_batch.cc          | 16 ++++++++
 cpp/src/arrow/record_batch_test.cc     | 67 ++++++++++++++++++++++++++++++++++
 6 files changed, 138 insertions(+)

diff --git a/cpp/src/arrow/array/array_test.cc 
b/cpp/src/arrow/array/array_test.cc
index 4db76512d2..b40f14a554 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -3911,6 +3911,7 @@ class TestArrayDataStatistics : public ::testing::Test {
     valids_ = {1, 0, 1, 1};
     null_count_ = std::count(valids_.begin(), valids_.end(), 0);
     distinct_count_ = 3.0;
+    max_byte_width_ = 4.0;
     average_byte_width_ = 4.0;
     null_buffer_ = *internal::BytesToBits(valids_);
     values_ = {1, 0, 3, -4};
@@ -3922,6 +3923,7 @@ class TestArrayDataStatistics : public ::testing::Test {
     data_->statistics = std::make_shared<ArrayStatistics>();
     data_->statistics->null_count = null_count_;
     data_->statistics->distinct_count = distinct_count_;
+    data_->statistics->max_byte_width = max_byte_width_;
     data_->statistics->average_byte_width = average_byte_width_;
     data_->statistics->is_average_byte_width_exact = true;
     data_->statistics->min = min_;
@@ -3934,6 +3936,7 @@ class TestArrayDataStatistics : public ::testing::Test {
   std::vector<uint8_t> valids_;
   size_t null_count_;
   double distinct_count_;
+  double max_byte_width_;
   double average_byte_width_;
   std::shared_ptr<Buffer> null_buffer_;
   std::vector<int32_t> values_;
@@ -3954,6 +3957,10 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
   ASSERT_DOUBLE_EQ(distinct_count_,
                    
std::get<double>(moved_data.statistics->distinct_count.value()));
 
+  ASSERT_TRUE(moved_data.statistics->max_byte_width.has_value());
+  ASSERT_DOUBLE_EQ(max_byte_width_,
+                   
std::get<double>(moved_data.statistics->max_byte_width.value()));
+
   ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
   ASSERT_DOUBLE_EQ(average_byte_width_,
                    moved_data.statistics->average_byte_width.value());
@@ -3980,6 +3987,10 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) {
   ASSERT_DOUBLE_EQ(distinct_count_,
                    
std::get<double>(copied_data.statistics->distinct_count.value()));
 
+  ASSERT_TRUE(copied_data.statistics->max_byte_width.has_value());
+  ASSERT_DOUBLE_EQ(max_byte_width_,
+                   
std::get<double>(copied_data.statistics->max_byte_width.value()));
+
   ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
   ASSERT_DOUBLE_EQ(average_byte_width_,
                    copied_data.statistics->average_byte_width.value());
@@ -4008,6 +4019,10 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) {
   ASSERT_DOUBLE_EQ(distinct_count_,
                    
std::get<double>(moved_data.statistics->distinct_count.value()));
 
+  ASSERT_TRUE(moved_data.statistics->max_byte_width.has_value());
+  ASSERT_DOUBLE_EQ(max_byte_width_,
+                   
std::get<double>(moved_data.statistics->max_byte_width.value()));
+
   ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
   ASSERT_DOUBLE_EQ(average_byte_width_,
                    moved_data.statistics->average_byte_width.value());
@@ -4035,6 +4050,10 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) {
   ASSERT_DOUBLE_EQ(distinct_count_,
                    
std::get<double>(copied_data.statistics->distinct_count.value()));
 
+  ASSERT_TRUE(copied_data.statistics->max_byte_width.has_value());
+  ASSERT_DOUBLE_EQ(max_byte_width_,
+                   
std::get<double>(copied_data.statistics->max_byte_width.value()));
+
   ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
   ASSERT_DOUBLE_EQ(average_byte_width_,
                    copied_data.statistics->average_byte_width.value());
diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h
index 7d2fd416bf..cbf0bc39e8 100644
--- a/cpp/src/arrow/array/statistics.h
+++ b/cpp/src/arrow/array/statistics.h
@@ -41,6 +41,7 @@ struct ARROW_EXPORT ArrayStatistics {
   using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
   using NumericType = std::variant<int64_t, double>;
   using CountType = NumericType;
+  using SizeType = NumericType;
 
   static const std::shared_ptr<DataType>& ValueToArrowType(
       const std::optional<ValueType>& value,
@@ -82,6 +83,11 @@ struct ARROW_EXPORT ArrayStatistics {
   /// and when set to `double`, it represents `approximate_distinct_count`.
   std::optional<CountType> distinct_count = std::nullopt;
 
+  /// \brief The maximum length in bytes of the rows in an array; may not be 
set
+  /// Note: when the type is `int64_t`, it represents `max_byte_width_exact`,
+  /// and when the type is `double`, it represents 
`max_byte_width_approximate`.
+  std::optional<SizeType> max_byte_width = std::nullopt;
+
   /// \brief The average size in bytes of a row in an array, may not be set.
   std::optional<double> average_byte_width = std::nullopt;
 
diff --git a/cpp/src/arrow/array/statistics_test.cc 
b/cpp/src/arrow/array/statistics_test.cc
index 2b09c20b73..607ee8aa09 100644
--- a/cpp/src/arrow/array/statistics_test.cc
+++ b/cpp/src/arrow/array/statistics_test.cc
@@ -49,6 +49,22 @@ TEST(TestArrayStatistics, DistinctCountApproximate) {
   ASSERT_DOUBLE_EQ(29.0, std::get<double>(statistics.distinct_count.value()));
 }
 
+TEST(TestArrayStatistics, MaxByteWidthExact) {
+  ArrayStatistics statistics;
+  ASSERT_FALSE(statistics.max_byte_width.has_value());
+  statistics.max_byte_width = static_cast<int64_t>(5);
+  ASSERT_TRUE(statistics.max_byte_width.has_value());
+  ASSERT_EQ(5, std::get<int64_t>(statistics.max_byte_width.value()));
+}
+
+TEST(TestArrayStatistics, MaxByteWidthApproximate) {
+  ArrayStatistics statistics;
+  ASSERT_FALSE(statistics.max_byte_width.has_value());
+  statistics.max_byte_width = 5.0;
+  ASSERT_TRUE(statistics.max_byte_width.has_value());
+  ASSERT_DOUBLE_EQ(5.0, std::get<double>(statistics.max_byte_width.value()));
+}
+
 TEST(TestArrayStatistics, AverageByteWidth) {
   ArrayStatistics statistics;
   ASSERT_FALSE(statistics.average_byte_width.has_value());
@@ -107,6 +123,18 @@ TEST(TestArrayStatistics, Equals) {
   statistics2.distinct_count = 2930.5;
   ASSERT_EQ(statistics1, statistics2);
 
+  // Test MAX_BYTE_WIDTH_EXACT
+  statistics1.max_byte_width = static_cast<int64_t>(5);
+  ASSERT_NE(statistics1, statistics2);
+  statistics2.max_byte_width = static_cast<int64_t>(5);
+  ASSERT_EQ(statistics1, statistics2);
+
+  // Test MAX_BYTE_WIDTH_APPROXIMATE
+  statistics1.max_byte_width = 5.0;
+  ASSERT_NE(statistics1, statistics2);
+  statistics2.max_byte_width = 5.0;
+  ASSERT_EQ(statistics1, statistics2);
+
   statistics1.average_byte_width = 2.9;
   ASSERT_NE(statistics1, statistics2);
   statistics2.average_byte_width = 2.9;
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index d37325fa1a..a86d8ba673 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -1566,6 +1566,8 @@ bool ArrayStatisticsEqualsImpl(const ArrayStatistics& 
left, const ArrayStatistic
   return left.null_count == right.null_count &&
          ArrayStatisticsOptionalValueEquals(left.distinct_count, 
right.distinct_count,
                                             equal_options) &&
+         ArrayStatisticsOptionalValueEquals(left.max_byte_width, 
right.max_byte_width,
+                                            equal_options) &&
          left.is_average_byte_width_exact == right.is_average_byte_width_exact 
&&
          left.is_min_exact == right.is_min_exact &&
          left.is_max_exact == right.is_max_exact &&
diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index 2429113333..f39b40f02c 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -564,6 +564,22 @@ Status EnumerateStatistics(const RecordBatch& 
record_batch, OnStatistics on_stat
       statistics.start_new_column = false;
     }
 
+    if (column_statistics->max_byte_width.has_value()) {
+      statistics.nth_statistics++;
+      if 
(std::holds_alternative<int64_t>(column_statistics->max_byte_width.value())) {
+        statistics.key = ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT;
+        statistics.type = int64();
+        statistics.value = 
std::get<int64_t>(column_statistics->max_byte_width.value());
+      } else {
+        statistics.key = ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE;
+        statistics.type = float64();
+        statistics.value = 
std::get<double>(column_statistics->max_byte_width.value());
+      }
+
+      RETURN_NOT_OK(on_statistics(statistics));
+      statistics.start_new_column = false;
+    }
+
     if (column_statistics->average_byte_width.has_value()) {
       statistics.nth_statistics++;
       if (column_statistics->is_average_byte_width_exact) {
diff --git a/cpp/src/arrow/record_batch_test.cc 
b/cpp/src/arrow/record_batch_test.cc
index 4fec2ee686..5c5717ff3f 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -1538,6 +1538,73 @@ TEST_F(TestRecordBatch, 
MakeStatisticsArrayDistinctCountApproximate) {
   AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
 }
 
+TEST_F(TestRecordBatch, MakeStatisticsArrayMaxByteWidthExact) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("utf8", 
utf8())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto string_array_data = ArrayFromJSON(utf8(), R"(["aa", null, 
"c"])")->data()->Copy();
+  string_array_data->statistics = std::make_shared<ArrayStatistics>();
+  string_array_data->statistics->null_count = 1;
+  string_array_data->statistics->max_byte_width = static_cast<int64_t>(2);
+  auto string_array = MakeArray(std::move(string_array_data));
+  auto batch = RecordBatch::Make(schema, string_array->length(),
+                                 {no_statistics_array, string_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                       MakeStatisticsArray("[null, 1]",
+                                           {{
+                                                
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                            },
+                                            {
+                                                
ARROW_STATISTICS_KEY_NULL_COUNT_EXACT,
+                                                
ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT,
+                                            }},
+                                           {{
+                                                
ArrayStatistics::ValueType{int64_t{3}},
+                                            },
+                                            {
+                                                
ArrayStatistics::ValueType{int64_t{1}},
+                                                
ArrayStatistics::ValueType{int64_t{2}},
+                                            }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayMaxByteWidthApproximate) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("utf8", 
utf8())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto string_array_data = ArrayFromJSON(utf8(), R"(["aa", null, 
"c"])")->data()->Copy();
+  string_array_data->statistics = std::make_shared<ArrayStatistics>();
+  string_array_data->statistics->null_count = 1;
+  string_array_data->statistics->max_byte_width = 2.0;
+  auto string_array = MakeArray(std::move(string_array_data));
+  auto batch = RecordBatch::Make(schema, string_array->length(),
+                                 {no_statistics_array, string_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(
+      auto expected_statistics_array,
+      MakeStatisticsArray("[null, 1]",
+                          {{
+                               ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                           },
+                           {
+                               ARROW_STATISTICS_KEY_NULL_COUNT_EXACT,
+                               ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE,
+                           }},
+                          {{
+                               ArrayStatistics::ValueType{int64_t{3}},
+                           },
+                           {
+                               ArrayStatistics::ValueType{int64_t{1}},
+                               ArrayStatistics::ValueType{2.0},
+                           }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
 TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthApproximate) {
   auto schema =
       ::arrow::schema({field("no-statistics", boolean()), field("utf8", 
utf8())});

Reply via email to