This is an automated email from the ASF dual-hosted git repository. kou pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push: new 32a112d05e GH-47102: [Statistics][C++] Implement Statistics specification attribute ARROW:max_byte_width:{exact,approximate} Component: C++ (#47463) 32a112d05e is described below commit 32a112d05eecb327c4880d38bd3d1477cc3dd2d6 Author: Arash Andishgar <42874930+andish...@users.noreply.github.com> AuthorDate: Mon Sep 8 00:48:58 2025 +0330 GH-47102: [Statistics][C++] Implement Statistics specification attribute ARROW:max_byte_width:{exact,approximate} Component: C++ (#47463) ### Rationale for this change Add` max_byte_width statistics{exact,approxiamte} `statistics attributes ### What changes are included in this PR? Add `arrow::ArrayStatistics::max_byte_width` with relevant unit tests ### Are these changes tested? Yes, I ran the related unit tests ### Are there any user-facing changes? Yes, Add `arrow::ArrayStatistics::max_byte_width` * GitHub Issue: #47102 Authored-by: Arash Andishgar <arashandishg...@gmail.com> Signed-off-by: Sutou Kouhei <k...@clear-code.com> --- cpp/src/arrow/array/array_test.cc | 19 ++++++++++ cpp/src/arrow/array/statistics.h | 6 +++ cpp/src/arrow/array/statistics_test.cc | 28 ++++++++++++++ cpp/src/arrow/compare.cc | 2 + cpp/src/arrow/record_batch.cc | 16 ++++++++ cpp/src/arrow/record_batch_test.cc | 67 ++++++++++++++++++++++++++++++++++ 6 files changed, 138 insertions(+) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 4db76512d2..b40f14a554 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3911,6 +3911,7 @@ class TestArrayDataStatistics : public ::testing::Test { valids_ = {1, 0, 1, 1}; null_count_ = std::count(valids_.begin(), valids_.end(), 0); distinct_count_ = 3.0; + max_byte_width_ = 4.0; average_byte_width_ = 4.0; null_buffer_ = *internal::BytesToBits(valids_); values_ = {1, 0, 3, -4}; @@ -3922,6 +3923,7 @@ class TestArrayDataStatistics : public ::testing::Test { data_->statistics = std::make_shared<ArrayStatistics>(); data_->statistics->null_count = null_count_; data_->statistics->distinct_count = distinct_count_; + data_->statistics->max_byte_width = max_byte_width_; data_->statistics->average_byte_width = average_byte_width_; data_->statistics->is_average_byte_width_exact = true; data_->statistics->min = min_; @@ -3934,6 +3936,7 @@ class TestArrayDataStatistics : public ::testing::Test { std::vector<uint8_t> valids_; size_t null_count_; double distinct_count_; + double max_byte_width_; double average_byte_width_; std::shared_ptr<Buffer> null_buffer_; std::vector<int32_t> values_; @@ -3954,6 +3957,10 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) { ASSERT_DOUBLE_EQ(distinct_count_, std::get<double>(moved_data.statistics->distinct_count.value())); + ASSERT_TRUE(moved_data.statistics->max_byte_width.has_value()); + ASSERT_DOUBLE_EQ(max_byte_width_, + std::get<double>(moved_data.statistics->max_byte_width.value())); + ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value()); ASSERT_DOUBLE_EQ(average_byte_width_, moved_data.statistics->average_byte_width.value()); @@ -3980,6 +3987,10 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) { ASSERT_DOUBLE_EQ(distinct_count_, std::get<double>(copied_data.statistics->distinct_count.value())); + ASSERT_TRUE(copied_data.statistics->max_byte_width.has_value()); + ASSERT_DOUBLE_EQ(max_byte_width_, + std::get<double>(copied_data.statistics->max_byte_width.value())); + ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value()); ASSERT_DOUBLE_EQ(average_byte_width_, copied_data.statistics->average_byte_width.value()); @@ -4008,6 +4019,10 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) { ASSERT_DOUBLE_EQ(distinct_count_, std::get<double>(moved_data.statistics->distinct_count.value())); + ASSERT_TRUE(moved_data.statistics->max_byte_width.has_value()); + ASSERT_DOUBLE_EQ(max_byte_width_, + std::get<double>(moved_data.statistics->max_byte_width.value())); + ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value()); ASSERT_DOUBLE_EQ(average_byte_width_, moved_data.statistics->average_byte_width.value()); @@ -4035,6 +4050,10 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) { ASSERT_DOUBLE_EQ(distinct_count_, std::get<double>(copied_data.statistics->distinct_count.value())); + ASSERT_TRUE(copied_data.statistics->max_byte_width.has_value()); + ASSERT_DOUBLE_EQ(max_byte_width_, + std::get<double>(copied_data.statistics->max_byte_width.value())); + ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value()); ASSERT_DOUBLE_EQ(average_byte_width_, copied_data.statistics->average_byte_width.value()); diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 7d2fd416bf..cbf0bc39e8 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -41,6 +41,7 @@ struct ARROW_EXPORT ArrayStatistics { using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>; using NumericType = std::variant<int64_t, double>; using CountType = NumericType; + using SizeType = NumericType; static const std::shared_ptr<DataType>& ValueToArrowType( const std::optional<ValueType>& value, @@ -82,6 +83,11 @@ struct ARROW_EXPORT ArrayStatistics { /// and when set to `double`, it represents `approximate_distinct_count`. std::optional<CountType> distinct_count = std::nullopt; + /// \brief The maximum length in bytes of the rows in an array; may not be set + /// Note: when the type is `int64_t`, it represents `max_byte_width_exact`, + /// and when the type is `double`, it represents `max_byte_width_approximate`. + std::optional<SizeType> max_byte_width = std::nullopt; + /// \brief The average size in bytes of a row in an array, may not be set. std::optional<double> average_byte_width = std::nullopt; diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index 2b09c20b73..607ee8aa09 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -49,6 +49,22 @@ TEST(TestArrayStatistics, DistinctCountApproximate) { ASSERT_DOUBLE_EQ(29.0, std::get<double>(statistics.distinct_count.value())); } +TEST(TestArrayStatistics, MaxByteWidthExact) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.max_byte_width.has_value()); + statistics.max_byte_width = static_cast<int64_t>(5); + ASSERT_TRUE(statistics.max_byte_width.has_value()); + ASSERT_EQ(5, std::get<int64_t>(statistics.max_byte_width.value())); +} + +TEST(TestArrayStatistics, MaxByteWidthApproximate) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.max_byte_width.has_value()); + statistics.max_byte_width = 5.0; + ASSERT_TRUE(statistics.max_byte_width.has_value()); + ASSERT_DOUBLE_EQ(5.0, std::get<double>(statistics.max_byte_width.value())); +} + TEST(TestArrayStatistics, AverageByteWidth) { ArrayStatistics statistics; ASSERT_FALSE(statistics.average_byte_width.has_value()); @@ -107,6 +123,18 @@ TEST(TestArrayStatistics, Equals) { statistics2.distinct_count = 2930.5; ASSERT_EQ(statistics1, statistics2); + // Test MAX_BYTE_WIDTH_EXACT + statistics1.max_byte_width = static_cast<int64_t>(5); + ASSERT_NE(statistics1, statistics2); + statistics2.max_byte_width = static_cast<int64_t>(5); + ASSERT_EQ(statistics1, statistics2); + + // Test MAX_BYTE_WIDTH_APPROXIMATE + statistics1.max_byte_width = 5.0; + ASSERT_NE(statistics1, statistics2); + statistics2.max_byte_width = 5.0; + ASSERT_EQ(statistics1, statistics2); + statistics1.average_byte_width = 2.9; ASSERT_NE(statistics1, statistics2); statistics2.average_byte_width = 2.9; diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index d37325fa1a..a86d8ba673 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1566,6 +1566,8 @@ bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistic return left.null_count == right.null_count && ArrayStatisticsOptionalValueEquals(left.distinct_count, right.distinct_count, equal_options) && + ArrayStatisticsOptionalValueEquals(left.max_byte_width, right.max_byte_width, + equal_options) && left.is_average_byte_width_exact == right.is_average_byte_width_exact && left.is_min_exact == right.is_min_exact && left.is_max_exact == right.is_max_exact && diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 2429113333..f39b40f02c 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -564,6 +564,22 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat statistics.start_new_column = false; } + if (column_statistics->max_byte_width.has_value()) { + statistics.nth_statistics++; + if (std::holds_alternative<int64_t>(column_statistics->max_byte_width.value())) { + statistics.key = ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT; + statistics.type = int64(); + statistics.value = std::get<int64_t>(column_statistics->max_byte_width.value()); + } else { + statistics.key = ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE; + statistics.type = float64(); + statistics.value = std::get<double>(column_statistics->max_byte_width.value()); + } + + RETURN_NOT_OK(on_statistics(statistics)); + statistics.start_new_column = false; + } + if (column_statistics->average_byte_width.has_value()) { statistics.nth_statistics++; if (column_statistics->is_average_byte_width_exact) { diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 4fec2ee686..5c5717ff3f 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -1538,6 +1538,73 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCountApproximate) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } +TEST_F(TestRecordBatch, MakeStatisticsArrayMaxByteWidthExact) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array_data = ArrayFromJSON(utf8(), R"(["aa", null, "c"])")->data()->Copy(); + string_array_data->statistics = std::make_shared<ArrayStatistics>(); + string_array_data->statistics->null_count = 1; + string_array_data->statistics->max_byte_width = static_cast<int64_t>(2); + auto string_array = MakeArray(std::move(string_array_data)); + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_NULL_COUNT_EXACT, + ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{int64_t{1}}, + ArrayStatistics::ValueType{int64_t{2}}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayMaxByteWidthApproximate) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array_data = ArrayFromJSON(utf8(), R"(["aa", null, "c"])")->data()->Copy(); + string_array_data->statistics = std::make_shared<ArrayStatistics>(); + string_array_data->statistics->null_count = 1; + string_array_data->statistics->max_byte_width = 2.0; + auto string_array = MakeArray(std::move(string_array_data)); + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_NULL_COUNT_EXACT, + ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{int64_t{1}}, + ArrayStatistics::ValueType{2.0}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthApproximate) { auto schema = ::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())});