This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 673823b5a4 GH-45639: [C++][Statistics] Add support for
ARROW:average_byte_width:{exac,approximate} (#46385)
673823b5a4 is described below
commit 673823b5a4775a8eb0e3a67e4b3e549febf4b44a
Author: Arash Andishgar <[email protected]>
AuthorDate: Thu Jul 10 03:39:08 2025 +0330
GH-45639: [C++][Statistics] Add support for
ARROW:average_byte_width:{exac,approximate} (#46385)
### Rationale for this change
`ARROW:average_byte_width:exact` and `ARROW:average_byte_width:approximate`
statistics attributes are missing in `arrow::ArrayStatistics`.
### What changes are included in this PR?
Add `average_byte_width` and `is_average_byte_width_exact` member
variables to `arrow::ArrayStatistics`.
### Are these changes tested?
Yes, I run the relevant unit tests
### Are there any user-facing changes?
Yes
* GitHub Issue: #45639
Lead-authored-by: Arash Andishgar <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/array/array_test.cc | 24 +++++++++++++
cpp/src/arrow/array/statistics.h | 6 ++++
cpp/src/arrow/array/statistics_test.cc | 23 ++++++++++++-
cpp/src/arrow/compare.cc | 3 ++
cpp/src/arrow/record_batch.cc | 15 ++++++++
cpp/src/arrow/record_batch_test.cc | 62 ++++++++++++++++++++++++++++++++++
6 files changed, 132 insertions(+), 1 deletion(-)
diff --git a/cpp/src/arrow/array/array_test.cc
b/cpp/src/arrow/array/array_test.cc
index e5a27d18d0..0dd75b01f6 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -3897,6 +3897,7 @@ class TestArrayDataStatistics : public ::testing::Test {
void SetUp() {
valids_ = {1, 0, 1, 1};
null_count_ = std::count(valids_.begin(), valids_.end(), 0);
+ average_byte_width_ = 4.0;
null_buffer_ = *internal::BytesToBits(valids_);
values_ = {1, 0, 3, -4};
min_ = *std::min_element(values_.begin(), values_.end());
@@ -3906,6 +3907,8 @@ class TestArrayDataStatistics : public ::testing::Test {
null_count_);
data_->statistics = std::make_shared<ArrayStatistics>();
data_->statistics->null_count = null_count_;
+ data_->statistics->average_byte_width = average_byte_width_;
+ data_->statistics->is_average_byte_width_exact = true;
data_->statistics->min = min_;
data_->statistics->is_min_exact = true;
data_->statistics->max = max_;
@@ -3915,6 +3918,7 @@ class TestArrayDataStatistics : public ::testing::Test {
protected:
std::vector<uint8_t> valids_;
size_t null_count_;
+ double average_byte_width_;
std::shared_ptr<Buffer> null_buffer_;
std::vector<int32_t> values_;
int64_t min_;
@@ -3930,6 +3934,11 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
+ ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
+ ASSERT_DOUBLE_EQ(average_byte_width_,
+ moved_data.statistics->average_byte_width.value());
+ ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact);
+
ASSERT_TRUE(moved_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
@@ -3947,6 +3956,11 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) {
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
+ ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
+ ASSERT_DOUBLE_EQ(average_byte_width_,
+ copied_data.statistics->average_byte_width.value());
+ ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact);
+
ASSERT_TRUE(copied_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
@@ -3966,6 +3980,11 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) {
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
+ ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
+ ASSERT_DOUBLE_EQ(average_byte_width_,
+ moved_data.statistics->average_byte_width.value());
+ ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact);
+
ASSERT_TRUE(moved_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
@@ -3984,6 +4003,11 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) {
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
+ ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
+ ASSERT_DOUBLE_EQ(average_byte_width_,
+ copied_data.statistics->average_byte_width.value());
+ ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact);
+
ASSERT_TRUE(copied_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h
index 6accd48af7..435c38e861 100644
--- a/cpp/src/arrow/array/statistics.h
+++ b/cpp/src/arrow/array/statistics.h
@@ -78,6 +78,12 @@ struct ARROW_EXPORT ArrayStatistics {
/// \brief The number of distinct values, may not be set
std::optional<int64_t> distinct_count = std::nullopt;
+ /// \brief The average size in bytes of a row in an array, may not be set.
+ std::optional<double> average_byte_width = std::nullopt;
+
+ /// \brief Whether the average size in bytes is exact or not.
+ bool is_average_byte_width_exact = false;
+
/// \brief The minimum value, may not be set
std::optional<ValueType> min = std::nullopt;
diff --git a/cpp/src/arrow/array/statistics_test.cc
b/cpp/src/arrow/array/statistics_test.cc
index 250c4bb437..d7dbea7c0f 100644
--- a/cpp/src/arrow/array/statistics_test.cc
+++ b/cpp/src/arrow/array/statistics_test.cc
@@ -41,6 +41,17 @@ TEST(TestArrayStatistics, DistinctCount) {
ASSERT_EQ(29, statistics.distinct_count.value());
}
+TEST(TestArrayStatistics, AverageByteWidth) {
+ ArrayStatistics statistics;
+ ASSERT_FALSE(statistics.average_byte_width.has_value());
+ ASSERT_FALSE(statistics.is_average_byte_width_exact);
+ statistics.average_byte_width = 4.2;
+ ASSERT_TRUE(statistics.average_byte_width.has_value());
+ ASSERT_DOUBLE_EQ(4.2, statistics.average_byte_width.value());
+ statistics.is_average_byte_width_exact = true;
+ ASSERT_TRUE(statistics.is_average_byte_width_exact);
+}
+
TEST(TestArrayStatistics, Min) {
ArrayStatistics statistics;
ASSERT_FALSE(statistics.min.has_value());
@@ -65,7 +76,7 @@ TEST(TestArrayStatistics, Max) {
ASSERT_FALSE(statistics.is_max_exact);
}
-TEST(TestArrayStatistics, EqualityNonDoulbeValue) {
+TEST(TestArrayStatistics, Equals) {
ArrayStatistics statistics1;
ArrayStatistics statistics2;
@@ -81,6 +92,16 @@ TEST(TestArrayStatistics, EqualityNonDoulbeValue) {
statistics2.distinct_count = 2929;
ASSERT_EQ(statistics1, statistics2);
+ statistics1.average_byte_width = 2.9;
+ ASSERT_NE(statistics1, statistics2);
+ statistics2.average_byte_width = 2.9;
+ ASSERT_EQ(statistics1, statistics2);
+
+ statistics1.is_average_byte_width_exact = true;
+ ASSERT_NE(statistics1, statistics2);
+ statistics2.is_average_byte_width_exact = true;
+ ASSERT_EQ(statistics1, statistics2);
+
statistics1.min = std::string("world");
ASSERT_NE(statistics1, statistics2);
statistics2.min = std::string("world");
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 6ece1cb444..aa041a5bd5 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -1561,8 +1561,11 @@ bool ArrayStatisticsEqualsImpl(const ArrayStatistics&
left, const ArrayStatistic
const EqualOptions& equal_options) {
return left.null_count == right.null_count &&
left.distinct_count == right.distinct_count &&
+ left.is_average_byte_width_exact == right.is_average_byte_width_exact
&&
left.is_min_exact == right.is_min_exact &&
left.is_max_exact == right.is_max_exact &&
+ ArrayStatisticsValueTypeEquals(left.average_byte_width,
right.average_byte_width,
+ equal_options) &&
ArrayStatisticsValueTypeEquals(left.min, right.min, equal_options) &&
ArrayStatisticsValueTypeEquals(left.max, right.max, equal_options);
}
diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index 700e1bb2c9..04d6890d39 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -530,6 +530,19 @@ Status EnumerateStatistics(const RecordBatch&
record_batch, OnStatistics on_stat
statistics.start_new_column = false;
}
+ if (column_statistics->average_byte_width.has_value()) {
+ statistics.nth_statistics++;
+ if (column_statistics->is_average_byte_width_exact) {
+ statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT;
+ } else {
+ statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE;
+ }
+ statistics.type = float64();
+ statistics.value = column_statistics->average_byte_width.value();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+ }
+
if (column_statistics->min.has_value()) {
statistics.nth_statistics++;
if (column_statistics->is_min_exact) {
@@ -671,8 +684,10 @@ Result<std::shared_ptr<Array>>
RecordBatch::MakeStatisticsArray(
if (statistics.start_new_column) {
RETURN_NOT_OK(builder.Append());
if (statistics.nth_column.has_value()) {
+ // Add Columns
RETURN_NOT_OK(columns_builder->Append(statistics.nth_column.value()));
} else {
+ // Add RecordBatch
RETURN_NOT_OK(columns_builder->AppendNull());
}
RETURN_NOT_OK(values_builder->Append());
diff --git a/cpp/src/arrow/record_batch_test.cc
b/cpp/src/arrow/record_batch_test.cc
index 0572883441..fab8137171 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -1345,6 +1345,68 @@ TEST_F(TestRecordBatch,
MakeStatisticsArrayDistinctCount) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}
+TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthApproximate) {
+ auto schema =
+ ::arrow::schema({field("no-statistics", boolean()), field("utf8",
utf8())});
+ auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+ auto string_array = ArrayFromJSON(utf8(), R"(["aa", "bb", "ccc"])");
+ string_array->data()->statistics = std::make_shared<ArrayStatistics>();
+ string_array->data()->statistics->average_byte_width = 2.3;
+ auto batch = RecordBatch::Make(schema, string_array->length(),
+ {no_statistics_array, string_array});
+
+ ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+ ASSERT_OK_AND_ASSIGN(
+ auto expected_statistics_array,
+ MakeStatisticsArray("[null, 1]",
+ {{
+ ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+ },
+ {
+
ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE,
+ }},
+ {{
+ ArrayStatistics::ValueType{int64_t{3}},
+ },
+ {
+ ArrayStatistics::ValueType{2.3},
+ }}));
+ AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthExact) {
+ auto schema =
+ ::arrow::schema({field("no-statistics", boolean()), field("float64",
float64())});
+ auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+ auto float_array = ArrayFromJSON(float64(), R"([1.0, 2.0, 3.0])");
+ float_array->data()->statistics = std::make_shared<ArrayStatistics>();
+ float_array->data()->statistics->average_byte_width = 8.0;
+ float_array->data()->statistics->is_average_byte_width_exact = true;
+
+ auto batch = RecordBatch::Make(schema, float_array->length(),
+ {no_statistics_array, float_array});
+
+ ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+ ASSERT_OK_AND_ASSIGN(
+ auto expected_statistics_array,
+ MakeStatisticsArray("[null, 1]",
+ {{
+ ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+ },
+ {
+ ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT,
+ }},
+ {{
+ ArrayStatistics::ValueType{int64_t{3}},
+ },
+ {
+ ArrayStatistics::ValueType{8.0},
+ }}));
+ AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
TEST_F(TestRecordBatch, MakeStatisticsArrayMinExact) {
auto schema =
::arrow::schema({field("no-statistics", boolean()), field("uint32",
uint32())});