kou commented on code in PR #45663:
URL: https://github.com/apache/arrow/pull/45663#discussion_r1992767086
##########
cpp/src/arrow/array/statistics_test.cc:
##########
@@ -91,6 +93,28 @@ TEST(ArrayStatisticsTest, TestEquality) {
ASSERT_NE(statistics1, statistics2);
statistics2.max = static_cast<int64_t>(-29);
ASSERT_EQ(statistics1, statistics2);
+ statistics2.max = static_cast<int64_t>(2);
+ ASSERT_NE(statistics1, statistics2);
+
+ statistics1.max = std::nullopt;
+ statistics2.max = std::nullopt;
+ // check the state of both of them are std::nullopt
+ ASSERT_EQ(statistics1.max, statistics2.max);
+ // the state of one of them is std::nullopt
Review Comment:
```suggestion
// the state of one of them is std::nullopt
```
##########
cpp/src/arrow/array/statistics_test.cc:
##########
@@ -91,6 +93,28 @@ TEST(ArrayStatisticsTest, TestEquality) {
ASSERT_NE(statistics1, statistics2);
statistics2.max = static_cast<int64_t>(-29);
ASSERT_EQ(statistics1, statistics2);
+ statistics2.max = static_cast<int64_t>(2);
+ ASSERT_NE(statistics1, statistics2);
+
+ statistics1.max = std::nullopt;
+ statistics2.max = std::nullopt;
+ // check the state of both of them are std::nullopt
+ ASSERT_EQ(statistics1.max, statistics2.max);
+ // the state of one of them is std::nullopt
+ statistics1.max = std::shared_ptr<Scalar>();
+ ASSERT_NE(statistics1, statistics2);
+ // the state of both of them are nullptr
Review Comment:
```suggestion
// the state of both of them are empty shared_ptr
```
##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1454,6 +1462,94 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}
+TEST_F(TestRecordBatch, MakeStatisticsArrayNestedType) {
+ auto struct_type = struct_({field("a", int64()), field("b", int64())});
+ auto struct_array = ArrayFromJSON(
+ struct_type,
+
R"([{"a":1,"b":6},{"a":2,"b":7},{"a":3,"b":8},{"a":4,"b":9},{"a":5,"b":10}])");
+ ASSERT_OK_AND_ASSIGN(auto struct_nested_stat,
Review Comment:
Can we use better name? This is not a statistics array, right?
##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1454,6 +1462,94 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}
+TEST_F(TestRecordBatch, MakeStatisticsArrayNestedType) {
+ auto struct_type = struct_({field("a", int64()), field("b", int64())});
+ auto struct_array = ArrayFromJSON(
+ struct_type,
+
R"([{"a":1,"b":6},{"a":2,"b":7},{"a":3,"b":8},{"a":4,"b":9},{"a":5,"b":10}])");
+ ASSERT_OK_AND_ASSIGN(auto struct_nested_stat,
+ struct_array->CopyTo(default_cpu_memory_manager()));
+ auto statistics_struct = std::make_shared<ArrayStatistics>();
+ ASSERT_OK_AND_ASSIGN(statistics_struct->max, struct_array->GetScalar(4));
+ statistics_struct->null_count = 0;
+ auto struct_array_data = struct_array->data();
+ auto statistics_struct_child_a = std::make_shared<ArrayStatistics>();
+ statistics_struct_child_a->min = int64_t{1};
+ struct_array_data->statistics = statistics_struct;
+ struct_array_data->child_data[0]->statistics = statistics_struct_child_a;
+ auto array_c = ArrayFromJSON(int64(), R"([11,12,13,14,15])");
+ array_c->data()->statistics = std::make_shared<ArrayStatistics>();
+ array_c->data()->statistics->max = int64_t{15};
+ auto array_d = ArrayFromJSON(int64(), R"([16,17,18,19,20])");
+ auto nested_child = struct_nested_stat->data()->child_data[0];
+ nested_child->statistics = std::make_shared<ArrayStatistics>();
+ nested_child->statistics->max = int64_t{5};
+ nested_child->statistics->is_max_exact = true;
+
+ auto rb_schema =
+ schema({field("struct_a_b", struct_type), field("c", int64()),
field("d", int64()),
+ field("struct_copy", struct_nested_stat->type())});
+ auto rb = RecordBatch::Make(rb_schema, 5,
+ {struct_array, array_c, array_d,
struct_nested_stat});
+
+ auto expected_scalar = internal::checked_pointer_cast<StructScalar>(
+ ScalarFromJSON(struct_type, R"([5,10])"));
+ auto a =
ArrayStatistics::ValueType{std::static_pointer_cast<Scalar>(expected_scalar)};
Review Comment:
Is this used?
##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1454,6 +1462,94 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}
+TEST_F(TestRecordBatch, MakeStatisticsArrayNestedType) {
+ auto struct_type = struct_({field("a", int64()), field("b", int64())});
Review Comment:
Could you add a comment that explains what record batch is building?
For example:
```cpp
// Schema:
// struct_a_b: struct {a: int64, b: int64}
// c: int64
// d: int64
// struct_copy: struct {a: int64, b: int64}
//
// Data:
// [
// {struct_a_b: {a: 1, b: 6}, c: 11, d: 16, struct_copy: {a: 1, b: 6}},
// ...,
// ]
//
// Statistics:
// struct_a_b:
// max: {a: 5, b: 10}
// ...
// ...
```
##########
cpp/src/arrow/array/statistics_test.cc:
##########
@@ -91,6 +93,28 @@ TEST(ArrayStatisticsTest, TestEquality) {
ASSERT_NE(statistics1, statistics2);
statistics2.max = static_cast<int64_t>(-29);
ASSERT_EQ(statistics1, statistics2);
+ statistics2.max = static_cast<int64_t>(2);
+ ASSERT_NE(statistics1, statistics2);
+
+ statistics1.max = std::nullopt;
+ statistics2.max = std::nullopt;
+ // check the state of both of them are std::nullopt
+ ASSERT_EQ(statistics1.max, statistics2.max);
+ // the state of one of them is std::nullopt
+ statistics1.max = std::shared_ptr<Scalar>();
+ ASSERT_NE(statistics1, statistics2);
+ // the state of both of them are nullptr
+ statistics2.max = std::shared_ptr<Scalar>();
+ ASSERT_EQ(statistics1, statistics2);
+ ASSERT_OK_AND_ASSIGN(statistics1.max, MakeScalar(int64(), 5));
+ // the state of one of them is nullptr
Review Comment:
```suggestion
// the state of one of them is empty shared_ptr
```
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -487,6 +487,27 @@ struct EnumeratedStatistics {
};
using OnStatistics =
std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+using ArrayStatisticsAndTypeVector =
+ std::vector<std::pair<std::shared_ptr<ArrayStatistics>,
std::shared_ptr<DataType>>>;
+ArrayStatisticsAndTypeVector ExtractArrayDataAndType(const RecordBatch&
record_batch) {
+ ArrayDataVector traverse;
+ ArrayStatisticsAndTypeVector result;
+ for (int i = 0; i < record_batch.num_columns(); ++i) {
+ auto column_array_data = record_batch.column(i)->data();
+ result.emplace_back(column_array_data->statistics,
column_array_data->type);
+ traverse.insert(traverse.end(), column_array_data->child_data.crbegin(),
+ column_array_data->child_data.crend());
+ while (!traverse.empty()) {
+ auto child_data = traverse.back();
+ traverse.pop_back();
+ result.emplace_back(child_data->statistics, child_data->type);
+ traverse.insert(traverse.end(), child_data->child_data.crbegin(),
+ child_data->child_data.crend());
+ }
Review Comment:
IPC writer has max recursion check:
https://github.com/apache/arrow/blob/c3e399af43c4a2e384a41fad0619589baa9045f0/cpp/src/arrow/ipc/writer.cc#L149-L151
https://github.com/apache/arrow/blob/c3e399af43c4a2e384a41fad0619589baa9045f0/cpp/src/arrow/ipc/options.h#L49-L50
https://github.com/apache/arrow/blob/c3e399af43c4a2e384a41fad0619589baa9045f0/cpp/src/arrow/ipc/options.h#L37-L40
We may need similar check but how to make it customizable? We may need
`MakeStatisticsOptions` or something...
##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1454,6 +1462,94 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}
+TEST_F(TestRecordBatch, MakeStatisticsArrayNestedType) {
+ auto struct_type = struct_({field("a", int64()), field("b", int64())});
+ auto struct_array = ArrayFromJSON(
+ struct_type,
+
R"([{"a":1,"b":6},{"a":2,"b":7},{"a":3,"b":8},{"a":4,"b":9},{"a":5,"b":10}])");
+ ASSERT_OK_AND_ASSIGN(auto struct_nested_stat,
+ struct_array->CopyTo(default_cpu_memory_manager()));
+ auto statistics_struct = std::make_shared<ArrayStatistics>();
+ ASSERT_OK_AND_ASSIGN(statistics_struct->max, struct_array->GetScalar(4));
+ statistics_struct->null_count = 0;
+ auto struct_array_data = struct_array->data();
+ auto statistics_struct_child_a = std::make_shared<ArrayStatistics>();
+ statistics_struct_child_a->min = int64_t{1};
+ struct_array_data->statistics = statistics_struct;
+ struct_array_data->child_data[0]->statistics = statistics_struct_child_a;
+ auto array_c = ArrayFromJSON(int64(), R"([11,12,13,14,15])");
+ array_c->data()->statistics = std::make_shared<ArrayStatistics>();
+ array_c->data()->statistics->max = int64_t{15};
+ auto array_d = ArrayFromJSON(int64(), R"([16,17,18,19,20])");
+ auto nested_child = struct_nested_stat->data()->child_data[0];
+ nested_child->statistics = std::make_shared<ArrayStatistics>();
+ nested_child->statistics->max = int64_t{5};
+ nested_child->statistics->is_max_exact = true;
+
+ auto rb_schema =
+ schema({field("struct_a_b", struct_type), field("c", int64()),
field("d", int64()),
+ field("struct_copy", struct_nested_stat->type())});
Review Comment:
Can we use better name than `struct_copy`?
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -487,6 +487,27 @@ struct EnumeratedStatistics {
};
using OnStatistics =
std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+using ArrayStatisticsAndTypeVector =
+ std::vector<std::pair<std::shared_ptr<ArrayStatistics>,
std::shared_ptr<DataType>>>;
+ArrayStatisticsAndTypeVector ExtractArrayDataAndType(const RecordBatch&
record_batch) {
+ ArrayDataVector traverse;
+ ArrayStatisticsAndTypeVector result;
+ for (int i = 0; i < record_batch.num_columns(); ++i) {
+ auto column_array_data = record_batch.column(i)->data();
+ result.emplace_back(column_array_data->statistics,
column_array_data->type);
Review Comment:
How about just returning `std::shared_ptr<ArrayData>` instead of
`std::pair<std::shared_ptr<ArrayStatistics>, std::shared_ptr<DataType>>`?
##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1454,6 +1462,94 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}
+TEST_F(TestRecordBatch, MakeStatisticsArrayNestedType) {
+ auto struct_type = struct_({field("a", int64()), field("b", int64())});
+ auto struct_array = ArrayFromJSON(
+ struct_type,
+
R"([{"a":1,"b":6},{"a":2,"b":7},{"a":3,"b":8},{"a":4,"b":9},{"a":5,"b":10}])");
+ ASSERT_OK_AND_ASSIGN(auto struct_nested_stat,
+ struct_array->CopyTo(default_cpu_memory_manager()));
+ auto statistics_struct = std::make_shared<ArrayStatistics>();
+ ASSERT_OK_AND_ASSIGN(statistics_struct->max, struct_array->GetScalar(4));
+ statistics_struct->null_count = 0;
+ auto struct_array_data = struct_array->data();
+ auto statistics_struct_child_a = std::make_shared<ArrayStatistics>();
+ statistics_struct_child_a->min = int64_t{1};
+ struct_array_data->statistics = statistics_struct;
+ struct_array_data->child_data[0]->statistics = statistics_struct_child_a;
+ auto array_c = ArrayFromJSON(int64(), R"([11,12,13,14,15])");
+ array_c->data()->statistics = std::make_shared<ArrayStatistics>();
+ array_c->data()->statistics->max = int64_t{15};
+ auto array_d = ArrayFromJSON(int64(), R"([16,17,18,19,20])");
+ auto nested_child = struct_nested_stat->data()->child_data[0];
Review Comment:
Can we use better name? This is `struct_copy.a`, right?
##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1454,6 +1462,94 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}
+TEST_F(TestRecordBatch, MakeStatisticsArrayNestedType) {
+ auto struct_type = struct_({field("a", int64()), field("b", int64())});
+ auto struct_array = ArrayFromJSON(
+ struct_type,
+
R"([{"a":1,"b":6},{"a":2,"b":7},{"a":3,"b":8},{"a":4,"b":9},{"a":5,"b":10}])");
+ ASSERT_OK_AND_ASSIGN(auto struct_nested_stat,
+ struct_array->CopyTo(default_cpu_memory_manager()));
+ auto statistics_struct = std::make_shared<ArrayStatistics>();
+ ASSERT_OK_AND_ASSIGN(statistics_struct->max, struct_array->GetScalar(4));
+ statistics_struct->null_count = 0;
+ auto struct_array_data = struct_array->data();
+ auto statistics_struct_child_a = std::make_shared<ArrayStatistics>();
+ statistics_struct_child_a->min = int64_t{1};
+ struct_array_data->statistics = statistics_struct;
+ struct_array_data->child_data[0]->statistics = statistics_struct_child_a;
+ auto array_c = ArrayFromJSON(int64(), R"([11,12,13,14,15])");
+ array_c->data()->statistics = std::make_shared<ArrayStatistics>();
+ array_c->data()->statistics->max = int64_t{15};
+ auto array_d = ArrayFromJSON(int64(), R"([16,17,18,19,20])");
+ auto nested_child = struct_nested_stat->data()->child_data[0];
+ nested_child->statistics = std::make_shared<ArrayStatistics>();
+ nested_child->statistics->max = int64_t{5};
+ nested_child->statistics->is_max_exact = true;
+
+ auto rb_schema =
Review Comment:
Could you use `batch_` prefix instead of `rb_` prefix like other codes?
##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1454,6 +1462,94 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}
+TEST_F(TestRecordBatch, MakeStatisticsArrayNestedType) {
+ auto struct_type = struct_({field("a", int64()), field("b", int64())});
+ auto struct_array = ArrayFromJSON(
+ struct_type,
+
R"([{"a":1,"b":6},{"a":2,"b":7},{"a":3,"b":8},{"a":4,"b":9},{"a":5,"b":10}])");
+ ASSERT_OK_AND_ASSIGN(auto struct_nested_stat,
+ struct_array->CopyTo(default_cpu_memory_manager()));
+ auto statistics_struct = std::make_shared<ArrayStatistics>();
+ ASSERT_OK_AND_ASSIGN(statistics_struct->max, struct_array->GetScalar(4));
+ statistics_struct->null_count = 0;
+ auto struct_array_data = struct_array->data();
+ auto statistics_struct_child_a = std::make_shared<ArrayStatistics>();
+ statistics_struct_child_a->min = int64_t{1};
+ struct_array_data->statistics = statistics_struct;
+ struct_array_data->child_data[0]->statistics = statistics_struct_child_a;
+ auto array_c = ArrayFromJSON(int64(), R"([11,12,13,14,15])");
+ array_c->data()->statistics = std::make_shared<ArrayStatistics>();
+ array_c->data()->statistics->max = int64_t{15};
+ auto array_d = ArrayFromJSON(int64(), R"([16,17,18,19,20])");
+ auto nested_child = struct_nested_stat->data()->child_data[0];
+ nested_child->statistics = std::make_shared<ArrayStatistics>();
+ nested_child->statistics->max = int64_t{5};
+ nested_child->statistics->is_max_exact = true;
+
+ auto rb_schema =
+ schema({field("struct_a_b", struct_type), field("c", int64()),
field("d", int64()),
+ field("struct_copy", struct_nested_stat->type())});
+ auto rb = RecordBatch::Make(rb_schema, 5,
+ {struct_array, array_c, array_d,
struct_nested_stat});
+
+ auto expected_scalar = internal::checked_pointer_cast<StructScalar>(
+ ScalarFromJSON(struct_type, R"([5,10])"));
+ auto a =
ArrayStatistics::ValueType{std::static_pointer_cast<Scalar>(expected_scalar)};
+
+ ASSERT_OK_AND_ASSIGN(
+ auto expected_array,
Review Comment:
```suggestion
auto expected_statistics_array,
```
##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1454,6 +1462,94 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}
+TEST_F(TestRecordBatch, MakeStatisticsArrayNestedType) {
+ auto struct_type = struct_({field("a", int64()), field("b", int64())});
+ auto struct_array = ArrayFromJSON(
+ struct_type,
+
R"([{"a":1,"b":6},{"a":2,"b":7},{"a":3,"b":8},{"a":4,"b":9},{"a":5,"b":10}])");
+ ASSERT_OK_AND_ASSIGN(auto struct_nested_stat,
+ struct_array->CopyTo(default_cpu_memory_manager()));
+ auto statistics_struct = std::make_shared<ArrayStatistics>();
+ ASSERT_OK_AND_ASSIGN(statistics_struct->max, struct_array->GetScalar(4));
+ statistics_struct->null_count = 0;
+ auto struct_array_data = struct_array->data();
+ auto statistics_struct_child_a = std::make_shared<ArrayStatistics>();
+ statistics_struct_child_a->min = int64_t{1};
+ struct_array_data->statistics = statistics_struct;
+ struct_array_data->child_data[0]->statistics = statistics_struct_child_a;
+ auto array_c = ArrayFromJSON(int64(), R"([11,12,13,14,15])");
+ array_c->data()->statistics = std::make_shared<ArrayStatistics>();
+ array_c->data()->statistics->max = int64_t{15};
+ auto array_d = ArrayFromJSON(int64(), R"([16,17,18,19,20])");
+ auto nested_child = struct_nested_stat->data()->child_data[0];
+ nested_child->statistics = std::make_shared<ArrayStatistics>();
+ nested_child->statistics->max = int64_t{5};
+ nested_child->statistics->is_max_exact = true;
+
+ auto rb_schema =
+ schema({field("struct_a_b", struct_type), field("c", int64()),
field("d", int64()),
+ field("struct_copy", struct_nested_stat->type())});
+ auto rb = RecordBatch::Make(rb_schema, 5,
+ {struct_array, array_c, array_d,
struct_nested_stat});
+
+ auto expected_scalar = internal::checked_pointer_cast<StructScalar>(
+ ScalarFromJSON(struct_type, R"([5,10])"));
+ auto a =
ArrayStatistics::ValueType{std::static_pointer_cast<Scalar>(expected_scalar)};
+
+ ASSERT_OK_AND_ASSIGN(
+ auto expected_array,
+ MakeStatisticsArray("[null,0,1,3,6]",
+ {{ARROW_STATISTICS_KEY_ROW_COUNT_EXACT},
+ {ARROW_STATISTICS_KEY_NULL_COUNT_EXACT,
+ ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE},
+ {ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE},
+ {ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE},
+ {ARROW_STATISTICS_KEY_MAX_VALUE_EXACT}},
+ {{ArrayStatistics::ValueType{int64_t{5}}},
+ {ArrayStatistics::ValueType{int64_t{0}},
+ ArrayStatistics::ValueType{
+
std::static_pointer_cast<Scalar>(expected_scalar)}},
+ {ArrayStatistics::ValueType{int64_t{1}}},
+ {ArrayStatistics::ValueType{int64_t{15}}},
+ {ArrayStatistics::ValueType{int64_t{5}}}}));
+ ASSERT_OK_AND_ASSIGN(auto rb_stat, rb->MakeStatisticsArray());
Review Comment:
```suggestion
ASSERT_OK_AND_ASSIGN(auto rb_statistics_array, rb->MakeStatisticsArray());
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]