kou commented on code in PR #46031: URL: https://github.com/apache/arrow/pull/46031#discussion_r2030553026
########## cpp/src/arrow/record_batch.cc: ########## @@ -556,6 +558,21 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat } return Status::OK(); } +struct StringVisitorBuilder { + template <typename DataType, + typename Builder = typename TypeTraits<DataType>::BuilderType> + enable_if_has_string_view<DataType, Status> Visit(const DataType&, + ArrayBuilder* raw_builder, + const std::string& value) { + Builder* builder = static_cast<Builder*>(raw_builder); + return builder->Append(value); + } + + Status Visit(const DataType& type, ArrayBuilder*, const std::string&) { + return Status::NotImplemented( Review Comment: How about using `Status::Invalid()` instead? We never support other types in this class, right? ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1137,6 +1175,10 @@ Result<std::shared_ptr<Array>> MakeStatisticsArray( for (size_t i = 0; i < nested_statistics_keys.size(); ++i) { const auto& statistics_keys = nested_statistics_keys[i]; const auto& statistics_values = nested_statistics_values[i]; + auto string_type = std::shared_ptr<DataType>(nullptr); Review Comment: ```suggestion auto array_type = arrow::null(); ``` ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1080,17 +1118,17 @@ Result<std::shared_ptr<Array>> BuildArray(const std::vector<ValueType>& values) } Result<std::shared_ptr<Array>> operator()(const std::string&) { auto values = StatisticsValuesToRawValues<std::string>(values_); - return BuildArray<StringType>(values); + return BuildArray(string_type, values); } - } builder(values); + } builder(values, string_type); return std::visit(builder, values[0]); } Result<std::shared_ptr<Array>> MakeStatisticsArray( const std::string& columns_json, const std::vector<std::vector<std::string>>& nested_statistics_keys, - const std::vector<std::vector<ArrayStatistics::ValueType>>& - nested_statistics_values) { + const std::vector<std::vector<ArrayStatistics::ValueType>>& nested_statistics_values, + std::vector<std::shared_ptr<DataType>> string_types = {}) { Review Comment: ```suggestion const std::vector<std::shared_ptr<DataType>>& array_types = {}) { ``` ########## cpp/src/arrow/record_batch.cc: ########## @@ -556,6 +558,21 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat } return Status::OK(); } +struct StringVisitorBuilder { Review Comment: How about renaming this to `StringBuilderVisitor` because this is a visitor not a builder? ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1033,15 +1039,44 @@ Result<std::shared_ptr<Array>> BuildArray( } return builder.Finish(); } +struct StringBuilderVisitor { + template <typename DataType, + typename Builder = typename TypeTraits<DataType>::BuilderType> + enable_if_t<is_base_binary_type<DataType>::value, Status> Visit( + const DataType&, ArrayBuilder* raw_builder, + const std::vector<std::string>& values) { + Builder* builder = static_cast<Builder*>(raw_builder); + ARROW_RETURN_NOT_OK(builder->AppendValues(values)); + return Status::OK(); + } -template <typename ArrowType, typename = enable_if_string<ArrowType>> -Result<std::shared_ptr<Array>> BuildArray(const std::vector<std::string>& values) { - using BuilderType = typename TypeTraits<ArrowType>::BuilderType; - BuilderType builder; - for (const auto& value : values) { - ARROW_RETURN_NOT_OK(builder.Append(value)); + template <typename DataType, + typename Builder = typename TypeTraits<DataType>::BuilderType> + enable_if_t<is_binary_view_like_type<DataType>::value || + std::is_same_v<DataType, FixedSizeBinaryType>, + Status> + Visit(const DataType&, ArrayBuilder* raw_builder, + const std::vector<std::string>& values) { + Builder* builder = static_cast<Builder*>(raw_builder); + // FixedSizeBinary, StringView, BinaryView do not have AppendValues method Review Comment: Can we simplify this code by using `Append()` for `Binary`/`String`? This is a test. So we can use unoptimized methods. ########## cpp/src/arrow/record_batch.cc: ########## @@ -556,6 +558,21 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat } return Status::OK(); } +struct StringVisitorBuilder { + template <typename DataType, + typename Builder = typename TypeTraits<DataType>::BuilderType> + enable_if_has_string_view<DataType, Status> Visit(const DataType&, + ArrayBuilder* raw_builder, + const std::string& value) { + Builder* builder = static_cast<Builder*>(raw_builder); Review Comment: Can we move `Builder` to `Visit()`? ```suggestion template <typename DataType> enable_if_has_string_view<DataType, Status> Visit(const DataType&, ArrayBuilder* raw_builder, const std::string& value) { using Builder = typename TypeTraits<DataType>::BuilderType; auto builder = static_cast<Builder*>(raw_builder); ``` ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1137,6 +1175,10 @@ Result<std::shared_ptr<Array>> MakeStatisticsArray( for (size_t i = 0; i < nested_statistics_keys.size(); ++i) { const auto& statistics_keys = nested_statistics_keys[i]; const auto& statistics_values = nested_statistics_values[i]; + auto string_type = std::shared_ptr<DataType>(nullptr); + if (!string_types.empty()) { + string_type = string_types[i]; + } Review Comment: ```suggestion const auto& array_type = (i < array_types.size()) ? array_types[i] : arrow::null(); ``` ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1424,33 +1471,86 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { } TEST_F(TestRecordBatch, MakeStatisticsArrayString) { - auto schema = - ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); - auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); - auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); - string_array_data->statistics = std::make_shared<ArrayStatistics>(); - string_array_data->statistics->is_max_exact = true; - string_array_data->statistics->max = "c"; - auto string_array = MakeArray(std::move(string_array_data)); - auto batch = RecordBatch::Make(schema, string_array->length(), - {no_statistics_array, string_array}); + auto CheckString = [](const std::shared_ptr<DataType>& type) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("string", type)}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array_data = ArrayFromJSON(type, "[\"a\", null, \"c\"]")->data()->Copy(); + string_array_data->statistics = std::make_shared<ArrayStatistics>(); + string_array_data->statistics->is_max_exact = true; + string_array_data->statistics->max = "c"; + auto string_array = MakeArray(std::move(string_array_data)); + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{"c"}, + }}, + {nullptr, type})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); + }; + // Check each type + CheckString(binary()); + CheckString(utf8()); + CheckString(large_binary()); + CheckString(large_utf8()); + CheckString(binary_view()); + CheckString(utf8_view()); + CheckString(fixed_size_binary(1)); Review Comment: Could you use `TYPED_TEST_SUITE()` instead of parameterizing manually? ```cpp template <typename DataType> class TestRecordBatchMakeStatisticsArrayString : public ::testing::Test { public: void TestMax() { // ... } }; // BaseBinaryOrBinaryViewLikeArrowTypes is defined in cpp/src/arrow/testing/gtest_util.h. TYPED_TEST_SUITE(TestRecordBatchMakeStatisticsArrayString, BaseBinaryOrBinaryViewLikeArrowTypes); TYPED_TEST(TestRecordBatchMakeStatisticsArrayString, Max) { this->TestMax();} ``` ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1424,33 +1471,86 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { } TEST_F(TestRecordBatch, MakeStatisticsArrayString) { - auto schema = - ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); - auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); - auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); - string_array_data->statistics = std::make_shared<ArrayStatistics>(); - string_array_data->statistics->is_max_exact = true; - string_array_data->statistics->max = "c"; - auto string_array = MakeArray(std::move(string_array_data)); - auto batch = RecordBatch::Make(schema, string_array->length(), - {no_statistics_array, string_array}); + auto CheckString = [](const std::shared_ptr<DataType>& type) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("string", type)}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array_data = ArrayFromJSON(type, "[\"a\", null, \"c\"]")->data()->Copy(); + string_array_data->statistics = std::make_shared<ArrayStatistics>(); + string_array_data->statistics->is_max_exact = true; + string_array_data->statistics->max = "c"; + auto string_array = MakeArray(std::move(string_array_data)); + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{"c"}, + }}, + {nullptr, type})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); + }; + // Check each type + CheckString(binary()); + CheckString(utf8()); + CheckString(large_binary()); + CheckString(large_utf8()); + CheckString(binary_view()); + CheckString(utf8_view()); + CheckString(fixed_size_binary(1)); + + // Check the combination of various string types Review Comment: Is this combination case important? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org