kou commented on code in PR #46031: URL: https://github.com/apache/arrow/pull/46031#discussion_r2053187736
########## cpp/src/arrow/record_batch.cc: ########## @@ -680,8 +697,8 @@ Result<std::shared_ptr<Array>> RecordBatch::MakeStatisticsArray( return static_cast<DoubleBuilder*>(builder)->Append(value); } Status operator()(const std::string& value) { - return static_cast<StringBuilder*>(builder)->Append( - value.data(), static_cast<int32_t>(value.size())); + StringBuilderVisitor visitor_builder; + return VisitTypeInline(*builder->type(), &visitor_builder, builder, value); Review Comment: Could you use `builder_visitor` or `visitor`? ```suggestion StringBuilderVisitor visitor; return VisitTypeInline(*builder->type(), &visitor, builder, value); ``` ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1423,37 +1453,140 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } -TEST_F(TestRecordBatch, MakeStatisticsArrayString) { - auto schema = - ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); - auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); - auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); - string_array_data->statistics = std::make_shared<ArrayStatistics>(); - string_array_data->statistics->is_max_exact = true; - string_array_data->statistics->max = "c"; - auto string_array = MakeArray(std::move(string_array_data)); - auto batch = RecordBatch::Make(schema, string_array->length(), - {no_statistics_array, string_array}); +template <typename DataType> +class TestRecordBatchMakeStatisticsArrayStringBase { + public: + std::shared_ptr<::arrow::DataType> type(int byte_width = 1) { Review Comment: ```suggestion std::shared_ptr<::arrow::DataType> type(int32_t byte_width = 1) { ``` ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1423,37 +1453,140 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } -TEST_F(TestRecordBatch, MakeStatisticsArrayString) { - auto schema = - ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); - auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); - auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); - string_array_data->statistics = std::make_shared<ArrayStatistics>(); - string_array_data->statistics->is_max_exact = true; - string_array_data->statistics->max = "c"; - auto string_array = MakeArray(std::move(string_array_data)); - auto batch = RecordBatch::Make(schema, string_array->length(), - {no_statistics_array, string_array}); +template <typename DataType> +class TestRecordBatchMakeStatisticsArrayStringBase { + public: + std::shared_ptr<::arrow::DataType> type(int byte_width = 1) { + if constexpr (std::is_same_v<DataType, FixedSizeBinaryType>) { + return fixed_size_binary(byte_width); + } else { + return TypeTraits<DataType>::type_singleton(); + } + } + std::shared_ptr<Array> GenerateString( + const std::shared_ptr<::arrow::DataType>& dataType) { + if (dataType->id() == Type::FIXED_SIZE_BINARY) { + FixedSizeBinaryType* type = static_cast<FixedSizeBinaryType*>(dataType.get()); + int byte_width = type->byte_width(); + auto a = std::string(byte_width, 'a'); + auto b = std::string(byte_width, 'b'); + auto c = std::string(byte_width, 'c'); + std::stringstream ss; + ss << R"([")" << a << R"(",")" << b << R"(",")" << c << R"("])"; + return ArrayFromJSON(dataType, ss.str()); + } + return ArrayFromJSON(dataType, R"(["a","b","c"])"); + } +}; +template <typename DataType> +class TestRecordBatchMakeStatisticsArrayEachStringType + : public TestRecordBatchMakeStatisticsArrayStringBase<DataType>, + public ::testing::Test { + public: + void TestEachType() { + auto schema = ::arrow::schema( + {field("no-statistics", boolean()), field("string", this->type())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array_data = this->GenerateString(this->type())->data()->Copy(); + string_array_data->statistics = std::make_shared<ArrayStatistics>(); + string_array_data->statistics->is_max_exact = true; + string_array_data->statistics->max = "c"; + auto string_array = MakeArray(std::move(string_array_data)); + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{"c"}, + }}, + {null(), this->type()})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); + } +}; - ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); +class TestRecordBatchMakeStatisticsArrayFixedSizeBinaryCombination + : public TestRecordBatchMakeStatisticsArrayStringBase<FixedSizeBinaryType>, + public ::testing::TestWithParam<std::tuple<int, int>> { + public: + void TestCombination() { + auto [byte_width_1, byte_width_2] = GetParam(); + auto f0 = GenerateString(type(byte_width_1)); + f0->data()->statistics = std::make_shared<ArrayStatistics>(); + f0->data()->statistics->max = std::string(byte_width_1, 'c'); + + auto f1 = ArrayFromJSON(int64(), R"([1, 2, 3])"); + + auto f2 = GenerateString(type(byte_width_2)); + f2->data()->statistics = std::make_shared<ArrayStatistics>(); + f2->data()->statistics->max = std::string(byte_width_2, 'c'); + + auto schema = ::arrow::schema({field("f0", type(byte_width_1)), field("f1", int64()), + field("f2", fixed_size_binary(byte_width_2))}); + auto batch = RecordBatch::Make(schema, f0->length(), {f0, f1, f2}); + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray( + "[null, 0, 2]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{std::string(byte_width_1, 'c')}, + }, + { + ArrayStatistics::ValueType{std::string(byte_width_2, 'c')}, + }}, + {null(), type(byte_width_1), fixed_size_binary(byte_width_2)})); + + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); + } +}; - ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, - MakeStatisticsArray("[null, 1]", - {{ - ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, - }, - { - ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, - }}, - {{ - ArrayStatistics::ValueType{int64_t{3}}, - }, - { - ArrayStatistics::ValueType{"c"}, - }})); - AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +using BaseBinaryOrBinaryViewOrFixedSizeBinaryLikeArrowTypes = + ::testing::Types<BinaryType, LargeBinaryType, BinaryViewType, FixedSizeBinaryType, + StringType, LargeStringType, StringViewType>; + +TYPED_TEST_SUITE(TestRecordBatchMakeStatisticsArrayEachStringType, + BaseBinaryOrBinaryViewOrFixedSizeBinaryLikeArrowTypes); + +TYPED_TEST(TestRecordBatchMakeStatisticsArrayEachStringType, EachType) { + this->TestEachType(); } +TEST_P(TestRecordBatchMakeStatisticsArrayFixedSizeBinaryCombination, + FixedSizeBinaryCombination) { + this->TestCombination(); +} + +// tuple(1, 2) tests whether RecordBatch::MakeStatistics can handle cases involving +// different fixed_size_binary types. +// tuple(2, 2) verifies if a single union array is created to include both values. Review Comment: How about creating 2 separated tests for them? The former should use separated type objects and the latter should use the same type object. It will clarify what we want to check. It seems that we don't need to use parameterized test for this. ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1033,15 +1040,32 @@ Result<std::shared_ptr<Array>> BuildArray( } return builder.Finish(); } +struct StringBuilderVisitor { + template <typename DataType> + enable_if_t<has_string_view<DataType>::value, Status> Visit( + const DataType&, ArrayBuilder* raw_builder, + const std::vector<std::string>& values) { + using Builder = typename TypeTraits<DataType>::BuilderType; + Builder* builder = static_cast<Builder*>(raw_builder); + for (const auto& value : values) { + ARROW_RETURN_NOT_OK(builder->Append(value)); + } Review Comment: ```suggestion ARROW_RETURN_NOT_OK(builder->AppendValues(values)); ``` ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1033,15 +1040,32 @@ Result<std::shared_ptr<Array>> BuildArray( } return builder.Finish(); } +struct StringBuilderVisitor { + template <typename DataType> + enable_if_t<has_string_view<DataType>::value, Status> Visit( + const DataType&, ArrayBuilder* raw_builder, + const std::vector<std::string>& values) { + using Builder = typename TypeTraits<DataType>::BuilderType; + Builder* builder = static_cast<Builder*>(raw_builder); Review Comment: ```suggestion auto builder = static_cast<Builder*>(raw_builder); ``` ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1423,37 +1453,140 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } -TEST_F(TestRecordBatch, MakeStatisticsArrayString) { - auto schema = - ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); - auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); - auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); - string_array_data->statistics = std::make_shared<ArrayStatistics>(); - string_array_data->statistics->is_max_exact = true; - string_array_data->statistics->max = "c"; - auto string_array = MakeArray(std::move(string_array_data)); - auto batch = RecordBatch::Make(schema, string_array->length(), - {no_statistics_array, string_array}); +template <typename DataType> +class TestRecordBatchMakeStatisticsArrayStringBase { + public: + std::shared_ptr<::arrow::DataType> type(int byte_width = 1) { + if constexpr (std::is_same_v<DataType, FixedSizeBinaryType>) { + return fixed_size_binary(byte_width); + } else { + return TypeTraits<DataType>::type_singleton(); + } + } + std::shared_ptr<Array> GenerateString( + const std::shared_ptr<::arrow::DataType>& dataType) { Review Comment: ```suggestion const std::shared_ptr<::arrow::DataType>& data_type) { ``` ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1056,11 +1080,14 @@ std::vector<RawType> StatisticsValuesToRawValues( template <typename ValueType, typename = std::enable_if_t<std::is_same< ArrayStatistics::ValueType, ValueType>::value>> -Result<std::shared_ptr<Array>> BuildArray(const std::vector<ValueType>& values) { +Result<std::shared_ptr<Array>> BuildArray(const std::vector<ValueType>& values, + const std::shared_ptr<DataType>& array_type) { struct Builder { const std::vector<ArrayStatistics::ValueType>& values_; - explicit Builder(const std::vector<ArrayStatistics::ValueType>& values) - : values_(values) {} + const std::shared_ptr<DataType>& array_type; Review Comment: Could you add `_` suffix for member name? ```suggestion const std::shared_ptr<DataType>& array_type_; ``` ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1423,37 +1453,140 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } -TEST_F(TestRecordBatch, MakeStatisticsArrayString) { - auto schema = - ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); - auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); - auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); - string_array_data->statistics = std::make_shared<ArrayStatistics>(); - string_array_data->statistics->is_max_exact = true; - string_array_data->statistics->max = "c"; - auto string_array = MakeArray(std::move(string_array_data)); - auto batch = RecordBatch::Make(schema, string_array->length(), - {no_statistics_array, string_array}); +template <typename DataType> +class TestRecordBatchMakeStatisticsArrayStringBase { + public: + std::shared_ptr<::arrow::DataType> type(int byte_width = 1) { + if constexpr (std::is_same_v<DataType, FixedSizeBinaryType>) { + return fixed_size_binary(byte_width); + } else { + return TypeTraits<DataType>::type_singleton(); + } + } + std::shared_ptr<Array> GenerateString( + const std::shared_ptr<::arrow::DataType>& dataType) { + if (dataType->id() == Type::FIXED_SIZE_BINARY) { + FixedSizeBinaryType* type = static_cast<FixedSizeBinaryType*>(dataType.get()); + int byte_width = type->byte_width(); + auto a = std::string(byte_width, 'a'); + auto b = std::string(byte_width, 'b'); + auto c = std::string(byte_width, 'c'); + std::stringstream ss; + ss << R"([")" << a << R"(",")" << b << R"(",")" << c << R"("])"; + return ArrayFromJSON(dataType, ss.str()); + } + return ArrayFromJSON(dataType, R"(["a","b","c"])"); + } +}; +template <typename DataType> +class TestRecordBatchMakeStatisticsArrayEachStringType + : public TestRecordBatchMakeStatisticsArrayStringBase<DataType>, + public ::testing::Test { + public: + void TestEachType() { + auto schema = ::arrow::schema( + {field("no-statistics", boolean()), field("string", this->type())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array_data = this->GenerateString(this->type())->data()->Copy(); + string_array_data->statistics = std::make_shared<ArrayStatistics>(); + string_array_data->statistics->is_max_exact = true; + string_array_data->statistics->max = "c"; + auto string_array = MakeArray(std::move(string_array_data)); + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{"c"}, + }}, + {null(), this->type()})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); + } +}; - ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); +class TestRecordBatchMakeStatisticsArrayFixedSizeBinaryCombination + : public TestRecordBatchMakeStatisticsArrayStringBase<FixedSizeBinaryType>, + public ::testing::TestWithParam<std::tuple<int, int>> { + public: + void TestCombination() { + auto [byte_width_1, byte_width_2] = GetParam(); + auto f0 = GenerateString(type(byte_width_1)); + f0->data()->statistics = std::make_shared<ArrayStatistics>(); + f0->data()->statistics->max = std::string(byte_width_1, 'c'); + + auto f1 = ArrayFromJSON(int64(), R"([1, 2, 3])"); + + auto f2 = GenerateString(type(byte_width_2)); + f2->data()->statistics = std::make_shared<ArrayStatistics>(); + f2->data()->statistics->max = std::string(byte_width_2, 'c'); + + auto schema = ::arrow::schema({field("f0", type(byte_width_1)), field("f1", int64()), + field("f2", fixed_size_binary(byte_width_2))}); + auto batch = RecordBatch::Make(schema, f0->length(), {f0, f1, f2}); + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray( + "[null, 0, 2]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{std::string(byte_width_1, 'c')}, + }, + { + ArrayStatistics::ValueType{std::string(byte_width_2, 'c')}, + }}, + {null(), type(byte_width_1), fixed_size_binary(byte_width_2)})); + + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); + } +}; - ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, - MakeStatisticsArray("[null, 1]", - {{ - ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, - }, - { - ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, - }}, - {{ - ArrayStatistics::ValueType{int64_t{3}}, - }, - { - ArrayStatistics::ValueType{"c"}, - }})); - AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +using BaseBinaryOrBinaryViewOrFixedSizeBinaryLikeArrowTypes = + ::testing::Types<BinaryType, LargeBinaryType, BinaryViewType, FixedSizeBinaryType, + StringType, LargeStringType, StringViewType>; Review Comment: Could you move this to `cpp/src/arrow/testing/gtest_util.h`? How about renaming this to `AllBinaryOrBinrayViewLikeArrowTypes`? ########## cpp/src/arrow/record_batch_test.cc: ########## @@ -1423,37 +1453,140 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } -TEST_F(TestRecordBatch, MakeStatisticsArrayString) { - auto schema = - ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); - auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); - auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); - string_array_data->statistics = std::make_shared<ArrayStatistics>(); - string_array_data->statistics->is_max_exact = true; - string_array_data->statistics->max = "c"; - auto string_array = MakeArray(std::move(string_array_data)); - auto batch = RecordBatch::Make(schema, string_array->length(), - {no_statistics_array, string_array}); +template <typename DataType> +class TestRecordBatchMakeStatisticsArrayStringBase { + public: + std::shared_ptr<::arrow::DataType> type(int byte_width = 1) { + if constexpr (std::is_same_v<DataType, FixedSizeBinaryType>) { + return fixed_size_binary(byte_width); + } else { + return TypeTraits<DataType>::type_singleton(); + } + } + std::shared_ptr<Array> GenerateString( + const std::shared_ptr<::arrow::DataType>& dataType) { + if (dataType->id() == Type::FIXED_SIZE_BINARY) { + FixedSizeBinaryType* type = static_cast<FixedSizeBinaryType*>(dataType.get()); + int byte_width = type->byte_width(); Review Comment: Do we need to cast it to use `byte_width()`? ```suggestion auto byte_width = data_type->byte_width(); ``` ########## cpp/src/arrow/record_batch.cc: ########## @@ -556,6 +558,21 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat } return Status::OK(); } +struct StringBuilderVisitor { + template <typename DataType> + enable_if_has_string_view<DataType, Status> Visit(const DataType&, + ArrayBuilder* raw_builder, + const std::string& value) { + using Builder = typename TypeTraits<DataType>::BuilderType; + Builder* builder = static_cast<Builder*>(raw_builder); Review Comment: ```suggestion auto builder = static_cast<Builder*>(raw_builder); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org