kou commented on code in PR #46031:
URL: https://github.com/apache/arrow/pull/46031#discussion_r2053187736


##########
cpp/src/arrow/record_batch.cc:
##########
@@ -680,8 +697,8 @@ Result<std::shared_ptr<Array>> 
RecordBatch::MakeStatisticsArray(
         return static_cast<DoubleBuilder*>(builder)->Append(value);
       }
       Status operator()(const std::string& value) {
-        return static_cast<StringBuilder*>(builder)->Append(
-            value.data(), static_cast<int32_t>(value.size()));
+        StringBuilderVisitor visitor_builder;
+        return VisitTypeInline(*builder->type(), &visitor_builder, builder, 
value);

Review Comment:
   Could you use `builder_visitor` or `visitor`?
   
   ```suggestion
           StringBuilderVisitor visitor;
           return VisitTypeInline(*builder->type(), &visitor, builder, value);
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1423,37 +1453,140 @@ TEST_F(TestRecordBatch, 
MakeStatisticsArrayMaxApproximate) {
   AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
 }
 
-TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
-  auto schema =
-      ::arrow::schema({field("no-statistics", boolean()), field("string", 
utf8())});
-  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
-  auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, 
\"c\"]")->data()->Copy();
-  string_array_data->statistics = std::make_shared<ArrayStatistics>();
-  string_array_data->statistics->is_max_exact = true;
-  string_array_data->statistics->max = "c";
-  auto string_array = MakeArray(std::move(string_array_data));
-  auto batch = RecordBatch::Make(schema, string_array->length(),
-                                 {no_statistics_array, string_array});
+template <typename DataType>
+class TestRecordBatchMakeStatisticsArrayStringBase {
+ public:
+  std::shared_ptr<::arrow::DataType> type(int byte_width = 1) {

Review Comment:
   ```suggestion
     std::shared_ptr<::arrow::DataType> type(int32_t byte_width = 1) {
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1423,37 +1453,140 @@ TEST_F(TestRecordBatch, 
MakeStatisticsArrayMaxApproximate) {
   AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
 }
 
-TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
-  auto schema =
-      ::arrow::schema({field("no-statistics", boolean()), field("string", 
utf8())});
-  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
-  auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, 
\"c\"]")->data()->Copy();
-  string_array_data->statistics = std::make_shared<ArrayStatistics>();
-  string_array_data->statistics->is_max_exact = true;
-  string_array_data->statistics->max = "c";
-  auto string_array = MakeArray(std::move(string_array_data));
-  auto batch = RecordBatch::Make(schema, string_array->length(),
-                                 {no_statistics_array, string_array});
+template <typename DataType>
+class TestRecordBatchMakeStatisticsArrayStringBase {
+ public:
+  std::shared_ptr<::arrow::DataType> type(int byte_width = 1) {
+    if constexpr (std::is_same_v<DataType, FixedSizeBinaryType>) {
+      return fixed_size_binary(byte_width);
+    } else {
+      return TypeTraits<DataType>::type_singleton();
+    }
+  }
+  std::shared_ptr<Array> GenerateString(
+      const std::shared_ptr<::arrow::DataType>& dataType) {
+    if (dataType->id() == Type::FIXED_SIZE_BINARY) {
+      FixedSizeBinaryType* type = 
static_cast<FixedSizeBinaryType*>(dataType.get());
+      int byte_width = type->byte_width();
+      auto a = std::string(byte_width, 'a');
+      auto b = std::string(byte_width, 'b');
+      auto c = std::string(byte_width, 'c');
+      std::stringstream ss;
+      ss << R"([")" << a << R"(",")" << b << R"(",")" << c << R"("])";
+      return ArrayFromJSON(dataType, ss.str());
+    }
+    return ArrayFromJSON(dataType, R"(["a","b","c"])");
+  }
+};
+template <typename DataType>
+class TestRecordBatchMakeStatisticsArrayEachStringType
+    : public TestRecordBatchMakeStatisticsArrayStringBase<DataType>,
+      public ::testing::Test {
+ public:
+  void TestEachType() {
+    auto schema = ::arrow::schema(
+        {field("no-statistics", boolean()), field("string", this->type())});
+    auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+    auto string_array_data = 
this->GenerateString(this->type())->data()->Copy();
+    string_array_data->statistics = std::make_shared<ArrayStatistics>();
+    string_array_data->statistics->is_max_exact = true;
+    string_array_data->statistics->max = "c";
+    auto string_array = MakeArray(std::move(string_array_data));
+    auto batch = RecordBatch::Make(schema, string_array->length(),
+                                   {no_statistics_array, string_array});
+
+    ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+    ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                         MakeStatisticsArray("[null, 1]",
+                                             {{
+                                                  
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                              },
+                                              {
+                                                  
ARROW_STATISTICS_KEY_MAX_VALUE_EXACT,
+                                              }},
+                                             {{
+                                                  
ArrayStatistics::ValueType{int64_t{3}},
+                                              },
+                                              {
+                                                  
ArrayStatistics::ValueType{"c"},
+                                              }},
+                                             {null(), this->type()}));
+    AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+  }
+};
 
-  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+class TestRecordBatchMakeStatisticsArrayFixedSizeBinaryCombination
+    : public TestRecordBatchMakeStatisticsArrayStringBase<FixedSizeBinaryType>,
+      public ::testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  void TestCombination() {
+    auto [byte_width_1, byte_width_2] = GetParam();
+    auto f0 = GenerateString(type(byte_width_1));
+    f0->data()->statistics = std::make_shared<ArrayStatistics>();
+    f0->data()->statistics->max = std::string(byte_width_1, 'c');
+
+    auto f1 = ArrayFromJSON(int64(), R"([1, 2, 3])");
+
+    auto f2 = GenerateString(type(byte_width_2));
+    f2->data()->statistics = std::make_shared<ArrayStatistics>();
+    f2->data()->statistics->max = std::string(byte_width_2, 'c');
+
+    auto schema = ::arrow::schema({field("f0", type(byte_width_1)), 
field("f1", int64()),
+                                   field("f2", 
fixed_size_binary(byte_width_2))});
+    auto batch = RecordBatch::Make(schema, f0->length(), {f0, f1, f2});
+    ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+    ASSERT_OK_AND_ASSIGN(
+        auto expected_statistics_array,
+        MakeStatisticsArray(
+            "[null, 0, 2]",
+            {{
+                 ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+             },
+             {
+                 ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE,
+             },
+             {
+                 ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE,
+             }},
+            {{
+                 ArrayStatistics::ValueType{int64_t{3}},
+             },
+             {
+                 ArrayStatistics::ValueType{std::string(byte_width_1, 'c')},
+             },
+             {
+                 ArrayStatistics::ValueType{std::string(byte_width_2, 'c')},
+             }},
+            {null(), type(byte_width_1), fixed_size_binary(byte_width_2)}));
+
+    AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+  }
+};
 
-  ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
-                       MakeStatisticsArray("[null, 1]",
-                                           {{
-                                                
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
-                                            },
-                                            {
-                                                
ARROW_STATISTICS_KEY_MAX_VALUE_EXACT,
-                                            }},
-                                           {{
-                                                
ArrayStatistics::ValueType{int64_t{3}},
-                                            },
-                                            {
-                                                
ArrayStatistics::ValueType{"c"},
-                                            }}));
-  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+using BaseBinaryOrBinaryViewOrFixedSizeBinaryLikeArrowTypes =
+    ::testing::Types<BinaryType, LargeBinaryType, BinaryViewType, 
FixedSizeBinaryType,
+                     StringType, LargeStringType, StringViewType>;
+
+TYPED_TEST_SUITE(TestRecordBatchMakeStatisticsArrayEachStringType,
+                 BaseBinaryOrBinaryViewOrFixedSizeBinaryLikeArrowTypes);
+
+TYPED_TEST(TestRecordBatchMakeStatisticsArrayEachStringType, EachType) {
+  this->TestEachType();
 }
 
+TEST_P(TestRecordBatchMakeStatisticsArrayFixedSizeBinaryCombination,
+       FixedSizeBinaryCombination) {
+  this->TestCombination();
+}
+
+// tuple(1, 2) tests whether RecordBatch::MakeStatistics can handle cases 
involving
+// different fixed_size_binary types.
+// tuple(2, 2) verifies if a single union array is created to include both 
values.

Review Comment:
   How about creating 2 separated tests for them?
   
   The former should use separated type objects and the latter should use the 
same type object. It will clarify what we want to check.
   
   It seems that we don't need to use parameterized test for this.



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1033,15 +1040,32 @@ Result<std::shared_ptr<Array>> BuildArray(
   }
   return builder.Finish();
 }
+struct StringBuilderVisitor {
+  template <typename DataType>
+  enable_if_t<has_string_view<DataType>::value, Status> Visit(
+      const DataType&, ArrayBuilder* raw_builder,
+      const std::vector<std::string>& values) {
+    using Builder = typename TypeTraits<DataType>::BuilderType;
+    Builder* builder = static_cast<Builder*>(raw_builder);
+    for (const auto& value : values) {
+      ARROW_RETURN_NOT_OK(builder->Append(value));
+    }

Review Comment:
   ```suggestion
       ARROW_RETURN_NOT_OK(builder->AppendValues(values));
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1033,15 +1040,32 @@ Result<std::shared_ptr<Array>> BuildArray(
   }
   return builder.Finish();
 }
+struct StringBuilderVisitor {
+  template <typename DataType>
+  enable_if_t<has_string_view<DataType>::value, Status> Visit(
+      const DataType&, ArrayBuilder* raw_builder,
+      const std::vector<std::string>& values) {
+    using Builder = typename TypeTraits<DataType>::BuilderType;
+    Builder* builder = static_cast<Builder*>(raw_builder);

Review Comment:
   ```suggestion
       auto builder = static_cast<Builder*>(raw_builder);
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1423,37 +1453,140 @@ TEST_F(TestRecordBatch, 
MakeStatisticsArrayMaxApproximate) {
   AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
 }
 
-TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
-  auto schema =
-      ::arrow::schema({field("no-statistics", boolean()), field("string", 
utf8())});
-  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
-  auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, 
\"c\"]")->data()->Copy();
-  string_array_data->statistics = std::make_shared<ArrayStatistics>();
-  string_array_data->statistics->is_max_exact = true;
-  string_array_data->statistics->max = "c";
-  auto string_array = MakeArray(std::move(string_array_data));
-  auto batch = RecordBatch::Make(schema, string_array->length(),
-                                 {no_statistics_array, string_array});
+template <typename DataType>
+class TestRecordBatchMakeStatisticsArrayStringBase {
+ public:
+  std::shared_ptr<::arrow::DataType> type(int byte_width = 1) {
+    if constexpr (std::is_same_v<DataType, FixedSizeBinaryType>) {
+      return fixed_size_binary(byte_width);
+    } else {
+      return TypeTraits<DataType>::type_singleton();
+    }
+  }
+  std::shared_ptr<Array> GenerateString(
+      const std::shared_ptr<::arrow::DataType>& dataType) {

Review Comment:
   ```suggestion
         const std::shared_ptr<::arrow::DataType>& data_type) {
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1056,11 +1080,14 @@ std::vector<RawType> StatisticsValuesToRawValues(
 
 template <typename ValueType, typename = std::enable_if_t<std::is_same<
                                   ArrayStatistics::ValueType, 
ValueType>::value>>
-Result<std::shared_ptr<Array>> BuildArray(const std::vector<ValueType>& 
values) {
+Result<std::shared_ptr<Array>> BuildArray(const std::vector<ValueType>& values,
+                                          const std::shared_ptr<DataType>& 
array_type) {
   struct Builder {
     const std::vector<ArrayStatistics::ValueType>& values_;
-    explicit Builder(const std::vector<ArrayStatistics::ValueType>& values)
-        : values_(values) {}
+    const std::shared_ptr<DataType>& array_type;

Review Comment:
   Could you add `_` suffix for member name?
   
   ```suggestion
       const std::shared_ptr<DataType>& array_type_;
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1423,37 +1453,140 @@ TEST_F(TestRecordBatch, 
MakeStatisticsArrayMaxApproximate) {
   AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
 }
 
-TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
-  auto schema =
-      ::arrow::schema({field("no-statistics", boolean()), field("string", 
utf8())});
-  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
-  auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, 
\"c\"]")->data()->Copy();
-  string_array_data->statistics = std::make_shared<ArrayStatistics>();
-  string_array_data->statistics->is_max_exact = true;
-  string_array_data->statistics->max = "c";
-  auto string_array = MakeArray(std::move(string_array_data));
-  auto batch = RecordBatch::Make(schema, string_array->length(),
-                                 {no_statistics_array, string_array});
+template <typename DataType>
+class TestRecordBatchMakeStatisticsArrayStringBase {
+ public:
+  std::shared_ptr<::arrow::DataType> type(int byte_width = 1) {
+    if constexpr (std::is_same_v<DataType, FixedSizeBinaryType>) {
+      return fixed_size_binary(byte_width);
+    } else {
+      return TypeTraits<DataType>::type_singleton();
+    }
+  }
+  std::shared_ptr<Array> GenerateString(
+      const std::shared_ptr<::arrow::DataType>& dataType) {
+    if (dataType->id() == Type::FIXED_SIZE_BINARY) {
+      FixedSizeBinaryType* type = 
static_cast<FixedSizeBinaryType*>(dataType.get());
+      int byte_width = type->byte_width();
+      auto a = std::string(byte_width, 'a');
+      auto b = std::string(byte_width, 'b');
+      auto c = std::string(byte_width, 'c');
+      std::stringstream ss;
+      ss << R"([")" << a << R"(",")" << b << R"(",")" << c << R"("])";
+      return ArrayFromJSON(dataType, ss.str());
+    }
+    return ArrayFromJSON(dataType, R"(["a","b","c"])");
+  }
+};
+template <typename DataType>
+class TestRecordBatchMakeStatisticsArrayEachStringType
+    : public TestRecordBatchMakeStatisticsArrayStringBase<DataType>,
+      public ::testing::Test {
+ public:
+  void TestEachType() {
+    auto schema = ::arrow::schema(
+        {field("no-statistics", boolean()), field("string", this->type())});
+    auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+    auto string_array_data = 
this->GenerateString(this->type())->data()->Copy();
+    string_array_data->statistics = std::make_shared<ArrayStatistics>();
+    string_array_data->statistics->is_max_exact = true;
+    string_array_data->statistics->max = "c";
+    auto string_array = MakeArray(std::move(string_array_data));
+    auto batch = RecordBatch::Make(schema, string_array->length(),
+                                   {no_statistics_array, string_array});
+
+    ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+    ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                         MakeStatisticsArray("[null, 1]",
+                                             {{
+                                                  
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                              },
+                                              {
+                                                  
ARROW_STATISTICS_KEY_MAX_VALUE_EXACT,
+                                              }},
+                                             {{
+                                                  
ArrayStatistics::ValueType{int64_t{3}},
+                                              },
+                                              {
+                                                  
ArrayStatistics::ValueType{"c"},
+                                              }},
+                                             {null(), this->type()}));
+    AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+  }
+};
 
-  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+class TestRecordBatchMakeStatisticsArrayFixedSizeBinaryCombination
+    : public TestRecordBatchMakeStatisticsArrayStringBase<FixedSizeBinaryType>,
+      public ::testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  void TestCombination() {
+    auto [byte_width_1, byte_width_2] = GetParam();
+    auto f0 = GenerateString(type(byte_width_1));
+    f0->data()->statistics = std::make_shared<ArrayStatistics>();
+    f0->data()->statistics->max = std::string(byte_width_1, 'c');
+
+    auto f1 = ArrayFromJSON(int64(), R"([1, 2, 3])");
+
+    auto f2 = GenerateString(type(byte_width_2));
+    f2->data()->statistics = std::make_shared<ArrayStatistics>();
+    f2->data()->statistics->max = std::string(byte_width_2, 'c');
+
+    auto schema = ::arrow::schema({field("f0", type(byte_width_1)), 
field("f1", int64()),
+                                   field("f2", 
fixed_size_binary(byte_width_2))});
+    auto batch = RecordBatch::Make(schema, f0->length(), {f0, f1, f2});
+    ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+    ASSERT_OK_AND_ASSIGN(
+        auto expected_statistics_array,
+        MakeStatisticsArray(
+            "[null, 0, 2]",
+            {{
+                 ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+             },
+             {
+                 ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE,
+             },
+             {
+                 ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE,
+             }},
+            {{
+                 ArrayStatistics::ValueType{int64_t{3}},
+             },
+             {
+                 ArrayStatistics::ValueType{std::string(byte_width_1, 'c')},
+             },
+             {
+                 ArrayStatistics::ValueType{std::string(byte_width_2, 'c')},
+             }},
+            {null(), type(byte_width_1), fixed_size_binary(byte_width_2)}));
+
+    AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+  }
+};
 
-  ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
-                       MakeStatisticsArray("[null, 1]",
-                                           {{
-                                                
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
-                                            },
-                                            {
-                                                
ARROW_STATISTICS_KEY_MAX_VALUE_EXACT,
-                                            }},
-                                           {{
-                                                
ArrayStatistics::ValueType{int64_t{3}},
-                                            },
-                                            {
-                                                
ArrayStatistics::ValueType{"c"},
-                                            }}));
-  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+using BaseBinaryOrBinaryViewOrFixedSizeBinaryLikeArrowTypes =
+    ::testing::Types<BinaryType, LargeBinaryType, BinaryViewType, 
FixedSizeBinaryType,
+                     StringType, LargeStringType, StringViewType>;

Review Comment:
   Could you move this to `cpp/src/arrow/testing/gtest_util.h`?
   
   How about renaming this to `AllBinaryOrBinrayViewLikeArrowTypes`?



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1423,37 +1453,140 @@ TEST_F(TestRecordBatch, 
MakeStatisticsArrayMaxApproximate) {
   AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
 }
 
-TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
-  auto schema =
-      ::arrow::schema({field("no-statistics", boolean()), field("string", 
utf8())});
-  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
-  auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, 
\"c\"]")->data()->Copy();
-  string_array_data->statistics = std::make_shared<ArrayStatistics>();
-  string_array_data->statistics->is_max_exact = true;
-  string_array_data->statistics->max = "c";
-  auto string_array = MakeArray(std::move(string_array_data));
-  auto batch = RecordBatch::Make(schema, string_array->length(),
-                                 {no_statistics_array, string_array});
+template <typename DataType>
+class TestRecordBatchMakeStatisticsArrayStringBase {
+ public:
+  std::shared_ptr<::arrow::DataType> type(int byte_width = 1) {
+    if constexpr (std::is_same_v<DataType, FixedSizeBinaryType>) {
+      return fixed_size_binary(byte_width);
+    } else {
+      return TypeTraits<DataType>::type_singleton();
+    }
+  }
+  std::shared_ptr<Array> GenerateString(
+      const std::shared_ptr<::arrow::DataType>& dataType) {
+    if (dataType->id() == Type::FIXED_SIZE_BINARY) {
+      FixedSizeBinaryType* type = 
static_cast<FixedSizeBinaryType*>(dataType.get());
+      int byte_width = type->byte_width();

Review Comment:
   Do we need to cast it to use `byte_width()`?
   
   ```suggestion
         auto byte_width = data_type->byte_width();
   ```



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -556,6 +558,21 @@ Status EnumerateStatistics(const RecordBatch& 
record_batch, OnStatistics on_stat
   }
   return Status::OK();
 }
+struct StringBuilderVisitor {
+  template <typename DataType>
+  enable_if_has_string_view<DataType, Status> Visit(const DataType&,
+                                                    ArrayBuilder* raw_builder,
+                                                    const std::string& value) {
+    using Builder = typename TypeTraits<DataType>::BuilderType;
+    Builder* builder = static_cast<Builder*>(raw_builder);

Review Comment:
   ```suggestion
       auto builder = static_cast<Builder*>(raw_builder);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to