kou commented on code in PR #46031:
URL: https://github.com/apache/arrow/pull/46031#discussion_r2030553026


##########
cpp/src/arrow/record_batch.cc:
##########
@@ -556,6 +558,21 @@ Status EnumerateStatistics(const RecordBatch& 
record_batch, OnStatistics on_stat
   }
   return Status::OK();
 }
+struct StringVisitorBuilder {
+  template <typename DataType,
+            typename Builder = typename TypeTraits<DataType>::BuilderType>
+  enable_if_has_string_view<DataType, Status> Visit(const DataType&,
+                                                    ArrayBuilder* raw_builder,
+                                                    const std::string& value) {
+    Builder* builder = static_cast<Builder*>(raw_builder);
+    return builder->Append(value);
+  }
+
+  Status Visit(const DataType& type, ArrayBuilder*, const std::string&) {
+    return Status::NotImplemented(

Review Comment:
   How about using `Status::Invalid()` instead? We never support other types in 
this class, right?



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1137,6 +1175,10 @@ Result<std::shared_ptr<Array>> MakeStatisticsArray(
   for (size_t i = 0; i < nested_statistics_keys.size(); ++i) {
     const auto& statistics_keys = nested_statistics_keys[i];
     const auto& statistics_values = nested_statistics_values[i];
+    auto string_type = std::shared_ptr<DataType>(nullptr);

Review Comment:
   ```suggestion
       auto array_type = arrow::null();
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1080,17 +1118,17 @@ Result<std::shared_ptr<Array>> BuildArray(const 
std::vector<ValueType>& values)
     }
     Result<std::shared_ptr<Array>> operator()(const std::string&) {
       auto values = StatisticsValuesToRawValues<std::string>(values_);
-      return BuildArray<StringType>(values);
+      return BuildArray(string_type, values);
     }
-  } builder(values);
+  } builder(values, string_type);
   return std::visit(builder, values[0]);
 }
 
 Result<std::shared_ptr<Array>> MakeStatisticsArray(
     const std::string& columns_json,
     const std::vector<std::vector<std::string>>& nested_statistics_keys,
-    const std::vector<std::vector<ArrayStatistics::ValueType>>&
-        nested_statistics_values) {
+    const std::vector<std::vector<ArrayStatistics::ValueType>>& 
nested_statistics_values,
+    std::vector<std::shared_ptr<DataType>> string_types = {}) {

Review Comment:
   ```suggestion
       const std::vector<std::shared_ptr<DataType>>& array_types = {}) {
   ```



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -556,6 +558,21 @@ Status EnumerateStatistics(const RecordBatch& 
record_batch, OnStatistics on_stat
   }
   return Status::OK();
 }
+struct StringVisitorBuilder {

Review Comment:
   How about renaming this to `StringBuilderVisitor` because this is a visitor 
not a builder?



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1033,15 +1039,44 @@ Result<std::shared_ptr<Array>> BuildArray(
   }
   return builder.Finish();
 }
+struct StringBuilderVisitor {
+  template <typename DataType,
+            typename Builder = typename TypeTraits<DataType>::BuilderType>
+  enable_if_t<is_base_binary_type<DataType>::value, Status> Visit(
+      const DataType&, ArrayBuilder* raw_builder,
+      const std::vector<std::string>& values) {
+    Builder* builder = static_cast<Builder*>(raw_builder);
+    ARROW_RETURN_NOT_OK(builder->AppendValues(values));
+    return Status::OK();
+  }
 
-template <typename ArrowType, typename = enable_if_string<ArrowType>>
-Result<std::shared_ptr<Array>> BuildArray(const std::vector<std::string>& 
values) {
-  using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
-  BuilderType builder;
-  for (const auto& value : values) {
-    ARROW_RETURN_NOT_OK(builder.Append(value));
+  template <typename DataType,
+            typename Builder = typename TypeTraits<DataType>::BuilderType>
+  enable_if_t<is_binary_view_like_type<DataType>::value ||
+                  std::is_same_v<DataType, FixedSizeBinaryType>,
+              Status>
+  Visit(const DataType&, ArrayBuilder* raw_builder,
+        const std::vector<std::string>& values) {
+    Builder* builder = static_cast<Builder*>(raw_builder);
+    // FixedSizeBinary, StringView, BinaryView do not have AppendValues method

Review Comment:
   Can we simplify this code by using `Append()` for `Binary`/`String`?
   This is a test. So we can use unoptimized methods.



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -556,6 +558,21 @@ Status EnumerateStatistics(const RecordBatch& 
record_batch, OnStatistics on_stat
   }
   return Status::OK();
 }
+struct StringVisitorBuilder {
+  template <typename DataType,
+            typename Builder = typename TypeTraits<DataType>::BuilderType>
+  enable_if_has_string_view<DataType, Status> Visit(const DataType&,
+                                                    ArrayBuilder* raw_builder,
+                                                    const std::string& value) {
+    Builder* builder = static_cast<Builder*>(raw_builder);

Review Comment:
   Can we move `Builder` to `Visit()`?
   
   ```suggestion
     template <typename DataType>
     enable_if_has_string_view<DataType, Status> Visit(const DataType&,
                                                       ArrayBuilder* 
raw_builder,
                                                       const std::string& 
value) {
       using Builder = typename TypeTraits<DataType>::BuilderType;
       auto builder = static_cast<Builder*>(raw_builder);
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1137,6 +1175,10 @@ Result<std::shared_ptr<Array>> MakeStatisticsArray(
   for (size_t i = 0; i < nested_statistics_keys.size(); ++i) {
     const auto& statistics_keys = nested_statistics_keys[i];
     const auto& statistics_values = nested_statistics_values[i];
+    auto string_type = std::shared_ptr<DataType>(nullptr);
+    if (!string_types.empty()) {
+      string_type = string_types[i];
+    }

Review Comment:
   ```suggestion
       const auto& array_type = (i < array_types.size()) ? array_types[i] : 
arrow::null();
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1424,33 +1471,86 @@ TEST_F(TestRecordBatch, 
MakeStatisticsArrayMaxApproximate) {
 }
 
 TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
-  auto schema =
-      ::arrow::schema({field("no-statistics", boolean()), field("string", 
utf8())});
-  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
-  auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, 
\"c\"]")->data()->Copy();
-  string_array_data->statistics = std::make_shared<ArrayStatistics>();
-  string_array_data->statistics->is_max_exact = true;
-  string_array_data->statistics->max = "c";
-  auto string_array = MakeArray(std::move(string_array_data));
-  auto batch = RecordBatch::Make(schema, string_array->length(),
-                                 {no_statistics_array, string_array});
+  auto CheckString = [](const std::shared_ptr<DataType>& type) {
+    auto schema =
+        ::arrow::schema({field("no-statistics", boolean()), field("string", 
type)});
+    auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+    auto string_array_data = ArrayFromJSON(type, "[\"a\", null, 
\"c\"]")->data()->Copy();
+    string_array_data->statistics = std::make_shared<ArrayStatistics>();
+    string_array_data->statistics->is_max_exact = true;
+    string_array_data->statistics->max = "c";
+    auto string_array = MakeArray(std::move(string_array_data));
+    auto batch = RecordBatch::Make(schema, string_array->length(),
+                                   {no_statistics_array, string_array});
+
+    ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+    ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                         MakeStatisticsArray("[null, 1]",
+                                             {{
+                                                  
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                              },
+                                              {
+                                                  
ARROW_STATISTICS_KEY_MAX_VALUE_EXACT,
+                                              }},
+                                             {{
+                                                  
ArrayStatistics::ValueType{int64_t{3}},
+                                              },
+                                              {
+                                                  
ArrayStatistics::ValueType{"c"},
+                                              }},
+                                             {nullptr, type}));
+    AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+  };
 
+  // Check each type
+  CheckString(binary());
+  CheckString(utf8());
+  CheckString(large_binary());
+  CheckString(large_utf8());
+  CheckString(binary_view());
+  CheckString(utf8_view());
+  CheckString(fixed_size_binary(1));

Review Comment:
   Could you use `TYPED_TEST_SUITE()` instead of parameterizing manually?
   
   ```cpp
   template <typename DataType>
   class TestRecordBatchMakeStatisticsArrayString : public ::testing::Test {
    public:
     void TestMax() {
       // ...
     }
   };
   // BaseBinaryOrBinaryViewLikeArrowTypes is defined in 
cpp/src/arrow/testing/gtest_util.h.
   TYPED_TEST_SUITE(TestRecordBatchMakeStatisticsArrayString, 
BaseBinaryOrBinaryViewLikeArrowTypes);
   TYPED_TEST(TestRecordBatchMakeStatisticsArrayString, Max) { this->TestMax();}
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -1424,33 +1471,86 @@ TEST_F(TestRecordBatch, 
MakeStatisticsArrayMaxApproximate) {
 }
 
 TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
-  auto schema =
-      ::arrow::schema({field("no-statistics", boolean()), field("string", 
utf8())});
-  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
-  auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, 
\"c\"]")->data()->Copy();
-  string_array_data->statistics = std::make_shared<ArrayStatistics>();
-  string_array_data->statistics->is_max_exact = true;
-  string_array_data->statistics->max = "c";
-  auto string_array = MakeArray(std::move(string_array_data));
-  auto batch = RecordBatch::Make(schema, string_array->length(),
-                                 {no_statistics_array, string_array});
+  auto CheckString = [](const std::shared_ptr<DataType>& type) {
+    auto schema =
+        ::arrow::schema({field("no-statistics", boolean()), field("string", 
type)});
+    auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+    auto string_array_data = ArrayFromJSON(type, "[\"a\", null, 
\"c\"]")->data()->Copy();
+    string_array_data->statistics = std::make_shared<ArrayStatistics>();
+    string_array_data->statistics->is_max_exact = true;
+    string_array_data->statistics->max = "c";
+    auto string_array = MakeArray(std::move(string_array_data));
+    auto batch = RecordBatch::Make(schema, string_array->length(),
+                                   {no_statistics_array, string_array});
+
+    ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+    ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                         MakeStatisticsArray("[null, 1]",
+                                             {{
+                                                  
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                              },
+                                              {
+                                                  
ARROW_STATISTICS_KEY_MAX_VALUE_EXACT,
+                                              }},
+                                             {{
+                                                  
ArrayStatistics::ValueType{int64_t{3}},
+                                              },
+                                              {
+                                                  
ArrayStatistics::ValueType{"c"},
+                                              }},
+                                             {nullptr, type}));
+    AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+  };
 
+  // Check each type
+  CheckString(binary());
+  CheckString(utf8());
+  CheckString(large_binary());
+  CheckString(large_utf8());
+  CheckString(binary_view());
+  CheckString(utf8_view());
+  CheckString(fixed_size_binary(1));
+
+  // Check the combination of various string types

Review Comment:
   Is this combination case important?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to