This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new fd1bf8e688 GH-44579: [C++] Use array type to compute min/max 
statistics Arrow type (#45094)
fd1bf8e688 is described below

commit fd1bf8e6886646ccd71a03de940190d5baca1ad2
Author: Sutou Kouhei <[email protected]>
AuthorDate: Tue Dec 31 12:05:43 2024 +0900

    GH-44579: [C++] Use array type to compute min/max statistics Arrow type 
(#45094)
    
    ### Rationale for this change
    
    `arrow::ArrayStatistics` uses raw C++ types such as `int64_t` and 
`std::string` for min/max types. We need to convert raw C++ types to Arrow 
types when we use `arrow::ArrayStatistics` for generating statistics array. 
(GH-45038)
    
    We can't map `std::string` to an Arrow type. Because it may be 
`arrow::binary()`, `arrow::utf8()` or something.
    
    ### What changes are included in this PR?
    
    Use `arrow::DataType` information of associated array when we convert 
`arrow::ArrayStatistics`'s min/max raw C++ types to Arrow types.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    No.
    * GitHub Issue: #44579
    
    Authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/array/statistics.h   | 64 +++++++++++++++++++++++++++++++++-----
 cpp/src/arrow/record_batch.cc      |  8 +++--
 cpp/src/arrow/record_batch_test.cc |  2 +-
 3 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h
index 99b853ab0f..6ccd2f4766 100644
--- a/cpp/src/arrow/array/statistics.h
+++ b/cpp/src/arrow/array/statistics.h
@@ -22,7 +22,7 @@
 #include <string>
 #include <variant>
 
-#include "arrow/type_fwd.h"
+#include "arrow/type.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
@@ -34,22 +34,38 @@ namespace arrow {
 /// as Apache Parquet may have statistics. Statistics associated with
 /// data source can be read unified API via this class.
 struct ARROW_EXPORT ArrayStatistics {
+  /// \brief The type for maximum and minimum values. If the target
+  /// value exists, one of them is used. `std::nullopt` is used
+  /// otherwise.
   using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
 
   static const std::shared_ptr<DataType>& ValueToArrowType(
-      const std::optional<ValueType>& value) {
+      const std::optional<ValueType>& value,
+      const std::shared_ptr<DataType>& array_type) {
     if (!value.has_value()) {
       return null();
     }
 
     struct Visitor {
+      const std::shared_ptr<DataType>& array_type;
+
       const std::shared_ptr<DataType>& operator()(const bool&) { return 
boolean(); }
       const std::shared_ptr<DataType>& operator()(const int64_t&) { return 
int64(); }
       const std::shared_ptr<DataType>& operator()(const uint64_t&) { return 
uint64(); }
       const std::shared_ptr<DataType>& operator()(const double&) { return 
float64(); }
-      // GH-44579: How to support binary data?
-      const std::shared_ptr<DataType>& operator()(const std::string&) { return 
utf8(); }
-    } visitor;
+      const std::shared_ptr<DataType>& operator()(const std::string&) {
+        switch (array_type->id()) {
+          case Type::STRING:
+          case Type::BINARY:
+          case Type::FIXED_SIZE_BINARY:
+          case Type::LARGE_STRING:
+          case Type::LARGE_BINARY:
+            return array_type;
+          default:
+            return utf8();
+        }
+      }
+    } visitor{array_type};
     return std::visit(visitor, value.value());
   }
 
@@ -62,7 +78,24 @@ struct ARROW_EXPORT ArrayStatistics {
   /// \brief The minimum value, may not be set
   std::optional<ValueType> min = std::nullopt;
 
-  const std::shared_ptr<DataType>& MinArrowType() { return 
ValueToArrowType(min); }
+  /// \brief Compute Arrow type of the minimum value.
+  ///
+  /// If \ref ValueType is `std::string`, `array_type` may be
+  /// used. If `array_type` is a binary-like type such as \ref
+  /// arrow::binary and \ref arrow::large_utf8, `array_type` is
+  /// returned. \ref arrow::utf8 is returned otherwise.
+  ///
+  /// If \ref ValueType isn't `std::string`, `array_type` isn't used.
+  ///
+  /// \param array_type The Arrow type of the associated array.
+  ///
+  /// \return \ref arrow::null if the minimum value is `std::nullopt`,
+  ///         Arrow type based on \ref ValueType of the \ref min
+  ///         otherwise.
+  const std::shared_ptr<DataType>& MinArrowType(
+      const std::shared_ptr<DataType>& array_type) {
+    return ValueToArrowType(min, array_type);
+  }
 
   /// \brief Whether the minimum value is exact or not
   bool is_min_exact = false;
@@ -70,7 +103,24 @@ struct ARROW_EXPORT ArrayStatistics {
   /// \brief The maximum value, may not be set
   std::optional<ValueType> max = std::nullopt;
 
-  const std::shared_ptr<DataType>& MaxArrowType() { return 
ValueToArrowType(max); }
+  /// \brief Compute Arrow type of the maximum value.
+  ///
+  /// If \ref ValueType is `std::string`, `array_type` may be
+  /// used. If `array_type` is a binary-like type such as \ref
+  /// arrow::binary and \ref arrow::large_utf8, `array_type` is
+  /// returned. \ref arrow::utf8 is returned otherwise.
+  ///
+  /// If \ref ValueType isn't `std::string`, `array_type` isn't used.
+  ///
+  /// \param array_type The Arrow type of the associated array.
+  ///
+  /// \return \ref arrow::null if the maximum value is `std::nullopt`,
+  ///         Arrow type based on \ref ValueType of the \ref max
+  ///         otherwise.
+  const std::shared_ptr<DataType>& MaxArrowType(
+      const std::shared_ptr<DataType>& array_type) {
+    return ValueToArrowType(max, array_type);
+  }
 
   /// \brief Whether the maximum value is exact or not
   bool is_max_exact = false;
diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index 3f8237188d..5ce33a3731 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -493,8 +493,10 @@ Status EnumerateStatistics(const RecordBatch& 
record_batch, OnStatistics on_stat
   RETURN_NOT_OK(on_statistics(statistics));
   statistics.start_new_column = false;
 
-  const auto num_fields = record_batch.schema()->num_fields();
+  const auto& schema = record_batch.schema();
+  const auto num_fields = schema->num_fields();
   for (int nth_column = 0; nth_column < num_fields; ++nth_column) {
+    const auto& field = schema->field(nth_column);
     auto column_statistics = record_batch.column(nth_column)->statistics();
     if (!column_statistics) {
       continue;
@@ -527,7 +529,7 @@ Status EnumerateStatistics(const RecordBatch& record_batch, 
OnStatistics on_stat
       } else {
         statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE;
       }
-      statistics.type = column_statistics->MinArrowType();
+      statistics.type = column_statistics->MinArrowType(field->type());
       statistics.value = column_statistics->min.value();
       RETURN_NOT_OK(on_statistics(statistics));
       statistics.start_new_column = false;
@@ -540,7 +542,7 @@ Status EnumerateStatistics(const RecordBatch& record_batch, 
OnStatistics on_stat
       } else {
         statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE;
       }
-      statistics.type = column_statistics->MaxArrowType();
+      statistics.type = column_statistics->MaxArrowType(field->type());
       statistics.value = column_statistics->max.value();
       RETURN_NOT_OK(on_statistics(statistics));
       statistics.start_new_column = false;
diff --git a/cpp/src/arrow/record_batch_test.cc 
b/cpp/src/arrow/record_batch_test.cc
index 21202c6acb..21d51ae506 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -1116,7 +1116,7 @@ Result<std::shared_ptr<Array>> MakeStatisticsArray(
       }
       keys_indices.push_back(key_index);
 
-      auto values_type = ArrayStatistics::ValueToArrowType(value);
+      auto values_type = ArrayStatistics::ValueToArrowType(value, 
arrow::null());
       int8_t values_type_code = 0;
       for (; values_type_code < static_cast<int32_t>(values_types.size());
            ++values_type_code) {

Reply via email to