This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new d748acee35 GH-44010: [C++] Add 
`arrow::RecordBatch::MakeStatisticsArray()` (#44252)
d748acee35 is described below

commit d748acee35ecaf88cf6191048c2cac43007a76b7
Author: Sutou Kouhei <[email protected]>
AuthorDate: Sat Nov 9 05:37:27 2024 +0900

    GH-44010: [C++] Add `arrow::RecordBatch::MakeStatisticsArray()` (#44252)
    
    ### Rationale for this change
    
    Statistics schema for Arrow C data interface (GH-43553) is complex because 
it uses nested types (struct, map and union). So reusable implementation to 
make statistics array is useful.
    
    ### What changes are included in this PR?
    
    `arrow::RecordBatch::MakeStatisticsArray()` is a convenient function that 
converts `arrow::ArrayStatistics` in a `arrow::RecordBatch` to `arrow::Array` 
for the Arrow C data interface.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    Yes.
    * GitHub Issue: #44010
    
    Authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/array/statistics.h   |  22 ++
 cpp/src/arrow/c/abi.h              |  18 ++
 cpp/src/arrow/record_batch.cc      | 219 +++++++++++++++++++
 cpp/src/arrow/record_batch.h       |  12 +
 cpp/src/arrow/record_batch_test.cc | 436 +++++++++++++++++++++++++++++++++++++
 5 files changed, 707 insertions(+)

diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h
index 523f877bbe..e7365a9d7f 100644
--- a/cpp/src/arrow/array/statistics.h
+++ b/cpp/src/arrow/array/statistics.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <variant>
 
+#include "arrow/type_fwd.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
@@ -34,6 +35,23 @@ namespace arrow {
 struct ARROW_EXPORT ArrayStatistics {
   using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
 
+  static const std::shared_ptr<DataType>& ValueToArrowType(
+      const std::optional<ValueType>& value) {
+    if (!value.has_value()) {
+      return null();
+    }
+
+    struct Visitor {
+      const std::shared_ptr<DataType>& operator()(const bool&) { return 
boolean(); }
+      const std::shared_ptr<DataType>& operator()(const int64_t&) { return 
int64(); }
+      const std::shared_ptr<DataType>& operator()(const uint64_t&) { return 
uint64(); }
+      const std::shared_ptr<DataType>& operator()(const double&) { return 
float64(); }
+      // GH-44579: How to support binary data?
+      const std::shared_ptr<DataType>& operator()(const std::string&) { return 
utf8(); }
+    } visitor;
+    return std::visit(visitor, value.value());
+  }
+
   /// \brief The number of null values, may not be set
   std::optional<int64_t> null_count = std::nullopt;
 
@@ -43,12 +61,16 @@ struct ARROW_EXPORT ArrayStatistics {
   /// \brief The minimum value, may not be set
   std::optional<ValueType> min = std::nullopt;
 
+  const std::shared_ptr<DataType>& MinArrowType() { return 
ValueToArrowType(min); }
+
   /// \brief Whether the minimum value is exact or not
   bool is_min_exact = false;
 
   /// \brief The maximum value, may not be set
   std::optional<ValueType> max = std::nullopt;
 
+  const std::shared_ptr<DataType>& MaxArrowType() { return 
ValueToArrowType(max); }
+
   /// \brief Whether the maximum value is exact or not
   bool is_max_exact = false;
 
diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 9dc142bd08..e44933a6af 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -80,6 +80,24 @@ struct ArrowArray {
   void* private_data;
 };
 
+#  define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT 
"ARROW:average_byte_width:exact"
+#  define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE \
+    "ARROW:average_byte_width:approximate"
+#  define ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT 
"ARROW:distinct_count:exact"
+#  define ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE \
+    "ARROW:distinct_count:approximate"
+#  define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT 
"ARROW:max_byte_width:exact"
+#  define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE \
+    "ARROW:max_byte_width:approximate"
+#  define ARROW_STATISTICS_KEY_MAX_VALUE_EXACT "ARROW:max_value:exact"
+#  define ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE 
"ARROW:max_value:approximate"
+#  define ARROW_STATISTICS_KEY_MIN_VALUE_EXACT "ARROW:min_value:exact"
+#  define ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE 
"ARROW:min_value:approximate"
+#  define ARROW_STATISTICS_KEY_NULL_COUNT_EXACT "ARROW:null_count:exact"
+#  define ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE 
"ARROW:null_count:approximate"
+#  define ARROW_STATISTICS_KEY_ROW_COUNT_EXACT "ARROW:row_count:exact"
+#  define ARROW_STATISTICS_KEY_ROW_COUNT_APPROXIMATE 
"ARROW:row_count:approximate"
+
 #endif  // ARROW_C_DATA_INTERFACE
 
 #ifndef ARROW_C_DEVICE_DATA_INTERFACE
diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index e3a8c0d710..3f8237188d 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -26,8 +26,13 @@
 #include <utility>
 
 #include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/builder_union.h"
 #include "arrow/array/concatenate.h"
 #include "arrow/array/validate.h"
+#include "arrow/c/abi.h"
 #include "arrow/pretty_print.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
@@ -465,6 +470,220 @@ Result<std::shared_ptr<RecordBatch>> 
RecordBatch::ViewOrCopyTo(
   return Make(schema_, num_rows(), std::move(copied_columns));
 }
 
+namespace {
+struct EnumeratedStatistics {
+  int nth_statistics = 0;
+  bool start_new_column = false;
+  std::optional<int32_t> nth_column = std::nullopt;
+  const char* key = nullptr;
+  std::shared_ptr<DataType> type = nullptr;
+  ArrayStatistics::ValueType value = false;
+};
+using OnStatistics =
+    std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics 
on_statistics) {
+  EnumeratedStatistics statistics;
+  statistics.nth_statistics = 0;
+  statistics.start_new_column = true;
+  statistics.nth_column = std::nullopt;
+
+  statistics.key = ARROW_STATISTICS_KEY_ROW_COUNT_EXACT;
+  statistics.type = int64();
+  statistics.value = record_batch.num_rows();
+  RETURN_NOT_OK(on_statistics(statistics));
+  statistics.start_new_column = false;
+
+  const auto num_fields = record_batch.schema()->num_fields();
+  for (int nth_column = 0; nth_column < num_fields; ++nth_column) {
+    auto column_statistics = record_batch.column(nth_column)->statistics();
+    if (!column_statistics) {
+      continue;
+    }
+
+    statistics.start_new_column = true;
+    statistics.nth_column = nth_column;
+    if (column_statistics->null_count.has_value()) {
+      statistics.nth_statistics++;
+      statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_EXACT;
+      statistics.type = int64();
+      statistics.value = column_statistics->null_count.value();
+      RETURN_NOT_OK(on_statistics(statistics));
+      statistics.start_new_column = false;
+    }
+
+    if (column_statistics->distinct_count.has_value()) {
+      statistics.nth_statistics++;
+      statistics.key = ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT;
+      statistics.type = int64();
+      statistics.value = column_statistics->distinct_count.value();
+      RETURN_NOT_OK(on_statistics(statistics));
+      statistics.start_new_column = false;
+    }
+
+    if (column_statistics->min.has_value()) {
+      statistics.nth_statistics++;
+      if (column_statistics->is_min_exact) {
+        statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_EXACT;
+      } else {
+        statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE;
+      }
+      statistics.type = column_statistics->MinArrowType();
+      statistics.value = column_statistics->min.value();
+      RETURN_NOT_OK(on_statistics(statistics));
+      statistics.start_new_column = false;
+    }
+
+    if (column_statistics->max.has_value()) {
+      statistics.nth_statistics++;
+      if (column_statistics->is_max_exact) {
+        statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_EXACT;
+      } else {
+        statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE;
+      }
+      statistics.type = column_statistics->MaxArrowType();
+      statistics.value = column_statistics->max.value();
+      RETURN_NOT_OK(on_statistics(statistics));
+      statistics.start_new_column = false;
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
+
+Result<std::shared_ptr<Array>> RecordBatch::MakeStatisticsArray(
+    MemoryPool* memory_pool) const {
+  // Statistics schema:
+  // struct<
+  //   column: int32,
+  //   statistics: map<
+  //     key: dictionary<
+  //       indices: int32,
+  //       dictionary: utf8,
+  //     >,
+  //     items: dense_union<...all needed types...>,
+  //   >
+  // >
+
+  // Statistics schema doesn't define static dense union type for
+  // values. Each statistics schema have a dense union type that has
+  // needled value types. The following block collects these types.
+  std::vector<std::shared_ptr<Field>> values_types;
+  std::vector<int8_t> values_type_indexes;
+  RETURN_NOT_OK(EnumerateStatistics(*this, [&](const EnumeratedStatistics& 
statistics) {
+    int8_t i = 0;
+    for (const auto& field : values_types) {
+      if (field->type()->id() == statistics.type->id()) {
+        break;
+      }
+      i++;
+    }
+    if (i == static_cast<int8_t>(values_types.size())) {
+      values_types.push_back(field(statistics.type->name(), statistics.type));
+    }
+    values_type_indexes.push_back(i);
+    return Status::OK();
+  }));
+
+  // statistics.key: dictionary<indices: int32, dictionary: utf8>
+  auto keys_type = dictionary(int32(), utf8(), false);
+  // statistics.items: dense_union<...all needed types...>
+  auto values_type = dense_union(values_types);
+  // struct<
+  //   column: int32,
+  //   statistics: map<
+  //     key: dictionary<
+  //       indices: int32,
+  //       dictionary: utf8,
+  //     >,
+  //     items: dense_union<...all needed types...>,
+  //   >
+  // >
+  auto statistics_type =
+      struct_({field("column", int32()),
+               field("statistics", map(keys_type, values_type, false))});
+
+  std::vector<std::shared_ptr<ArrayBuilder>> field_builders;
+  // columns: int32
+  auto columns_builder = std::make_shared<Int32Builder>(memory_pool);
+  
field_builders.push_back(std::static_pointer_cast<ArrayBuilder>(columns_builder));
+  // statistics.key: dictionary<indices: int32, dictionary: utf8>
+  auto keys_builder = std::make_shared<StringDictionary32Builder>();
+  // statistics.items: dense_union<...all needed types...>
+  std::vector<std::shared_ptr<ArrayBuilder>> values_builders;
+  for (const auto& values_type : values_types) {
+    std::unique_ptr<ArrayBuilder> values_builder;
+    RETURN_NOT_OK(MakeBuilder(memory_pool, values_type->type(), 
&values_builder));
+    
values_builders.push_back(std::shared_ptr<ArrayBuilder>(std::move(values_builder)));
+  }
+  auto items_builder = std::make_shared<DenseUnionBuilder>(
+      memory_pool, std::move(values_builders), values_type);
+  // statistics:
+  //   map<
+  //     key: dictionary<
+  //       indices: int32,
+  //       dictionary: utf8,
+  //     >,
+  //     items: dense_union<...all needed types...>,
+  //   >
+  auto values_builder = std::make_shared<MapBuilder>(
+      memory_pool, std::static_pointer_cast<ArrayBuilder>(keys_builder),
+      std::static_pointer_cast<ArrayBuilder>(items_builder));
+  
field_builders.push_back(std::static_pointer_cast<ArrayBuilder>(values_builder));
+  // struct<
+  //   column: int32,
+  //   statistics: map<
+  //     key: dictionary<
+  //       indices: int32,
+  //       dictionary: utf8,
+  //     >,
+  //     items: dense_union<...all needed types...>,
+  //   >
+  // >
+  StructBuilder builder(statistics_type, memory_pool, 
std::move(field_builders));
+
+  // Append statistics.
+  RETURN_NOT_OK(EnumerateStatistics(*this, [&](const EnumeratedStatistics& 
statistics) {
+    if (statistics.start_new_column) {
+      RETURN_NOT_OK(builder.Append());
+      if (statistics.nth_column.has_value()) {
+        RETURN_NOT_OK(columns_builder->Append(statistics.nth_column.value()));
+      } else {
+        RETURN_NOT_OK(columns_builder->AppendNull());
+      }
+      RETURN_NOT_OK(values_builder->Append());
+    }
+    RETURN_NOT_OK(keys_builder->Append(statistics.key,
+                                       
static_cast<int32_t>(strlen(statistics.key))));
+    const auto values_type_index = 
values_type_indexes[statistics.nth_statistics];
+    RETURN_NOT_OK(items_builder->Append(values_type_index));
+    struct Visitor {
+      ArrayBuilder* builder;
+
+      Status operator()(const bool& value) {
+        return static_cast<BooleanBuilder*>(builder)->Append(value);
+      }
+      Status operator()(const int64_t& value) {
+        return static_cast<Int64Builder*>(builder)->Append(value);
+      }
+      Status operator()(const uint64_t& value) {
+        return static_cast<UInt64Builder*>(builder)->Append(value);
+      }
+      Status operator()(const double& value) {
+        return static_cast<DoubleBuilder*>(builder)->Append(value);
+      }
+      Status operator()(const std::string& value) {
+        return static_cast<StringBuilder*>(builder)->Append(
+            value.data(), static_cast<int32_t>(value.size()));
+      }
+    } visitor;
+    visitor.builder = values_builders[values_type_index].get();
+    RETURN_NOT_OK(std::visit(visitor, statistics.value));
+    return Status::OK();
+  }));
+
+  return builder.Finish();
+}
+
 Status RecordBatch::Validate() const {
   return ValidateBatch(*this, /*full_validation=*/false);
 }
diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h
index 95596e9c15..edbefc1c77 100644
--- a/cpp/src/arrow/record_batch.h
+++ b/cpp/src/arrow/record_batch.h
@@ -282,6 +282,18 @@ class ARROW_EXPORT RecordBatch {
 
   virtual DeviceAllocationType device_type() const = 0;
 
+  /// \brief Create a statistics array of this record batch
+  ///
+  /// The created array follows the C data interface statistics
+  /// specification. See
+  /// https://arrow.apache.org/docs/format/CDataInterfaceStatistics.html
+  /// for details.
+  ///
+  /// \param[in] pool the memory pool to allocate memory from
+  /// \return the statistics array of this record batch
+  Result<std::shared_ptr<Array>> MakeStatisticsArray(
+      MemoryPool* pool = default_memory_pool()) const;
+
  protected:
   RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows);
 
diff --git a/cpp/src/arrow/record_batch_test.cc 
b/cpp/src/arrow/record_batch_test.cc
index daf7109075..21202c6acb 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -25,9 +25,11 @@
 #include <vector>
 
 #include "arrow/array/array_base.h"
+#include "arrow/array/array_dict.h"
 #include "arrow/array/array_nested.h"
 #include "arrow/array/data.h"
 #include "arrow/array/util.h"
+#include "arrow/c/abi.h"
 #include "arrow/chunked_array.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
@@ -980,6 +982,440 @@ TEST_F(TestRecordBatch, ToTensorUnsupportedMixedFloat16) {
       batch1->ToTensor());
 }
 
+namespace {
+template <typename ArrowType,
+          typename = std::enable_if_t<is_boolean_type<ArrowType>::value ||
+                                      is_number_type<ArrowType>::value>>
+Result<std::shared_ptr<Array>> BuildArray(
+    const std::vector<typename TypeTraits<ArrowType>::CType>& values) {
+  using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
+  BuilderType builder;
+  for (const auto& value : values) {
+    ARROW_RETURN_NOT_OK(builder.Append(value));
+  }
+  return builder.Finish();
+}
+
+template <typename ArrowType, typename = enable_if_string<ArrowType>>
+Result<std::shared_ptr<Array>> BuildArray(const std::vector<std::string>& 
values) {
+  using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
+  BuilderType builder;
+  for (const auto& value : values) {
+    ARROW_RETURN_NOT_OK(builder.Append(value));
+  }
+  return builder.Finish();
+}
+
+template <typename RawType>
+std::vector<RawType> StatisticsValuesToRawValues(
+    const std::vector<ArrayStatistics::ValueType>& values) {
+  std::vector<RawType> raw_values;
+  for (const auto& value : values) {
+    raw_values.push_back(std::get<RawType>(value));
+  }
+  return raw_values;
+}
+
+template <typename ValueType, typename = std::enable_if_t<std::is_same<
+                                  ArrayStatistics::ValueType, 
ValueType>::value>>
+Result<std::shared_ptr<Array>> BuildArray(const std::vector<ValueType>& 
values) {
+  struct Builder {
+    const std::vector<ArrayStatistics::ValueType>& values_;
+    explicit Builder(const std::vector<ArrayStatistics::ValueType>& values)
+        : values_(values) {}
+
+    Result<std::shared_ptr<Array>> operator()(const bool&) {
+      auto values = StatisticsValuesToRawValues<bool>(values_);
+      return BuildArray<BooleanType>(values);
+    }
+    Result<std::shared_ptr<Array>> operator()(const int64_t&) {
+      auto values = StatisticsValuesToRawValues<int64_t>(values_);
+      return BuildArray<Int64Type>(values);
+    }
+    Result<std::shared_ptr<Array>> operator()(const uint64_t&) {
+      auto values = StatisticsValuesToRawValues<uint64_t>(values_);
+      return BuildArray<UInt64Type>(values);
+    }
+    Result<std::shared_ptr<Array>> operator()(const double&) {
+      auto values = StatisticsValuesToRawValues<double>(values_);
+      return BuildArray<DoubleType>(values);
+    }
+    Result<std::shared_ptr<Array>> operator()(const std::string&) {
+      auto values = StatisticsValuesToRawValues<std::string>(values_);
+      return BuildArray<StringType>(values);
+    }
+  } builder(values);
+  return std::visit(builder, values[0]);
+}
+
+Result<std::shared_ptr<Array>> MakeStatisticsArray(
+    const std::string& columns_json,
+    const std::vector<std::vector<std::string>>& nested_statistics_keys,
+    const std::vector<std::vector<ArrayStatistics::ValueType>>&
+        nested_statistics_values) {
+  auto columns_type = int32();
+  auto columns_array = ArrayFromJSON(columns_type, columns_json);
+  const auto n_columns = columns_array->length();
+
+  // nested_statistics_keys:
+  //   {
+  //     {"ARROW:row_count:exact", "ARROW:null_count:exact"},
+  //     {"ARROW:max_value:exact"},
+  //     {"ARROW:max_value:exact", "ARROW:distinct_count:exact"},
+  //   }
+  // nested_statistics_values:
+  //   {
+  //     {int64_t{29}, int64_t{1}},
+  //     {double{2.9}},
+  //     {double{-2.9}, int64_t{2}},
+  //   }
+  // ->
+  // keys_dictionary:
+  //   {
+  //     "ARROW:row_count:exact",
+  //     "ARROW:null_count:exact",
+  //     "ARROW:max_value:exact",
+  //     "ARROW:distinct_count:exact",
+  //   }
+  // keys_indices: {0, 1, 2, 2, 3}
+  // values_types: {int64(), float64()}
+  // values_type_codes: {0, 1}
+  // values_values[0]: {int64_t{29}, int64_t{1}, int64_t{2}}
+  // values_values[1]: {double{2.9}, double{-2.9}}
+  // values_value_type_ids: {0, 0, 1, 1, 0}
+  // values_value_offsets: {0, 1, 0, 1, 2}
+  // statistics_offsets: {0, 2, 3, 5, 5}
+  std::vector<std::string> keys_dictionary;
+  std::vector<int32_t> keys_indices;
+  std::vector<std::shared_ptr<DataType>> values_types;
+  std::vector<int8_t> values_type_codes;
+  std::vector<std::vector<ArrayStatistics::ValueType>> values_values;
+  std::vector<int8_t> values_value_type_ids;
+  std::vector<int32_t> values_value_offsets;
+  std::vector<int32_t> statistics_offsets;
+
+  int32_t offset = 0;
+  std::vector<int32_t> values_value_offset_counters;
+  for (size_t i = 0; i < nested_statistics_keys.size(); ++i) {
+    const auto& statistics_keys = nested_statistics_keys[i];
+    const auto& statistics_values = nested_statistics_values[i];
+    statistics_offsets.push_back(offset);
+    for (size_t j = 0; j < statistics_keys.size(); ++j) {
+      const auto& key = statistics_keys[j];
+      const auto& value = statistics_values[j];
+      ++offset;
+
+      int32_t key_index = 0;
+      for (; key_index < static_cast<int32_t>(keys_dictionary.size()); 
++key_index) {
+        if (keys_dictionary[key_index] == key) {
+          break;
+        }
+      }
+      if (key_index == static_cast<int32_t>(keys_dictionary.size())) {
+        keys_dictionary.push_back(key);
+      }
+      keys_indices.push_back(key_index);
+
+      auto values_type = ArrayStatistics::ValueToArrowType(value);
+      int8_t values_type_code = 0;
+      for (; values_type_code < static_cast<int32_t>(values_types.size());
+           ++values_type_code) {
+        if (values_types[values_type_code] == values_type) {
+          break;
+        }
+      }
+      if (values_type_code == static_cast<int32_t>(values_types.size())) {
+        values_types.push_back(values_type);
+        values_type_codes.push_back(values_type_code);
+        values_values.emplace_back();
+        values_value_offset_counters.push_back(0);
+      }
+      values_values[values_type_code].push_back(value);
+      values_value_type_ids.push_back(values_type_code);
+      
values_value_offsets.push_back(values_value_offset_counters[values_type_code]++);
+    }
+  }
+  statistics_offsets.push_back(offset);
+
+  auto keys_type = dictionary(int32(), utf8(), false);
+  std::vector<std::shared_ptr<Field>> values_fields;
+  for (const auto& type : values_types) {
+    values_fields.push_back(field(type->name(), type));
+  }
+  auto values_type = dense_union(values_fields);
+  auto statistics_type = map(keys_type, values_type, false);
+  auto struct_type =
+      struct_({field("column", columns_type), field("statistics", 
statistics_type)});
+
+  ARROW_ASSIGN_OR_RAISE(auto keys_indices_array, 
BuildArray<Int32Type>(keys_indices));
+  ARROW_ASSIGN_OR_RAISE(auto keys_dictionary_array,
+                        BuildArray<StringType>(keys_dictionary));
+  ARROW_ASSIGN_OR_RAISE(
+      auto keys_array,
+      DictionaryArray::FromArrays(keys_type, keys_indices_array, 
keys_dictionary_array));
+
+  std::vector<std::shared_ptr<Array>> values_arrays;
+  for (const auto& values : values_values) {
+    ARROW_ASSIGN_OR_RAISE(auto values_array,
+                          BuildArray<ArrayStatistics::ValueType>(values));
+    values_arrays.push_back(values_array);
+  }
+  ARROW_ASSIGN_OR_RAISE(auto values_value_type_ids_array,
+                        BuildArray<Int8Type>(values_value_type_ids));
+  ARROW_ASSIGN_OR_RAISE(auto values_value_offsets_array,
+                        BuildArray<Int32Type>(values_value_offsets));
+  auto values_array = std::make_shared<DenseUnionArray>(
+      values_type, values_value_offsets_array->length(), values_arrays,
+      values_value_type_ids_array->data()->buffers[1],
+      values_value_offsets_array->data()->buffers[1]);
+  ARROW_ASSIGN_OR_RAISE(auto statistics_offsets_array,
+                        BuildArray<Int32Type>(statistics_offsets));
+  ARROW_ASSIGN_OR_RAISE(auto statistics_array,
+                        MapArray::FromArrays(statistics_type, 
statistics_offsets_array,
+                                             keys_array, values_array));
+  std::vector<std::shared_ptr<Array>> struct_arrays = 
{std::move(columns_array),
+                                                       
std::move(statistics_array)};
+  return std::make_shared<StructArray>(struct_type, n_columns, struct_arrays);
+}
+};  // namespace
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayRowCount) {
+  auto schema = ::arrow::schema({field("int32", int32())});
+  auto int32_array = ArrayFromJSON(int32(), "[1, null, -1]");
+  auto batch = RecordBatch::Make(schema, int32_array->length(), {int32_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                       MakeStatisticsArray("[null]",
+                                           {{
+                                               
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                           }},
+                                           {{
+                                               
ArrayStatistics::ValueType{int64_t{3}},
+                                           }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayNullCount) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("int32", 
int32())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto int32_array_data = ArrayFromJSON(int32(), "[1, null, 
-1]")->data()->Copy();
+  int32_array_data->statistics = std::make_shared<ArrayStatistics>();
+  int32_array_data->statistics->null_count = 1;
+  auto int32_array = MakeArray(std::move(int32_array_data));
+  auto batch = RecordBatch::Make(schema, int32_array->length(),
+                                 {no_statistics_array, int32_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                       MakeStatisticsArray("[null, 1]",
+                                           {{
+                                                
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                            },
+                                            {
+                                                
ARROW_STATISTICS_KEY_NULL_COUNT_EXACT,
+                                            }},
+                                           {{
+                                                
ArrayStatistics::ValueType{int64_t{3}},
+                                            },
+                                            {
+                                                
ArrayStatistics::ValueType{int64_t{1}},
+                                            }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCount) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("int32", 
int32())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto int32_array_data = ArrayFromJSON(int32(), "[1, null, 
-1]")->data()->Copy();
+  int32_array_data->statistics = std::make_shared<ArrayStatistics>();
+  int32_array_data->statistics->null_count = 1;
+  int32_array_data->statistics->distinct_count = 2;
+  auto int32_array = MakeArray(std::move(int32_array_data));
+  auto batch = RecordBatch::Make(schema, int32_array->length(),
+                                 {no_statistics_array, int32_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                       MakeStatisticsArray("[null, 1]",
+                                           {{
+                                                
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                            },
+                                            {
+                                                
ARROW_STATISTICS_KEY_NULL_COUNT_EXACT,
+                                                
ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT,
+                                            }},
+                                           {{
+                                                
ArrayStatistics::ValueType{int64_t{3}},
+                                            },
+                                            {
+                                                
ArrayStatistics::ValueType{int64_t{1}},
+                                                
ArrayStatistics::ValueType{int64_t{2}},
+                                            }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayMinExact) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("uint32", 
uint32())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto uint32_array_data = ArrayFromJSON(uint32(), "[100, null, 
1]")->data()->Copy();
+  uint32_array_data->statistics = std::make_shared<ArrayStatistics>();
+  uint32_array_data->statistics->is_min_exact = true;
+  uint32_array_data->statistics->min = uint64_t{1};
+  auto uint32_array = MakeArray(std::move(uint32_array_data));
+  auto batch = RecordBatch::Make(schema, uint32_array->length(),
+                                 {no_statistics_array, uint32_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                       MakeStatisticsArray("[null, 1]",
+                                           {{
+                                                
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                            },
+                                            {
+                                                
ARROW_STATISTICS_KEY_MIN_VALUE_EXACT,
+                                            }},
+                                           {{
+                                                
ArrayStatistics::ValueType{int64_t{3}},
+                                            },
+                                            {
+                                                
ArrayStatistics::ValueType{uint64_t{1}},
+                                            }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayMinApproximate) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("int32", 
int32())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto int32_array_data = ArrayFromJSON(int32(), "[1, null, 
-1]")->data()->Copy();
+  int32_array_data->statistics = std::make_shared<ArrayStatistics>();
+  int32_array_data->statistics->min = -1.0;
+  auto int32_array = MakeArray(std::move(int32_array_data));
+  auto batch = RecordBatch::Make(schema, int32_array->length(),
+                                 {no_statistics_array, int32_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(
+      auto expected_statistics_array,
+      MakeStatisticsArray("[null, 1]",
+                          {{
+                               ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                           },
+                           {
+                               ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE,
+                           }},
+                          {{
+                               ArrayStatistics::ValueType{int64_t{3}},
+                           },
+                           {
+                               ArrayStatistics::ValueType{-1.0},
+                           }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayMaxExact) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("boolean", 
boolean())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto boolean_array_data =
+      ArrayFromJSON(boolean(), "[true, null, false]")->data()->Copy();
+  boolean_array_data->statistics = std::make_shared<ArrayStatistics>();
+  boolean_array_data->statistics->is_max_exact = true;
+  boolean_array_data->statistics->max = true;
+  auto boolean_array = MakeArray(std::move(boolean_array_data));
+  auto batch = RecordBatch::Make(schema, boolean_array->length(),
+                                 {no_statistics_array, boolean_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                       MakeStatisticsArray("[null, 1]",
+                                           {{
+                                                
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                            },
+                                            {
+                                                
ARROW_STATISTICS_KEY_MAX_VALUE_EXACT,
+                                            }},
+                                           {{
+                                                
ArrayStatistics::ValueType{int64_t{3}},
+                                            },
+                                            {
+                                                
ArrayStatistics::ValueType{true},
+                                            }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("float64", 
float64())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto float64_array_data = ArrayFromJSON(float64(), "[1.0, null, 
-1.0]")->data()->Copy();
+  float64_array_data->statistics = std::make_shared<ArrayStatistics>();
+  float64_array_data->statistics->min = -1.0;
+  auto float64_array = MakeArray(std::move(float64_array_data));
+  auto batch = RecordBatch::Make(schema, float64_array->length(),
+                                 {no_statistics_array, float64_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(
+      auto expected_statistics_array,
+      MakeStatisticsArray("[null, 1]",
+                          {{
+                               ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                           },
+                           {
+                               ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE,
+                           }},
+                          {{
+                               ArrayStatistics::ValueType{int64_t{3}},
+                           },
+                           {
+                               ArrayStatistics::ValueType{-1.0},
+                           }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
+  auto schema =
+      ::arrow::schema({field("no-statistics", boolean()), field("string", 
utf8())});
+  auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+  auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, 
\"c\"]")->data()->Copy();
+  string_array_data->statistics = std::make_shared<ArrayStatistics>();
+  string_array_data->statistics->is_max_exact = true;
+  string_array_data->statistics->max = "c";
+  auto string_array = MakeArray(std::move(string_array_data));
+  auto batch = RecordBatch::Make(schema, string_array->length(),
+                                 {no_statistics_array, string_array});
+
+  ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+  ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+                       MakeStatisticsArray("[null, 1]",
+                                           {{
+                                                
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+                                            },
+                                            {
+                                                
ARROW_STATISTICS_KEY_MAX_VALUE_EXACT,
+                                            }},
+                                           {{
+                                                
ArrayStatistics::ValueType{int64_t{3}},
+                                            },
+                                            {
+                                                
ArrayStatistics::ValueType{"c"},
+                                            }}));
+  AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
 template <typename DataType>
 class TestBatchToTensorColumnMajor : public ::testing::Test {};
 


Reply via email to