This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new d748acee35 GH-44010: [C++] Add
`arrow::RecordBatch::MakeStatisticsArray()` (#44252)
d748acee35 is described below
commit d748acee35ecaf88cf6191048c2cac43007a76b7
Author: Sutou Kouhei <[email protected]>
AuthorDate: Sat Nov 9 05:37:27 2024 +0900
GH-44010: [C++] Add `arrow::RecordBatch::MakeStatisticsArray()` (#44252)
### Rationale for this change
Statistics schema for Arrow C data interface (GH-43553) is complex because
it uses nested types (struct, map and union). So reusable implementation to
make statistics array is useful.
### What changes are included in this PR?
`arrow::RecordBatch::MakeStatisticsArray()` is a convenient function that
converts `arrow::ArrayStatistics` in a `arrow::RecordBatch` to `arrow::Array`
for the Arrow C data interface.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
* GitHub Issue: #44010
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/array/statistics.h | 22 ++
cpp/src/arrow/c/abi.h | 18 ++
cpp/src/arrow/record_batch.cc | 219 +++++++++++++++++++
cpp/src/arrow/record_batch.h | 12 +
cpp/src/arrow/record_batch_test.cc | 436 +++++++++++++++++++++++++++++++++++++
5 files changed, 707 insertions(+)
diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h
index 523f877bbe..e7365a9d7f 100644
--- a/cpp/src/arrow/array/statistics.h
+++ b/cpp/src/arrow/array/statistics.h
@@ -22,6 +22,7 @@
#include <string>
#include <variant>
+#include "arrow/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -34,6 +35,23 @@ namespace arrow {
struct ARROW_EXPORT ArrayStatistics {
using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
+ static const std::shared_ptr<DataType>& ValueToArrowType(
+ const std::optional<ValueType>& value) {
+ if (!value.has_value()) {
+ return null();
+ }
+
+ struct Visitor {
+ const std::shared_ptr<DataType>& operator()(const bool&) { return
boolean(); }
+ const std::shared_ptr<DataType>& operator()(const int64_t&) { return
int64(); }
+ const std::shared_ptr<DataType>& operator()(const uint64_t&) { return
uint64(); }
+ const std::shared_ptr<DataType>& operator()(const double&) { return
float64(); }
+ // GH-44579: How to support binary data?
+ const std::shared_ptr<DataType>& operator()(const std::string&) { return
utf8(); }
+ } visitor;
+ return std::visit(visitor, value.value());
+ }
+
/// \brief The number of null values, may not be set
std::optional<int64_t> null_count = std::nullopt;
@@ -43,12 +61,16 @@ struct ARROW_EXPORT ArrayStatistics {
/// \brief The minimum value, may not be set
std::optional<ValueType> min = std::nullopt;
+ const std::shared_ptr<DataType>& MinArrowType() { return
ValueToArrowType(min); }
+
/// \brief Whether the minimum value is exact or not
bool is_min_exact = false;
/// \brief The maximum value, may not be set
std::optional<ValueType> max = std::nullopt;
+ const std::shared_ptr<DataType>& MaxArrowType() { return
ValueToArrowType(max); }
+
/// \brief Whether the maximum value is exact or not
bool is_max_exact = false;
diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 9dc142bd08..e44933a6af 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -80,6 +80,24 @@ struct ArrowArray {
void* private_data;
};
+# define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT
"ARROW:average_byte_width:exact"
+# define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE \
+ "ARROW:average_byte_width:approximate"
+# define ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT
"ARROW:distinct_count:exact"
+# define ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE \
+ "ARROW:distinct_count:approximate"
+# define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT
"ARROW:max_byte_width:exact"
+# define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE \
+ "ARROW:max_byte_width:approximate"
+# define ARROW_STATISTICS_KEY_MAX_VALUE_EXACT "ARROW:max_value:exact"
+# define ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE
"ARROW:max_value:approximate"
+# define ARROW_STATISTICS_KEY_MIN_VALUE_EXACT "ARROW:min_value:exact"
+# define ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE
"ARROW:min_value:approximate"
+# define ARROW_STATISTICS_KEY_NULL_COUNT_EXACT "ARROW:null_count:exact"
+# define ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE
"ARROW:null_count:approximate"
+# define ARROW_STATISTICS_KEY_ROW_COUNT_EXACT "ARROW:row_count:exact"
+# define ARROW_STATISTICS_KEY_ROW_COUNT_APPROXIMATE
"ARROW:row_count:approximate"
+
#endif // ARROW_C_DATA_INTERFACE
#ifndef ARROW_C_DEVICE_DATA_INTERFACE
diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index e3a8c0d710..3f8237188d 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -26,8 +26,13 @@
#include <utility>
#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/builder_union.h"
#include "arrow/array/concatenate.h"
#include "arrow/array/validate.h"
+#include "arrow/c/abi.h"
#include "arrow/pretty_print.h"
#include "arrow/status.h"
#include "arrow/table.h"
@@ -465,6 +470,220 @@ Result<std::shared_ptr<RecordBatch>>
RecordBatch::ViewOrCopyTo(
return Make(schema_, num_rows(), std::move(copied_columns));
}
+namespace {
+struct EnumeratedStatistics {
+ int nth_statistics = 0;
+ bool start_new_column = false;
+ std::optional<int32_t> nth_column = std::nullopt;
+ const char* key = nullptr;
+ std::shared_ptr<DataType> type = nullptr;
+ ArrayStatistics::ValueType value = false;
+};
+using OnStatistics =
+ std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics
on_statistics) {
+ EnumeratedStatistics statistics;
+ statistics.nth_statistics = 0;
+ statistics.start_new_column = true;
+ statistics.nth_column = std::nullopt;
+
+ statistics.key = ARROW_STATISTICS_KEY_ROW_COUNT_EXACT;
+ statistics.type = int64();
+ statistics.value = record_batch.num_rows();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+
+ const auto num_fields = record_batch.schema()->num_fields();
+ for (int nth_column = 0; nth_column < num_fields; ++nth_column) {
+ auto column_statistics = record_batch.column(nth_column)->statistics();
+ if (!column_statistics) {
+ continue;
+ }
+
+ statistics.start_new_column = true;
+ statistics.nth_column = nth_column;
+ if (column_statistics->null_count.has_value()) {
+ statistics.nth_statistics++;
+ statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_EXACT;
+ statistics.type = int64();
+ statistics.value = column_statistics->null_count.value();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+ }
+
+ if (column_statistics->distinct_count.has_value()) {
+ statistics.nth_statistics++;
+ statistics.key = ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT;
+ statistics.type = int64();
+ statistics.value = column_statistics->distinct_count.value();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+ }
+
+ if (column_statistics->min.has_value()) {
+ statistics.nth_statistics++;
+ if (column_statistics->is_min_exact) {
+ statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_EXACT;
+ } else {
+ statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE;
+ }
+ statistics.type = column_statistics->MinArrowType();
+ statistics.value = column_statistics->min.value();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+ }
+
+ if (column_statistics->max.has_value()) {
+ statistics.nth_statistics++;
+ if (column_statistics->is_max_exact) {
+ statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_EXACT;
+ } else {
+ statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE;
+ }
+ statistics.type = column_statistics->MaxArrowType();
+ statistics.value = column_statistics->max.value();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+ }
+ }
+ return Status::OK();
+}
+} // namespace
+
+Result<std::shared_ptr<Array>> RecordBatch::MakeStatisticsArray(
+ MemoryPool* memory_pool) const {
+ // Statistics schema:
+ // struct<
+ // column: int32,
+ // statistics: map<
+ // key: dictionary<
+ // indices: int32,
+ // dictionary: utf8,
+ // >,
+ // items: dense_union<...all needed types...>,
+ // >
+ // >
+
+ // Statistics schema doesn't define static dense union type for
+ // values. Each statistics schema have a dense union type that has
+ // needled value types. The following block collects these types.
+ std::vector<std::shared_ptr<Field>> values_types;
+ std::vector<int8_t> values_type_indexes;
+ RETURN_NOT_OK(EnumerateStatistics(*this, [&](const EnumeratedStatistics&
statistics) {
+ int8_t i = 0;
+ for (const auto& field : values_types) {
+ if (field->type()->id() == statistics.type->id()) {
+ break;
+ }
+ i++;
+ }
+ if (i == static_cast<int8_t>(values_types.size())) {
+ values_types.push_back(field(statistics.type->name(), statistics.type));
+ }
+ values_type_indexes.push_back(i);
+ return Status::OK();
+ }));
+
+ // statistics.key: dictionary<indices: int32, dictionary: utf8>
+ auto keys_type = dictionary(int32(), utf8(), false);
+ // statistics.items: dense_union<...all needed types...>
+ auto values_type = dense_union(values_types);
+ // struct<
+ // column: int32,
+ // statistics: map<
+ // key: dictionary<
+ // indices: int32,
+ // dictionary: utf8,
+ // >,
+ // items: dense_union<...all needed types...>,
+ // >
+ // >
+ auto statistics_type =
+ struct_({field("column", int32()),
+ field("statistics", map(keys_type, values_type, false))});
+
+ std::vector<std::shared_ptr<ArrayBuilder>> field_builders;
+ // columns: int32
+ auto columns_builder = std::make_shared<Int32Builder>(memory_pool);
+
field_builders.push_back(std::static_pointer_cast<ArrayBuilder>(columns_builder));
+ // statistics.key: dictionary<indices: int32, dictionary: utf8>
+ auto keys_builder = std::make_shared<StringDictionary32Builder>();
+ // statistics.items: dense_union<...all needed types...>
+ std::vector<std::shared_ptr<ArrayBuilder>> values_builders;
+ for (const auto& values_type : values_types) {
+ std::unique_ptr<ArrayBuilder> values_builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, values_type->type(),
&values_builder));
+
values_builders.push_back(std::shared_ptr<ArrayBuilder>(std::move(values_builder)));
+ }
+ auto items_builder = std::make_shared<DenseUnionBuilder>(
+ memory_pool, std::move(values_builders), values_type);
+ // statistics:
+ // map<
+ // key: dictionary<
+ // indices: int32,
+ // dictionary: utf8,
+ // >,
+ // items: dense_union<...all needed types...>,
+ // >
+ auto values_builder = std::make_shared<MapBuilder>(
+ memory_pool, std::static_pointer_cast<ArrayBuilder>(keys_builder),
+ std::static_pointer_cast<ArrayBuilder>(items_builder));
+
field_builders.push_back(std::static_pointer_cast<ArrayBuilder>(values_builder));
+ // struct<
+ // column: int32,
+ // statistics: map<
+ // key: dictionary<
+ // indices: int32,
+ // dictionary: utf8,
+ // >,
+ // items: dense_union<...all needed types...>,
+ // >
+ // >
+ StructBuilder builder(statistics_type, memory_pool,
std::move(field_builders));
+
+ // Append statistics.
+ RETURN_NOT_OK(EnumerateStatistics(*this, [&](const EnumeratedStatistics&
statistics) {
+ if (statistics.start_new_column) {
+ RETURN_NOT_OK(builder.Append());
+ if (statistics.nth_column.has_value()) {
+ RETURN_NOT_OK(columns_builder->Append(statistics.nth_column.value()));
+ } else {
+ RETURN_NOT_OK(columns_builder->AppendNull());
+ }
+ RETURN_NOT_OK(values_builder->Append());
+ }
+ RETURN_NOT_OK(keys_builder->Append(statistics.key,
+
static_cast<int32_t>(strlen(statistics.key))));
+ const auto values_type_index =
values_type_indexes[statistics.nth_statistics];
+ RETURN_NOT_OK(items_builder->Append(values_type_index));
+ struct Visitor {
+ ArrayBuilder* builder;
+
+ Status operator()(const bool& value) {
+ return static_cast<BooleanBuilder*>(builder)->Append(value);
+ }
+ Status operator()(const int64_t& value) {
+ return static_cast<Int64Builder*>(builder)->Append(value);
+ }
+ Status operator()(const uint64_t& value) {
+ return static_cast<UInt64Builder*>(builder)->Append(value);
+ }
+ Status operator()(const double& value) {
+ return static_cast<DoubleBuilder*>(builder)->Append(value);
+ }
+ Status operator()(const std::string& value) {
+ return static_cast<StringBuilder*>(builder)->Append(
+ value.data(), static_cast<int32_t>(value.size()));
+ }
+ } visitor;
+ visitor.builder = values_builders[values_type_index].get();
+ RETURN_NOT_OK(std::visit(visitor, statistics.value));
+ return Status::OK();
+ }));
+
+ return builder.Finish();
+}
+
Status RecordBatch::Validate() const {
return ValidateBatch(*this, /*full_validation=*/false);
}
diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h
index 95596e9c15..edbefc1c77 100644
--- a/cpp/src/arrow/record_batch.h
+++ b/cpp/src/arrow/record_batch.h
@@ -282,6 +282,18 @@ class ARROW_EXPORT RecordBatch {
virtual DeviceAllocationType device_type() const = 0;
+ /// \brief Create a statistics array of this record batch
+ ///
+ /// The created array follows the C data interface statistics
+ /// specification. See
+ /// https://arrow.apache.org/docs/format/CDataInterfaceStatistics.html
+ /// for details.
+ ///
+ /// \param[in] pool the memory pool to allocate memory from
+ /// \return the statistics array of this record batch
+ Result<std::shared_ptr<Array>> MakeStatisticsArray(
+ MemoryPool* pool = default_memory_pool()) const;
+
protected:
RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows);
diff --git a/cpp/src/arrow/record_batch_test.cc
b/cpp/src/arrow/record_batch_test.cc
index daf7109075..21202c6acb 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -25,9 +25,11 @@
#include <vector>
#include "arrow/array/array_base.h"
+#include "arrow/array/array_dict.h"
#include "arrow/array/array_nested.h"
#include "arrow/array/data.h"
#include "arrow/array/util.h"
+#include "arrow/c/abi.h"
#include "arrow/chunked_array.h"
#include "arrow/status.h"
#include "arrow/table.h"
@@ -980,6 +982,440 @@ TEST_F(TestRecordBatch, ToTensorUnsupportedMixedFloat16) {
batch1->ToTensor());
}
+namespace {
+template <typename ArrowType,
+ typename = std::enable_if_t<is_boolean_type<ArrowType>::value ||
+ is_number_type<ArrowType>::value>>
+Result<std::shared_ptr<Array>> BuildArray(
+ const std::vector<typename TypeTraits<ArrowType>::CType>& values) {
+ using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
+ BuilderType builder;
+ for (const auto& value : values) {
+ ARROW_RETURN_NOT_OK(builder.Append(value));
+ }
+ return builder.Finish();
+}
+
+template <typename ArrowType, typename = enable_if_string<ArrowType>>
+Result<std::shared_ptr<Array>> BuildArray(const std::vector<std::string>&
values) {
+ using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
+ BuilderType builder;
+ for (const auto& value : values) {
+ ARROW_RETURN_NOT_OK(builder.Append(value));
+ }
+ return builder.Finish();
+}
+
+template <typename RawType>
+std::vector<RawType> StatisticsValuesToRawValues(
+ const std::vector<ArrayStatistics::ValueType>& values) {
+ std::vector<RawType> raw_values;
+ for (const auto& value : values) {
+ raw_values.push_back(std::get<RawType>(value));
+ }
+ return raw_values;
+}
+
+template <typename ValueType, typename = std::enable_if_t<std::is_same<
+ ArrayStatistics::ValueType,
ValueType>::value>>
+Result<std::shared_ptr<Array>> BuildArray(const std::vector<ValueType>&
values) {
+ struct Builder {
+ const std::vector<ArrayStatistics::ValueType>& values_;
+ explicit Builder(const std::vector<ArrayStatistics::ValueType>& values)
+ : values_(values) {}
+
+ Result<std::shared_ptr<Array>> operator()(const bool&) {
+ auto values = StatisticsValuesToRawValues<bool>(values_);
+ return BuildArray<BooleanType>(values);
+ }
+ Result<std::shared_ptr<Array>> operator()(const int64_t&) {
+ auto values = StatisticsValuesToRawValues<int64_t>(values_);
+ return BuildArray<Int64Type>(values);
+ }
+ Result<std::shared_ptr<Array>> operator()(const uint64_t&) {
+ auto values = StatisticsValuesToRawValues<uint64_t>(values_);
+ return BuildArray<UInt64Type>(values);
+ }
+ Result<std::shared_ptr<Array>> operator()(const double&) {
+ auto values = StatisticsValuesToRawValues<double>(values_);
+ return BuildArray<DoubleType>(values);
+ }
+ Result<std::shared_ptr<Array>> operator()(const std::string&) {
+ auto values = StatisticsValuesToRawValues<std::string>(values_);
+ return BuildArray<StringType>(values);
+ }
+ } builder(values);
+ return std::visit(builder, values[0]);
+}
+
+Result<std::shared_ptr<Array>> MakeStatisticsArray(
+ const std::string& columns_json,
+ const std::vector<std::vector<std::string>>& nested_statistics_keys,
+ const std::vector<std::vector<ArrayStatistics::ValueType>>&
+ nested_statistics_values) {
+ auto columns_type = int32();
+ auto columns_array = ArrayFromJSON(columns_type, columns_json);
+ const auto n_columns = columns_array->length();
+
+ // nested_statistics_keys:
+ // {
+ // {"ARROW:row_count:exact", "ARROW:null_count:exact"},
+ // {"ARROW:max_value:exact"},
+ // {"ARROW:max_value:exact", "ARROW:distinct_count:exact"},
+ // }
+ // nested_statistics_values:
+ // {
+ // {int64_t{29}, int64_t{1}},
+ // {double{2.9}},
+ // {double{-2.9}, int64_t{2}},
+ // }
+ // ->
+ // keys_dictionary:
+ // {
+ // "ARROW:row_count:exact",
+ // "ARROW:null_count:exact",
+ // "ARROW:max_value:exact",
+ // "ARROW:distinct_count:exact",
+ // }
+ // keys_indices: {0, 1, 2, 2, 3}
+ // values_types: {int64(), float64()}
+ // values_type_codes: {0, 1}
+ // values_values[0]: {int64_t{29}, int64_t{1}, int64_t{2}}
+ // values_values[1]: {double{2.9}, double{-2.9}}
+ // values_value_type_ids: {0, 0, 1, 1, 0}
+ // values_value_offsets: {0, 1, 0, 1, 2}
+ // statistics_offsets: {0, 2, 3, 5, 5}
+ std::vector<std::string> keys_dictionary;
+ std::vector<int32_t> keys_indices;
+ std::vector<std::shared_ptr<DataType>> values_types;
+ std::vector<int8_t> values_type_codes;
+ std::vector<std::vector<ArrayStatistics::ValueType>> values_values;
+ std::vector<int8_t> values_value_type_ids;
+ std::vector<int32_t> values_value_offsets;
+ std::vector<int32_t> statistics_offsets;
+
+ int32_t offset = 0;
+ std::vector<int32_t> values_value_offset_counters;
+ for (size_t i = 0; i < nested_statistics_keys.size(); ++i) {
+ const auto& statistics_keys = nested_statistics_keys[i];
+ const auto& statistics_values = nested_statistics_values[i];
+ statistics_offsets.push_back(offset);
+ for (size_t j = 0; j < statistics_keys.size(); ++j) {
+ const auto& key = statistics_keys[j];
+ const auto& value = statistics_values[j];
+ ++offset;
+
+ int32_t key_index = 0;
+ for (; key_index < static_cast<int32_t>(keys_dictionary.size());
++key_index) {
+ if (keys_dictionary[key_index] == key) {
+ break;
+ }
+ }
+ if (key_index == static_cast<int32_t>(keys_dictionary.size())) {
+ keys_dictionary.push_back(key);
+ }
+ keys_indices.push_back(key_index);
+
+ auto values_type = ArrayStatistics::ValueToArrowType(value);
+ int8_t values_type_code = 0;
+ for (; values_type_code < static_cast<int32_t>(values_types.size());
+ ++values_type_code) {
+ if (values_types[values_type_code] == values_type) {
+ break;
+ }
+ }
+ if (values_type_code == static_cast<int32_t>(values_types.size())) {
+ values_types.push_back(values_type);
+ values_type_codes.push_back(values_type_code);
+ values_values.emplace_back();
+ values_value_offset_counters.push_back(0);
+ }
+ values_values[values_type_code].push_back(value);
+ values_value_type_ids.push_back(values_type_code);
+
values_value_offsets.push_back(values_value_offset_counters[values_type_code]++);
+ }
+ }
+ statistics_offsets.push_back(offset);
+
+ auto keys_type = dictionary(int32(), utf8(), false);
+ std::vector<std::shared_ptr<Field>> values_fields;
+ for (const auto& type : values_types) {
+ values_fields.push_back(field(type->name(), type));
+ }
+ auto values_type = dense_union(values_fields);
+ auto statistics_type = map(keys_type, values_type, false);
+ auto struct_type =
+ struct_({field("column", columns_type), field("statistics",
statistics_type)});
+
+ ARROW_ASSIGN_OR_RAISE(auto keys_indices_array,
BuildArray<Int32Type>(keys_indices));
+ ARROW_ASSIGN_OR_RAISE(auto keys_dictionary_array,
+ BuildArray<StringType>(keys_dictionary));
+ ARROW_ASSIGN_OR_RAISE(
+ auto keys_array,
+ DictionaryArray::FromArrays(keys_type, keys_indices_array,
keys_dictionary_array));
+
+ std::vector<std::shared_ptr<Array>> values_arrays;
+ for (const auto& values : values_values) {
+ ARROW_ASSIGN_OR_RAISE(auto values_array,
+ BuildArray<ArrayStatistics::ValueType>(values));
+ values_arrays.push_back(values_array);
+ }
+ ARROW_ASSIGN_OR_RAISE(auto values_value_type_ids_array,
+ BuildArray<Int8Type>(values_value_type_ids));
+ ARROW_ASSIGN_OR_RAISE(auto values_value_offsets_array,
+ BuildArray<Int32Type>(values_value_offsets));
+ auto values_array = std::make_shared<DenseUnionArray>(
+ values_type, values_value_offsets_array->length(), values_arrays,
+ values_value_type_ids_array->data()->buffers[1],
+ values_value_offsets_array->data()->buffers[1]);
+ ARROW_ASSIGN_OR_RAISE(auto statistics_offsets_array,
+ BuildArray<Int32Type>(statistics_offsets));
+ ARROW_ASSIGN_OR_RAISE(auto statistics_array,
+ MapArray::FromArrays(statistics_type,
statistics_offsets_array,
+ keys_array, values_array));
+ std::vector<std::shared_ptr<Array>> struct_arrays =
{std::move(columns_array),
+
std::move(statistics_array)};
+ return std::make_shared<StructArray>(struct_type, n_columns, struct_arrays);
+}
+}; // namespace
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayRowCount) {
+ auto schema = ::arrow::schema({field("int32", int32())});
+ auto int32_array = ArrayFromJSON(int32(), "[1, null, -1]");
+ auto batch = RecordBatch::Make(schema, int32_array->length(), {int32_array});
+
+ ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+ ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+ MakeStatisticsArray("[null]",
+ {{
+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+ }},
+ {{
+
ArrayStatistics::ValueType{int64_t{3}},
+ }}));
+ AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayNullCount) {
+ auto schema =
+ ::arrow::schema({field("no-statistics", boolean()), field("int32",
int32())});
+ auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+ auto int32_array_data = ArrayFromJSON(int32(), "[1, null,
-1]")->data()->Copy();
+ int32_array_data->statistics = std::make_shared<ArrayStatistics>();
+ int32_array_data->statistics->null_count = 1;
+ auto int32_array = MakeArray(std::move(int32_array_data));
+ auto batch = RecordBatch::Make(schema, int32_array->length(),
+ {no_statistics_array, int32_array});
+
+ ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+ ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+ MakeStatisticsArray("[null, 1]",
+ {{
+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+ },
+ {
+
ARROW_STATISTICS_KEY_NULL_COUNT_EXACT,
+ }},
+ {{
+
ArrayStatistics::ValueType{int64_t{3}},
+ },
+ {
+
ArrayStatistics::ValueType{int64_t{1}},
+ }}));
+ AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCount) {
+ auto schema =
+ ::arrow::schema({field("no-statistics", boolean()), field("int32",
int32())});
+ auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+ auto int32_array_data = ArrayFromJSON(int32(), "[1, null,
-1]")->data()->Copy();
+ int32_array_data->statistics = std::make_shared<ArrayStatistics>();
+ int32_array_data->statistics->null_count = 1;
+ int32_array_data->statistics->distinct_count = 2;
+ auto int32_array = MakeArray(std::move(int32_array_data));
+ auto batch = RecordBatch::Make(schema, int32_array->length(),
+ {no_statistics_array, int32_array});
+
+ ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+ ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+ MakeStatisticsArray("[null, 1]",
+ {{
+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+ },
+ {
+
ARROW_STATISTICS_KEY_NULL_COUNT_EXACT,
+
ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT,
+ }},
+ {{
+
ArrayStatistics::ValueType{int64_t{3}},
+ },
+ {
+
ArrayStatistics::ValueType{int64_t{1}},
+
ArrayStatistics::ValueType{int64_t{2}},
+ }}));
+ AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayMinExact) {
+ auto schema =
+ ::arrow::schema({field("no-statistics", boolean()), field("uint32",
uint32())});
+ auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+ auto uint32_array_data = ArrayFromJSON(uint32(), "[100, null,
1]")->data()->Copy();
+ uint32_array_data->statistics = std::make_shared<ArrayStatistics>();
+ uint32_array_data->statistics->is_min_exact = true;
+ uint32_array_data->statistics->min = uint64_t{1};
+ auto uint32_array = MakeArray(std::move(uint32_array_data));
+ auto batch = RecordBatch::Make(schema, uint32_array->length(),
+ {no_statistics_array, uint32_array});
+
+ ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+ ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+ MakeStatisticsArray("[null, 1]",
+ {{
+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+ },
+ {
+
ARROW_STATISTICS_KEY_MIN_VALUE_EXACT,
+ }},
+ {{
+
ArrayStatistics::ValueType{int64_t{3}},
+ },
+ {
+
ArrayStatistics::ValueType{uint64_t{1}},
+ }}));
+ AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayMinApproximate) {
+ auto schema =
+ ::arrow::schema({field("no-statistics", boolean()), field("int32",
int32())});
+ auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+ auto int32_array_data = ArrayFromJSON(int32(), "[1, null,
-1]")->data()->Copy();
+ int32_array_data->statistics = std::make_shared<ArrayStatistics>();
+ int32_array_data->statistics->min = -1.0;
+ auto int32_array = MakeArray(std::move(int32_array_data));
+ auto batch = RecordBatch::Make(schema, int32_array->length(),
+ {no_statistics_array, int32_array});
+
+ ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+ ASSERT_OK_AND_ASSIGN(
+ auto expected_statistics_array,
+ MakeStatisticsArray("[null, 1]",
+ {{
+ ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+ },
+ {
+ ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE,
+ }},
+ {{
+ ArrayStatistics::ValueType{int64_t{3}},
+ },
+ {
+ ArrayStatistics::ValueType{-1.0},
+ }}));
+ AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayMaxExact) {
+ auto schema =
+ ::arrow::schema({field("no-statistics", boolean()), field("boolean",
boolean())});
+ auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+ auto boolean_array_data =
+ ArrayFromJSON(boolean(), "[true, null, false]")->data()->Copy();
+ boolean_array_data->statistics = std::make_shared<ArrayStatistics>();
+ boolean_array_data->statistics->is_max_exact = true;
+ boolean_array_data->statistics->max = true;
+ auto boolean_array = MakeArray(std::move(boolean_array_data));
+ auto batch = RecordBatch::Make(schema, boolean_array->length(),
+ {no_statistics_array, boolean_array});
+
+ ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+ ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+ MakeStatisticsArray("[null, 1]",
+ {{
+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+ },
+ {
+
ARROW_STATISTICS_KEY_MAX_VALUE_EXACT,
+ }},
+ {{
+
ArrayStatistics::ValueType{int64_t{3}},
+ },
+ {
+
ArrayStatistics::ValueType{true},
+ }}));
+ AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) {
+ auto schema =
+ ::arrow::schema({field("no-statistics", boolean()), field("float64",
float64())});
+ auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+ auto float64_array_data = ArrayFromJSON(float64(), "[1.0, null,
-1.0]")->data()->Copy();
+ float64_array_data->statistics = std::make_shared<ArrayStatistics>();
+ float64_array_data->statistics->min = -1.0;
+ auto float64_array = MakeArray(std::move(float64_array_data));
+ auto batch = RecordBatch::Make(schema, float64_array->length(),
+ {no_statistics_array, float64_array});
+
+ ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+ ASSERT_OK_AND_ASSIGN(
+ auto expected_statistics_array,
+ MakeStatisticsArray("[null, 1]",
+ {{
+ ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+ },
+ {
+ ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE,
+ }},
+ {{
+ ArrayStatistics::ValueType{int64_t{3}},
+ },
+ {
+ ArrayStatistics::ValueType{-1.0},
+ }}));
+ AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
+TEST_F(TestRecordBatch, MakeStatisticsArrayString) {
+ auto schema =
+ ::arrow::schema({field("no-statistics", boolean()), field("string",
utf8())});
+ auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
+ auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null,
\"c\"]")->data()->Copy();
+ string_array_data->statistics = std::make_shared<ArrayStatistics>();
+ string_array_data->statistics->is_max_exact = true;
+ string_array_data->statistics->max = "c";
+ auto string_array = MakeArray(std::move(string_array_data));
+ auto batch = RecordBatch::Make(schema, string_array->length(),
+ {no_statistics_array, string_array});
+
+ ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
+
+ ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
+ MakeStatisticsArray("[null, 1]",
+ {{
+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
+ },
+ {
+
ARROW_STATISTICS_KEY_MAX_VALUE_EXACT,
+ }},
+ {{
+
ArrayStatistics::ValueType{int64_t{3}},
+ },
+ {
+
ArrayStatistics::ValueType{"c"},
+ }}));
+ AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
+}
+
template <typename DataType>
class TestBatchToTensorColumnMajor : public ::testing::Test {};