This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new fd1bf8e688 GH-44579: [C++] Use array type to compute min/max
statistics Arrow type (#45094)
fd1bf8e688 is described below
commit fd1bf8e6886646ccd71a03de940190d5baca1ad2
Author: Sutou Kouhei <[email protected]>
AuthorDate: Tue Dec 31 12:05:43 2024 +0900
GH-44579: [C++] Use array type to compute min/max statistics Arrow type
(#45094)
### Rationale for this change
`arrow::ArrayStatistics` uses raw C++ types such as `int64_t` and
`std::string` for min/max types. We need to convert raw C++ types to Arrow
types when we use `arrow::ArrayStatistics` for generating statistics array.
(GH-45038)
We can't map `std::string` to an Arrow type. Because it may be
`arrow::binary()`, `arrow::utf8()` or something.
### What changes are included in this PR?
Use `arrow::DataType` information of associated array when we convert
`arrow::ArrayStatistics`'s min/max raw C++ types to Arrow types.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* GitHub Issue: #44579
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/array/statistics.h | 64 +++++++++++++++++++++++++++++++++-----
cpp/src/arrow/record_batch.cc | 8 +++--
cpp/src/arrow/record_batch_test.cc | 2 +-
3 files changed, 63 insertions(+), 11 deletions(-)
diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h
index 99b853ab0f..6ccd2f4766 100644
--- a/cpp/src/arrow/array/statistics.h
+++ b/cpp/src/arrow/array/statistics.h
@@ -22,7 +22,7 @@
#include <string>
#include <variant>
-#include "arrow/type_fwd.h"
+#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -34,22 +34,38 @@ namespace arrow {
/// as Apache Parquet may have statistics. Statistics associated with
/// data source can be read unified API via this class.
struct ARROW_EXPORT ArrayStatistics {
+ /// \brief The type for maximum and minimum values. If the target
+ /// value exists, one of them is used. `std::nullopt` is used
+ /// otherwise.
using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
static const std::shared_ptr<DataType>& ValueToArrowType(
- const std::optional<ValueType>& value) {
+ const std::optional<ValueType>& value,
+ const std::shared_ptr<DataType>& array_type) {
if (!value.has_value()) {
return null();
}
struct Visitor {
+ const std::shared_ptr<DataType>& array_type;
+
const std::shared_ptr<DataType>& operator()(const bool&) { return
boolean(); }
const std::shared_ptr<DataType>& operator()(const int64_t&) { return
int64(); }
const std::shared_ptr<DataType>& operator()(const uint64_t&) { return
uint64(); }
const std::shared_ptr<DataType>& operator()(const double&) { return
float64(); }
- // GH-44579: How to support binary data?
- const std::shared_ptr<DataType>& operator()(const std::string&) { return
utf8(); }
- } visitor;
+ const std::shared_ptr<DataType>& operator()(const std::string&) {
+ switch (array_type->id()) {
+ case Type::STRING:
+ case Type::BINARY:
+ case Type::FIXED_SIZE_BINARY:
+ case Type::LARGE_STRING:
+ case Type::LARGE_BINARY:
+ return array_type;
+ default:
+ return utf8();
+ }
+ }
+ } visitor{array_type};
return std::visit(visitor, value.value());
}
@@ -62,7 +78,24 @@ struct ARROW_EXPORT ArrayStatistics {
/// \brief The minimum value, may not be set
std::optional<ValueType> min = std::nullopt;
- const std::shared_ptr<DataType>& MinArrowType() { return
ValueToArrowType(min); }
+ /// \brief Compute Arrow type of the minimum value.
+ ///
+ /// If \ref ValueType is `std::string`, `array_type` may be
+ /// used. If `array_type` is a binary-like type such as \ref
+ /// arrow::binary and \ref arrow::large_utf8, `array_type` is
+ /// returned. \ref arrow::utf8 is returned otherwise.
+ ///
+ /// If \ref ValueType isn't `std::string`, `array_type` isn't used.
+ ///
+ /// \param array_type The Arrow type of the associated array.
+ ///
+ /// \return \ref arrow::null if the minimum value is `std::nullopt`,
+ /// Arrow type based on \ref ValueType of the \ref min
+ /// otherwise.
+ const std::shared_ptr<DataType>& MinArrowType(
+ const std::shared_ptr<DataType>& array_type) {
+ return ValueToArrowType(min, array_type);
+ }
/// \brief Whether the minimum value is exact or not
bool is_min_exact = false;
@@ -70,7 +103,24 @@ struct ARROW_EXPORT ArrayStatistics {
/// \brief The maximum value, may not be set
std::optional<ValueType> max = std::nullopt;
- const std::shared_ptr<DataType>& MaxArrowType() { return
ValueToArrowType(max); }
+ /// \brief Compute Arrow type of the maximum value.
+ ///
+ /// If \ref ValueType is `std::string`, `array_type` may be
+ /// used. If `array_type` is a binary-like type such as \ref
+ /// arrow::binary and \ref arrow::large_utf8, `array_type` is
+ /// returned. \ref arrow::utf8 is returned otherwise.
+ ///
+ /// If \ref ValueType isn't `std::string`, `array_type` isn't used.
+ ///
+ /// \param array_type The Arrow type of the associated array.
+ ///
+ /// \return \ref arrow::null if the maximum value is `std::nullopt`,
+ /// Arrow type based on \ref ValueType of the \ref max
+ /// otherwise.
+ const std::shared_ptr<DataType>& MaxArrowType(
+ const std::shared_ptr<DataType>& array_type) {
+ return ValueToArrowType(max, array_type);
+ }
/// \brief Whether the maximum value is exact or not
bool is_max_exact = false;
diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index 3f8237188d..5ce33a3731 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -493,8 +493,10 @@ Status EnumerateStatistics(const RecordBatch&
record_batch, OnStatistics on_stat
RETURN_NOT_OK(on_statistics(statistics));
statistics.start_new_column = false;
- const auto num_fields = record_batch.schema()->num_fields();
+ const auto& schema = record_batch.schema();
+ const auto num_fields = schema->num_fields();
for (int nth_column = 0; nth_column < num_fields; ++nth_column) {
+ const auto& field = schema->field(nth_column);
auto column_statistics = record_batch.column(nth_column)->statistics();
if (!column_statistics) {
continue;
@@ -527,7 +529,7 @@ Status EnumerateStatistics(const RecordBatch& record_batch,
OnStatistics on_stat
} else {
statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE;
}
- statistics.type = column_statistics->MinArrowType();
+ statistics.type = column_statistics->MinArrowType(field->type());
statistics.value = column_statistics->min.value();
RETURN_NOT_OK(on_statistics(statistics));
statistics.start_new_column = false;
@@ -540,7 +542,7 @@ Status EnumerateStatistics(const RecordBatch& record_batch,
OnStatistics on_stat
} else {
statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE;
}
- statistics.type = column_statistics->MaxArrowType();
+ statistics.type = column_statistics->MaxArrowType(field->type());
statistics.value = column_statistics->max.value();
RETURN_NOT_OK(on_statistics(statistics));
statistics.start_new_column = false;
diff --git a/cpp/src/arrow/record_batch_test.cc
b/cpp/src/arrow/record_batch_test.cc
index 21202c6acb..21d51ae506 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -1116,7 +1116,7 @@ Result<std::shared_ptr<Array>> MakeStatisticsArray(
}
keys_indices.push_back(key_index);
- auto values_type = ArrayStatistics::ValueToArrowType(value);
+ auto values_type = ArrayStatistics::ValueToArrowType(value,
arrow::null());
int8_t values_type_code = 0;
for (; values_type_code < static_cast<int32_t>(values_types.size());
++values_type_code) {