This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 4d625b37cc GH-45732: [C++][Compute] Accept more pivot key types
(#45945)
4d625b37cc is described below
commit 4d625b37ccb6d1cff1f1c47138318acc6751d4b0
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Apr 2 14:10:04 2025 +0200
GH-45732: [C++][Compute] Accept more pivot key types (#45945)
### Rationale for this change
Allow the `pivot_wider` and `hash_pivot_wider` functions to accept an
integral pivot key column, in addition to binary-like.
Since the `key_names` option is a vector of strings, they are cast to the
appropriate pivot key type for matching.
### Are these changes tested?
Yes, by new unit tests.
### Are there any user-facing changes?
No.
* GitHub Issue: #45732
Lead-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/acero/hash_aggregate_test.cc | 60 +++++-
cpp/src/arrow/compute/api_aggregate.h | 7 +-
cpp/src/arrow/compute/exec.h | 2 +-
cpp/src/arrow/compute/kernels/aggregate_pivot.cc | 83 +++++----
cpp/src/arrow/compute/kernels/aggregate_test.cc | 42 ++++-
.../arrow/compute/kernels/hash_aggregate_pivot.cc | 70 ++++---
cpp/src/arrow/compute/kernels/pivot_internal.cc | 173 +++++++++--------
cpp/src/arrow/compute/kernels/pivot_internal.h | 15 +-
docs/source/cpp/compute.rst | 204 ++++++++++-----------
9 files changed, 408 insertions(+), 248 deletions(-)
diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc
b/cpp/src/arrow/acero/hash_aggregate_test.cc
index 1c456c2fd7..dce0e44eb1 100644
--- a/cpp/src/arrow/acero/hash_aggregate_test.cc
+++ b/cpp/src/arrow/acero/hash_aggregate_test.cc
@@ -4440,7 +4440,7 @@ TEST_P(GroupBy, PivotBasics) {
}
}
-TEST_P(GroupBy, PivotAllKeyTypes) {
+TEST_P(GroupBy, PivotBinaryKeyTypes) {
auto value_type = float32();
std::vector<std::string> table_json = {R"([
[1, "width", 10.5],
@@ -4462,6 +4462,49 @@ TEST_P(GroupBy, PivotAllKeyTypes) {
ARROW_SCOPED_TRACE("key_type = ", *key_type);
TestPivot(key_type, value_type, options, table_json, expected_json);
}
+
+ auto key_type = fixed_size_binary(3);
+ table_json = {R"([
+ [1, "wid", 10.5],
+ [2, "wid", 11.5]
+ ])",
+ R"([
+ [2, "hei", 12.5],
+ [3, "wid", 13.5],
+ [1, "hei", 14.5]
+ ])"};
+ expected_json = R"([
+ [1, {"hei": 14.5, "wid": 10.5} ],
+ [2, {"hei": 12.5, "wid": 11.5} ],
+ [3, {"hei": null, "wid": 13.5} ]
+ ])";
+ options.key_names = {"hei", "wid"};
+ ARROW_SCOPED_TRACE("key_type = ", *key_type);
+ TestPivot(key_type, value_type, options, table_json, expected_json);
+}
+
+TEST_P(GroupBy, PivotIntegerKeyTypes) {
+ auto value_type = float32();
+ std::vector<std::string> table_json = {R"([
+ [1, 78, 10.5],
+ [2, 78, 11.5]
+ ])",
+ R"([
+ [2, 56, 12.5],
+ [3, 78, 13.5],
+ [1, 56, 14.5]
+ ])"};
+ std::string expected_json = R"([
+ [1, {"56": 14.5, "78": 10.5} ],
+ [2, {"56": 12.5, "78": 11.5} ],
+ [3, {"56": null, "78": 13.5} ]
+ ])";
+ PivotWiderOptions options(/*key_names=*/{"56", "78"});
+
+ for (const auto& key_type : IntTypes()) {
+ ARROW_SCOPED_TRACE("key_type = ", *key_type);
+ TestPivot(key_type, value_type, options, table_json, expected_json);
+ }
}
TEST_P(GroupBy, PivotNumericValues) {
@@ -4749,6 +4792,21 @@ TEST_P(GroupBy, PivotDuplicateKeys) {
RunPivot(key_type, value_type, options, table_json));
}
+TEST_P(GroupBy, PivotInvalidKeys) {
+ // Integer key type, but key names cannot be converted to int
+ auto key_type = int32();
+ auto value_type = float32();
+ std::vector<std::string> table_json = {R"([])"};
+ PivotWiderOptions options(/*key_names=*/{"123", "width"});
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid, HasSubstr("Failed to parse string: 'width' as a scalar of type
int32"),
+ RunPivot(key_type, value_type, options, table_json));
+ options.key_names = {"12.3", "45"};
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid, HasSubstr("Failed to parse string: '12.3' as a scalar of type
int32"),
+ RunPivot(key_type, value_type, options, table_json));
+}
+
TEST_P(GroupBy, PivotDuplicateValues) {
auto key_type = utf8();
auto value_type = float32();
diff --git a/cpp/src/arrow/compute/api_aggregate.h
b/cpp/src/arrow/compute/api_aggregate.h
index 1d9076f6ba..8930d04de5 100644
--- a/cpp/src/arrow/compute/api_aggregate.h
+++ b/cpp/src/arrow/compute/api_aggregate.h
@@ -202,9 +202,10 @@ class ARROW_EXPORT TDigestOptions : public FunctionOptions
{
/// - The corresponding `Aggregate::target` must have two FieldRef elements;
/// the first one points to the pivot key column, the second points to the
/// pivoted data column.
-/// - The pivot key column must be string-like; its values will be matched
-/// against `key_names` in order to dispatch the pivoted data into the
-/// output.
+/// - The pivot key column can be string, binary or integer; its values will be
+/// matched against `key_names` in order to dispatch the pivoted data into
+/// the output. If the pivot key column is not string-like, the `key_names`
+/// will be cast to the pivot key type.
///
/// "pivot_wider" example
/// ---------------------
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 3fbefe4a1a..dae7e1ea68 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -276,7 +276,7 @@ struct ExecValue {
ArraySpan array = {};
const Scalar* scalar = NULLPTR;
- ExecValue(Scalar* scalar) // NOLINT implicit conversion
+ ExecValue(const Scalar* scalar) // NOLINT implicit conversion
: scalar(scalar) {}
ExecValue(ArraySpan array) // NOLINT implicit conversion
diff --git a/cpp/src/arrow/compute/kernels/aggregate_pivot.cc
b/cpp/src/arrow/compute/kernels/aggregate_pivot.cc
index 3ff6327ec9..f3571621e4 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_pivot.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_pivot.cc
@@ -22,6 +22,7 @@
#include "arrow/scalar.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/logging.h"
+#include "arrow/visit_data_inline.h"
namespace arrow::compute::internal {
namespace {
@@ -30,7 +31,8 @@ using arrow::internal::VisitSetBitRunsVoid;
using arrow::util::span;
struct PivotImpl : public ScalarAggregator {
- Status Init(const PivotWiderOptions& options, const std::vector<TypeHolder>&
in_types) {
+ Status Init(const PivotWiderOptions& options, const std::vector<TypeHolder>&
in_types,
+ ExecContext* ctx) {
options_ = &options;
key_type_ = in_types[0].GetSharedPtr();
auto value_type = in_types[1].GetSharedPtr();
@@ -42,47 +44,57 @@ struct PivotImpl : public ScalarAggregator {
values_.push_back(MakeNullScalar(value_type));
}
out_type_ = struct_(std::move(fields));
- ARROW_ASSIGN_OR_RAISE(key_mapper_, PivotWiderKeyMapper::Make(*key_type_,
options_));
+ ARROW_ASSIGN_OR_RAISE(key_mapper_,
+ PivotWiderKeyMapper::Make(*key_type_, options_,
ctx));
return Status::OK();
}
Status Consume(KernelContext*, const ExecSpan& batch) override {
DCHECK_EQ(batch.num_values(), 2);
if (batch[0].is_array()) {
- ARROW_ASSIGN_OR_RAISE(span<const PivotWiderKeyIndex> keys,
- key_mapper_->MapKeys(batch[0].array));
+ ARROW_ASSIGN_OR_RAISE(auto keys_array,
key_mapper_->MapKeys(batch[0].array));
+ DCHECK_EQ(keys_array->type->id(), Type::UINT32);
+ ArraySpan keys_span(*keys_array);
if (batch[1].is_array()) {
// Array keys, array values
auto values = batch[1].array.ToArray();
- for (int64_t i = 0; i < batch.length; ++i) {
- PivotWiderKeyIndex key = keys[i];
- if (key != kNullPivotKey && !values->IsNull(i)) {
- if (ARROW_PREDICT_FALSE(values_[key]->is_valid)) {
- return DuplicateValue();
- }
- ARROW_ASSIGN_OR_RAISE(values_[key], values->GetScalar(i));
- DCHECK(values_[key]->is_valid);
- }
- }
+ int64_t i = 0;
+ RETURN_NOT_OK(VisitArraySpanInline<UInt32Type>(
+ keys_span,
+ [&](uint32_t key) {
+ if (!values->IsNull(i)) {
+ if (ARROW_PREDICT_FALSE(values_[key]->is_valid)) {
+ return DuplicateValue();
+ }
+ ARROW_ASSIGN_OR_RAISE(values_[key], values->GetScalar(i));
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ }));
} else {
// Array keys, scalar value
const Scalar* value = batch[1].scalar;
if (value->is_valid) {
- for (int64_t i = 0; i < batch.length; ++i) {
- PivotWiderKeyIndex key = keys[i];
- if (key != kNullPivotKey) {
- if (ARROW_PREDICT_FALSE(values_[key]->is_valid)) {
- return DuplicateValue();
- }
- values_[key] = value->GetSharedPtr();
- }
- }
+ RETURN_NOT_OK(VisitArraySpanInline<UInt32Type>(
+ keys_span,
+ [&](uint32_t key) {
+ if (ARROW_PREDICT_FALSE(values_[key]->is_valid)) {
+ return DuplicateValue();
+ }
+ values_[key] = value->GetSharedPtr();
+ return Status::OK();
+ },
+ [] { return Status::OK(); }));
}
}
} else {
- ARROW_ASSIGN_OR_RAISE(PivotWiderKeyIndex key,
- key_mapper_->MapKey(*batch[0].scalar));
- if (key != kNullPivotKey) {
+ ARROW_ASSIGN_OR_RAISE(auto maybe_key,
key_mapper_->MapKey(*batch[0].scalar));
+ if (maybe_key.has_value()) {
+ PivotWiderKeyIndex key = maybe_key.value();
if (batch[1].is_array()) {
// Scalar key, array values
auto values = batch[1].array.ToArray();
@@ -145,10 +157,8 @@ struct PivotImpl : public ScalarAggregator {
Result<std::unique_ptr<KernelState>> PivotInit(KernelContext* ctx,
const KernelInitArgs& args) {
const auto& options = checked_cast<const PivotWiderOptions&>(*args.options);
- DCHECK_EQ(args.inputs.size(), 2);
- DCHECK(is_base_binary_like(args.inputs[0].id()));
auto state = std::make_unique<PivotImpl>();
- RETURN_NOT_OK(state->Init(options, args.inputs));
+ RETURN_NOT_OK(state->Init(options, args.inputs, ctx->exec_context()));
// GH-45718: This can be simplified once we drop the R openSUSE155 crossbow
// job
// R build with openSUSE155 requires an explicit shared_ptr construction
@@ -167,6 +177,8 @@ const FunctionDoc pivot_doc{
"is emitted. If a pivot key doesn't appear, null is emitted.\n"
"If more than one non-null value is encountered for a given pivot key,\n"
"Invalid is raised.\n"
+ "The pivot key column can be string, binary or integer. The `key_names`\n"
+ "will be cast to the pivot key column type for matching.\n"
"Behavior of unexpected pivot keys is controlled by
`unexpected_key_behavior`\n"
"in PivotWiderOptions."),
{"pivot_keys", "pivot_values"},
@@ -179,12 +191,19 @@ void RegisterScalarAggregatePivot(FunctionRegistry*
registry) {
auto func = std::make_shared<ScalarAggregateFunction>(
"pivot_wider", Arity::Binary(), pivot_doc, &default_pivot_options);
-
- for (auto key_type : BaseBinaryTypes()) {
- auto sig = KernelSignature::Make({key_type->id(), InputType::Any()},
+ auto add_kernel = [&](InputType key_type) {
+ auto sig = KernelSignature::Make({key_type, InputType::Any()},
OutputType(ResolveOutputType));
AddAggKernel(std::move(sig), PivotInit, func.get());
+ };
+
+ for (const auto& key_type : BaseBinaryTypes()) {
+ add_kernel(key_type->id());
+ }
+ for (const auto& key_type : IntTypes()) {
+ add_kernel(key_type->id());
}
+ add_kernel(Type::FIXED_SIZE_BINARY);
DCHECK_OK(registry->AddFunction(std::move(func)));
}
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc
b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index ec012a42cd..d821fc7e2c 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -4504,10 +4504,9 @@ TEST_F(TestPivotKernel, Basics) {
PivotWiderOptions(/*key_names=*/{"height", "width"}));
}
-TEST_F(TestPivotKernel, AllKeyTypes) {
+TEST_F(TestPivotKernel, BinaryKeyTypes) {
+ auto value_type = float32();
for (auto key_type : BaseBinaryTypes()) {
- auto value_type = float32();
-
auto keys = ArrayFromJSON(key_type, R"(["width", "height"])");
auto values = ArrayFromJSON(value_type, "[10.5, 11.5]");
auto expected =
@@ -4516,6 +4515,25 @@ TEST_F(TestPivotKernel, AllKeyTypes) {
AssertPivot(keys, values, *expected,
PivotWiderOptions(/*key_names=*/{"height", "width"}));
}
+ auto key_type = fixed_size_binary(3);
+ auto keys = ArrayFromJSON(key_type, R"(["wid", "hei"])");
+ auto values = ArrayFromJSON(value_type, "[10.5, 11.5]");
+ auto expected = ScalarFromJSON(
+ struct_({field("hei", value_type), field("wid", value_type)}), "[11.5,
10.5]");
+ AssertPivot(keys, values, *expected, PivotWiderOptions(/*key_names=*/{"hei",
"wid"}));
+}
+
+TEST_F(TestPivotKernel, IntegerKeyTypes) {
+ // It is possible to use an integer key column, while passing its string
equivalent
+ // in PivotWiderOptions::key_names.
+ auto value_type = float32();
+ for (auto key_type : IntTypes()) {
+ auto keys = ArrayFromJSON(key_type, "[34, 12]");
+ auto values = ArrayFromJSON(value_type, "[10.5, 11.5]");
+ auto expected = ScalarFromJSON(
+ struct_({field("12", value_type), field("34", value_type)}), "[11.5,
10.5]");
+ AssertPivot(keys, values, *expected,
PivotWiderOptions(/*key_names=*/{"12", "34"}));
+ }
}
TEST_F(TestPivotKernel, Numbers) {
@@ -4724,6 +4742,24 @@ TEST_F(TestPivotKernel, DuplicateKeyNames) {
CallFunction("pivot_wider", {keys, values}, &options));
}
+TEST_F(TestPivotKernel, InvalidKeyName) {
+ auto key_type = int32();
+ auto value_type = float32();
+
+ auto keys = ArrayFromJSON(key_type, "[]");
+ auto values = ArrayFromJSON(value_type, "[]");
+ auto options = PivotWiderOptions(/*key_names=*/{"123", "width"});
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr("Failed to parse string: 'width' as a scalar of
type int32"),
+ CallFunction("pivot_wider", {keys, values}, &options));
+ options.key_names = {"12.3", "45"};
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr("Failed to parse string: '12.3' as a scalar of type
int32"),
+ CallFunction("pivot_wider", {keys, values}, &options));
+}
+
TEST_F(TestPivotKernel, DuplicateValues) {
auto key_type = utf8();
auto value_type = float32();
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_pivot.cc
b/cpp/src/arrow/compute/kernels/hash_aggregate_pivot.cc
index c3dc070e4f..3833d4ddb7 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate_pivot.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate_pivot.cc
@@ -29,6 +29,7 @@
#include "arrow/compute/kernels/hash_aggregate_internal.h"
#include "arrow/compute/kernels/pivot_internal.h"
#include "arrow/compute/row/grouper.h"
+#include "arrow/util/bit_block_counter.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/span.h"
#include "arrow/visit_type_inline.h"
@@ -54,7 +55,7 @@ struct GroupedPivotAccumulator {
return Status::OK();
}
- Status Consume(span<const uint32_t> groups, span<const PivotWiderKeyIndex>
keys,
+ Status Consume(span<const uint32_t> groups, const
std::shared_ptr<ArrayData>& keys,
const ArraySpan& values) {
// To dispatch the values into the right (group, key) coordinates,
// we first compute a vector of take indices for each output column.
@@ -78,7 +79,8 @@ struct GroupedPivotAccumulator {
// respective take_indices for the column's keys.
//
- DCHECK_EQ(groups.size(), keys.size());
+ DCHECK_EQ(keys->type->id(), Type::UINT32);
+ DCHECK_EQ(groups.size(), static_cast<size_t>(keys->length));
DCHECK_EQ(groups.size(), static_cast<size_t>(values.length));
std::shared_ptr<DataType> take_index_type;
@@ -118,20 +120,28 @@ struct GroupedPivotAccumulator {
DCHECK_LE(offset, scratch_buffer_.capacity());
// Populate the take_indices for each output column
- for (int64_t i = 0; i < values.length; ++i) {
- const PivotWiderKeyIndex key = keys[i];
- if (key != kNullPivotKey && !values.IsNull(i)) {
- DCHECK_LT(static_cast<int>(key), num_keys_);
- const uint32_t group = groups[i];
- if (bit_util::GetBit(take_bitmap_data[key], group)) {
- return DuplicateValue();
- }
- // For row #group in column #key, we are going to take the value at
index #i
- bit_util::SetBit(take_bitmap_data[key], group);
- take_indices_data[key][group] = static_cast<TakeIndex>(i);
- }
- }
- return Status::OK();
+ const uint8_t* keys_null_bitmap =
+ (keys->GetNullCount() != 0) ? keys->GetValues<uint8_t>(0, 0) :
nullptr;
+ const uint32_t* key_values = keys->GetValues<uint32_t>(1);
+ const uint8_t* values_null_bitmap =
+ (values.GetNullCount() != 0) ? values.GetValues<uint8_t>(0, 0) :
nullptr;
+ return ::arrow::internal::VisitTwoBitBlocks(
+ keys_null_bitmap, keys->offset, values_null_bitmap, values.offset,
+ values.length,
+ [&](int64_t i) {
+ // Non-null key, non-null value
+ const uint32_t group = groups[i];
+ const uint32_t key = key_values[i];
+ DCHECK_LT(static_cast<int>(key), num_keys_);
+ if (ARROW_PREDICT_FALSE(bit_util::GetBit(take_bitmap_data[key],
group))) {
+ return DuplicateValue();
+ }
+ // For row #group in column #key, we are going to take the value
at index #i
+ bit_util::SetBit(take_bitmap_data[key], group);
+ take_indices_data[key][group] = static_cast<TakeIndex>(i);
+ return Status::OK();
+ },
+ [] { return Status::OK(); });
};
// Call compute_take_indices with the optimal integer width
@@ -166,12 +176,13 @@ struct GroupedPivotAccumulator {
return MergeColumns(std::move(new_columns));
}
- Status Consume(span<const uint32_t> groups, const PivotWiderKeyIndex key,
+ Status Consume(span<const uint32_t> groups,
std::optional<PivotWiderKeyIndex> maybe_key,
const ArraySpan& values) {
- if (key == kNullPivotKey) {
+ if (!maybe_key.has_value()) {
// Nothing to update
return Status::OK();
}
+ const auto key = maybe_key.value();
DCHECK_LT(static_cast<int>(key), num_keys_);
DCHECK_EQ(groups.size(), static_cast<size_t>(values.length));
@@ -381,7 +392,8 @@ struct GroupedPivotImpl : public GroupedAggregator {
}
out_type_ = struct_(std::move(fields));
out_struct_type_ = checked_cast<const StructType*>(out_type_.get());
- ARROW_ASSIGN_OR_RAISE(key_mapper_, PivotWiderKeyMapper::Make(*key_type_,
options_));
+ ARROW_ASSIGN_OR_RAISE(key_mapper_,
+ PivotWiderKeyMapper::Make(*key_type_, options_,
ctx));
RETURN_NOT_OK(accumulator_.Init(ctx, value_type, options_));
return Status::OK();
}
@@ -404,11 +416,11 @@ struct GroupedPivotImpl : public GroupedAggregator {
return Status::NotImplemented("Consuming scalar pivot value");
}
if (batch[0].is_array()) {
- ARROW_ASSIGN_OR_RAISE(span<const PivotWiderKeyIndex> keys,
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> keys,
key_mapper_->MapKeys(batch[0].array));
return accumulator_.Consume(groups, keys, batch[1].array);
} else {
- ARROW_ASSIGN_OR_RAISE(PivotWiderKeyIndex key,
+ ARROW_ASSIGN_OR_RAISE(std::optional<PivotWiderKeyIndex> key,
key_mapper_->MapKey(*batch[0].scalar));
return accumulator_.Consume(groups, key, batch[1].array);
}
@@ -444,6 +456,8 @@ const FunctionDoc hash_pivot_doc{
"is emitted. If a pivot key doesn't appear in a given group, null is
emitted.\n"
"If more than one non-null value is encountered in the same group for a\n"
"given pivot key, Invalid is raised.\n"
+ "The pivot key column can be string, binary or integer. The `key_names`\n"
+ "will be cast to the pivot key column type for matching.\n"
"Behavior of unexpected pivot keys is controlled by
`unexpected_key_behavior`\n"
"in PivotWiderOptions."),
{"pivot_keys", "pivot_values", "group_id_array"},
@@ -457,14 +471,20 @@ void RegisterHashAggregatePivot(FunctionRegistry*
registry) {
{
auto func = std::make_shared<HashAggregateFunction>(
"hash_pivot_wider", Arity::Ternary(), hash_pivot_doc,
&default_pivot_options);
- for (auto key_type : BaseBinaryTypes()) {
+ auto add_kernel = [&](InputType type) {
// Anything that scatter() (i.e. take()) accepts can be passed as values
- auto sig = KernelSignature::Make(
- {key_type->id(), InputType::Any(), InputType(Type::UINT32)},
- OutputType(ResolveGroupOutputType));
+ auto sig = KernelSignature::Make({type, InputType::Any(),
InputType(Type::UINT32)},
+ OutputType(ResolveGroupOutputType));
DCHECK_OK(func->AddKernel(
MakeKernel(std::move(sig), HashAggregateInit<GroupedPivotImpl>)));
+ };
+ for (const auto& key_type : BaseBinaryTypes()) {
+ add_kernel(key_type->id());
+ }
+ for (const auto& key_type : IntTypes()) {
+ add_kernel(key_type->id());
}
+ add_kernel(Type::FIXED_SIZE_BINARY);
DCHECK_OK(registry->AddFunction(std::move(func)));
}
}
diff --git a/cpp/src/arrow/compute/kernels/pivot_internal.cc
b/cpp/src/arrow/compute/kernels/pivot_internal.cc
index 7a65ddc212..72d96213c9 100644
--- a/cpp/src/arrow/compute/kernels/pivot_internal.cc
+++ b/cpp/src/arrow/compute/kernels/pivot_internal.cc
@@ -18,110 +18,139 @@
#include "arrow/compute/kernels/pivot_internal.h"
#include <cstdint>
+#include <string_view>
+#include <unordered_set>
+#include "arrow/array/array_primitive.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/compute/cast.h"
#include "arrow/compute/exec.h"
#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/row/grouper.h"
#include "arrow/scalar.h"
#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/unreachable.h"
#include "arrow/visit_type_inline.h"
namespace arrow::compute::internal {
using ::arrow::util::span;
-struct BasePivotKeyMapper : public PivotWiderKeyMapper {
- Status Init(const PivotWiderOptions* options) override {
- if (options->key_names.size() > static_cast<size_t>(kMaxPivotKey) + 1) {
+struct ConcretePivotWiderKeyMapper : public PivotWiderKeyMapper {
+ Status Init(const DataType& key_type, const PivotWiderOptions* options,
+ ExecContext* ctx) {
+ if (options->key_names.size() > static_cast<size_t>(kMaxPivotKey)) {
return Status::NotImplemented("Pivoting to more than ",
- static_cast<size_t>(kMaxPivotKey) + 1,
- " columns: got ",
options->key_names.size());
+ static_cast<size_t>(kMaxPivotKey), "
columns: got ",
+ options->key_names.size());
}
- key_name_map_.reserve(options->key_names.size());
- PivotWiderKeyIndex index = 0;
- for (const auto& key_name : options->key_names) {
- bool inserted =
- key_name_map_.try_emplace(std::string_view(key_name),
index++).second;
- if (!inserted) {
- return Status::KeyError("Duplicate key name '", key_name,
- "' in PivotWiderOptions");
+ unexpected_key_behavior_ = options->unexpected_key_behavior;
+ ARROW_ASSIGN_OR_RAISE(grouper_, Grouper::Make({&key_type}, ctx));
+ // Build a binary array of the pivot key values, and cast it to the
desired key type
+ BinaryBuilder builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(options->key_names.size()));
+ int64_t total_length = 0;
+ for (const auto& key : options->key_names) {
+ total_length += static_cast<int64_t>(key.length());
+ }
+ RETURN_NOT_OK(builder.ReserveData(total_length));
+ for (const auto& key : options->key_names) {
+ builder.UnsafeAppend(key);
+ }
+ ARROW_ASSIGN_OR_RAISE(auto binary_key_array, builder.Finish());
+ ARROW_ASSIGN_OR_RAISE(auto key_array,
+ Cast(*binary_key_array, &key_type,
CastOptions::Safe(), ctx));
+ // Populate the grouper with the keys from the array
+ ExecSpan batch({ExecValue(*key_array->data())}, key_array->length());
+ RETURN_NOT_OK(grouper_->Populate(batch));
+ if (grouper_->num_groups() != options->key_names.size()) {
+ // There's a duplicate key, find it to emit a nicer error message
+ std::unordered_set<std::string_view> seen;
+ for (const auto& key : options->key_names) {
+ auto [_, inserted] = seen.emplace(key);
+ if (!inserted) {
+ return Status::KeyError("Duplicate key name '", key, "' in
PivotWiderOptions");
+ }
}
+ Unreachable("Grouper doesn't agree with std::unordered_set");
}
- unexpected_key_behavior_ = options->unexpected_key_behavior;
return Status::OK();
}
- protected:
- Result<PivotWiderKeyIndex> KeyNotFound(std::string_view key_name) {
- if (unexpected_key_behavior_ == PivotWiderOptions::kIgnore) {
- return kNullPivotKey;
+ Result<std::shared_ptr<ArrayData>> MapKeys(const ArraySpan& array) override {
+ if (array.GetNullCount() != 0) {
+ return NullKeyName();
}
- DCHECK_EQ(unexpected_key_behavior_, PivotWiderOptions::kRaise);
- return Status::KeyError("Unexpected pivot key: ", key_name);
+ return MapKeysInternal(array, array.length);
}
- Result<PivotWiderKeyIndex> LookupKey(std::string_view key_name) {
- const auto it = this->key_name_map_.find(key_name);
- if (ARROW_PREDICT_FALSE(it == this->key_name_map_.end())) {
- return KeyNotFound(key_name);
+ Result<std::optional<PivotWiderKeyIndex>> MapKey(const Scalar& scalar)
override {
+ if (!scalar.is_valid) {
+ return NullKeyName();
+ }
+ ARROW_ASSIGN_OR_RAISE(auto group_id_array, MapKeysInternal(&scalar,
/*length=*/1));
+ DCHECK_EQ(group_id_array->length, 1);
+ if (group_id_array->GetNullCount() == 0) {
+ return group_id_array->GetValues<uint32_t>(1)[0];
} else {
- return it->second;
+ // For UnexpectedKeyBehavior::kIgnore
+ return std::nullopt;
+ }
+ }
+
+ protected:
+ Result<std::shared_ptr<ArrayData>> MapKeysInternal(const ExecValue& values,
+ int64_t length) {
+ ARROW_ASSIGN_OR_RAISE(auto result, grouper_->Lookup(ExecSpan({values},
length)));
+ DCHECK(result.is_array());
+ DCHECK_EQ(result.type()->id(), Type::UINT32);
+ auto group_id_array = result.array();
+ const bool has_nulls = (group_id_array->GetNullCount() != 0);
+ if (ARROW_PREDICT_FALSE(has_nulls) &&
+ unexpected_key_behavior_ == PivotWiderOptions::kRaise) {
+ // Extract unexpected key name, to emit a nicer error message
+ int64_t null_pos = 0;
+ DCHECK_NE(group_id_array->buffers[0], nullptr);
+ ::arrow::internal::BitRunReader
bit_run_reader(group_id_array->buffers[0]->data(),
+ group_id_array->offset,
+ group_id_array->length);
+ // Search the first unset validity bit, indicating the first unexpected
key
+ for (;;) {
+ auto run = bit_run_reader.NextRun();
+ if (run.length == 0 || !run.set) {
+ break;
+ }
+ null_pos += run.length;
+ }
+ DCHECK_LT(null_pos, group_id_array->length);
+ DCHECK_LT(null_pos, values.length());
+ std::shared_ptr<Scalar> key_scalar;
+ if (values.is_scalar()) {
+ DCHECK_EQ(null_pos, 0);
+ key_scalar = values.scalar->GetSharedPtr();
+ } else {
+ ARROW_ASSIGN_OR_RAISE(key_scalar,
values.array.ToArray()->GetScalar(null_pos));
+ }
+ return Status::KeyError("Unexpected pivot key: ",
key_scalar->ToString());
}
+ return group_id_array;
}
Status NullKeyName() { return Status::KeyError("pivot key name cannot be
null"); }
- // The strings backing the string_views should be kept alive by
PivotWiderOptions.
- std::unordered_map<std::string_view, PivotWiderKeyIndex> key_name_map_;
+ std::unique_ptr<Grouper> grouper_;
PivotWiderOptions::UnexpectedKeyBehavior unexpected_key_behavior_;
- TypedBufferBuilder<PivotWiderKeyIndex> key_indices_buffer_;
-};
-
-template <typename KeyType>
-struct TypedPivotKeyMapper : public BasePivotKeyMapper {
- Result<span<const PivotWiderKeyIndex>> MapKeys(const ArraySpan& array)
override {
- // XXX Should use a faster hashing facility than unordered_map, for example
- // Grouper or SwissTable.
- RETURN_NOT_OK(this->key_indices_buffer_.Reserve(array.length));
- PivotWiderKeyIndex* key_indices = this->key_indices_buffer_.mutable_data();
- int64_t i = 0;
- RETURN_NOT_OK(VisitArrayValuesInline<KeyType>(
- array,
- [&](std::string_view key_name) {
- ARROW_ASSIGN_OR_RAISE(key_indices[i], LookupKey(key_name));
- ++i;
- return Status::OK();
- },
- [&]() { return NullKeyName(); }));
- return span(key_indices, array.length);
- }
-
- Result<PivotWiderKeyIndex> MapKey(const Scalar& scalar) override {
- if (!scalar.is_valid) {
- return NullKeyName();
- }
- const auto& binary_scalar = checked_cast<const BaseBinaryScalar&>(scalar);
- return LookupKey(binary_scalar.view());
- }
+ std::shared_ptr<Buffer> last_group_ids_;
};
Result<std::unique_ptr<PivotWiderKeyMapper>> PivotWiderKeyMapper::Make(
- const DataType& key_type, const PivotWiderOptions* options) {
- std::unique_ptr<PivotWiderKeyMapper> instance;
-
- auto visit_key_type =
- [&](auto&& key_type) -> Result<std::unique_ptr<PivotWiderKeyMapper>> {
- using T = std::decay_t<decltype(key_type)>;
- // Only binary-like keys are supported for now
- if constexpr (is_base_binary_type<T>::value) {
- instance = std::make_unique<TypedPivotKeyMapper<T>>();
- RETURN_NOT_OK(instance->Init(options));
- return std::move(instance);
- }
- return Status::NotImplemented("Pivot key type: ", key_type);
- };
-
- return VisitType(key_type, visit_key_type);
+ const DataType& key_type, const PivotWiderOptions* options, ExecContext*
ctx) {
+ auto instance = std::make_unique<ConcretePivotWiderKeyMapper>();
+ RETURN_NOT_OK(instance->Init(key_type, options, ctx));
+ return instance;
}
} // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/pivot_internal.h
b/cpp/src/arrow/compute/kernels/pivot_internal.h
index faa808b7a2..9504e9f0a8 100644
--- a/cpp/src/arrow/compute/kernels/pivot_internal.h
+++ b/cpp/src/arrow/compute/kernels/pivot_internal.h
@@ -20,32 +20,29 @@
#include <cstdint>
#include <limits>
#include <memory>
+#include <optional>
#include "arrow/compute/api_aggregate.h"
#include "arrow/compute/type_fwd.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
-#include "arrow/util/span.h"
namespace arrow::compute::internal {
-using PivotWiderKeyIndex = uint8_t;
+using PivotWiderKeyIndex = uint32_t;
-constexpr PivotWiderKeyIndex kNullPivotKey =
+constexpr PivotWiderKeyIndex kMaxPivotKey =
std::numeric_limits<PivotWiderKeyIndex>::max();
-constexpr PivotWiderKeyIndex kMaxPivotKey = kNullPivotKey - 1;
struct PivotWiderKeyMapper {
virtual ~PivotWiderKeyMapper() = default;
- virtual Status Init(const PivotWiderOptions* options) = 0;
- virtual Result<::arrow::util::span<const PivotWiderKeyIndex>> MapKeys(
- const ArraySpan&) = 0;
- virtual Result<PivotWiderKeyIndex> MapKey(const Scalar&) = 0;
+ virtual Result<std::shared_ptr<ArrayData>> MapKeys(const ArraySpan&) = 0;
+ virtual Result<std::optional<PivotWiderKeyIndex>> MapKey(const Scalar&) = 0;
static Result<std::unique_ptr<PivotWiderKeyMapper>> Make(
- const DataType& key_type, const PivotWiderOptions* options);
+ const DataType& key_type, const PivotWiderOptions* options, ExecContext*
ctx);
};
} // namespace arrow::compute::internal
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 6bbcac0074..051ed7da4e 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -199,57 +199,57 @@ Aggregations
Scalar aggregations operate on a (chunked) array or scalar value and reduce
the input to a single output value.
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| Function name | Arity | Input types | Output type |
Options class | Notes |
-+====================+=========+==================+========================+==================================+=======+
-| all | Unary | Boolean | Scalar Boolean |
:struct:`ScalarAggregateOptions` | \(1) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| any | Unary | Boolean | Scalar Boolean |
:struct:`ScalarAggregateOptions` | \(1) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| approximate_median | Unary | Numeric | Scalar Float64 |
:struct:`ScalarAggregateOptions` | |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| count | Unary | Any | Scalar Int64 |
:struct:`CountOptions` | \(2) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| count_all | Nullary | | Scalar Int64 |
| |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| count_distinct | Unary | Non-nested types | Scalar Int64 |
:struct:`CountOptions` | \(2) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| first | Unary | Numeric, Binary | Scalar Input type |
:struct:`ScalarAggregateOptions` | \(3) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| first_last | Unary | Numeric, Binary | Scalar Struct |
:struct:`ScalarAggregateOptions` | \(3) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| index | Unary | Any | Scalar Int64 |
:struct:`IndexOptions` | \(4) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| kurtosis | Unary | Numeric | Scalar Float64 |
:struct:`SkewOptions` | \(11) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| last | Unary | Numeric, Binary | Scalar Input type |
:struct:`ScalarAggregateOptions` | \(3) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| max | Unary | Non-nested types | Scalar Input type |
:struct:`ScalarAggregateOptions` | |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| mean | Unary | Numeric | Scalar Decimal/Float64 |
:struct:`ScalarAggregateOptions` | \(5) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| min | Unary | Non-nested types | Scalar Input type |
:struct:`ScalarAggregateOptions` | |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| min_max | Unary | Non-nested types | Scalar Struct |
:struct:`ScalarAggregateOptions` | \(6) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| mode | Unary | Numeric | Struct |
:struct:`ModeOptions` | \(7) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| pivot_wider | Binary | Binary, Any | Scalar Struct |
:struct:`PivotWiderOptions` | \(8) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| product | Unary | Numeric | Scalar Numeric |
:struct:`ScalarAggregateOptions` | \(9) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| quantile | Unary | Numeric | Scalar Numeric |
:struct:`QuantileOptions` | \(10) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| skew | Unary | Numeric | Scalar Float64 |
:struct:`SkewOptions` | \(11) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| stddev | Unary | Numeric | Scalar Float64 |
:struct:`VarianceOptions` | \(11) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| sum | Unary | Numeric | Scalar Numeric |
:struct:`ScalarAggregateOptions` | \(9) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| tdigest | Unary | Numeric | Float64 |
:struct:`TDigestOptions` | \(12) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
-| variance | Unary | Numeric | Scalar Float64 |
:struct:`VarianceOptions` | \(11) |
-+--------------------+---------+------------------+------------------------+----------------------------------+-------+
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| Function name | Arity | Input types
| Output type | Options class | Notes |
++====================+=========+===============================================+========================+==================================+=======+
+| all | Unary | Boolean
| Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| any | Unary | Boolean
| Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| approximate_median | Unary | Numeric
| Scalar Float64 | :struct:`ScalarAggregateOptions` | |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| count | Unary | Any
| Scalar Int64 | :struct:`CountOptions` | \(2) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| count_all | Nullary |
| Scalar Int64 | | |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| count_distinct | Unary | Non-nested types
| Scalar Int64 | :struct:`CountOptions` | \(2) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| first | Unary | Numeric, Binary
| Scalar Input type | :struct:`ScalarAggregateOptions` | \(3) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| first_last | Unary | Numeric, Binary
| Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| index | Unary | Any
| Scalar Int64 | :struct:`IndexOptions` | \(4) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| kurtosis | Unary | Numeric
| Scalar Float64 | :struct:`SkewOptions` | \(11) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| last | Unary | Numeric, Binary
| Scalar Input type | :struct:`ScalarAggregateOptions` | \(3) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| max | Unary | Non-nested types
| Scalar Input type | :struct:`ScalarAggregateOptions` | |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| mean | Unary | Numeric
| Scalar Decimal/Float64 | :struct:`ScalarAggregateOptions` | \(5) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| min | Unary | Non-nested types
| Scalar Input type | :struct:`ScalarAggregateOptions` | |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| min_max | Unary | Non-nested types
| Scalar Struct | :struct:`ScalarAggregateOptions` | \(6) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| mode | Unary | Numeric
| Struct | :struct:`ModeOptions` | \(7) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| pivot_wider | Binary | Binary, String, Integer (Arg 0); Any (Arg 1)
| Scalar Struct | :struct:`PivotWiderOptions` | \(8) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| product | Unary | Numeric
| Scalar Numeric | :struct:`ScalarAggregateOptions` | \(9) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| quantile | Unary | Numeric
| Scalar Numeric | :struct:`QuantileOptions` | \(10) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| skew | Unary | Numeric
| Scalar Float64 | :struct:`SkewOptions` | \(11) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| stddev | Unary | Numeric
| Scalar Float64 | :struct:`VarianceOptions` | \(11) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| sum | Unary | Numeric
| Scalar Numeric | :struct:`ScalarAggregateOptions` | \(9) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| tdigest | Unary | Numeric
| Float64 | :struct:`TDigestOptions` | \(12) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
+| variance | Unary | Numeric
| Scalar Float64 | :struct:`VarianceOptions` | \(11) |
++--------------------+---------+-----------------------------------------------+------------------------+----------------------------------+-------+
* \(1) If null values are taken into account, by setting the
ScalarAggregateOptions parameter skip_nulls = false, then `Kleene logic`_
@@ -343,57 +343,57 @@ The supported aggregation functions are as follows. All
function names are
prefixed with ``hash_``, which differentiates them from their scalar
equivalents above and reflects how they are implemented internally.
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| Function name | Arity | Input types |
Output type | Options class | Notes |
-+=========================+=========+====================================+========================+==================================+===========+
-| hash_all | Unary | Boolean |
Boolean | :struct:`ScalarAggregateOptions` | \(1) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_any | Unary | Boolean |
Boolean | :struct:`ScalarAggregateOptions` | \(1) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_approximate_median | Unary | Numeric |
Float64 | :struct:`ScalarAggregateOptions` | |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_count | Unary | Any |
Int64 | :struct:`CountOptions` | \(2) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_count_all | Nullary | |
Int64 | | |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_count_distinct | Unary | Any |
Int64 | :struct:`CountOptions` | \(2) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_distinct | Unary | Any |
List of input type | :struct:`CountOptions` | \(2) \(3) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_first | Unary | Numeric, Binary |
Input type | :struct:`ScalarAggregateOptions` | \(11) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_first_last | Unary | Numeric, Binary |
Struct | :struct:`ScalarAggregateOptions` | \(11) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_kurtosis | Unary | Numeric |
Float64 | :struct:`SkewOptions` | \(9) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_last | Unary | Numeric, Binary |
Input type | :struct:`ScalarAggregateOptions` | \(11) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_list | Unary | Any |
List of input type | | \(3) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_max | Unary | Non-nested, non-binary/string-like |
Input type | :struct:`ScalarAggregateOptions` | |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_mean | Unary | Numeric |
Decimal/Float64 | :struct:`ScalarAggregateOptions` | \(4) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_min | Unary | Non-nested, non-binary/string-like |
Input type | :struct:`ScalarAggregateOptions` | |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_min_max | Unary | Non-nested types |
Struct | :struct:`ScalarAggregateOptions` | \(5) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_one | Unary | Any |
Input type | | \(6) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_pivot_wider | Binary | Binary, Any |
Struct | :struct:`PivotWiderOptions` | \(7) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_product | Unary | Numeric |
Numeric | :struct:`ScalarAggregateOptions` | \(8) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_skew | Unary | Numeric |
Float64 | :struct:`SkewOptions` | \(9) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_stddev | Unary | Numeric |
Float64 | :struct:`VarianceOptions` | \(9) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_sum | Unary | Numeric |
Numeric | :struct:`ScalarAggregateOptions` | \(8) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_tdigest | Unary | Numeric |
FixedSizeList[Float64] | :struct:`TDigestOptions` | \(10) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
-| hash_variance | Unary | Numeric |
Float64 | :struct:`VarianceOptions` | \(9) |
-+-------------------------+---------+------------------------------------+------------------------+----------------------------------+-----------+
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| Function name | Arity | Input types
| Output type | Options class | Notes |
++=========================+=========+==============================================+========================+==================================+===========+
+| hash_all | Unary | Boolean
| Boolean | :struct:`ScalarAggregateOptions` | \(1) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_any | Unary | Boolean
| Boolean | :struct:`ScalarAggregateOptions` | \(1) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_approximate_median | Unary | Numeric
| Float64 | :struct:`ScalarAggregateOptions` | |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_count | Unary | Any
| Int64 | :struct:`CountOptions` | \(2) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_count_all | Nullary |
| Int64 | | |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_count_distinct | Unary | Any
| Int64 | :struct:`CountOptions` | \(2) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_distinct | Unary | Any
| List of input type | :struct:`CountOptions` | \(2) \(3) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_first | Unary | Numeric, Binary
| Input type | :struct:`ScalarAggregateOptions` | \(11) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_first_last | Unary | Numeric, Binary
| Struct | :struct:`ScalarAggregateOptions` | \(11) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_kurtosis | Unary | Numeric
| Float64 | :struct:`SkewOptions` | \(9) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_last | Unary | Numeric, Binary
| Input type | :struct:`ScalarAggregateOptions` | \(11) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_list | Unary | Any
| List of input type | | \(3) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_max | Unary | Non-nested, non-binary/string-like
| Input type | :struct:`ScalarAggregateOptions` | |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_mean | Unary | Numeric
| Decimal/Float64 | :struct:`ScalarAggregateOptions` | \(4) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_min | Unary | Non-nested, non-binary/string-like
| Input type | :struct:`ScalarAggregateOptions` | |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_min_max | Unary | Non-nested types
| Struct | :struct:`ScalarAggregateOptions` | \(5) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_one | Unary | Any
| Input type | | \(6) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_pivot_wider | Binary | Binary, String, Integer (Arg 0); Any
(Arg 1) | Struct | :struct:`PivotWiderOptions` | \(7)
|
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_product | Unary | Numeric
| Numeric | :struct:`ScalarAggregateOptions` | \(8) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_skew | Unary | Numeric
| Float64 | :struct:`SkewOptions` | \(9) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_stddev | Unary | Numeric
| Float64 | :struct:`VarianceOptions` | \(9) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_sum | Unary | Numeric
| Numeric | :struct:`ScalarAggregateOptions` | \(8) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_tdigest | Unary | Numeric
| FixedSizeList[Float64] | :struct:`TDigestOptions` | \(10) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
+| hash_variance | Unary | Numeric
| Float64 | :struct:`VarianceOptions` | \(9) |
++-------------------------+---------+----------------------------------------------+------------------------+----------------------------------+-----------+
* \(1) If null values are taken into account, by setting the
:member:`ScalarAggregateOptions::skip_nulls` to false, then `Kleene logic`_