js8544 commented on code in PR #37100:
URL: https://github.com/apache/arrow/pull/37100#discussion_r1299848219
##########
cpp/src/arrow/compute/kernels/aggregate_basic.cc:
##########
@@ -492,11 +492,24 @@ Result<std::unique_ptr<KernelState>>
MinMaxInit(KernelContext* ctx,
return visitor.Create();
}
+namespace {
+
+Result<TypeHolder> DictionaryValueType(KernelContext*,
+ const std::vector<TypeHolder>& types) {
+ // T -> T.value_type
+ auto ty = types.front();
+ const DictionaryType& ty_dict = checked_cast<const DictionaryType&>(*ty);
+ return ty_dict.value_type();
+}
+
+} // namespace
+
// For "min" and "max" functions: override finalize and return the actual value
template <MinOrMax min_or_max>
-void AddMinOrMaxAggKernel(ScalarAggregateFunction* func,
- ScalarAggregateFunction* min_max_func) {
- auto sig = KernelSignature::Make({InputType::Any()}, FirstType);
+void AddMinOrMaxAggKernels(ScalarAggregateFunction* func,
+ ScalarAggregateFunction* min_max_func) {
+ std::shared_ptr<arrow::compute::KernelSignature> sig =
+ KernelSignature::Make({InputType::Any()}, FirstType);
Review Comment:
Both kernels now accept dictionary inputs, so the correctness of dispatching
relies on the order in which kernels are added, and the fact that
DispatchExactImpl chooses the last kernel that matches. Can you make this
signature match any type other than dictionary?
##########
cpp/src/arrow/compute/kernels/aggregate_basic.cc:
##########
@@ -492,11 +492,24 @@ Result<std::unique_ptr<KernelState>>
MinMaxInit(KernelContext* ctx,
return visitor.Create();
}
+namespace {
+
+Result<TypeHolder> DictionaryValueType(KernelContext*,
Review Comment:
Since this can be potentially useful for other kernels, can you put it in
codegen_internal.h along with `FirstType` etc. ?
##########
cpp/src/arrow/compute/kernels/aggregate_basic_internal.h:
##########
@@ -891,6 +891,105 @@ struct NullMinMaxImpl : public ScalarAggregator {
}
};
+template <SimdLevel::type SimdLevel>
+struct DictionaryMinMaxImpl : public ScalarAggregator {
+ using ThisType = DictionaryMinMaxImpl<SimdLevel>;
+
+ DictionaryMinMaxImpl(std::shared_ptr<DataType> out_type,
ScalarAggregateOptions options)
+ : options(std::move(options)),
+ out_type(std::move(out_type)),
+ has_nulls(false),
+ count(0),
+ min(nullptr),
+ max(nullptr) {
+ this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
+ }
+
+ Status Consume(KernelContext*, const ExecSpan& batch) override {
+ if (batch[0].is_scalar()) {
+ return Status::NotImplemented("No min/max implemented for
DictionaryScalar");
+ }
+
+ DictionaryArray arr(batch[0].array.ToArrayData());
+ this->has_nulls = arr.null_count() > 0;
+ this->count += arr.length() - arr.null_count();
+
+ Datum dict_values(arr.dictionary());
+ ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values)));
Review Comment:
You'll probably want to "Filter" rather than "Validate". You can use 'is_in'
and 'filter' to do such filtration.
##########
cpp/src/arrow/compute/kernels/aggregate_basic_internal.h:
##########
@@ -891,6 +891,111 @@ struct NullMinMaxImpl : public ScalarAggregator {
}
};
+template <SimdLevel::type SimdLevel>
+struct DictionaryMinMaxImpl : public ScalarAggregator {
+ using ThisType = DictionaryMinMaxImpl<SimdLevel>;
+
+ DictionaryMinMaxImpl(std::shared_ptr<DataType> out_type,
ScalarAggregateOptions options)
+ : options(std::move(options)),
+ out_type(std::move(out_type)),
+ has_nulls(false),
+ count(0),
+ min(nullptr),
+ max(nullptr) {
+ this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
+ }
+
+ Status Consume(KernelContext*, const ExecSpan& batch) override {
+ if (batch[0].is_scalar()) {
+ return Status::NotImplemented("No min/max implemented for
DictionaryScalar");
+ }
+
+ DictionaryArray arr(batch[0].array.ToArrayData());
+ this->has_nulls = arr.null_count() > 0;
+ this->count += arr.length() - arr.null_count();
+
+ const std::shared_ptr<Array>& dict = arr.dictionary();
+ if (dict->length() == 0) {
+ return Status::OK();
+ }
+
+ Datum dict_values(std::move(dict));
+ ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values)));
+ const StructScalar& struct_result =
+ checked_cast<const StructScalar&>(*result.scalar());
+ ARROW_ASSIGN_OR_RAISE(auto min_, struct_result.field(FieldRef("min")));
+ ARROW_ASSIGN_OR_RAISE(auto max_, struct_result.field(FieldRef("max")));
+ ARROW_RETURN_NOT_OK(CompareMinMax(std::move(min_), std::move(max_)));
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const ThisType&>(src);
+
+ ARROW_RETURN_NOT_OK(CompareMinMax(other.min, other.max));
+ this->has_nulls = this->has_nulls || other.has_nulls;
+ this->count += other.count;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ const auto& struct_type = checked_cast<const StructType&>(*out_type);
+ const auto& child_type = struct_type.field(0)->type();
+
+ std::vector<std::shared_ptr<Scalar>> values;
+ // Physical type != result type
+ if ((this->has_nulls && !options.skip_nulls) || (this->count <
options.min_count) ||
+ this->min == nullptr || this->min->type->id() == Type::NA) {
+ // (null, null)
+ std::shared_ptr<Scalar> null_scalar = MakeNullScalar(child_type);
+ values = {null_scalar, null_scalar};
+ } else {
+ values = {std::move(this->min), std::move(this->max)};
+ }
+
+ out->value = std::make_shared<StructScalar>(std::move(values),
this->out_type);
+ return Status::OK();
+ }
+
+ ScalarAggregateOptions options;
+ std::shared_ptr<DataType> out_type;
+ bool has_nulls;
+ int64_t count;
+ std::shared_ptr<Scalar> min;
+ std::shared_ptr<Scalar> max;
+
+ private:
+ Status CompareMinMax(const std::shared_ptr<Scalar>& min_,
Review Comment:
We follow the google code style[1], in which only class members have
underscores at the end [2]. So the naming `min_` and `max_` for function args
may cause confusion.
[1]
https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci
[2] https://google.github.io/styleguide/cppguide.html#Variable_Names
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]