mapleFU commented on code in PR #37100:
URL: https://github.com/apache/arrow/pull/37100#discussion_r1360709319
##########
cpp/src/arrow/compute/kernels/aggregate_basic_internal.h:
##########
@@ -912,6 +912,116 @@ struct NullMinMaxImpl : public ScalarAggregator {
}
};
+template <SimdLevel::type SimdLevel>
+struct DictionaryMinMaxImpl : public ScalarAggregator {
+ using ThisType = DictionaryMinMaxImpl<SimdLevel>;
+
+ DictionaryMinMaxImpl(std::shared_ptr<DataType> out_type,
ScalarAggregateOptions options)
+ : options(std::move(options)),
+ out_type(std::move(out_type)),
+ has_nulls(false),
+ count(0),
+ min(nullptr),
+ max(nullptr) {
+ this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
+ }
+
+ Status Consume(KernelContext* ctx, const ExecSpan& batch) override {
+ if (batch[0].is_scalar()) {
+ return Status::NotImplemented("No min/max implemented for
DictionaryScalar");
+ }
+
+ DictionaryArray dict_arr(batch[0].array.ToArrayData());
+ ARROW_ASSIGN_OR_RAISE(auto compacted_arr,
dict_arr.Compact(ctx->memory_pool()));
+ const DictionaryArray& compacted_dict_arr =
+ checked_cast<const DictionaryArray&>(*compacted_arr);
+ const std::shared_ptr<Array>& dict = compacted_dict_arr.dictionary();
+ if (dict->length() == 0) {
+ return Status::OK();
+ }
+ this->has_nulls |= compacted_dict_arr.null_count() > 0;
+ this->count += compacted_dict_arr.length() -
compacted_dict_arr.null_count();
+
+ Datum dict_values(dict);
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result, MinMax(std::move(dict_values),
ScalarAggregateOptions::Defaults(),
+ ctx->exec_context()));
+ const StructScalar& struct_result =
+ checked_cast<const StructScalar&>(*result.scalar());
+ ARROW_ASSIGN_OR_RAISE(auto dict_min, struct_result.field(FieldRef("min")));
+ ARROW_ASSIGN_OR_RAISE(auto dict_max, struct_result.field(FieldRef("max")));
+ ARROW_RETURN_NOT_OK(UpdateMinMaxState(std::move(dict_min),
std::move(dict_max), ctx));
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext* ctx, KernelState&& src) override {
+ const auto& other = checked_cast<const ThisType&>(src);
+
+ this->has_nulls |= other.has_nulls;
+ this->count += other.count;
+ ARROW_RETURN_NOT_OK(UpdateMinMaxState(other.min, other.max, ctx));
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ const auto& struct_type = checked_cast<const StructType&>(*out_type);
+ const auto& child_type = struct_type.field(0)->type();
+
+ std::vector<std::shared_ptr<Scalar>> values;
+ if ((this->has_nulls && !options.skip_nulls) || (this->count <
options.min_count)) {
+ // (null, null)
+ std::shared_ptr<Scalar> null_scalar = MakeNullScalar(child_type);
+ values = {null_scalar, null_scalar};
+ } else {
+ values = {std::move(this->min), std::move(this->max)};
+ }
+
+ out->value = std::make_shared<StructScalar>(std::move(values),
this->out_type);
+ return Status::OK();
+ }
+
+ ScalarAggregateOptions options;
+ std::shared_ptr<DataType> out_type;
+ bool has_nulls;
+ int64_t count;
+ std::shared_ptr<Scalar> min;
+ std::shared_ptr<Scalar> max;
+
+ private:
+ Status UpdateMinMaxState(const std::shared_ptr<Scalar>& other_min,
+ const std::shared_ptr<Scalar>& other_max,
KernelContext* ctx) {
Review Comment:
UpdateMinMaxState and `std::move` is used in L835, but here seems that it's
a `const std::shared_ptr<Scalar>&`?
##########
cpp/src/arrow/compute/kernels/aggregate_basic_internal.h:
##########
@@ -1002,6 +1115,11 @@ struct MinMaxInitState {
return Status::OK();
}
+ Status Visit(const DictionaryType&) {
+ state.reset(new DictionaryMinMaxImpl<SimdLevel>(out_type, options));
Review Comment:
Good idea, this could reduce the time that calling `MinMax`
##########
cpp/src/arrow/compute/kernels/aggregate_basic_internal.h:
##########
@@ -912,6 +912,116 @@ struct NullMinMaxImpl : public ScalarAggregator {
}
};
+template <SimdLevel::type SimdLevel>
+struct DictionaryMinMaxImpl : public ScalarAggregator {
+ using ThisType = DictionaryMinMaxImpl<SimdLevel>;
+
+ DictionaryMinMaxImpl(std::shared_ptr<DataType> out_type,
ScalarAggregateOptions options)
+ : options(std::move(options)),
+ out_type(std::move(out_type)),
+ has_nulls(false),
+ count(0),
+ min(nullptr),
+ max(nullptr) {
+ this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
+ }
+
+ Status Consume(KernelContext* ctx, const ExecSpan& batch) override {
+ if (batch[0].is_scalar()) {
+ return Status::NotImplemented("No min/max implemented for
DictionaryScalar");
+ }
+
+ DictionaryArray dict_arr(batch[0].array.ToArrayData());
+ ARROW_ASSIGN_OR_RAISE(auto compacted_arr,
dict_arr.Compact(ctx->memory_pool()));
+ const DictionaryArray& compacted_dict_arr =
+ checked_cast<const DictionaryArray&>(*compacted_arr);
+ const std::shared_ptr<Array>& dict = compacted_dict_arr.dictionary();
+ if (dict->length() == 0) {
+ return Status::OK();
+ }
+ this->has_nulls |= compacted_dict_arr.null_count() > 0;
+ this->count += compacted_dict_arr.length() -
compacted_dict_arr.null_count();
+
Review Comment:
Can we avoid calling `MinMax` when compaction dictionary length is exactly
`1`? (Maybe not required or as an optimization in the future?)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]