jorisvandenbossche commented on a change in pull request #9758: URL: https://github.com/apache/arrow/pull/9758#discussion_r615652404
########## File path: cpp/src/arrow/compute/api_aggregate.h ########## @@ -37,43 +37,17 @@ class ExecContext; // ---------------------------------------------------------------------- // Aggregate functions -/// \addtogroup compute-concrete-options -/// @{ - -/// \brief Control Count kernel behavior -/// -/// By default, all non-null values are counted. -struct ARROW_EXPORT CountOptions : public FunctionOptions { - enum Mode { - /// Count all non-null values. - COUNT_NON_NULL = 0, - /// Count all null values. - COUNT_NULL, - }; - - explicit CountOptions(enum Mode count_mode = COUNT_NON_NULL) : count_mode(count_mode) {} - - static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); } - - enum Mode count_mode; -}; - -/// \brief Control MinMax kernel behavior +/// \brief Control general scalar aggregate kernel behavior /// /// By default, null values are ignored -struct ARROW_EXPORT MinMaxOptions : public FunctionOptions { - enum Mode { - /// Skip null values - SKIP = 0, - /// Any nulls will result in null output - EMIT_NULL - }; +struct ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions { + explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1) + : skip_nulls(skip_nulls), min_count(min_count) {} - explicit MinMaxOptions(enum Mode null_handling = SKIP) : null_handling(null_handling) {} + static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; } - static MinMaxOptions Defaults() { return MinMaxOptions{}; } - - enum Mode null_handling; + bool skip_nulls = true; + uint32_t min_count = 1; Review comment: Was there any former discussion on what would be the best default here? (eg both pandas and R would set this to min_count=0, I think) Not that it matters too much as long as it's an option to change. ########## File path: cpp/src/arrow/compute/kernels/aggregate_test.cc ########## @@ -104,71 +104,122 @@ static Datum NaiveSum(const Array& array) { } template <typename ArrowType> -void ValidateSum(const Array& input, Datum expected) { +void ValidateSum( + const Array& input, Datum expected, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults()) { using OutputType = typename FindAccumulatorType<ArrowType>::Type; - ASSERT_OK_AND_ASSIGN(Datum result, Sum(input)); + ASSERT_OK_AND_ASSIGN(Datum result, Sum(input, options)); DatumEqual<OutputType>::EnsureEqual(result, expected); } template <typename ArrowType> -void ValidateSum(const std::shared_ptr<ChunkedArray>& input, Datum expected) { +void ValidateSum(const std::shared_ptr<ChunkedArray>& input, Datum expected, + const ScalarAggregateOptions& options) { using OutputType = typename FindAccumulatorType<ArrowType>::Type; - ASSERT_OK_AND_ASSIGN(Datum result, Sum(input)); + ASSERT_OK_AND_ASSIGN(Datum result, Sum(input, options)); DatumEqual<OutputType>::EnsureEqual(result, expected); } template <typename ArrowType> -void ValidateSum(const char* json, Datum expected) { +void ValidateSum( + const char* json, Datum expected, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults()) { auto array = ArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), json); - ValidateSum<ArrowType>(*array, expected); + ValidateSum<ArrowType>(*array, expected, options); } template <typename ArrowType> -void ValidateSum(const std::vector<std::string>& json, Datum expected) { +void ValidateSum( + const std::vector<std::string>& json, Datum expected, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults()) { auto array = ChunkedArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), json); - ValidateSum<ArrowType>(array, expected); + ValidateSum<ArrowType>(array, expected, options); } template <typename ArrowType> void ValidateSum(const Array& array) { ValidateSum<ArrowType>(array, NaiveSum<ArrowType>(array)); } -using UnaryOp = Result<Datum>(const Datum&, ExecContext*); +using UnaryOp = Result<Datum>(const Datum&, const ScalarAggregateOptions&, ExecContext*); -template <UnaryOp& Op, typename ScalarType> +template <UnaryOp& Op, typename ScalarAggregateOptions, typename ScalarType> void ValidateBooleanAgg(const std::string& json, - const std::shared_ptr<ScalarType>& expected) { + const std::shared_ptr<ScalarType>& expected, + const ScalarAggregateOptions& options) { auto array = ArrayFromJSON(boolean(), json); auto exp = Datum(expected); - ASSERT_OK_AND_ASSIGN(Datum result, Op(array, nullptr)); + ASSERT_OK_AND_ASSIGN(Datum result, Op(array, options, nullptr)); ASSERT_TRUE(result.Equals(exp)); } TEST(TestBooleanAggregation, Sum) { - ValidateBooleanAgg<Sum>("[]", std::make_shared<UInt64Scalar>()); - ValidateBooleanAgg<Sum>("[null]", std::make_shared<UInt64Scalar>()); - ValidateBooleanAgg<Sum>("[null, false]", std::make_shared<UInt64Scalar>(0)); - ValidateBooleanAgg<Sum>("[true]", std::make_shared<UInt64Scalar>(1)); - ValidateBooleanAgg<Sum>("[true, false, true]", std::make_shared<UInt64Scalar>(2)); + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(); + ValidateBooleanAgg<Sum>("[]", std::make_shared<UInt64Scalar>(), options); + ValidateBooleanAgg<Sum>("[null]", std::make_shared<UInt64Scalar>(), options); + ValidateBooleanAgg<Sum>("[null, false]", std::make_shared<UInt64Scalar>(0), options); + ValidateBooleanAgg<Sum>("[true]", std::make_shared<UInt64Scalar>(1), options); + ValidateBooleanAgg<Sum>("[true, false, true]", std::make_shared<UInt64Scalar>(2), + options); ValidateBooleanAgg<Sum>("[true, false, true, true, null]", - std::make_shared<UInt64Scalar>(3)); + std::make_shared<UInt64Scalar>(3), options); + + const ScalarAggregateOptions& options_min_count_zero = ScalarAggregateOptions(true, 0); Review comment: Can you add the keyword names when constructing the option (now that the values itself are no longer descriptive), like: ```suggestion const ScalarAggregateOptions& options_min_count_zero = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); ``` ########## File path: cpp/src/arrow/compute/kernels/aggregate_test.cc ########## @@ -399,6 +498,13 @@ TYPED_TEST_SUITE(TestMeanKernelNumeric, NumericArrowTypes); TYPED_TEST(TestMeanKernelNumeric, SimpleMean) { using ScalarType = typename TypeTraits<DoubleType>::ScalarType; + const ScalarAggregateOptions& options = + ScalarAggregateOptions(ScalarAggregateOptions::SKIPNA, 0); + + ValidateMean<TypeParam>("[]", Datum(std::make_shared<ScalarType>(0)), options); Review comment: It seems you changed it to return null now? Should it rather return NaN? (I assume it could be consistent with a manual mean from sum/count with `min_count=0`) ########## File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h ########## @@ -71,41 +72,51 @@ struct SumImpl : public ScalarAggregator { void MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast<const ThisType&>(src); + this->length += other.length; this->count += other.count; this->sum += other.sum; } void Finalize(KernelContext*, Datum* out) override { - if (this->count == 0) { + if (this->count < options.min_count) { out->value = std::make_shared<OutputType>(); } else { out->value = MakeScalar(this->sum); } } + size_t length = 0; size_t count = 0; typename SumType::c_type sum = 0; + ScalarAggregateOptions options; }; template <typename ArrowType, SimdLevel::type SimdLevel> struct MeanImpl : public SumImpl<ArrowType, SimdLevel> { void Finalize(KernelContext*, Datum* out) override { - if (this->count == 0) { + if (this->count == 0 || this->count < options.min_count) { Review comment: Should min_counts check replace `this->count == 0 `, instead of adding it? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org