pitrou commented on a change in pull request #11204:
URL: https://github.com/apache/arrow/pull/11204#discussion_r714109708
##########
File path: cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
##########
@@ -175,10 +182,52 @@ std::shared_ptr<ScalarAggregateFunction>
AddTDigestAggKernels() {
return func;
}
+std::shared_ptr<ScalarAggregateFunction> AddApproximateMedianAggKernels(
+ const ScalarAggregateFunction* tdigest_func) {
+ static ScalarAggregateOptions default_scalar_aggregate_options;
+
+ auto median = std::make_shared<ScalarAggregateFunction>(
+ "approximate_median", Arity::Unary(), &approximate_median_doc,
+ &default_scalar_aggregate_options);
+
+ auto sig =
+ KernelSignature::Make({InputType(ValueDescr::ANY)},
ValueDescr::Scalar(float64()));
+
+ auto init = [tdigest_func](
+ KernelContext* ctx,
+ const KernelInitArgs& args) ->
Result<std::unique_ptr<KernelState>> {
+ std::vector<ValueDescr> inputs = args.inputs;
+ ARROW_ASSIGN_OR_RAISE(auto kernel, tdigest_func->DispatchBest(&inputs));
+ const auto& scalar_options =
+ checked_cast<const ScalarAggregateOptions&>(*args.options);
+ TDigestOptions options;
+ // Default q = 0.5
+ options.min_count = scalar_options.min_count;
+ options.skip_nulls = scalar_options.skip_nulls;
+ KernelInitArgs new_args{kernel, inputs, &options};
+ return kernel->init(ctx, new_args);
Review comment:
Will it make a copy of the options struct? `&options` points to
stack-allocated space that will soon disappear...
##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -1734,6 +1734,37 @@ struct GroupedTDigestFactory {
InputType argument_type;
};
+HashAggregateKernel MakeApproximateMedianKernel(HashAggregateFunction*
tdigest_func) {
+ HashAggregateKernel kernel;
+ kernel.init = [tdigest_func](
+ KernelContext* ctx,
+ const KernelInitArgs& args) ->
Result<std::unique_ptr<KernelState>> {
+ std::vector<ValueDescr> inputs = args.inputs;
+ ARROW_ASSIGN_OR_RAISE(auto kernel, tdigest_func->DispatchBest(&inputs));
+ const auto& scalar_options =
+ checked_cast<const ScalarAggregateOptions&>(*args.options);
+ TDigestOptions options;
+ // Default q = 0.5
+ options.min_count = scalar_options.min_count;
+ options.skip_nulls = scalar_options.skip_nulls;
+ KernelInitArgs new_args{kernel, inputs, &options};
+ return kernel->init(ctx, new_args);
Review comment:
Same question wrt. options pointer.
##########
File path: docs/source/cpp/compute.rst
##########
@@ -188,39 +188,41 @@ Aggregations
Scalar aggregations operate on a (chunked) array or scalar value and reduce
the input to a single output value.
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| Function name | Arity | Input types | Output type | Options
class | Notes |
-+===============+=======+==================+========================+==================================+=======+
-| all | Unary | Boolean | Scalar Boolean |
:struct:`ScalarAggregateOptions` | \(1) |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| any | Unary | Boolean | Scalar Boolean |
:struct:`ScalarAggregateOptions` | \(1) |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| count | Unary | Any | Scalar Int64 |
:struct:`CountOptions` | \(2) |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| index | Unary | Any | Scalar Int64 |
:struct:`IndexOptions` | |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| max | Unary | Non-nested types | Scalar Input type |
:struct:`ScalarAggregateOptions` | |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| mean | Unary | Numeric | Scalar Decimal/Float64 |
:struct:`ScalarAggregateOptions` | |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| min | Unary | Non-nested types | Scalar Input type |
:struct:`ScalarAggregateOptions` | |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| min_max | Unary | Non-nested types | Scalar Struct |
:struct:`ScalarAggregateOptions` | \(3) |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| mode | Unary | Numeric | Struct |
:struct:`ModeOptions` | \(4) |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| product | Unary | Numeric | Scalar Numeric |
:struct:`ScalarAggregateOptions` | \(5) |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| quantile | Unary | Numeric | Scalar Numeric |
:struct:`QuantileOptions` | \(6) |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| stddev | Unary | Numeric | Scalar Float64 |
:struct:`VarianceOptions` | |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| sum | Unary | Numeric | Scalar Numeric |
:struct:`ScalarAggregateOptions` | \(5) |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| tdigest | Unary | Numeric | Scalar Float64 |
:struct:`TDigestOptions` | \(7) |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| variance | Unary | Numeric | Scalar Float64 |
:struct:`VarianceOptions` | |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| Function name | Arity | Input types | Output type |
Options class | Notes |
++====================+=======+==================+========================+==================================+=======+
+| all | Unary | Boolean | Scalar Boolean |
:struct:`ScalarAggregateOptions` | \(1) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| any | Unary | Boolean | Scalar Boolean |
:struct:`ScalarAggregateOptions` | \(1) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| approximate_median | Unary | Numeric | Scalar Float64 |
:struct:`TDigestOptions` | |
Review comment:
Should be ScalarAggregateOptions, no?
##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -3447,5 +3447,55 @@ TEST(TestTDigestKernel, Options) {
ResultWith(ArrayFromJSON(ty, "[null]")));
}
+TEST(TestTDigestKernel, ApproximateMedian) {
+ // This is a wrapper for TDigest
+ for (const auto& ty : {float64(), int64(), uint16()}) {
+ ScalarAggregateOptions keep_nulls(/*skip_nulls=*/false, /*min_count=*/0);
+ ScalarAggregateOptions min_count(/*skip_nulls=*/true, /*min_count=*/3);
+ ScalarAggregateOptions keep_nulls_min_count(/*skip_nulls=*/false,
/*min_count=*/3);
+
+ EXPECT_THAT(
+ CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 3]")},
&keep_nulls),
+ ResultWith(ScalarFromJSON(float64(), "2.0")));
+ EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2,
3, null]")},
+ &keep_nulls),
+ ResultWith(ScalarFromJSON(float64(), "null")));
+ EXPECT_THAT(
+ CallFunction("approximate_median", {ScalarFromJSON(ty, "1")},
&keep_nulls),
+ ResultWith(ScalarFromJSON(float64(), "1.0")));
+ EXPECT_THAT(
+ CallFunction("approximate_median", {ScalarFromJSON(ty, "null")},
&keep_nulls),
+ ResultWith(ScalarFromJSON(float64(), "null")));
+
+ EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2,
3, null]")},
+ &min_count),
+ ResultWith(ScalarFromJSON(float64(), "2.0")));
+ EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2,
null]")},
+ &min_count),
+ ResultWith(ScalarFromJSON(float64(), "null")));
+ EXPECT_THAT(CallFunction("approximate_median", {ScalarFromJSON(ty, "1")},
&min_count),
+ ResultWith(ScalarFromJSON(float64(), "null")));
+ EXPECT_THAT(
+ CallFunction("approximate_median", {ScalarFromJSON(ty, "null")},
&min_count),
+ ResultWith(ScalarFromJSON(float64(), "null")));
+
+ EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2,
3]")},
+ &keep_nulls_min_count),
+ ResultWith(ScalarFromJSON(float64(), "2.0")));
+ EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1,
2]")},
+ &keep_nulls_min_count),
+ ResultWith(ScalarFromJSON(float64(), "null")));
+ EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2,
3, null]")},
+ &keep_nulls_min_count),
+ ResultWith(ScalarFromJSON(float64(), "null")));
+ EXPECT_THAT(CallFunction("approximate_median", {ScalarFromJSON(ty, "1")},
+ &keep_nulls_min_count),
+ ResultWith(ScalarFromJSON(float64(), "null")));
+ EXPECT_THAT(CallFunction("approximate_median", {ScalarFromJSON(ty,
"null")},
+ &keep_nulls_min_count),
+ ResultWith(ScalarFromJSON(float64(), "null")));
+ }
+}
Review comment:
Does it also work on a chunked array?
##########
File path: docs/source/cpp/compute.rst
##########
@@ -298,37 +300,39 @@ The supported aggregation functions are as follows. All
function names are
prefixed with ``hash_``, which differentiates them from their scalar
equivalents above and reflects how they are implemented internally.
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| Function name | Arity | Input types | Output
type | Options class | Notes |
-+=====================+=======+====================================+=================+==================================+=======+
-| hash_all | Unary | Boolean | Boolean
| :struct:`ScalarAggregateOptions` | \(1) |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_any | Unary | Boolean | Boolean
| :struct:`ScalarAggregateOptions` | \(1) |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_count | Unary | Any | Int64
| :struct:`CountOptions` | \(2) |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_count_distinct | Unary | Any | Int64
| :struct:`CountOptions` | \(2) |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_distinct | Unary | Any | Input
type | :struct:`CountOptions` | \(2) |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_max | Unary | Non-nested, non-binary/string-like | Input
type | :struct:`ScalarAggregateOptions` | |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_mean | Unary | Numeric |
Decimal/Float64 | :struct:`ScalarAggregateOptions` | |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_min | Unary | Non-nested, non-binary/string-like | Input
type | :struct:`ScalarAggregateOptions` | |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_min_max | Unary | Non-nested, non-binary/string-like | Struct
| :struct:`ScalarAggregateOptions` | \(3) |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_product | Unary | Numeric | Numeric
| :struct:`ScalarAggregateOptions` | \(4) |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_stddev | Unary | Numeric | Float64
| :struct:`VarianceOptions` | |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_sum | Unary | Numeric | Numeric
| :struct:`ScalarAggregateOptions` | \(4) |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_tdigest | Unary | Numeric | Float64
| :struct:`TDigestOptions` | \(5) |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_variance | Unary | Numeric | Float64
| :struct:`VarianceOptions` | |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| Function name | Arity | Input types |
Output type | Options class | Notes |
++=========================+=======+====================================+========================+==================================+=======+
+| hash_all | Unary | Boolean |
Boolean | :struct:`ScalarAggregateOptions` | \(1) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_any | Unary | Boolean |
Boolean | :struct:`ScalarAggregateOptions` | \(1) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_approximate_median | Unary | Numeric |
Float64 | :struct:`TDigestOptions` | |
Review comment:
Same here.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]