pitrou commented on a change in pull request #11204:
URL: https://github.com/apache/arrow/pull/11204#discussion_r714109708



##########
File path: cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
##########
@@ -175,10 +182,52 @@ std::shared_ptr<ScalarAggregateFunction> 
AddTDigestAggKernels() {
   return func;
 }
 
+std::shared_ptr<ScalarAggregateFunction> AddApproximateMedianAggKernels(
+    const ScalarAggregateFunction* tdigest_func) {
+  static ScalarAggregateOptions default_scalar_aggregate_options;
+
+  auto median = std::make_shared<ScalarAggregateFunction>(
+      "approximate_median", Arity::Unary(), &approximate_median_doc,
+      &default_scalar_aggregate_options);
+
+  auto sig =
+      KernelSignature::Make({InputType(ValueDescr::ANY)}, 
ValueDescr::Scalar(float64()));
+
+  auto init = [tdigest_func](
+                  KernelContext* ctx,
+                  const KernelInitArgs& args) -> 
Result<std::unique_ptr<KernelState>> {
+    std::vector<ValueDescr> inputs = args.inputs;
+    ARROW_ASSIGN_OR_RAISE(auto kernel, tdigest_func->DispatchBest(&inputs));
+    const auto& scalar_options =
+        checked_cast<const ScalarAggregateOptions&>(*args.options);
+    TDigestOptions options;
+    // Default q = 0.5
+    options.min_count = scalar_options.min_count;
+    options.skip_nulls = scalar_options.skip_nulls;
+    KernelInitArgs new_args{kernel, inputs, &options};
+    return kernel->init(ctx, new_args);

Review comment:
       Will it make a copy of the options struct? `&options` points to 
stack-allocated space that will soon disappear...

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -1734,6 +1734,37 @@ struct GroupedTDigestFactory {
   InputType argument_type;
 };
 
+HashAggregateKernel MakeApproximateMedianKernel(HashAggregateFunction* 
tdigest_func) {
+  HashAggregateKernel kernel;
+  kernel.init = [tdigest_func](
+                    KernelContext* ctx,
+                    const KernelInitArgs& args) -> 
Result<std::unique_ptr<KernelState>> {
+    std::vector<ValueDescr> inputs = args.inputs;
+    ARROW_ASSIGN_OR_RAISE(auto kernel, tdigest_func->DispatchBest(&inputs));
+    const auto& scalar_options =
+        checked_cast<const ScalarAggregateOptions&>(*args.options);
+    TDigestOptions options;
+    // Default q = 0.5
+    options.min_count = scalar_options.min_count;
+    options.skip_nulls = scalar_options.skip_nulls;
+    KernelInitArgs new_args{kernel, inputs, &options};
+    return kernel->init(ctx, new_args);

Review comment:
       Same question wrt. options pointer.

##########
File path: docs/source/cpp/compute.rst
##########
@@ -188,39 +188,41 @@ Aggregations
 Scalar aggregations operate on a (chunked) array or scalar value and reduce
 the input to a single output value.
 
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| Function name | Arity | Input types      | Output type            | Options 
class                    | Notes |
-+===============+=======+==================+========================+==================================+=======+
-| all           | Unary | Boolean          | Scalar Boolean         | 
:struct:`ScalarAggregateOptions` | \(1)  |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| any           | Unary | Boolean          | Scalar Boolean         | 
:struct:`ScalarAggregateOptions` | \(1)  |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| count         | Unary | Any              | Scalar Int64           | 
:struct:`CountOptions`           | \(2)  |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| index         | Unary | Any              | Scalar Int64           | 
:struct:`IndexOptions`           |       |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| max           | Unary | Non-nested types | Scalar Input type      | 
:struct:`ScalarAggregateOptions` |       |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| mean          | Unary | Numeric          | Scalar Decimal/Float64 | 
:struct:`ScalarAggregateOptions` |       |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| min           | Unary | Non-nested types | Scalar Input type      | 
:struct:`ScalarAggregateOptions` |       |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| min_max       | Unary | Non-nested types | Scalar Struct          | 
:struct:`ScalarAggregateOptions` | \(3)  |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| mode          | Unary | Numeric          | Struct                 | 
:struct:`ModeOptions`            | \(4)  |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| product       | Unary | Numeric          | Scalar Numeric         | 
:struct:`ScalarAggregateOptions` | \(5)  |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| quantile      | Unary | Numeric          | Scalar Numeric         | 
:struct:`QuantileOptions`        | \(6)  |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| stddev        | Unary | Numeric          | Scalar Float64         | 
:struct:`VarianceOptions`        |       |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| sum           | Unary | Numeric          | Scalar Numeric         | 
:struct:`ScalarAggregateOptions` | \(5)  |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| tdigest       | Unary | Numeric          | Scalar Float64         | 
:struct:`TDigestOptions`         | \(7)  |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
-| variance      | Unary | Numeric          | Scalar Float64         | 
:struct:`VarianceOptions`        |       |
-+---------------+-------+------------------+------------------------+----------------------------------+-------+
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| Function name      | Arity | Input types      | Output type            | 
Options class                    | Notes |
++====================+=======+==================+========================+==================================+=======+
+| all                | Unary | Boolean          | Scalar Boolean         | 
:struct:`ScalarAggregateOptions` | \(1)  |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| any                | Unary | Boolean          | Scalar Boolean         | 
:struct:`ScalarAggregateOptions` | \(1)  |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| approximate_median | Unary | Numeric          | Scalar Float64         | 
:struct:`TDigestOptions`         |       |

Review comment:
       Should be ScalarAggregateOptions, no?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -3447,5 +3447,55 @@ TEST(TestTDigestKernel, Options) {
               ResultWith(ArrayFromJSON(ty, "[null]")));
 }
 
+TEST(TestTDigestKernel, ApproximateMedian) {
+  // This is a wrapper for TDigest
+  for (const auto& ty : {float64(), int64(), uint16()}) {
+    ScalarAggregateOptions keep_nulls(/*skip_nulls=*/false, /*min_count=*/0);
+    ScalarAggregateOptions min_count(/*skip_nulls=*/true, /*min_count=*/3);
+    ScalarAggregateOptions keep_nulls_min_count(/*skip_nulls=*/false, 
/*min_count=*/3);
+
+    EXPECT_THAT(
+        CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 3]")}, 
&keep_nulls),
+        ResultWith(ScalarFromJSON(float64(), "2.0")));
+    EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 
3, null]")},
+                             &keep_nulls),
+                ResultWith(ScalarFromJSON(float64(), "null")));
+    EXPECT_THAT(
+        CallFunction("approximate_median", {ScalarFromJSON(ty, "1")}, 
&keep_nulls),
+        ResultWith(ScalarFromJSON(float64(), "1.0")));
+    EXPECT_THAT(
+        CallFunction("approximate_median", {ScalarFromJSON(ty, "null")}, 
&keep_nulls),
+        ResultWith(ScalarFromJSON(float64(), "null")));
+
+    EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 
3, null]")},
+                             &min_count),
+                ResultWith(ScalarFromJSON(float64(), "2.0")));
+    EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 
null]")},
+                             &min_count),
+                ResultWith(ScalarFromJSON(float64(), "null")));
+    EXPECT_THAT(CallFunction("approximate_median", {ScalarFromJSON(ty, "1")}, 
&min_count),
+                ResultWith(ScalarFromJSON(float64(), "null")));
+    EXPECT_THAT(
+        CallFunction("approximate_median", {ScalarFromJSON(ty, "null")}, 
&min_count),
+        ResultWith(ScalarFromJSON(float64(), "null")));
+
+    EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 
3]")},
+                             &keep_nulls_min_count),
+                ResultWith(ScalarFromJSON(float64(), "2.0")));
+    EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 
2]")},
+                             &keep_nulls_min_count),
+                ResultWith(ScalarFromJSON(float64(), "null")));
+    EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 
3, null]")},
+                             &keep_nulls_min_count),
+                ResultWith(ScalarFromJSON(float64(), "null")));
+    EXPECT_THAT(CallFunction("approximate_median", {ScalarFromJSON(ty, "1")},
+                             &keep_nulls_min_count),
+                ResultWith(ScalarFromJSON(float64(), "null")));
+    EXPECT_THAT(CallFunction("approximate_median", {ScalarFromJSON(ty, 
"null")},
+                             &keep_nulls_min_count),
+                ResultWith(ScalarFromJSON(float64(), "null")));
+  }
+}

Review comment:
       Does it also work on a chunked array?

##########
File path: docs/source/cpp/compute.rst
##########
@@ -298,37 +300,39 @@ The supported aggregation functions are as follows. All 
function names are
 prefixed with ``hash_``, which differentiates them from their scalar
 equivalents above and reflects how they are implemented internally.
 
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| Function name       | Arity | Input types                        | Output 
type     | Options class                    | Notes |
-+=====================+=======+====================================+=================+==================================+=======+
-| hash_all            | Unary | Boolean                            | Boolean   
      | :struct:`ScalarAggregateOptions` | \(1)  |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_any            | Unary | Boolean                            | Boolean   
      | :struct:`ScalarAggregateOptions` | \(1)  |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_count          | Unary | Any                                | Int64     
      | :struct:`CountOptions`           | \(2)  |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_count_distinct | Unary | Any                                | Int64     
      | :struct:`CountOptions`           | \(2)  |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_distinct       | Unary | Any                                | Input 
type      | :struct:`CountOptions`           | \(2)  |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_max            | Unary | Non-nested, non-binary/string-like | Input 
type      | :struct:`ScalarAggregateOptions` |       |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_mean           | Unary | Numeric                            | 
Decimal/Float64 | :struct:`ScalarAggregateOptions` |       |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_min            | Unary | Non-nested, non-binary/string-like | Input 
type      | :struct:`ScalarAggregateOptions` |       |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_min_max        | Unary | Non-nested, non-binary/string-like | Struct    
      | :struct:`ScalarAggregateOptions` | \(3)  |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_product        | Unary | Numeric                            | Numeric   
      | :struct:`ScalarAggregateOptions` | \(4)  |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_stddev         | Unary | Numeric                            | Float64   
      | :struct:`VarianceOptions`        |       |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_sum            | Unary | Numeric                            | Numeric   
      | :struct:`ScalarAggregateOptions` | \(4)  |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_tdigest        | Unary | Numeric                            | Float64   
      | :struct:`TDigestOptions`         | \(5)  |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
-| hash_variance       | Unary | Numeric                            | Float64   
      | :struct:`VarianceOptions`        |       |
-+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| Function name           | Arity | Input types                        | 
Output type            | Options class                    | Notes |
++=========================+=======+====================================+========================+==================================+=======+
+| hash_all                | Unary | Boolean                            | 
Boolean                | :struct:`ScalarAggregateOptions` | \(1)  |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_any                | Unary | Boolean                            | 
Boolean                | :struct:`ScalarAggregateOptions` | \(1)  |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_approximate_median | Unary | Numeric                            | 
Float64                | :struct:`TDigestOptions`         |       |

Review comment:
       Same here.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to