This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 17a0ff5566 GH-45190: [C++][Compute] Add rank_quantile function (#45259)
17a0ff5566 is described below

commit 17a0ff556610c5fc0666f4220a6a0e87c487770c
Author: Antoine Pitrou <[email protected]>
AuthorDate: Thu Jan 23 20:32:28 2025 +0100

    GH-45190: [C++][Compute] Add rank_quantile function (#45259)
    
    ### Rationale for this change
    
    Add a "rank_quantile" function following the Wikipedia definition:
    https://en.wikipedia.org/wiki/Percentile_rank
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    Yes, an additional compute function.
    
    * GitHub Issue: #45190
    
    Lead-authored-by: Antoine Pitrou <[email protected]>
    Co-authored-by: Rossi Sun <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/compute/api_vector.cc                |  11 +
 cpp/src/arrow/compute/api_vector.h                 |  19 +
 cpp/src/arrow/compute/kernels/vector_rank.cc       | 383 +++++++++++++--------
 .../arrow/compute/kernels/vector_sort_internal.h   |   7 +
 cpp/src/arrow/compute/kernels/vector_sort_test.cc  | 146 +++++++-
 docs/source/cpp/compute.rst                        |  36 +-
 6 files changed, 429 insertions(+), 173 deletions(-)

diff --git a/cpp/src/arrow/compute/api_vector.cc 
b/cpp/src/arrow/compute/api_vector.cc
index 22ecf1cc87..61335de6ac 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -48,6 +48,7 @@ using compute::DictionaryEncodeOptions;
 using compute::FilterOptions;
 using compute::NullPlacement;
 using compute::RankOptions;
+using compute::RankQuantileOptions;
 
 template <>
 struct EnumTraits<FilterOptions::NullSelectionBehavior>
@@ -151,6 +152,9 @@ static auto kRankOptionsType = 
GetFunctionOptionsType<RankOptions>(
     DataMember("sort_keys", &RankOptions::sort_keys),
     DataMember("null_placement", &RankOptions::null_placement),
     DataMember("tiebreaker", &RankOptions::tiebreaker));
+static auto kRankQuantileOptionsType = 
GetFunctionOptionsType<RankQuantileOptions>(
+    DataMember("sort_keys", &RankQuantileOptions::sort_keys),
+    DataMember("null_placement", &RankQuantileOptions::null_placement));
 static auto kPairwiseOptionsType = GetFunctionOptionsType<PairwiseOptions>(
     DataMember("periods", &PairwiseOptions::periods));
 static auto kListFlattenOptionsType = 
GetFunctionOptionsType<ListFlattenOptions>(
@@ -228,6 +232,13 @@ RankOptions::RankOptions(std::vector<SortKey> sort_keys, 
NullPlacement null_plac
       tiebreaker(tiebreaker) {}
 constexpr char RankOptions::kTypeName[];
 
+RankQuantileOptions::RankQuantileOptions(std::vector<SortKey> sort_keys,
+                                         NullPlacement null_placement)
+    : FunctionOptions(internal::kRankQuantileOptionsType),
+      sort_keys(std::move(sort_keys)),
+      null_placement(null_placement) {}
+constexpr char RankQuantileOptions::kTypeName[];
+
 PairwiseOptions::PairwiseOptions(int64_t periods)
     : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {}
 constexpr char PairwiseOptions::kTypeName[];
diff --git a/cpp/src/arrow/compute/api_vector.h 
b/cpp/src/arrow/compute/api_vector.h
index ada1665b3e..22bb164719 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -195,6 +195,25 @@ class ARROW_EXPORT RankOptions : public FunctionOptions {
   Tiebreaker tiebreaker;
 };
 
+/// \brief Quantile rank options
+class ARROW_EXPORT RankQuantileOptions : public FunctionOptions {
+ public:
+  explicit RankQuantileOptions(std::vector<SortKey> sort_keys = {},
+                               NullPlacement null_placement = 
NullPlacement::AtEnd);
+  /// Convenience constructor for array inputs
+  explicit RankQuantileOptions(SortOrder order,
+                               NullPlacement null_placement = 
NullPlacement::AtEnd)
+      : RankQuantileOptions({SortKey("", order)}, null_placement) {}
+
+  static constexpr char const kTypeName[] = "RankQuantileOptions";
+  static RankQuantileOptions Defaults() { return RankQuantileOptions(); }
+
+  /// Column key(s) to order by and how to order by these sort keys.
+  std::vector<SortKey> sort_keys;
+  /// Whether nulls and NaNs are placed at the start or at the end
+  NullPlacement null_placement;
+};
+
 /// \brief Partitioning options for NthToIndices
 class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
  public:
diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc 
b/cpp/src/arrow/compute/kernels/vector_rank.cc
index 4fdc83788c..2efc61c2e6 100644
--- a/cpp/src/arrow/compute/kernels/vector_rank.cc
+++ b/cpp/src/arrow/compute/kernels/vector_rank.cc
@@ -15,6 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <functional>
+#include <memory>
+
 #include "arrow/compute/function.h"
 #include "arrow/compute/kernels/vector_sort_internal.h"
 #include "arrow/compute/registry.h"
@@ -32,10 +35,6 @@ namespace {
 // is the same as the value at the previous sort index.
 constexpr uint64_t kDuplicateMask = 1ULL << 63;
 
-constexpr bool NeedsDuplicates(RankOptions::Tiebreaker tiebreaker) {
-  return tiebreaker != RankOptions::First;
-}
-
 template <typename ValueSelector>
 void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& 
value_selector) {
   using T = decltype(value_selector(int64_t{}));
@@ -63,81 +62,68 @@ void MarkDuplicates(const NullPartitionResult& sorted, 
ValueSelector&& value_sel
   }
 }
 
-Result<Datum> CreateRankings(ExecContext* ctx, const NullPartitionResult& 
sorted,
-                             const NullPlacement null_placement,
-                             const RankOptions::Tiebreaker tiebreaker) {
-  auto length = sorted.overall_end() - sorted.overall_begin();
-  ARROW_ASSIGN_OR_RAISE(auto rankings,
-                        MakeMutableUInt64Array(length, ctx->memory_pool()));
-  auto out_begin = rankings->GetMutableValues<uint64_t>(1);
-  uint64_t rank;
-
-  auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 
0; };
-  auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; };
-
-  switch (tiebreaker) {
-    case RankOptions::Dense: {
-      rank = 0;
-      for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) {
-        if (!is_duplicate(*it)) {
-          ++rank;
-        }
-        out_begin[original_index(*it)] = rank;
-      }
-      break;
-    }
-
-    case RankOptions::First: {
-      rank = 0;
-      for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
-        // No duplicate marks expected for RankOptions::First
-        DCHECK(!is_duplicate(*it));
-        out_begin[*it] = ++rank;
-      }
-      break;
-    }
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
 
-    case RankOptions::Min: {
-      rank = 0;
-      for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) {
-        if (!is_duplicate(*it)) {
-          rank = (it - sorted.overall_begin()) + 1;
-        }
-        out_begin[original_index(*it)] = rank;
-      }
-      break;
-    }
+const RankQuantileOptions* GetDefaultQuantileRankOptions() {
+  static const auto kDefaultQuantileRankOptions = 
RankQuantileOptions::Defaults();
+  return &kDefaultQuantileRankOptions;
+}
 
-    case RankOptions::Max: {
-      rank = length;
-      for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); 
--it) {
-        out_begin[original_index(*it)] = rank;
-        // If the current index isn't marked as duplicate, then it's the last
-        // tie in a row (since we iterate in reverse order), so update rank
-        // for the next row of ties.
-        if (!is_duplicate(*it)) {
-          rank = it - sorted.overall_begin();
-        }
-      }
-      break;
-    }
+template <typename ArrowType>
+Result<NullPartitionResult> DoSortAndMarkDuplicate(
+    ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, const 
Array& input,
+    const std::shared_ptr<DataType>& physical_type, const SortOrder order,
+    const NullPlacement null_placement, bool needs_duplicates) {
+  using GetView = GetViewType<ArrowType>;
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+
+  ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type));
+
+  ArrayType array(input.data());
+  ARROW_ASSIGN_OR_RAISE(auto sorted,
+                        array_sorter(indices_begin, indices_end, array, 0,
+                                     ArraySortOptions(order, null_placement), 
ctx));
+
+  if (needs_duplicates) {
+    auto value_selector = [&array](int64_t index) {
+      return GetView::LogicalValue(array.GetView(index));
+    };
+    MarkDuplicates(sorted, value_selector);
   }
-
-  return Datum(rankings);
+  return sorted;
 }
 
-const RankOptions* GetDefaultRankOptions() {
-  static const auto kDefaultRankOptions = RankOptions::Defaults();
-  return &kDefaultRankOptions;
+template <typename ArrowType>
+Result<NullPartitionResult> DoSortAndMarkDuplicate(
+    ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+    const ChunkedArray& input, const std::shared_ptr<DataType>& physical_type,
+    const SortOrder order, const NullPlacement null_placement, bool 
needs_duplicates) {
+  auto physical_chunks = GetPhysicalChunks(input, physical_type);
+  if (physical_chunks.empty()) {
+    return NullPartitionResult{};
+  }
+  ARROW_ASSIGN_OR_RAISE(auto sorted,
+                        SortChunkedArray(ctx, indices_begin, indices_end, 
physical_type,
+                                         physical_chunks, order, 
null_placement));
+  if (needs_duplicates) {
+    const auto arrays = GetArrayPointers(physical_chunks);
+    auto value_selector = [resolver = 
ChunkedArrayResolver(span(arrays))](int64_t index) {
+      return resolver.Resolve(index).Value<ArrowType>();
+    };
+    MarkDuplicates(sorted, value_selector);
+  }
+  return sorted;
 }
 
-template <typename InputType, typename RankerType>
-class RankerMixin : public TypeVisitor {
+template <typename InputType>
+class SortAndMarkDuplicate : public TypeVisitor {
  public:
-  RankerMixin(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
-              const InputType& input, const SortOrder order,
-              const NullPlacement null_placement,
-              const RankOptions::Tiebreaker tiebreaker, Datum* output)
+  SortAndMarkDuplicate(ExecContext* ctx, uint64_t* indices_begin, uint64_t* 
indices_end,
+                       const InputType& input, const SortOrder order,
+                       const NullPlacement null_placement, const bool 
needs_duplicate)
       : TypeVisitor(),
         ctx_(ctx),
         indices_begin_(indices_begin),
@@ -145,104 +131,145 @@ class RankerMixin : public TypeVisitor {
         input_(input),
         order_(order),
         null_placement_(null_placement),
-        tiebreaker_(tiebreaker),
-        physical_type_(GetPhysicalType(input.type())),
-        output_(output) {}
+        needs_duplicates_(needs_duplicate),
+        physical_type_(GetPhysicalType(input.type())) {}
 
-  Status Run() { return physical_type_->Accept(this); }
+  Result<NullPartitionResult> Run() {
+    RETURN_NOT_OK(physical_type_->Accept(this));
+    return sorted_;
+  }
 
-#define VISIT(TYPE)                                                       \
-  Status Visit(const TYPE& type) {                                        \
-    return static_cast<RankerType*>(this)->template RankInternal<TYPE>(); \
+#define VISIT(TYPE)                                                            
     \
+  Status Visit(const TYPE& type) {                                             
     \
+    ARROW_ASSIGN_OR_RAISE(                                                     
     \
+        sorted_, DoSortAndMarkDuplicate<TYPE>(ctx_, indices_begin_, 
indices_end_,   \
+                                              input_, physical_type_, order_,  
     \
+                                              null_placement_, 
needs_duplicates_)); \
+    return Status::OK();                                                       
     \
   }
 
   VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
 
 #undef VISIT
 
- protected:
+ private:
   ExecContext* ctx_;
   uint64_t* indices_begin_;
   uint64_t* indices_end_;
   const InputType& input_;
   const SortOrder order_;
   const NullPlacement null_placement_;
-  const RankOptions::Tiebreaker tiebreaker_;
+  const bool needs_duplicates_;
   const std::shared_ptr<DataType> physical_type_;
-  Datum* output_;
+  NullPartitionResult sorted_{};
 };
 
-template <typename T>
-class Ranker;
-
-template <>
-class Ranker<Array> : public RankerMixin<Array, Ranker<Array>> {
- public:
-  using RankerMixin::RankerMixin;
-
-  template <typename InType>
-  Status RankInternal() {
-    using GetView = GetViewType<InType>;
-    using ArrayType = typename TypeTraits<InType>::ArrayType;
+// A helper class that emits rankings for the "rank_quantile" function
+struct QuantileRanker {
+  Result<Datum> CreateRankings(ExecContext* ctx, const NullPartitionResult& 
sorted) {
+    const int64_t length = sorted.overall_end() - sorted.overall_begin();
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableFloat64Array(length, ctx->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<double>(1);
+
+    auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) 
!= 0; };
+    auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; 
};
+
+    // The count of values strictly less than the value being considered
+    int64_t cum_freq = 0;
+    auto it = sorted.overall_begin();
+
+    while (it < sorted.overall_end()) {
+      // Look for a run of duplicate values
+      DCHECK(!is_duplicate(*it));
+      auto run_end = it;
+      while (++run_end < sorted.overall_end() && is_duplicate(*run_end)) {
+      }
+      // The run length, i.e. the frequency of the current value
+      int64_t freq = run_end - it;
+      double quantile = (cum_freq + 0.5 * freq) / static_cast<double>(length);
+      // Output quantile rank values
+      for (; it < run_end; ++it) {
+        out_begin[original_index(*it)] = quantile;
+      }
+      cum_freq += freq;
+    }
+    DCHECK_EQ(cum_freq, length);
+    return Datum(rankings);
+  }
+};
 
-    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+// A helper class that emits rankings for the "rank" function
+struct OrdinalRanker {
+  explicit OrdinalRanker(RankOptions::Tiebreaker tiebreaker) : 
tiebreaker_(tiebreaker) {}
 
-    ArrayType array(input_.data());
-    ARROW_ASSIGN_OR_RAISE(NullPartitionResult sorted,
-                          array_sorter(indices_begin_, indices_end_, array, 0,
-                                       ArraySortOptions(order_, 
null_placement_), ctx_));
+  Result<Datum> CreateRankings(ExecContext* ctx, const NullPartitionResult& 
sorted) {
+    const int64_t length = sorted.overall_end() - sorted.overall_begin();
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(length, ctx->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+    uint64_t rank;
+
+    auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) 
!= 0; };
+    auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; 
};
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        rank = 0;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); 
++it) {
+          if (!is_duplicate(*it)) {
+            ++rank;
+          }
+          out_begin[original_index(*it)] = rank;
+        }
+        break;
+      }
 
-    if (NeedsDuplicates(tiebreaker_)) {
-      auto value_selector = [&array](int64_t index) {
-        return GetView::LogicalValue(array.GetView(index));
-      };
-      MarkDuplicates(sorted, value_selector);
-    }
-    ARROW_ASSIGN_OR_RAISE(*output_,
-                          CreateRankings(ctx_, sorted, null_placement_, 
tiebreaker_));
+      case RankOptions::First: {
+        rank = 0;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); 
it++) {
+          // No duplicate marks expected for RankOptions::First
+          DCHECK(!is_duplicate(*it));
+          out_begin[*it] = ++rank;
+        }
+        break;
+      }
 
-    return Status::OK();
-  }
-};
+      case RankOptions::Min: {
+        rank = 0;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); 
++it) {
+          if (!is_duplicate(*it)) {
+            rank = (it - sorted.overall_begin()) + 1;
+          }
+          out_begin[original_index(*it)] = rank;
+        }
+        break;
+      }
 
-template <>
-class Ranker<ChunkedArray> : public RankerMixin<ChunkedArray, 
Ranker<ChunkedArray>> {
- public:
-  template <typename... Args>
-  explicit Ranker(Args&&... args)
-      : RankerMixin(std::forward<Args>(args)...),
-        physical_chunks_(GetPhysicalChunks(input_, physical_type_)) {}
-
-  template <typename InType>
-  Status RankInternal() {
-    if (physical_chunks_.empty()) {
-      return Status::OK();
+      case RankOptions::Max: {
+        rank = length;
+        for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); 
--it) {
+          out_begin[original_index(*it)] = rank;
+          // If the current index isn't marked as duplicate, then it's the last
+          // tie in a row (since we iterate in reverse order), so update rank
+          // for the next row of ties.
+          if (!is_duplicate(*it)) {
+            rank = it - sorted.overall_begin();
+          }
+        }
+        break;
+      }
     }
 
-    ARROW_ASSIGN_OR_RAISE(
-        NullPartitionResult sorted,
-        SortChunkedArray(ctx_, indices_begin_, indices_end_, physical_type_,
-                         physical_chunks_, order_, null_placement_));
-
-    if (NeedsDuplicates(tiebreaker_)) {
-      const auto arrays = GetArrayPointers(physical_chunks_);
-      auto value_selector = [resolver =
-                                 ChunkedArrayResolver(span(arrays))](int64_t 
index) {
-        return resolver.Resolve(index).Value<InType>();
-      };
-      MarkDuplicates(sorted, value_selector);
-    }
-    ARROW_ASSIGN_OR_RAISE(*output_,
-                          CreateRankings(ctx_, sorted, null_placement_, 
tiebreaker_));
-    return Status::OK();
+    return Datum(rankings);
   }
 
  private:
-  const ArrayVector physical_chunks_;
+  const RankOptions::Tiebreaker tiebreaker_;
 };
 
 const FunctionDoc rank_doc(
-    "Compute numerical ranks of an array (1-based)",
+    "Compute ordinal ranks of an array (1-based)",
     ("This function computes a rank of the input array.\n"
      "By default, null values are considered greater than any other value 
and\n"
      "are therefore sorted at the end of the input. For floating-point 
types,\n"
@@ -253,21 +280,34 @@ const FunctionDoc rank_doc(
      "The handling of nulls, NaNs and tiebreakers can be changed in 
RankOptions."),
     {"input"}, "RankOptions");
 
-class RankMetaFunction : public MetaFunction {
+const FunctionDoc rank_quantile_doc(
+    "Compute quantile ranks of an array",
+    ("This function computes a quantile rank of the input array.\n"
+     "By default, null values are considered greater than any other value 
and\n"
+     "are therefore sorted at the end of the input. For floating-point 
types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values.\n"
+     "The results are real values strictly between 0 and 1. They are\n"
+     "computed as in https://en.wikipedia.org/wiki/Quantile_rank\n";
+     "but without multiplying by 100.\n"
+     "\n"
+     "The handling of nulls and NaNs can be changed in RankQuantileOptions."),
+    {"input"}, "RankQuantileOptions");
+
+template <typename Derived>
+class RankMetaFunctionBase : public MetaFunction {
  public:
-  RankMetaFunction()
-      : MetaFunction("rank", Arity::Unary(), rank_doc, 
GetDefaultRankOptions()) {}
+  using MetaFunction::MetaFunction;
 
   Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
                             const FunctionOptions* options,
                             ExecContext* ctx) const override {
-    const auto& rank_options = checked_cast<const RankOptions&>(*options);
     switch (args[0].kind()) {
       case Datum::ARRAY: {
-        return Rank(*args[0].make_array(), rank_options, ctx);
+        return Rank(*args[0].make_array(), *options, ctx);
       } break;
       case Datum::CHUNKED_ARRAY: {
-        return Rank(*args[0].chunked_array(), rank_options, ctx);
+        return Rank(*args[0].chunked_array(), *options, ctx);
       } break;
       default:
         break;
@@ -278,10 +318,13 @@ class RankMetaFunction : public MetaFunction {
         args[0].ToString());
   }
 
- private:
+ protected:
   template <typename T>
-  static Result<Datum> Rank(const T& input, const RankOptions& options,
-                            ExecContext* ctx) {
+  Result<Datum> Rank(const T& input, const FunctionOptions& function_options,
+                     ExecContext* ctx) const {
+    const auto& options =
+        checked_cast<const typename 
Derived::FunctionOptionsType&>(function_options);
+
     SortOrder order = SortOrder::Ascending;
     if (!options.sort_keys.empty()) {
       order = options.sort_keys[0].order;
@@ -293,19 +336,53 @@ class RankMetaFunction : public MetaFunction {
     auto* indices_begin = indices->GetMutableValues<uint64_t>(1);
     auto* indices_end = indices_begin + length;
     std::iota(indices_begin, indices_end, 0);
+    auto needs_duplicates = Derived::NeedsDuplicates(options);
+    ARROW_ASSIGN_OR_RAISE(
+        auto sorted, SortAndMarkDuplicate(ctx, indices_begin, indices_end, 
input, order,
+                                          options.null_placement, 
needs_duplicates)
+                         .Run());
+
+    auto ranker = Derived::GetRanker(options);
+    return ranker.CreateRankings(ctx, sorted);
+  }
+};
+
+class RankMetaFunction : public RankMetaFunctionBase<RankMetaFunction> {
+ public:
+  using FunctionOptionsType = RankOptions;
+  using RankerType = OrdinalRanker;
 
-    Datum output;
-    Ranker<T> ranker(ctx, indices_begin, indices_end, input, order,
-                     options.null_placement, options.tiebreaker, &output);
-    ARROW_RETURN_NOT_OK(ranker.Run());
-    return output;
+  static bool NeedsDuplicates(const RankOptions& options) {
+    return options.tiebreaker != RankOptions::First;
   }
+
+  static RankerType GetRanker(const RankOptions& options) {
+    return RankerType(options.tiebreaker);
+  }
+
+  RankMetaFunction()
+      : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc, 
GetDefaultRankOptions()) {}
+};
+
+class RankQuantileMetaFunction : public 
RankMetaFunctionBase<RankQuantileMetaFunction> {
+ public:
+  using FunctionOptionsType = RankQuantileOptions;
+  using RankerType = QuantileRanker;
+
+  static bool NeedsDuplicates(const RankQuantileOptions&) { return true; }
+
+  static RankerType GetRanker(const RankQuantileOptions& options) { return 
RankerType(); }
+
+  RankQuantileMetaFunction()
+      : RankMetaFunctionBase("rank_quantile", Arity::Unary(), 
rank_quantile_doc,
+                             GetDefaultQuantileRankOptions()) {}
 };
 
 }  // namespace
 
 void RegisterVectorRank(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::make_shared<RankMetaFunction>()));
+  
DCHECK_OK(registry->AddFunction(std::make_shared<RankQuantileMetaFunction>()));
 }
 
 }  // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h 
b/cpp/src/arrow/compute/kernels/vector_sort_internal.h
index cc6b7834a3..6288aa26ea 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h
@@ -806,4 +806,11 @@ inline Result<std::shared_ptr<ArrayData>> 
MakeMutableUInt64Array(
   return ArrayData::Make(uint64(), length, {nullptr, std::move(data)}, 
/*null_count=*/0);
 }
 
+inline Result<std::shared_ptr<ArrayData>> MakeMutableFloat64Array(
+    int64_t length, MemoryPool* memory_pool) {
+  auto buffer_size = length * sizeof(double);
+  ARROW_ASSIGN_OR_RAISE(auto data, AllocateBuffer(buffer_size, memory_pool));
+  return ArrayData::Make(float64(), length, {nullptr, std::move(data)}, 
/*null_count=*/0);
+}
+
 }  // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc 
b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
index 7f0ef641f6..dc1c055705 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
@@ -2205,9 +2205,9 @@ TEST_F(TestNestedSortIndices, SortRecordBatch) { 
TestSort(GetRecordBatch()); }
 TEST_F(TestNestedSortIndices, SortTable) { TestSort(GetTable()); }
 
 // ----------------------------------------------------------------------
-// Tests for Rank
+// Tests for Rank and Quantile Rank
 
-class TestRank : public ::testing::Test {
+class BaseTestRank : public ::testing::Test {
  protected:
   // Create several test datums from `array`. One of which is the unmodified 
Array
   // while the rest are chunked variants based on it.
@@ -2236,6 +2236,11 @@ class TestRank : public ::testing::Test {
     datums_ = {chunked_array};
   }
 
+  DatumVector datums_;
+};
+
+class TestRank : public BaseTestRank {
+ protected:
   static void AssertRank(const DatumVector& datums, SortOrder order,
                          NullPlacement null_placement, RankOptions::Tiebreaker 
tiebreaker,
                          const std::shared_ptr<Array>& expected) {
@@ -2310,8 +2315,6 @@ class TestRank : public ::testing::Test {
     AssertRank(SortOrder::Descending, NullPlacement::AtStart, 
RankOptions::Dense,
                ArrayFromJSON(uint64(), "[3, 4, 2, 1, 2, 1, 4]"));
   }
-
-  DatumVector datums_;
 };
 
 TEST_F(TestRank, Real) {
@@ -2466,5 +2469,140 @@ TEST_F(TestRank, EmptyChunks) {
   }
 }
 
+class TestRankQuantile : public BaseTestRank {
+ public:
+  void AssertRankQuantile(const DatumVector& datums, SortOrder order,
+                          NullPlacement null_placement,
+                          const std::shared_ptr<Array>& expected) {
+    const std::vector<SortKey> sort_keys{SortKey("foo", order)};
+    RankQuantileOptions options(sort_keys, null_placement);
+    ARROW_SCOPED_TRACE("options = ", options.ToString());
+    for (const auto& datum : datums) {
+      ASSERT_OK_AND_ASSIGN(auto actual, CallFunction("rank_quantile", {datum}, 
&options));
+      ValidateOutput(actual);
+      AssertDatumsEqual(expected, actual, /*verbose=*/true);
+    }
+  }
+
+  void AssertRankQuantile(const DatumVector& datums, SortOrder order,
+                          NullPlacement null_placement, const std::string& 
expected) {
+    AssertRankQuantile(datums, order, null_placement, ArrayFromJSON(float64(), 
expected));
+  }
+
+  void AssertRankQuantile(SortOrder order, NullPlacement null_placement,
+                          const std::shared_ptr<Array>& expected) {
+    AssertRankQuantile(datums_, order, null_placement, expected);
+  }
+
+  void AssertRankQuantile(SortOrder order, NullPlacement null_placement,
+                          const std::string& expected) {
+    AssertRankQuantile(datums_, order, null_placement,
+                       ArrayFromJSON(float64(), expected));
+  }
+
+  void AssertRankQuantileEmpty(std::shared_ptr<DataType> type) {
+    for (auto null_placement : AllNullPlacements()) {
+      for (auto order : AllOrders()) {
+        AssertRankQuantile({ArrayFromJSON(type, "[]")}, order, null_placement, 
"[]");
+        AssertRankQuantile({ArrayFromJSON(type, "[null]")}, order, 
null_placement,
+                           "[0.5]");
+        AssertRankQuantile({ArrayFromJSON(type, "[null, null, null]")}, order,
+                           null_placement, "[0.5, 0.5, 0.5]");
+      }
+    }
+  }
+
+  // Expecting an input ordered like [1, 2, 1, 2, 1]
+  void AssertRankQuantile_12121() {
+    for (auto null_placement : AllNullPlacements()) {
+      AssertRankQuantile(SortOrder::Ascending, null_placement,
+                         "[0.3, 0.8, 0.3, 0.8, 0.3]");
+      AssertRankQuantile(SortOrder::Descending, null_placement,
+                         "[0.7, 0.2, 0.7, 0.2, 0.7]");
+    }
+  }
+
+  // Expecting an input ordered like [null, 1, null, 2, null]
+  void AssertRankQuantile_N1N2N() {
+    AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtStart,
+                       "[0.3, 0.7, 0.3, 0.9, 0.3]");
+    AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtEnd,
+                       "[0.7, 0.1, 0.7, 0.3, 0.7]");
+    AssertRankQuantile(SortOrder::Descending, NullPlacement::AtStart,
+                       "[0.3, 0.9, 0.3, 0.7, 0.3]");
+    AssertRankQuantile(SortOrder::Descending, NullPlacement::AtEnd,
+                       "[0.7, 0.3, 0.7, 0.1, 0.7]");
+  }
+
+  void AssertRankQuantileNumeric(std::shared_ptr<DataType> type) {
+    ARROW_SCOPED_TRACE("type = ", type->ToString());
+    AssertRankQuantileEmpty(type);
+
+    // Reproduce the example from https://en.wikipedia.org/wiki/Percentile_rank
+    SetInput(ArrayFromJSON(type, "[7, 5, 5, 4, 4, 3, 3, 3, 2, 1]"));
+    for (auto null_placement : AllNullPlacements()) {
+      AssertRankQuantile(SortOrder::Ascending, null_placement,
+                         "[0.95, 0.8, 0.8, 0.6, 0.6, 0.35, 0.35, 0.35, 0.15, 
0.05]");
+      AssertRankQuantile(SortOrder::Descending, null_placement,
+                         "[0.05, 0.2, 0.2, 0.4, 0.4, 0.65, 0.65, 0.65, 0.85, 
0.95]");
+    }
+
+    // With nulls
+    SetInput(ArrayFromJSON(type, "[null, 1, null, 2, null]"));
+    AssertRankQuantile_N1N2N();
+  }
+
+  void AssertRankQuantileBinaryLike(std::shared_ptr<DataType> type) {
+    ARROW_SCOPED_TRACE("type = ", type->ToString());
+    AssertRankQuantileEmpty(type);
+
+    SetInput(ArrayFromJSON(type, R"(["", "ab", "", "ab", ""])"));
+    AssertRankQuantile_12121();
+    // With nulls
+    SetInput(ArrayFromJSON(type, R"([null, "", null, "ab", null])"));
+    AssertRankQuantile_N1N2N();
+  }
+};
+
+TEST_F(TestRankQuantile, Real) {
+  for (auto type : ::arrow::FloatingPointTypes()) {
+    AssertRankQuantileNumeric(type);
+  }
+}
+
+TEST_F(TestRankQuantile, Integral) {
+  for (auto type : ::arrow::IntTypes()) {
+    AssertRankQuantileNumeric(type);
+  }
+}
+
+TEST_F(TestRankQuantile, Boolean) {
+  auto type = boolean();
+  AssertRankQuantileEmpty(type);
+
+  SetInput(ArrayFromJSON(type, "[false, true, false, true, false]"));
+  AssertRankQuantile_12121();
+  // With nulls
+  SetInput(ArrayFromJSON(type, "[null, false, null, true, null]"));
+  AssertRankQuantile_N1N2N();
+}
+
+TEST_F(TestRankQuantile, BinaryLike) {
+  for (auto type : BaseBinaryTypes()) {
+    AssertRankQuantileBinaryLike(type);
+  }
+}
+
+TEST_F(TestRankQuantile, FixedSizeBinary) {
+  auto type = fixed_size_binary(3);
+  AssertRankQuantileEmpty(type);
+
+  SetInput(ArrayFromJSON(type, R"(["abc", "def", "abc", "def", "abc"])"));
+  AssertRankQuantile_12121();
+  // With nulls
+  SetInput(ArrayFromJSON(type, R"([null, "abc", null, "def", null])"));
+  AssertRankQuantile_N1N2N();
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 92f3e44039..6acc9e31a5 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1796,19 +1796,21 @@ in the respective option classes.
    Binary- and String-like inputs are ordered lexicographically as bytestrings,
    even for String types.
 
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| Function name         | Arity      | Input types                             
                | Output type       | Options class                  | Notes    
      |
-+=======================+============+=========================================================+===================+================================+================+
-| array_sort_indices    | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | UInt64            | :struct:`ArraySortOptions`     | \(1) 
\(2)      |
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| partition_nth_indices | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | UInt64            | :struct:`PartitionNthOptions`  | \(3)     
      |
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| rank                  | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | UInt64            | :struct:`RankOptions`          | \(4)     
      |
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| select_k_unstable     | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | UInt64            | :struct:`SelectKOptions`       | \(5) 
\(6)      |
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| sort_indices          | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | UInt64            | :struct:`SortOptions`          | \(1) 
\(5)      |
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| Function name         | Arity      | Input types                             
                | Output type       | Options class                 | Notes     
     |
++=======================+============+=========================================================+===================+===============================+================+
+| array_sort_indices    | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | UInt64            | :struct:`ArraySortOptions`    | \(1) \(2) 
     |
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| partition_nth_indices | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | UInt64            | :struct:`PartitionNthOptions` | \(3)      
     |
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| rank                  | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | UInt64            | :struct:`RankOptions`         | \(4)      
     |
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| rank_quantile         | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | Float64           | :struct:`RankQuantileOptions` | \(5)      
     |
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| select_k_unstable     | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | UInt64            | :struct:`SelectKOptions`      | \(6) \(7) 
     |
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| sort_indices          | Unary      | Boolean, Numeric, Temporal, Binary- and 
String-like     | UInt64            | :struct:`SortOptions`         | \(1) \(6) 
     |
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
 
 
 * \(1) The output is an array of indices into the input, that define a
@@ -1823,13 +1825,15 @@ in the respective option classes.
   :func:`std::nth_element`).  *N* is given in
   :member:`PartitionNthOptions::pivot`.
 
-* \(4) The output is a one-based numerical array of ranks
+* \(4) The output is a one-based numerical array of ranks.
 
-* \(5) The input can be an array, chunked array, record batch or
+* \(5) The output is an array of quantiles strictly between 0 and 1.
+
+* \(6) The input can be an array, chunked array, record batch or
   table. If the input is a record batch or table, one or more sort
   keys must be specified.
 
-* \(6) The output is an array of indices into the input, that define a
+* \(7) The output is an array of indices into the input, that define a
   non-stable sort of the input.
 
 .. _cpp-compute-vector-structural-transforms:


Reply via email to