This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 17a0ff5566 GH-45190: [C++][Compute] Add rank_quantile function (#45259)
17a0ff5566 is described below
commit 17a0ff556610c5fc0666f4220a6a0e87c487770c
Author: Antoine Pitrou <[email protected]>
AuthorDate: Thu Jan 23 20:32:28 2025 +0100
GH-45190: [C++][Compute] Add rank_quantile function (#45259)
### Rationale for this change
Add a "rank_quantile" function following the Wikipedia definition:
https://en.wikipedia.org/wiki/Percentile_rank
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes, an additional compute function.
* GitHub Issue: #45190
Lead-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Rossi Sun <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/compute/api_vector.cc | 11 +
cpp/src/arrow/compute/api_vector.h | 19 +
cpp/src/arrow/compute/kernels/vector_rank.cc | 383 +++++++++++++--------
.../arrow/compute/kernels/vector_sort_internal.h | 7 +
cpp/src/arrow/compute/kernels/vector_sort_test.cc | 146 +++++++-
docs/source/cpp/compute.rst | 36 +-
6 files changed, 429 insertions(+), 173 deletions(-)
diff --git a/cpp/src/arrow/compute/api_vector.cc
b/cpp/src/arrow/compute/api_vector.cc
index 22ecf1cc87..61335de6ac 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -48,6 +48,7 @@ using compute::DictionaryEncodeOptions;
using compute::FilterOptions;
using compute::NullPlacement;
using compute::RankOptions;
+using compute::RankQuantileOptions;
template <>
struct EnumTraits<FilterOptions::NullSelectionBehavior>
@@ -151,6 +152,9 @@ static auto kRankOptionsType =
GetFunctionOptionsType<RankOptions>(
DataMember("sort_keys", &RankOptions::sort_keys),
DataMember("null_placement", &RankOptions::null_placement),
DataMember("tiebreaker", &RankOptions::tiebreaker));
+static auto kRankQuantileOptionsType =
GetFunctionOptionsType<RankQuantileOptions>(
+ DataMember("sort_keys", &RankQuantileOptions::sort_keys),
+ DataMember("null_placement", &RankQuantileOptions::null_placement));
static auto kPairwiseOptionsType = GetFunctionOptionsType<PairwiseOptions>(
DataMember("periods", &PairwiseOptions::periods));
static auto kListFlattenOptionsType =
GetFunctionOptionsType<ListFlattenOptions>(
@@ -228,6 +232,13 @@ RankOptions::RankOptions(std::vector<SortKey> sort_keys,
NullPlacement null_plac
tiebreaker(tiebreaker) {}
constexpr char RankOptions::kTypeName[];
+RankQuantileOptions::RankQuantileOptions(std::vector<SortKey> sort_keys,
+ NullPlacement null_placement)
+ : FunctionOptions(internal::kRankQuantileOptionsType),
+ sort_keys(std::move(sort_keys)),
+ null_placement(null_placement) {}
+constexpr char RankQuantileOptions::kTypeName[];
+
PairwiseOptions::PairwiseOptions(int64_t periods)
: FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {}
constexpr char PairwiseOptions::kTypeName[];
diff --git a/cpp/src/arrow/compute/api_vector.h
b/cpp/src/arrow/compute/api_vector.h
index ada1665b3e..22bb164719 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -195,6 +195,25 @@ class ARROW_EXPORT RankOptions : public FunctionOptions {
Tiebreaker tiebreaker;
};
+/// \brief Quantile rank options
+class ARROW_EXPORT RankQuantileOptions : public FunctionOptions {
+ public:
+ explicit RankQuantileOptions(std::vector<SortKey> sort_keys = {},
+ NullPlacement null_placement =
NullPlacement::AtEnd);
+ /// Convenience constructor for array inputs
+ explicit RankQuantileOptions(SortOrder order,
+ NullPlacement null_placement =
NullPlacement::AtEnd)
+ : RankQuantileOptions({SortKey("", order)}, null_placement) {}
+
+ static constexpr char const kTypeName[] = "RankQuantileOptions";
+ static RankQuantileOptions Defaults() { return RankQuantileOptions(); }
+
+ /// Column key(s) to order by and how to order by these sort keys.
+ std::vector<SortKey> sort_keys;
+ /// Whether nulls and NaNs are placed at the start or at the end
+ NullPlacement null_placement;
+};
+
/// \brief Partitioning options for NthToIndices
class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
public:
diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc
b/cpp/src/arrow/compute/kernels/vector_rank.cc
index 4fdc83788c..2efc61c2e6 100644
--- a/cpp/src/arrow/compute/kernels/vector_rank.cc
+++ b/cpp/src/arrow/compute/kernels/vector_rank.cc
@@ -15,6 +15,9 @@
// specific language governing permissions and limitations
// under the License.
+#include <functional>
+#include <memory>
+
#include "arrow/compute/function.h"
#include "arrow/compute/kernels/vector_sort_internal.h"
#include "arrow/compute/registry.h"
@@ -32,10 +35,6 @@ namespace {
// is the same as the value at the previous sort index.
constexpr uint64_t kDuplicateMask = 1ULL << 63;
-constexpr bool NeedsDuplicates(RankOptions::Tiebreaker tiebreaker) {
- return tiebreaker != RankOptions::First;
-}
-
template <typename ValueSelector>
void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&&
value_selector) {
using T = decltype(value_selector(int64_t{}));
@@ -63,81 +62,68 @@ void MarkDuplicates(const NullPartitionResult& sorted,
ValueSelector&& value_sel
}
}
-Result<Datum> CreateRankings(ExecContext* ctx, const NullPartitionResult&
sorted,
- const NullPlacement null_placement,
- const RankOptions::Tiebreaker tiebreaker) {
- auto length = sorted.overall_end() - sorted.overall_begin();
- ARROW_ASSIGN_OR_RAISE(auto rankings,
- MakeMutableUInt64Array(length, ctx->memory_pool()));
- auto out_begin = rankings->GetMutableValues<uint64_t>(1);
- uint64_t rank;
-
- auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) !=
0; };
- auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; };
-
- switch (tiebreaker) {
- case RankOptions::Dense: {
- rank = 0;
- for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) {
- if (!is_duplicate(*it)) {
- ++rank;
- }
- out_begin[original_index(*it)] = rank;
- }
- break;
- }
-
- case RankOptions::First: {
- rank = 0;
- for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
- // No duplicate marks expected for RankOptions::First
- DCHECK(!is_duplicate(*it));
- out_begin[*it] = ++rank;
- }
- break;
- }
+const RankOptions* GetDefaultRankOptions() {
+ static const auto kDefaultRankOptions = RankOptions::Defaults();
+ return &kDefaultRankOptions;
+}
- case RankOptions::Min: {
- rank = 0;
- for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) {
- if (!is_duplicate(*it)) {
- rank = (it - sorted.overall_begin()) + 1;
- }
- out_begin[original_index(*it)] = rank;
- }
- break;
- }
+const RankQuantileOptions* GetDefaultQuantileRankOptions() {
+ static const auto kDefaultQuantileRankOptions =
RankQuantileOptions::Defaults();
+ return &kDefaultQuantileRankOptions;
+}
- case RankOptions::Max: {
- rank = length;
- for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin();
--it) {
- out_begin[original_index(*it)] = rank;
- // If the current index isn't marked as duplicate, then it's the last
- // tie in a row (since we iterate in reverse order), so update rank
- // for the next row of ties.
- if (!is_duplicate(*it)) {
- rank = it - sorted.overall_begin();
- }
- }
- break;
- }
+template <typename ArrowType>
+Result<NullPartitionResult> DoSortAndMarkDuplicate(
+ ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, const
Array& input,
+ const std::shared_ptr<DataType>& physical_type, const SortOrder order,
+ const NullPlacement null_placement, bool needs_duplicates) {
+ using GetView = GetViewType<ArrowType>;
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+
+ ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type));
+
+ ArrayType array(input.data());
+ ARROW_ASSIGN_OR_RAISE(auto sorted,
+ array_sorter(indices_begin, indices_end, array, 0,
+ ArraySortOptions(order, null_placement),
ctx));
+
+ if (needs_duplicates) {
+ auto value_selector = [&array](int64_t index) {
+ return GetView::LogicalValue(array.GetView(index));
+ };
+ MarkDuplicates(sorted, value_selector);
}
-
- return Datum(rankings);
+ return sorted;
}
-const RankOptions* GetDefaultRankOptions() {
- static const auto kDefaultRankOptions = RankOptions::Defaults();
- return &kDefaultRankOptions;
+template <typename ArrowType>
+Result<NullPartitionResult> DoSortAndMarkDuplicate(
+ ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+ const ChunkedArray& input, const std::shared_ptr<DataType>& physical_type,
+ const SortOrder order, const NullPlacement null_placement, bool
needs_duplicates) {
+ auto physical_chunks = GetPhysicalChunks(input, physical_type);
+ if (physical_chunks.empty()) {
+ return NullPartitionResult{};
+ }
+ ARROW_ASSIGN_OR_RAISE(auto sorted,
+ SortChunkedArray(ctx, indices_begin, indices_end,
physical_type,
+ physical_chunks, order,
null_placement));
+ if (needs_duplicates) {
+ const auto arrays = GetArrayPointers(physical_chunks);
+ auto value_selector = [resolver =
ChunkedArrayResolver(span(arrays))](int64_t index) {
+ return resolver.Resolve(index).Value<ArrowType>();
+ };
+ MarkDuplicates(sorted, value_selector);
+ }
+ return sorted;
}
-template <typename InputType, typename RankerType>
-class RankerMixin : public TypeVisitor {
+template <typename InputType>
+class SortAndMarkDuplicate : public TypeVisitor {
public:
- RankerMixin(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
- const InputType& input, const SortOrder order,
- const NullPlacement null_placement,
- const RankOptions::Tiebreaker tiebreaker, Datum* output)
+ SortAndMarkDuplicate(ExecContext* ctx, uint64_t* indices_begin, uint64_t*
indices_end,
+ const InputType& input, const SortOrder order,
+ const NullPlacement null_placement, const bool
needs_duplicate)
: TypeVisitor(),
ctx_(ctx),
indices_begin_(indices_begin),
@@ -145,104 +131,145 @@ class RankerMixin : public TypeVisitor {
input_(input),
order_(order),
null_placement_(null_placement),
- tiebreaker_(tiebreaker),
- physical_type_(GetPhysicalType(input.type())),
- output_(output) {}
+ needs_duplicates_(needs_duplicate),
+ physical_type_(GetPhysicalType(input.type())) {}
- Status Run() { return physical_type_->Accept(this); }
+ Result<NullPartitionResult> Run() {
+ RETURN_NOT_OK(physical_type_->Accept(this));
+ return sorted_;
+ }
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) { \
- return static_cast<RankerType*>(this)->template RankInternal<TYPE>(); \
+#define VISIT(TYPE)
\
+ Status Visit(const TYPE& type) {
\
+ ARROW_ASSIGN_OR_RAISE(
\
+ sorted_, DoSortAndMarkDuplicate<TYPE>(ctx_, indices_begin_,
indices_end_, \
+ input_, physical_type_, order_,
\
+ null_placement_,
needs_duplicates_)); \
+ return Status::OK();
\
}
VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
#undef VISIT
- protected:
+ private:
ExecContext* ctx_;
uint64_t* indices_begin_;
uint64_t* indices_end_;
const InputType& input_;
const SortOrder order_;
const NullPlacement null_placement_;
- const RankOptions::Tiebreaker tiebreaker_;
+ const bool needs_duplicates_;
const std::shared_ptr<DataType> physical_type_;
- Datum* output_;
+ NullPartitionResult sorted_{};
};
-template <typename T>
-class Ranker;
-
-template <>
-class Ranker<Array> : public RankerMixin<Array, Ranker<Array>> {
- public:
- using RankerMixin::RankerMixin;
-
- template <typename InType>
- Status RankInternal() {
- using GetView = GetViewType<InType>;
- using ArrayType = typename TypeTraits<InType>::ArrayType;
+// A helper class that emits rankings for the "rank_quantile" function
+struct QuantileRanker {
+ Result<Datum> CreateRankings(ExecContext* ctx, const NullPartitionResult&
sorted) {
+ const int64_t length = sorted.overall_end() - sorted.overall_begin();
+ ARROW_ASSIGN_OR_RAISE(auto rankings,
+ MakeMutableFloat64Array(length, ctx->memory_pool()));
+ auto out_begin = rankings->GetMutableValues<double>(1);
+
+ auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask)
!= 0; };
+ auto original_index = [](uint64_t index) { return index & ~kDuplicateMask;
};
+
+ // The count of values strictly less than the value being considered
+ int64_t cum_freq = 0;
+ auto it = sorted.overall_begin();
+
+ while (it < sorted.overall_end()) {
+ // Look for a run of duplicate values
+ DCHECK(!is_duplicate(*it));
+ auto run_end = it;
+ while (++run_end < sorted.overall_end() && is_duplicate(*run_end)) {
+ }
+ // The run length, i.e. the frequency of the current value
+ int64_t freq = run_end - it;
+ double quantile = (cum_freq + 0.5 * freq) / static_cast<double>(length);
+ // Output quantile rank values
+ for (; it < run_end; ++it) {
+ out_begin[original_index(*it)] = quantile;
+ }
+ cum_freq += freq;
+ }
+ DCHECK_EQ(cum_freq, length);
+ return Datum(rankings);
+ }
+};
- ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+// A helper class that emits rankings for the "rank" function
+struct OrdinalRanker {
+ explicit OrdinalRanker(RankOptions::Tiebreaker tiebreaker) :
tiebreaker_(tiebreaker) {}
- ArrayType array(input_.data());
- ARROW_ASSIGN_OR_RAISE(NullPartitionResult sorted,
- array_sorter(indices_begin_, indices_end_, array, 0,
- ArraySortOptions(order_,
null_placement_), ctx_));
+ Result<Datum> CreateRankings(ExecContext* ctx, const NullPartitionResult&
sorted) {
+ const int64_t length = sorted.overall_end() - sorted.overall_begin();
+ ARROW_ASSIGN_OR_RAISE(auto rankings,
+ MakeMutableUInt64Array(length, ctx->memory_pool()));
+ auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+ uint64_t rank;
+
+ auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask)
!= 0; };
+ auto original_index = [](uint64_t index) { return index & ~kDuplicateMask;
};
+
+ switch (tiebreaker_) {
+ case RankOptions::Dense: {
+ rank = 0;
+ for (auto it = sorted.overall_begin(); it < sorted.overall_end();
++it) {
+ if (!is_duplicate(*it)) {
+ ++rank;
+ }
+ out_begin[original_index(*it)] = rank;
+ }
+ break;
+ }
- if (NeedsDuplicates(tiebreaker_)) {
- auto value_selector = [&array](int64_t index) {
- return GetView::LogicalValue(array.GetView(index));
- };
- MarkDuplicates(sorted, value_selector);
- }
- ARROW_ASSIGN_OR_RAISE(*output_,
- CreateRankings(ctx_, sorted, null_placement_,
tiebreaker_));
+ case RankOptions::First: {
+ rank = 0;
+ for (auto it = sorted.overall_begin(); it < sorted.overall_end();
it++) {
+ // No duplicate marks expected for RankOptions::First
+ DCHECK(!is_duplicate(*it));
+ out_begin[*it] = ++rank;
+ }
+ break;
+ }
- return Status::OK();
- }
-};
+ case RankOptions::Min: {
+ rank = 0;
+ for (auto it = sorted.overall_begin(); it < sorted.overall_end();
++it) {
+ if (!is_duplicate(*it)) {
+ rank = (it - sorted.overall_begin()) + 1;
+ }
+ out_begin[original_index(*it)] = rank;
+ }
+ break;
+ }
-template <>
-class Ranker<ChunkedArray> : public RankerMixin<ChunkedArray,
Ranker<ChunkedArray>> {
- public:
- template <typename... Args>
- explicit Ranker(Args&&... args)
- : RankerMixin(std::forward<Args>(args)...),
- physical_chunks_(GetPhysicalChunks(input_, physical_type_)) {}
-
- template <typename InType>
- Status RankInternal() {
- if (physical_chunks_.empty()) {
- return Status::OK();
+ case RankOptions::Max: {
+ rank = length;
+ for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin();
--it) {
+ out_begin[original_index(*it)] = rank;
+ // If the current index isn't marked as duplicate, then it's the last
+ // tie in a row (since we iterate in reverse order), so update rank
+ // for the next row of ties.
+ if (!is_duplicate(*it)) {
+ rank = it - sorted.overall_begin();
+ }
+ }
+ break;
+ }
}
- ARROW_ASSIGN_OR_RAISE(
- NullPartitionResult sorted,
- SortChunkedArray(ctx_, indices_begin_, indices_end_, physical_type_,
- physical_chunks_, order_, null_placement_));
-
- if (NeedsDuplicates(tiebreaker_)) {
- const auto arrays = GetArrayPointers(physical_chunks_);
- auto value_selector = [resolver =
- ChunkedArrayResolver(span(arrays))](int64_t
index) {
- return resolver.Resolve(index).Value<InType>();
- };
- MarkDuplicates(sorted, value_selector);
- }
- ARROW_ASSIGN_OR_RAISE(*output_,
- CreateRankings(ctx_, sorted, null_placement_,
tiebreaker_));
- return Status::OK();
+ return Datum(rankings);
}
private:
- const ArrayVector physical_chunks_;
+ const RankOptions::Tiebreaker tiebreaker_;
};
const FunctionDoc rank_doc(
- "Compute numerical ranks of an array (1-based)",
+ "Compute ordinal ranks of an array (1-based)",
("This function computes a rank of the input array.\n"
"By default, null values are considered greater than any other value
and\n"
"are therefore sorted at the end of the input. For floating-point
types,\n"
@@ -253,21 +280,34 @@ const FunctionDoc rank_doc(
"The handling of nulls, NaNs and tiebreakers can be changed in
RankOptions."),
{"input"}, "RankOptions");
-class RankMetaFunction : public MetaFunction {
+const FunctionDoc rank_quantile_doc(
+ "Compute quantile ranks of an array",
+ ("This function computes a quantile rank of the input array.\n"
+ "By default, null values are considered greater than any other value
and\n"
+ "are therefore sorted at the end of the input. For floating-point
types,\n"
+ "NaNs are considered greater than any other non-null value, but smaller\n"
+ "than null values.\n"
+ "The results are real values strictly between 0 and 1. They are\n"
+ "computed as in https://en.wikipedia.org/wiki/Quantile_rank\n"
+ "but without multiplying by 100.\n"
+ "\n"
+ "The handling of nulls and NaNs can be changed in RankQuantileOptions."),
+ {"input"}, "RankQuantileOptions");
+
+template <typename Derived>
+class RankMetaFunctionBase : public MetaFunction {
public:
- RankMetaFunction()
- : MetaFunction("rank", Arity::Unary(), rank_doc,
GetDefaultRankOptions()) {}
+ using MetaFunction::MetaFunction;
Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
ExecContext* ctx) const override {
- const auto& rank_options = checked_cast<const RankOptions&>(*options);
switch (args[0].kind()) {
case Datum::ARRAY: {
- return Rank(*args[0].make_array(), rank_options, ctx);
+ return Rank(*args[0].make_array(), *options, ctx);
} break;
case Datum::CHUNKED_ARRAY: {
- return Rank(*args[0].chunked_array(), rank_options, ctx);
+ return Rank(*args[0].chunked_array(), *options, ctx);
} break;
default:
break;
@@ -278,10 +318,13 @@ class RankMetaFunction : public MetaFunction {
args[0].ToString());
}
- private:
+ protected:
template <typename T>
- static Result<Datum> Rank(const T& input, const RankOptions& options,
- ExecContext* ctx) {
+ Result<Datum> Rank(const T& input, const FunctionOptions& function_options,
+ ExecContext* ctx) const {
+ const auto& options =
+ checked_cast<const typename
Derived::FunctionOptionsType&>(function_options);
+
SortOrder order = SortOrder::Ascending;
if (!options.sort_keys.empty()) {
order = options.sort_keys[0].order;
@@ -293,19 +336,53 @@ class RankMetaFunction : public MetaFunction {
auto* indices_begin = indices->GetMutableValues<uint64_t>(1);
auto* indices_end = indices_begin + length;
std::iota(indices_begin, indices_end, 0);
+ auto needs_duplicates = Derived::NeedsDuplicates(options);
+ ARROW_ASSIGN_OR_RAISE(
+ auto sorted, SortAndMarkDuplicate(ctx, indices_begin, indices_end,
input, order,
+ options.null_placement,
needs_duplicates)
+ .Run());
+
+ auto ranker = Derived::GetRanker(options);
+ return ranker.CreateRankings(ctx, sorted);
+ }
+};
+
+class RankMetaFunction : public RankMetaFunctionBase<RankMetaFunction> {
+ public:
+ using FunctionOptionsType = RankOptions;
+ using RankerType = OrdinalRanker;
- Datum output;
- Ranker<T> ranker(ctx, indices_begin, indices_end, input, order,
- options.null_placement, options.tiebreaker, &output);
- ARROW_RETURN_NOT_OK(ranker.Run());
- return output;
+ static bool NeedsDuplicates(const RankOptions& options) {
+ return options.tiebreaker != RankOptions::First;
}
+
+ static RankerType GetRanker(const RankOptions& options) {
+ return RankerType(options.tiebreaker);
+ }
+
+ RankMetaFunction()
+ : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc,
GetDefaultRankOptions()) {}
+};
+
+class RankQuantileMetaFunction : public
RankMetaFunctionBase<RankQuantileMetaFunction> {
+ public:
+ using FunctionOptionsType = RankQuantileOptions;
+ using RankerType = QuantileRanker;
+
+ static bool NeedsDuplicates(const RankQuantileOptions&) { return true; }
+
+ static RankerType GetRanker(const RankQuantileOptions& options) { return
RankerType(); }
+
+ RankQuantileMetaFunction()
+ : RankMetaFunctionBase("rank_quantile", Arity::Unary(),
rank_quantile_doc,
+ GetDefaultQuantileRankOptions()) {}
};
} // namespace
void RegisterVectorRank(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::make_shared<RankMetaFunction>()));
+
DCHECK_OK(registry->AddFunction(std::make_shared<RankQuantileMetaFunction>()));
}
} // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h
b/cpp/src/arrow/compute/kernels/vector_sort_internal.h
index cc6b7834a3..6288aa26ea 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h
@@ -806,4 +806,11 @@ inline Result<std::shared_ptr<ArrayData>>
MakeMutableUInt64Array(
return ArrayData::Make(uint64(), length, {nullptr, std::move(data)},
/*null_count=*/0);
}
+inline Result<std::shared_ptr<ArrayData>> MakeMutableFloat64Array(
+ int64_t length, MemoryPool* memory_pool) {
+ auto buffer_size = length * sizeof(double);
+ ARROW_ASSIGN_OR_RAISE(auto data, AllocateBuffer(buffer_size, memory_pool));
+ return ArrayData::Make(float64(), length, {nullptr, std::move(data)},
/*null_count=*/0);
+}
+
} // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc
b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
index 7f0ef641f6..dc1c055705 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
@@ -2205,9 +2205,9 @@ TEST_F(TestNestedSortIndices, SortRecordBatch) {
TestSort(GetRecordBatch()); }
TEST_F(TestNestedSortIndices, SortTable) { TestSort(GetTable()); }
// ----------------------------------------------------------------------
-// Tests for Rank
+// Tests for Rank and Quantile Rank
-class TestRank : public ::testing::Test {
+class BaseTestRank : public ::testing::Test {
protected:
// Create several test datums from `array`. One of which is the unmodified
Array
// while the rest are chunked variants based on it.
@@ -2236,6 +2236,11 @@ class TestRank : public ::testing::Test {
datums_ = {chunked_array};
}
+ DatumVector datums_;
+};
+
+class TestRank : public BaseTestRank {
+ protected:
static void AssertRank(const DatumVector& datums, SortOrder order,
NullPlacement null_placement, RankOptions::Tiebreaker
tiebreaker,
const std::shared_ptr<Array>& expected) {
@@ -2310,8 +2315,6 @@ class TestRank : public ::testing::Test {
AssertRank(SortOrder::Descending, NullPlacement::AtStart,
RankOptions::Dense,
ArrayFromJSON(uint64(), "[3, 4, 2, 1, 2, 1, 4]"));
}
-
- DatumVector datums_;
};
TEST_F(TestRank, Real) {
@@ -2466,5 +2469,140 @@ TEST_F(TestRank, EmptyChunks) {
}
}
+class TestRankQuantile : public BaseTestRank {
+ public:
+ void AssertRankQuantile(const DatumVector& datums, SortOrder order,
+ NullPlacement null_placement,
+ const std::shared_ptr<Array>& expected) {
+ const std::vector<SortKey> sort_keys{SortKey("foo", order)};
+ RankQuantileOptions options(sort_keys, null_placement);
+ ARROW_SCOPED_TRACE("options = ", options.ToString());
+ for (const auto& datum : datums) {
+ ASSERT_OK_AND_ASSIGN(auto actual, CallFunction("rank_quantile", {datum},
&options));
+ ValidateOutput(actual);
+ AssertDatumsEqual(expected, actual, /*verbose=*/true);
+ }
+ }
+
+ void AssertRankQuantile(const DatumVector& datums, SortOrder order,
+ NullPlacement null_placement, const std::string&
expected) {
+ AssertRankQuantile(datums, order, null_placement, ArrayFromJSON(float64(),
expected));
+ }
+
+ void AssertRankQuantile(SortOrder order, NullPlacement null_placement,
+ const std::shared_ptr<Array>& expected) {
+ AssertRankQuantile(datums_, order, null_placement, expected);
+ }
+
+ void AssertRankQuantile(SortOrder order, NullPlacement null_placement,
+ const std::string& expected) {
+ AssertRankQuantile(datums_, order, null_placement,
+ ArrayFromJSON(float64(), expected));
+ }
+
+ void AssertRankQuantileEmpty(std::shared_ptr<DataType> type) {
+ for (auto null_placement : AllNullPlacements()) {
+ for (auto order : AllOrders()) {
+ AssertRankQuantile({ArrayFromJSON(type, "[]")}, order, null_placement,
"[]");
+ AssertRankQuantile({ArrayFromJSON(type, "[null]")}, order,
null_placement,
+ "[0.5]");
+ AssertRankQuantile({ArrayFromJSON(type, "[null, null, null]")}, order,
+ null_placement, "[0.5, 0.5, 0.5]");
+ }
+ }
+ }
+
+ // Expecting an input ordered like [1, 2, 1, 2, 1]
+ void AssertRankQuantile_12121() {
+ for (auto null_placement : AllNullPlacements()) {
+ AssertRankQuantile(SortOrder::Ascending, null_placement,
+ "[0.3, 0.8, 0.3, 0.8, 0.3]");
+ AssertRankQuantile(SortOrder::Descending, null_placement,
+ "[0.7, 0.2, 0.7, 0.2, 0.7]");
+ }
+ }
+
+ // Expecting an input ordered like [null, 1, null, 2, null]
+ void AssertRankQuantile_N1N2N() {
+ AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtStart,
+ "[0.3, 0.7, 0.3, 0.9, 0.3]");
+ AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtEnd,
+ "[0.7, 0.1, 0.7, 0.3, 0.7]");
+ AssertRankQuantile(SortOrder::Descending, NullPlacement::AtStart,
+ "[0.3, 0.9, 0.3, 0.7, 0.3]");
+ AssertRankQuantile(SortOrder::Descending, NullPlacement::AtEnd,
+ "[0.7, 0.3, 0.7, 0.1, 0.7]");
+ }
+
+ void AssertRankQuantileNumeric(std::shared_ptr<DataType> type) {
+ ARROW_SCOPED_TRACE("type = ", type->ToString());
+ AssertRankQuantileEmpty(type);
+
+ // Reproduce the example from https://en.wikipedia.org/wiki/Percentile_rank
+ SetInput(ArrayFromJSON(type, "[7, 5, 5, 4, 4, 3, 3, 3, 2, 1]"));
+ for (auto null_placement : AllNullPlacements()) {
+ AssertRankQuantile(SortOrder::Ascending, null_placement,
+ "[0.95, 0.8, 0.8, 0.6, 0.6, 0.35, 0.35, 0.35, 0.15,
0.05]");
+ AssertRankQuantile(SortOrder::Descending, null_placement,
+ "[0.05, 0.2, 0.2, 0.4, 0.4, 0.65, 0.65, 0.65, 0.85,
0.95]");
+ }
+
+ // With nulls
+ SetInput(ArrayFromJSON(type, "[null, 1, null, 2, null]"));
+ AssertRankQuantile_N1N2N();
+ }
+
+ void AssertRankQuantileBinaryLike(std::shared_ptr<DataType> type) {
+ ARROW_SCOPED_TRACE("type = ", type->ToString());
+ AssertRankQuantileEmpty(type);
+
+ SetInput(ArrayFromJSON(type, R"(["", "ab", "", "ab", ""])"));
+ AssertRankQuantile_12121();
+ // With nulls
+ SetInput(ArrayFromJSON(type, R"([null, "", null, "ab", null])"));
+ AssertRankQuantile_N1N2N();
+ }
+};
+
+TEST_F(TestRankQuantile, Real) {
+ for (auto type : ::arrow::FloatingPointTypes()) {
+ AssertRankQuantileNumeric(type);
+ }
+}
+
+TEST_F(TestRankQuantile, Integral) {
+ for (auto type : ::arrow::IntTypes()) {
+ AssertRankQuantileNumeric(type);
+ }
+}
+
+TEST_F(TestRankQuantile, Boolean) {
+ auto type = boolean();
+ AssertRankQuantileEmpty(type);
+
+ SetInput(ArrayFromJSON(type, "[false, true, false, true, false]"));
+ AssertRankQuantile_12121();
+ // With nulls
+ SetInput(ArrayFromJSON(type, "[null, false, null, true, null]"));
+ AssertRankQuantile_N1N2N();
+}
+
+TEST_F(TestRankQuantile, BinaryLike) {
+ for (auto type : BaseBinaryTypes()) {
+ AssertRankQuantileBinaryLike(type);
+ }
+}
+
+TEST_F(TestRankQuantile, FixedSizeBinary) {
+ auto type = fixed_size_binary(3);
+ AssertRankQuantileEmpty(type);
+
+ SetInput(ArrayFromJSON(type, R"(["abc", "def", "abc", "def", "abc"])"));
+ AssertRankQuantile_12121();
+ // With nulls
+ SetInput(ArrayFromJSON(type, R"([null, "abc", null, "def", null])"));
+ AssertRankQuantile_N1N2N();
+}
+
} // namespace compute
} // namespace arrow
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 92f3e44039..6acc9e31a5 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1796,19 +1796,21 @@ in the respective option classes.
Binary- and String-like inputs are ordered lexicographically as bytestrings,
even for String types.
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| Function name | Arity | Input types
| Output type | Options class | Notes
|
-+=======================+============+=========================================================+===================+================================+================+
-| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | UInt64 | :struct:`ArraySortOptions` | \(1)
\(2) |
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | UInt64 | :struct:`PartitionNthOptions` | \(3)
|
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| rank | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | UInt64 | :struct:`RankOptions` | \(4)
|
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | UInt64 | :struct:`SelectKOptions` | \(5)
\(6) |
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | UInt64 | :struct:`SortOptions` | \(1)
\(5) |
-+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| Function name | Arity | Input types
| Output type | Options class | Notes
|
++=======================+============+=========================================================+===================+===============================+================+
+| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2)
|
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | UInt64 | :struct:`PartitionNthOptions` | \(3)
|
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| rank | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | UInt64 | :struct:`RankOptions` | \(4)
|
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| rank_quantile | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | Float64 | :struct:`RankQuantileOptions` | \(5)
|
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | UInt64 | :struct:`SelectKOptions` | \(6) \(7)
|
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
+| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and
String-like | UInt64 | :struct:`SortOptions` | \(1) \(6)
|
++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+
* \(1) The output is an array of indices into the input, that define a
@@ -1823,13 +1825,15 @@ in the respective option classes.
:func:`std::nth_element`). *N* is given in
:member:`PartitionNthOptions::pivot`.
-* \(4) The output is a one-based numerical array of ranks
+* \(4) The output is a one-based numerical array of ranks.
-* \(5) The input can be an array, chunked array, record batch or
+* \(5) The output is an array of quantiles strictly between 0 and 1.
+
+* \(6) The input can be an array, chunked array, record batch or
table. If the input is a record batch or table, one or more sort
keys must be specified.
-* \(6) The output is an array of indices into the input, that define a
+* \(7) The output is an array of indices into the input, that define a
non-stable sort of the input.
.. _cpp-compute-vector-structural-transforms: