pitrou commented on code in PR #33846:
URL: https://github.com/apache/arrow/pull/33846#discussion_r1087859308
##########
cpp/src/arrow/compute/kernels/vector_sort_test.cc:
##########
@@ -2182,5 +2185,146 @@ TEST(TestRankForFixedSizeBinary, RankFixedSizeBinary) {
ArrayFromJSON(binary_type, R"(["aaa", " ", "eee", null, "eee", null, "
"])"));
}
+TEST(TestRankForReal, RankRealChunked) {
Review Comment:
Instead of essentially duplicating existing tests, perhaps we can generate
`ChunkedArray` checks from the `Array` tests, or vice-versa?
##########
cpp/src/arrow/compute/kernels/vector_rank.cc:
##########
@@ -77,121 +191,90 @@ class ArrayRanker : public TypeVisitor {
NullPartitionResult sorted =
array_sorter(sort_begin, sort_end, arr, 0, array_options);
- uint64_t rank;
-
- ARROW_ASSIGN_OR_RAISE(auto rankings,
- MakeMutableUInt64Array(length, ctx_->memory_pool()));
- auto out_begin = rankings->GetMutableValues<uint64_t>(1);
-
- switch (tiebreaker_) {
- case RankOptions::Dense: {
- T curr_value, prev_value{};
- rank = 0;
-
- if (null_placement_ == NullPlacement::AtStart && sorted.null_count() >
0) {
- rank++;
- for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
- out_begin[*it] = rank;
- }
- }
-
- for (auto it = sorted.non_nulls_begin; it < sorted.non_nulls_end;
it++) {
- curr_value = GetView::LogicalValue(arr.GetView(*it));
- if (it == sorted.non_nulls_begin || curr_value != prev_value) {
- rank++;
- }
-
- out_begin[*it] = rank;
- prev_value = curr_value;
- }
-
- if (null_placement_ == NullPlacement::AtEnd) {
- rank++;
- for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
- out_begin[*it] = rank;
- }
- }
- break;
- }
-
- case RankOptions::First: {
- rank = 0;
- for (auto it = sorted.overall_begin(); it < sorted.overall_end();
it++) {
- out_begin[*it] = ++rank;
- }
- break;
- }
- case RankOptions::Min: {
- T curr_value, prev_value{};
- rank = 0;
+ auto value_selector = [&arr](int64_t index) {
+ return GetView::LogicalValue(arr.GetView(index));
+ };
+ auto rankings =
+ CreateRankings<T>(ctx_, sorted, null_placement_, tiebreaker_,
value_selector);
+ ARROW_ASSIGN_OR_RAISE(auto output, rankings);
+ *output_ = output;
+ return Status::OK();
+ }
- if (null_placement_ == NullPlacement::AtStart) {
- rank++;
- for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
- out_begin[*it] = rank;
- }
- }
+ ExecContext* ctx_;
+ const Array& array_;
+ const RankOptions& options_;
+ const NullPlacement null_placement_;
+ const RankOptions::Tiebreaker tiebreaker_;
+ const std::shared_ptr<DataType> physical_type_;
+ Datum* output_;
+};
- for (auto it = sorted.non_nulls_begin; it < sorted.non_nulls_end;
it++) {
- curr_value = GetView::LogicalValue(arr.GetView(*it));
- if (it == sorted.non_nulls_begin || curr_value != prev_value) {
- rank = (it - sorted.overall_begin()) + 1;
- }
- out_begin[*it] = rank;
- prev_value = curr_value;
- }
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+ // TODO: here we accept order / null_placement / tiebreaker as separate
arguments
+ // whereas the ArrayRanker accepts them as the RankOptions struct; this is
consistent
+ // with ArraySorter / ChunkedArraySorter, so likely should refactor
ArrayRanker
Review Comment:
If the refactor is small enough, why not do it in this PR?
##########
cpp/src/arrow/compute/kernels/vector_rank.cc:
##########
@@ -237,6 +323,32 @@ class RankMetaFunction : public MetaFunction {
ARROW_RETURN_NOT_OK(ranker.Run());
return output;
}
+
+ Result<Datum> Rank(const ChunkedArray& chunked_array, const RankOptions&
options,
+ ExecContext* ctx) const {
+ SortOrder order = SortOrder::Ascending;
+ if (!options.sort_keys.empty()) {
+ order = options.sort_keys[0].order;
+ }
+
+ auto sort_type = uint64();
+ auto length = chunked_array.length();
+ auto buffer_size = bit_util::BytesForBits(
+ length * std::static_pointer_cast<UInt64Type>(sort_type)->bit_width());
+ std::vector<std::shared_ptr<Buffer>> buffers(2);
+ ARROW_ASSIGN_OR_RAISE(buffers[1],
+ AllocateResizableBuffer(buffer_size,
ctx->memory_pool()));
+ auto out = std::make_shared<ArrayData>(sort_type, length, buffers, 0);
+ auto sort_begin = out->GetMutableValues<uint64_t>(1);
+ auto sort_end = sort_begin + length;
+ std::iota(sort_begin, sort_end, 0);
Review Comment:
Most of this should probably be shared with the `Array` code path?
##########
cpp/src/arrow/compute/kernels/vector_rank.cc:
##########
@@ -77,121 +191,90 @@ class ArrayRanker : public TypeVisitor {
NullPartitionResult sorted =
array_sorter(sort_begin, sort_end, arr, 0, array_options);
- uint64_t rank;
-
- ARROW_ASSIGN_OR_RAISE(auto rankings,
- MakeMutableUInt64Array(length, ctx_->memory_pool()));
- auto out_begin = rankings->GetMutableValues<uint64_t>(1);
-
- switch (tiebreaker_) {
- case RankOptions::Dense: {
- T curr_value, prev_value{};
- rank = 0;
-
- if (null_placement_ == NullPlacement::AtStart && sorted.null_count() >
0) {
- rank++;
- for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
- out_begin[*it] = rank;
- }
- }
-
- for (auto it = sorted.non_nulls_begin; it < sorted.non_nulls_end;
it++) {
- curr_value = GetView::LogicalValue(arr.GetView(*it));
- if (it == sorted.non_nulls_begin || curr_value != prev_value) {
- rank++;
- }
-
- out_begin[*it] = rank;
- prev_value = curr_value;
- }
-
- if (null_placement_ == NullPlacement::AtEnd) {
- rank++;
- for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
- out_begin[*it] = rank;
- }
- }
- break;
- }
-
- case RankOptions::First: {
- rank = 0;
- for (auto it = sorted.overall_begin(); it < sorted.overall_end();
it++) {
- out_begin[*it] = ++rank;
- }
- break;
- }
- case RankOptions::Min: {
- T curr_value, prev_value{};
- rank = 0;
+ auto value_selector = [&arr](int64_t index) {
+ return GetView::LogicalValue(arr.GetView(index));
+ };
+ auto rankings =
+ CreateRankings<T>(ctx_, sorted, null_placement_, tiebreaker_,
value_selector);
+ ARROW_ASSIGN_OR_RAISE(auto output, rankings);
Review Comment:
Also, this can perhaps even be `ARROW_ASSIGN_OR_RAISE(*output_, ....`
##########
cpp/src/arrow/compute/kernels/vector_sort_test.cc:
##########
@@ -2182,5 +2185,146 @@ TEST(TestRankForFixedSizeBinary, RankFixedSizeBinary) {
ArrayFromJSON(binary_type, R"(["aaa", " ", "eee", null, "eee", null, "
"])"));
}
+TEST(TestRankForReal, RankRealChunked) {
+ for (auto real_type : ::arrow::FloatingPointTypes()) {
+ for (auto null_placement : AllNullPlacements()) {
+ for (auto tiebreaker : AllTiebreakers()) {
+ for (auto order : AllOrders()) {
+ AssertRankEmpty(real_type, order, null_placement, tiebreaker);
+ }
+
+ AssertRankSimple(
+ ChunkedArrayFromJSON(real_type, {"[2.1, 3.2]", "[1.0, 0.0, 5.5]"}),
+ null_placement, tiebreaker);
Review Comment:
Regardless of what strategy is chosen, it would be nice to exercise empty
chunks as well.
##########
cpp/src/arrow/compute/kernels/vector_rank.cc:
##########
@@ -77,121 +191,90 @@ class ArrayRanker : public TypeVisitor {
NullPartitionResult sorted =
array_sorter(sort_begin, sort_end, arr, 0, array_options);
- uint64_t rank;
-
- ARROW_ASSIGN_OR_RAISE(auto rankings,
- MakeMutableUInt64Array(length, ctx_->memory_pool()));
- auto out_begin = rankings->GetMutableValues<uint64_t>(1);
-
- switch (tiebreaker_) {
- case RankOptions::Dense: {
- T curr_value, prev_value{};
- rank = 0;
-
- if (null_placement_ == NullPlacement::AtStart && sorted.null_count() >
0) {
- rank++;
- for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
- out_begin[*it] = rank;
- }
- }
-
- for (auto it = sorted.non_nulls_begin; it < sorted.non_nulls_end;
it++) {
- curr_value = GetView::LogicalValue(arr.GetView(*it));
- if (it == sorted.non_nulls_begin || curr_value != prev_value) {
- rank++;
- }
-
- out_begin[*it] = rank;
- prev_value = curr_value;
- }
-
- if (null_placement_ == NullPlacement::AtEnd) {
- rank++;
- for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
- out_begin[*it] = rank;
- }
- }
- break;
- }
-
- case RankOptions::First: {
- rank = 0;
- for (auto it = sorted.overall_begin(); it < sorted.overall_end();
it++) {
- out_begin[*it] = ++rank;
- }
- break;
- }
- case RankOptions::Min: {
- T curr_value, prev_value{};
- rank = 0;
+ auto value_selector = [&arr](int64_t index) {
+ return GetView::LogicalValue(arr.GetView(index));
+ };
+ auto rankings =
+ CreateRankings<T>(ctx_, sorted, null_placement_, tiebreaker_,
value_selector);
+ ARROW_ASSIGN_OR_RAISE(auto output, rankings);
Review Comment:
I'm curious, why didn't you collapse these two lines in one? The `rankings`
temporary doesn't seem useful...
##########
cpp/src/arrow/compute/kernels/vector_rank.cc:
##########
@@ -25,6 +25,120 @@ namespace {
// ----------------------------------------------------------------------
// Rank implementation
+template <typename T>
+Result<Datum> CreateRankings(ExecContext* ctx, const NullPartitionResult&
sorted,
+ const NullPlacement null_placement,
+ const RankOptions::Tiebreaker tiebreaker,
+ std::function<T(int64_t)> value_selector) {
Review Comment:
Ideally, the `std::function` overhead would be removed by the compiler after
inlining, but perhaps it's better to avoid it upfront (including in debug
builds) by passing this as a template parameter instead?
```c++
template <typename ValueSelector, typename T =
std::invoke_result_t<ValueSelector, int64_t>>
Result<Datum> CreateRankings(ExecContext* ctx, const NullPartitionResult&
sorted,
const NullPlacement null_placement,
const RankOptions::Tiebreaker tiebreaker,
ValueSelector&& value_selector) {
```
##########
cpp/src/arrow/compute/kernels/vector_rank.cc:
##########
@@ -77,121 +191,90 @@ class ArrayRanker : public TypeVisitor {
NullPartitionResult sorted =
array_sorter(sort_begin, sort_end, arr, 0, array_options);
- uint64_t rank;
-
- ARROW_ASSIGN_OR_RAISE(auto rankings,
- MakeMutableUInt64Array(length, ctx_->memory_pool()));
- auto out_begin = rankings->GetMutableValues<uint64_t>(1);
-
- switch (tiebreaker_) {
- case RankOptions::Dense: {
- T curr_value, prev_value{};
- rank = 0;
-
- if (null_placement_ == NullPlacement::AtStart && sorted.null_count() >
0) {
- rank++;
- for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
- out_begin[*it] = rank;
- }
- }
-
- for (auto it = sorted.non_nulls_begin; it < sorted.non_nulls_end;
it++) {
- curr_value = GetView::LogicalValue(arr.GetView(*it));
- if (it == sorted.non_nulls_begin || curr_value != prev_value) {
- rank++;
- }
-
- out_begin[*it] = rank;
- prev_value = curr_value;
- }
-
- if (null_placement_ == NullPlacement::AtEnd) {
- rank++;
- for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
- out_begin[*it] = rank;
- }
- }
- break;
- }
-
- case RankOptions::First: {
- rank = 0;
- for (auto it = sorted.overall_begin(); it < sorted.overall_end();
it++) {
- out_begin[*it] = ++rank;
- }
- break;
- }
- case RankOptions::Min: {
- T curr_value, prev_value{};
- rank = 0;
+ auto value_selector = [&arr](int64_t index) {
+ return GetView::LogicalValue(arr.GetView(index));
+ };
+ auto rankings =
+ CreateRankings<T>(ctx_, sorted, null_placement_, tiebreaker_,
value_selector);
+ ARROW_ASSIGN_OR_RAISE(auto output, rankings);
Review Comment:
(you might need to parenthesize the `CreateRankings` call because of syntax
limitations, though)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]