pitrou commented on code in PR #46926:
URL: https://github.com/apache/arrow/pull/46926#discussion_r3413592302


##########
python/pyarrow/_compute.pyx:
##########
@@ -2499,17 +2527,26 @@ class RankOptions(_RankOptions):
                    number of distinct values in the input.
     """
 
-    def __init__(self, sort_keys="ascending", *, null_placement="at_end", 
tiebreaker="first"):
+    def __init__(self, sort_keys="ascending", *, null_placement=None, 
tiebreaker="first"):
         self._set_options(sort_keys, null_placement, tiebreaker)

Review Comment:
   I don't think so (except fixing the advertised deprecation version, sorry!)



##########
cpp/src/arrow/compute/kernels/vector_select_k.cc:
##########
@@ -315,73 +473,124 @@ class RecordBatchSelector : public TypeVisitor {
         *status = maybe_array.status();
         return {};
       }
-      resolved.emplace_back(*std::move(maybe_array), key.order);
+      resolved.emplace_back(*std::move(maybe_array), key.order, 
key.null_placement);
     }
     return resolved;
   }
 
-  template <typename InType, SortOrder sort_order>
-  Status SelectKthInternal() {
-    using GetView = GetViewType<InType>;
-    using ArrayType = typename TypeTraits<InType>::ArrayType;
-    auto& comparator = comparator_;
-    const auto& first_sort_key = sort_keys_[0];
-    const auto& arr = checked_cast<const ArrayType&>(first_sort_key.array);
+  class SelectKForKey : public TypeVisitor {
+   public:
+    SelectKForKey(RecordBatchSelector* selector, size_t start_sort_key_index,
+                  std::span<uint64_t> input_indices, std::span<uint64_t> 
output_indices)
+        : TypeVisitor(),
+          selector_(selector),
+          start_sort_key_index_(start_sort_key_index),
+          input_indices_(input_indices),
+          output_indices_(output_indices) {}
+
+   private:
+    template <typename InType>
+    Status Do() {
+      using ArrayType = typename TypeTraits<InType>::ArrayType;
+      const auto& first_remaining_sort_key = 
selector_->sort_keys_[start_sort_key_index_];
+      const auto& arr = checked_cast<const 
ArrayType&>(first_remaining_sort_key.array);
+
+      uint64_t* input_indices_begin = input_indices_.data();
+      uint64_t* input_indices_end = input_indices_.data() + 
input_indices_.size();
+
+      const auto p = PartitionNullsAndNans<ArrayType, NonStablePartitioner>(
+          input_indices_begin, input_indices_end, arr, 0,
+          first_remaining_sort_key.null_placement);
+
+      // From k = output_range.size(), calculate

Review Comment:
   Nit
   ```suggestion
         // From k = output_indices.size(), calculate
   ```



##########
cpp/src/arrow/compute/kernels/vector_select_k.cc:
##########
@@ -205,19 +339,38 @@ class ChunkedArraySelector : public TypeVisitor {
     if (k_ > chunked_array_.length()) {
       k_ = chunked_array_.length();
     }
+
+    ARROW_ASSIGN_OR_RAISE(auto take_indices,
+                          MakeMutableUInt64Array(k_, ctx_->memory_pool()));
+    auto* output_begin = take_indices->GetMutableValues<uint64_t>(1);
+
+    int64_t null_count = chunked_array_.null_count();
+    int64_t nan_count = ComputeNanCount<InType>();
+    int64_t non_null_like_count = chunked_array_.length() - null_count - 
nan_count;
+

Review Comment:
   > The temporary storage of all `indices` for the individual chunks could 
still be a bit better. Maybe I should introduce a struct that holds all three 
of `array`, `indices` and `partitions`?
   
   I don't think we care in this PR. Otherwise we have 
`CompressedChunkLocation` that can make things more compact.
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to