WillAyd commented on code in PR #45217:
URL: https://github.com/apache/arrow/pull/45217#discussion_r1909410555


##########
cpp/src/arrow/compute/kernels/vector_rank.cc:
##########
@@ -28,114 +28,97 @@ namespace {
 // ----------------------------------------------------------------------
 // Rank implementation
 
-template <typename ValueSelector,
-          typename T = std::decay_t<std::invoke_result_t<ValueSelector, 
int64_t>>>
+// A bit that is set in the sort indices when the value at the current sort 
index
+// is the same as the value at the previous sort index.
+constexpr uint64_t kDuplicateMask = 1ULL << 63;
+
+constexpr bool NeedsDuplicates(RankOptions::Tiebreaker tiebreaker) {
+  return tiebreaker != RankOptions::First;
+}
+
+template <typename ValueSelector>
+void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& 
value_selector) {
+  using T = decltype(value_selector(int64_t{}));
+
+  // Process non-nulls
+  if (sorted.non_nulls_end != sorted.non_nulls_begin) {
+    auto it = sorted.non_nulls_begin;
+    T prev_value = value_selector(*it);
+    while (++it < sorted.non_nulls_end) {
+      T curr_value = value_selector(*it);
+      if (curr_value == prev_value) {
+        *it |= kDuplicateMask;

Review Comment:
   Is this leading bit never used by `it` in the current implementation, or are 
we just assuming that to be highly unlikely?



##########
cpp/src/arrow/compute/kernels/vector_sort_internal.h:
##########
@@ -238,33 +198,28 @@ NullPartitionResult PartitionNullsOnly(uint64_t* 
indices_begin, uint64_t* indice
 //
 // `offset` is used when this is called on a chunk of a chunked array
 template <typename ArrayType, typename Partitioner>
-enable_if_t<!has_null_like_values<typename ArrayType::TypeClass>::value,
-            NullPartitionResult>
-PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
-                   const ArrayType& values, int64_t offset,
-                   NullPlacement null_placement) {
-  return NullPartitionResult::NoNulls(indices_begin, indices_end, 
null_placement);
-}
-
-template <typename ArrayType, typename Partitioner>
-enable_if_t<has_null_like_values<typename ArrayType::TypeClass>::value,
-            NullPartitionResult>
-PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
-                   const ArrayType& values, int64_t offset,
-                   NullPlacement null_placement) {
-  Partitioner partitioner;
-  if (null_placement == NullPlacement::AtStart) {
-    auto null_likes_end =
-        partitioner(indices_begin, indices_end, [&values, &offset](uint64_t 
ind) {
-          return std::isnan(values.GetView(ind - offset));
-        });
-    return NullPartitionResult::NullsAtStart(indices_begin, indices_end, 
null_likes_end);
+NullPartitionResult PartitionNullLikes(uint64_t* indices_begin, uint64_t* 
indices_end,
+                                       const ArrayType& values, int64_t offset,
+                                       NullPlacement null_placement) {
+  if constexpr (has_null_like_values<typename ArrayType::TypeClass>()) {
+    Partitioner partitioner;
+    if (null_placement == NullPlacement::AtStart) {
+      auto null_likes_end =

Review Comment:
   So the float types are explicitly looking at nan as a value instead of the 
mask right? That's something you are looking to align in a follow up? Or its 
still an open question as to how to handle?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to