Jefffrey commented on code in PR #9991:
URL: https://github.com/apache/arrow-rs/pull/9991#discussion_r3486986883


##########
arrow-ord/src/sort.rs:
##########
@@ -949,40 +949,55 @@ pub fn lexsort_to_indices(
         ));
     };
 
-    let mut value_indices = (0..row_count).collect::<Vec<usize>>();
-    let mut len = value_indices.len();
+    let len = limit.unwrap_or(row_count).min(row_count);
 
-    if let Some(limit) = limit {
-        len = limit.min(len);
+    if len == 0 {
+        return Ok(UInt32Array::from(Vec::<u32>::new()));
     }
 
-    // Instantiate specialized versions of comparisons for small numbers
-    // of columns as it helps the compiler generate better code.
-    match columns.len() {
-        2 => {
-            sort_fixed_column::<2>(columns, &mut value_indices, len)?;
-        }
-        3 => {
-            sort_fixed_column::<3>(columns, &mut value_indices, len)?;
-        }
-        4 => {
-            sort_fixed_column::<4>(columns, &mut value_indices, len)?;
-        }
-        5 => {
-            sort_fixed_column::<5>(columns, &mut value_indices, len)?;
+    // The heap path avoids allocating and partially sorting all row indices
+    // when the requested limit is a small fraction of the input. For larger
+    // limits, the existing partial-sort path is preferred because heap
+    // maintenance costs grow with the requested limit.
+    let value_indices = if limit.is_some() && len <= row_count / 10 {

Review Comment:
   ```suggestion
       let value_indices = if let Some(limit) = limit
           && limit <= row_count / 10
       {
   ```
   
   i found it a little confusing since `len` could technically be either 
`row_count` or `limit` from above; perhaps like this its clearer



##########
arrow-ord/src/sort.rs:
##########
@@ -1000,6 +1015,100 @@ fn sort_fixed_column<const N: usize>(
     Ok(())
 }
 
+// Uses the fixed-column comparator for the bounded heap path.
+fn lexsort_topk_fixed<const N: usize>(
+    columns: &[SortColumn],
+    row_count: usize,
+    limit: usize,
+) -> Result<Vec<usize>, ArrowError> {
+    let lexicographical_comparator = 
FixedLexicographicalComparator::<N>::try_new(columns)?;
+    Ok(lexsort_topk(row_count, limit, |a, b| {
+        lexicographical_comparator.compare(a, b)
+    }))
+}
+
+// Keeps the smallest `limit` indices in a bounded max-heap.
+// The root is the largest retained index according to `compare`.
+fn lexsort_topk(
+    row_count: usize,
+    limit: usize,
+    mut compare: impl FnMut(usize, usize) -> Ordering,
+) -> Vec<usize> {
+    if limit >= row_count {
+        let mut value_indices = (0..row_count).collect::<Vec<_>>();
+        value_indices.sort_unstable_by(|a, b| compare(*a, *b).then_with(|| 
a.cmp(b)));
+        return value_indices;
+    }
+
+    // Break ties by row index so equal sort keys are selected consistently
+    // during heap updates and final sorting.
+    let mut compare_total = |a: usize, b: usize| compare(a, b).then_with(|| 
a.cmp(&b));

Review Comment:
   could you expand on this need for a stable sort here? is there an edge case 
regarding heap updates if we dont have this stable fix?



##########
arrow-ord/src/sort.rs:
##########
@@ -1000,6 +1015,100 @@ fn sort_fixed_column<const N: usize>(
     Ok(())
 }
 
+// Uses the fixed-column comparator for the bounded heap path.
+fn lexsort_topk_fixed<const N: usize>(
+    columns: &[SortColumn],
+    row_count: usize,
+    limit: usize,
+) -> Result<Vec<usize>, ArrowError> {
+    let lexicographical_comparator = 
FixedLexicographicalComparator::<N>::try_new(columns)?;
+    Ok(lexsort_topk(row_count, limit, |a, b| {
+        lexicographical_comparator.compare(a, b)
+    }))
+}
+
+// Keeps the smallest `limit` indices in a bounded max-heap.
+// The root is the largest retained index according to `compare`.
+fn lexsort_topk(
+    row_count: usize,
+    limit: usize,
+    mut compare: impl FnMut(usize, usize) -> Ordering,
+) -> Vec<usize> {
+    if limit >= row_count {
+        let mut value_indices = (0..row_count).collect::<Vec<_>>();
+        value_indices.sort_unstable_by(|a, b| compare(*a, *b).then_with(|| 
a.cmp(b)));
+        return value_indices;
+    }

Review Comment:
   ```suggestion
   ```
   
   we probably don't need this check? ideally we should always have `limit < 
row_count`
   
   would this be a safety issue (returns incorrect results or panics) if we 
omit this check?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to