This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 1f1c637  Use partition for bool sort (#448)
1f1c637 is described below

commit 1f1c63729955d87a0083719f87b651a9c9adc6b4
Author: Jiayu Liu <[email protected]>
AuthorDate: Sat Jun 19 19:44:49 2021 +0800

    Use partition for bool sort (#448)
    
    * optimize boolean sort using parition
    
    * add docs
---
 arrow/src/compute/kernels/sort.rs | 59 +++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/arrow/src/compute/kernels/sort.rs 
b/arrow/src/compute/kernels/sort.rs
index b0eecb9..d6c4107 100644
--- a/arrow/src/compute/kernels/sort.rs
+++ b/arrow/src/compute/kernels/sort.rs
@@ -405,7 +405,13 @@ impl Default for SortOptions {
     }
 }
 
-/// Sort primitive values
+/// Sort boolean values
+///
+/// when a limit is present, the sort is pair-comparison based as k-select 
might be more efficient,
+/// when the limit is absent, binary partition is used to speed up (which is 
linear).
+///
+/// TODO maybe partition_validity call can be eliminated in this case and 
tri-color sort can be used
+/// instead. https://en.wikipedia.org/wiki/Dutch_national_flag_problem
 fn sort_boolean(
     values: &ArrayRef,
     value_indices: Vec<u32>,
@@ -419,30 +425,41 @@ fn sort_boolean(
         .expect("Unable to downcast to boolean array");
     let descending = options.descending;
 
-    // create tuples that are used for sorting
-    let mut valids = value_indices
-        .into_iter()
-        .map(|index| (index, values.value(index as usize)))
-        .collect::<Vec<(u32, bool)>>();
-
-    let mut nulls = null_indices;
-
-    let valids_len = valids.len();
-    let nulls_len = nulls.len();
+    let valids_len = value_indices.len();
+    let nulls_len = null_indices.len();
 
     let mut len = values.len();
-    if let Some(limit) = limit {
+    let valids = if let Some(limit) = limit {
         len = limit.min(len);
-    }
-    if !descending {
-        sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
-            cmp(a.1, b.1)
-        });
+        // create tuples that are used for sorting
+        let mut valids = value_indices
+            .into_iter()
+            .map(|index| (index, values.value(index as usize)))
+            .collect::<Vec<(u32, bool)>>();
+        if !descending {
+            sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
+                cmp(a.1, b.1)
+            });
+        } else {
+            sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
+                cmp(a.1, b.1).reverse()
+            });
+        }
+        valids
     } else {
-        sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
-            cmp(a.1, b.1).reverse()
-        });
-        // reverse to keep a stable ordering
+        // when limit is not present, we have a better way than sorting: we 
can just partition
+        // the vec into [false..., true...] or [true..., false...] when 
descending
+        // TODO when https://github.com/rust-lang/rust/issues/62543 is merged 
we can use partition_in_place
+        let (mut a, b): (Vec<(u32, bool)>, Vec<(u32, bool)>) = value_indices
+            .into_iter()
+            .map(|index| (index, values.value(index as usize)))
+            .partition(|(_, value)| *value == descending);
+        a.extend(b);
+        a
+    };
+
+    let mut nulls = null_indices;
+    if descending {
         nulls.reverse();
     }
 

Reply via email to