This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 1f1c637 Use partition for bool sort (#448)
1f1c637 is described below
commit 1f1c63729955d87a0083719f87b651a9c9adc6b4
Author: Jiayu Liu <[email protected]>
AuthorDate: Sat Jun 19 19:44:49 2021 +0800
Use partition for bool sort (#448)
* optimize boolean sort using parition
* add docs
---
arrow/src/compute/kernels/sort.rs | 59 +++++++++++++++++++++++++--------------
1 file changed, 38 insertions(+), 21 deletions(-)
diff --git a/arrow/src/compute/kernels/sort.rs
b/arrow/src/compute/kernels/sort.rs
index b0eecb9..d6c4107 100644
--- a/arrow/src/compute/kernels/sort.rs
+++ b/arrow/src/compute/kernels/sort.rs
@@ -405,7 +405,13 @@ impl Default for SortOptions {
}
}
-/// Sort primitive values
+/// Sort boolean values
+///
+/// when a limit is present, the sort is pair-comparison based as k-select
might be more efficient,
+/// when the limit is absent, binary partition is used to speed up (which is
linear).
+///
+/// TODO maybe partition_validity call can be eliminated in this case and
tri-color sort can be used
+/// instead. https://en.wikipedia.org/wiki/Dutch_national_flag_problem
fn sort_boolean(
values: &ArrayRef,
value_indices: Vec<u32>,
@@ -419,30 +425,41 @@ fn sort_boolean(
.expect("Unable to downcast to boolean array");
let descending = options.descending;
- // create tuples that are used for sorting
- let mut valids = value_indices
- .into_iter()
- .map(|index| (index, values.value(index as usize)))
- .collect::<Vec<(u32, bool)>>();
-
- let mut nulls = null_indices;
-
- let valids_len = valids.len();
- let nulls_len = nulls.len();
+ let valids_len = value_indices.len();
+ let nulls_len = null_indices.len();
let mut len = values.len();
- if let Some(limit) = limit {
+ let valids = if let Some(limit) = limit {
len = limit.min(len);
- }
- if !descending {
- sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
- cmp(a.1, b.1)
- });
+ // create tuples that are used for sorting
+ let mut valids = value_indices
+ .into_iter()
+ .map(|index| (index, values.value(index as usize)))
+ .collect::<Vec<(u32, bool)>>();
+ if !descending {
+ sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
+ cmp(a.1, b.1)
+ });
+ } else {
+ sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
+ cmp(a.1, b.1).reverse()
+ });
+ }
+ valids
} else {
- sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
- cmp(a.1, b.1).reverse()
- });
- // reverse to keep a stable ordering
+ // when limit is not present, we have a better way than sorting: we
can just partition
+ // the vec into [false..., true...] or [true..., false...] when
descending
+ // TODO when https://github.com/rust-lang/rust/issues/62543 is merged
we can use partition_in_place
+ let (mut a, b): (Vec<(u32, bool)>, Vec<(u32, bool)>) = value_indices
+ .into_iter()
+ .map(|index| (index, values.value(index as usize)))
+ .partition(|(_, value)| *value == descending);
+ a.extend(b);
+ a
+ };
+
+ let mut nulls = null_indices;
+ if descending {
nulls.reverse();
}