tustvold commented on code in PR #4613:
URL: https://github.com/apache/arrow-rs/pull/4613#discussion_r1280465930
##########
arrow-ord/src/sort.rs:
##########
@@ -422,231 +237,190 @@ pub fn sort_to_indices(
})
}
-/// Sort boolean values
-///
-/// when a limit is present, the sort is pair-comparison based as k-select
might be more efficient,
-/// when the limit is absent, binary partition is used to speed up (which is
linear).
-///
-/// TODO maybe partition_validity call can be eliminated in this case
-/// and [tri-color
sort](https://en.wikipedia.org/wiki/Dutch_national_flag_problem)
-/// can be used instead.
fn sort_boolean(
- values: &dyn Array,
+ values: &BooleanArray,
value_indices: Vec<u32>,
- mut null_indices: Vec<u32>,
- options: &SortOptions,
+ null_indices: Vec<u32>,
+ options: SortOptions,
limit: Option<usize>,
) -> UInt32Array {
- let values = values
- .as_any()
- .downcast_ref::<BooleanArray>()
- .expect("Unable to downcast to boolean array");
- let descending = options.descending;
-
- let valids_len = value_indices.len();
- let nulls_len = null_indices.len();
-
- let mut len = values.len();
- let valids = if let Some(limit) = limit {
- len = limit.min(len);
- // create tuples that are used for sorting
- let mut valids = value_indices
- .into_iter()
- .map(|index| (index, values.value(index as usize)))
- .collect::<Vec<(u32, bool)>>();
-
- sort_valids(descending, &mut valids, len, cmp);
- valids
- } else {
- // when limit is not present, we have a better way than sorting: we
can just partition
- // the vec into [false..., true...] or [true..., false...] when
descending
- // TODO when https://github.com/rust-lang/rust/issues/62543 is merged
we can use partition_in_place
- let (mut a, b): (Vec<_>, Vec<_>) = value_indices
- .into_iter()
- .map(|index| (index, values.value(index as usize)))
- .partition(|(_, value)| *value == descending);
- a.extend(b);
- if descending {
- null_indices.reverse();
- }
- a
- };
-
- let nulls = null_indices;
-
- // collect results directly into a buffer instead of a vec to avoid
another aligned allocation
- let result_capacity = len * std::mem::size_of::<u32>();
- let mut result = MutableBuffer::new(result_capacity);
- // sets len to capacity so we can access the whole buffer as a typed slice
- result.resize(result_capacity, 0);
- let result_slice: &mut [u32] = result.typed_data_mut();
-
- if options.nulls_first {
- let size = nulls_len.min(len);
- result_slice[0..size].copy_from_slice(&nulls[0..size]);
- if nulls_len < len {
- insert_valid_values(result_slice, nulls_len, &valids[0..len -
size]);
- }
- } else {
- // nulls last
- let size = valids.len().min(len);
- insert_valid_values(result_slice, 0, &valids[0..size]);
- if len > size {
- result_slice[valids_len..].copy_from_slice(&nulls[0..(len -
valids_len)]);
- }
- }
-
- let result_data = unsafe {
- ArrayData::new_unchecked(
- DataType::UInt32,
- len,
- Some(0),
- None,
- 0,
- vec![result.into()],
- vec![],
- )
- };
+ let mut valids = value_indices
+ .into_iter()
+ .map(|index| (index, values.value(index as usize)))
+ .collect::<Vec<(u32, bool)>>();
+ sort_impl(options, &mut valids, &null_indices, limit, |a, b|
a.cmp(&b)).into()
+}
- UInt32Array::from(result_data)
+fn sort_primitive<T: ArrowPrimitiveType>(
+ values: &PrimitiveArray<T>,
+ value_indices: Vec<u32>,
+ nulls: Vec<u32>,
+ options: SortOptions,
+ limit: Option<usize>,
+) -> UInt32Array {
+ let mut valids = value_indices
+ .into_iter()
+ .map(|index| (index, values.value(index as usize)))
+ .collect::<Vec<(u32, T::Native)>>();
+ sort_impl(options, &mut valids, &nulls, limit, T::Native::compare).into()
}
-/// Sort primitive values
-fn sort_primitive<T, F>(
- values: &dyn Array,
+fn sort_bytes<T: ByteArrayType>(
+ values: &GenericByteArray<T>,
value_indices: Vec<u32>,
- null_indices: Vec<u32>,
- cmp: F,
- options: &SortOptions,
+ nulls: Vec<u32>,
+ options: SortOptions,
limit: Option<usize>,
-) -> UInt32Array
-where
- T: ArrowPrimitiveType,
- T::Native: PartialOrd,
- F: Fn(T::Native, T::Native) -> Ordering,
-{
- // create tuples that are used for sorting
- let valids = {
- let values = values.as_primitive::<T>();
- value_indices
- .into_iter()
- .map(|index| (index, values.value(index as usize)))
- .collect::<Vec<(u32, T::Native)>>()
- };
- sort_primitive_inner(values.len(), null_indices, cmp, options, limit,
valids)
+) -> UInt32Array {
+ let mut valids = value_indices
+ .into_iter()
+ .map(|index| (index, values.value(index as usize).as_ref()))
+ .collect::<Vec<(u32, &[u8])>>();
+
+ sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into()
}
-/// Given a list of indices that yield a sorted order, returns the ordered
-/// rank of each index
-///
-/// e.g. [2, 4, 3, 1, 0] -> [4, 3, 0, 2, 1]
-fn sorted_rank(sorted_value_indices: &UInt32Array) -> Vec<u32> {
- assert_eq!(sorted_value_indices.null_count(), 0);
- let sorted_indices = sorted_value_indices.values();
- let mut out: Vec<_> = vec![0_u32; sorted_indices.len()];
- for (ix, val) in sorted_indices.iter().enumerate() {
- out[*val as usize] = ix as u32;
- }
- out
+fn sort_fixed_size_binary(
+ values: &FixedSizeBinaryArray,
+ value_indices: Vec<u32>,
+ nulls: Vec<u32>,
+ options: SortOptions,
+ limit: Option<usize>,
+) -> UInt32Array {
+ let mut valids = value_indices
+ .iter()
+ .copied()
+ .map(|index| (index, values.value(index as usize)))
+ .collect::<Vec<(u32, &[u8])>>();
+ sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into()
}
-/// Sort dictionary given the sorted rank of each key
fn sort_dictionary<K: ArrowDictionaryKeyType>(
dict: &DictionaryArray<K>,
- rank: &[u32],
value_indices: Vec<u32>,
null_indices: Vec<u32>,
options: SortOptions,
limit: Option<usize>,
-) -> UInt32Array {
+) -> Result<UInt32Array, ArrowError> {
let keys: &PrimitiveArray<K> = dict.keys();
+ let rank = child_rank(dict.values().as_ref(), options)?;
// create tuples that are used for sorting
- let valids = value_indices
+ let mut valids = value_indices
.into_iter()
.map(|index| {
let key: K::Native = keys.value(index as usize);
(index, rank[key.as_usize()])
})
.collect::<Vec<(u32, u32)>>();
- sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, &options,
limit, valids)
+ Ok(sort_impl(options, &mut valids, &null_indices, limit, |a, b|
a.cmp(&b)).into())
}
-// sort is instantiated a lot so we only compile this inner version for each
native type
-fn sort_primitive_inner<T, F>(
- value_len: usize,
- nulls: Vec<u32>,
- cmp: F,
- options: &SortOptions,
+fn sort_list<O: OffsetSizeTrait>(
+ array: &GenericListArray<O>,
+ value_indices: Vec<u32>,
+ null_indices: Vec<u32>,
+ options: SortOptions,
limit: Option<usize>,
- mut valids: Vec<(u32, T)>,
-) -> UInt32Array
+) -> Result<UInt32Array, ArrowError> {
+ let rank = child_rank(array.values().as_ref(), options)?;
+ let offsets = array.value_offsets();
+ let mut valids = value_indices
Review Comment:
The duplication in extracting these valids arrays is somewhat unfortunate, I
have some ideas of how to fix it, but wanted to keep the PR size down
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]