This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new caa37fed52 Simplify dictionary sort (#4605)
caa37fed52 is described below
commit caa37fed52478c19dfd7ac49b651fb1c891e27a9
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Tue Aug 1 09:00:55 2023 +0100
Simplify dictionary sort (#4605)
---
arrow-ord/src/sort.rs | 73 +++++++++++----------------------------------------
1 file changed, 16 insertions(+), 57 deletions(-)
diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs
index 147af1e301..c623475c0b 100644
--- a/arrow-ord/src/sort.rs
+++ b/arrow-ord/src/sort.rs
@@ -390,28 +390,15 @@ pub fn sort_to_indices(
descending: false,
nulls_first: value_null_first,
});
- downcast_dictionary_array!(
- values => match values.values().data_type() {
- dt if DataType::is_primitive(dt) => {
- let dict_values = values.values();
- let sorted_value_indices =
sort_to_indices(dict_values, value_options, None)?;
- let value_indices_map =
sorted_rank(&sorted_value_indices);
- sort_primitive_dictionary::<_, _>(values,
&value_indices_map, v, n, options, limit, cmp)
- },
- DataType::Utf8 => {
- let dict_values = values.values();
- let sorted_value_indices =
sort_to_indices(dict_values, value_options, None)?;
- let value_indices_map =
sorted_rank(&sorted_value_indices);
- sort_string_dictionary::<_>(values,
&value_indices_map, v, n, &options, limit)
- },
- t => return Err(ArrowError::ComputeError(format!(
- "Unsupported dictionary value type {t}"
- ))),
- },
- t => return Err(ArrowError::ComputeError(format!(
- "Unsupported datatype {t}"
- ))),
- )
+ downcast_dictionary_array! {
+ values => {
+ let dict_values = values.values();
+ let sorted_value_indices = sort_to_indices(dict_values,
value_options, None)?;
+ let rank = sorted_rank(&sorted_value_indices);
+ sort_dictionary(values, &rank, v, n, options, limit)
+ }
+ _ => unreachable!(),
+ }
}
DataType::Binary | DataType::FixedSizeBinary(_) => {
sort_binary::<i32>(values, v, n, &options, limit)
@@ -563,28 +550,23 @@ fn sorted_rank(sorted_value_indices: &UInt32Array) ->
Vec<u32> {
out
}
-/// Sort dictionary encoded primitive values
-fn sort_primitive_dictionary<K, F>(
- values: &DictionaryArray<K>,
- value_indices_map: &[u32],
+/// Sort dictionary given the sorted rank of each key
+fn sort_dictionary<K: ArrowDictionaryKeyType>(
+ dict: &DictionaryArray<K>,
+ rank: &[u32],
value_indices: Vec<u32>,
null_indices: Vec<u32>,
options: SortOptions,
limit: Option<usize>,
- cmp: F,
-) -> UInt32Array
-where
- K: ArrowDictionaryKeyType,
- F: Fn(u32, u32) -> Ordering,
-{
- let keys: &PrimitiveArray<K> = values.keys();
+) -> UInt32Array {
+ let keys: &PrimitiveArray<K> = dict.keys();
// create tuples that are used for sorting
let valids = value_indices
.into_iter()
.map(|index| {
let key: K::Native = keys.value(index as usize);
- (index, value_indices_map[key.as_usize()])
+ (index, rank[key.as_usize()])
})
.collect::<Vec<(u32, u32)>>();
@@ -877,29 +859,6 @@ fn sort_string<Offset: OffsetSizeTrait>(
)
}
-/// Sort dictionary encoded strings
-fn sort_string_dictionary<T: ArrowDictionaryKeyType>(
- values: &DictionaryArray<T>,
- value_indices_map: &[u32],
- value_indices: Vec<u32>,
- null_indices: Vec<u32>,
- options: &SortOptions,
- limit: Option<usize>,
-) -> UInt32Array {
- let keys: &PrimitiveArray<T> = values.keys();
-
- // create tuples that are used for sorting
- let valids = value_indices
- .into_iter()
- .map(|index| {
- let key: T::Native = keys.value(index as usize);
- (index, value_indices_map[key.as_usize()])
- })
- .collect::<Vec<(u32, u32)>>();
-
- sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, options,
limit, valids)
-}
-
/// shared implementation between dictionary encoded and plain string arrays
#[inline]
fn sort_string_helper<'a, A: Array, F>(