This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new bb363dc26 fix: lexsort_to_indices should not fallback to non-lexical 
sort if the datatype is not supported (#6225)
bb363dc26 is described below

commit bb363dc261b12beed8af9f3d4b9e535a860fa4f3
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Mon Aug 12 13:49:51 2024 -0700

    fix: lexsort_to_indices should not fallback to non-lexical sort if the 
datatype is not supported (#6225)
    
    * fix: lexsort_to_indices should not fallback to non-lexical sort if the 
datatype is not supported
    
    * fix clippy
    
    * Check error message
---
 arrow-ord/src/sort.rs | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs
index 885f23644..140d878f3 100644
--- a/arrow-ord/src/sort.rs
+++ b/arrow-ord/src/sort.rs
@@ -190,6 +190,39 @@ fn partition_validity(array: &dyn Array) -> (Vec<u32>, 
Vec<u32>) {
     }
 }
 
+/// Whether `arrow_ord::rank` can rank an array of given data type.
+fn can_rank(data_type: &DataType) -> bool {
+    data_type.is_primitive()
+        || matches!(
+            data_type,
+            DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | 
DataType::LargeBinary
+        )
+}
+
+/// Whether `sort_to_indices` can sort an array of given data type.
+fn can_sort_to_indices(data_type: &DataType) -> bool {
+    data_type.is_primitive()
+        || matches!(
+            data_type,
+            DataType::Boolean
+                | DataType::Utf8
+                | DataType::LargeUtf8
+                | DataType::Utf8View
+                | DataType::Binary
+                | DataType::LargeBinary
+                | DataType::BinaryView
+                | DataType::FixedSizeBinary(_)
+        )
+        || match data_type {
+            DataType::List(f) if can_rank(f.data_type()) => true,
+            DataType::LargeList(f) if can_rank(f.data_type()) => true,
+            DataType::FixedSizeList(f, _) if can_rank(f.data_type()) => true,
+            DataType::Dictionary(_, values) if can_rank(values.as_ref()) => 
true,
+            DataType::RunEndEncoded(_, f) if 
can_sort_to_indices(f.data_type()) => true,
+            _ => false,
+        }
+}
+
 /// Sort elements from `ArrayRef` into an unsigned integer (`UInt32Array`) of 
indices.
 /// Floats are sorted using IEEE 754 totalOrder.  `limit` is an option for 
[partial_sort].
 pub fn sort_to_indices(
@@ -678,7 +711,7 @@ pub fn lexsort_to_indices(
             "Sort requires at least one column".to_string(),
         ));
     }
-    if columns.len() == 1 {
+    if columns.len() == 1 && 
can_sort_to_indices(columns[0].values.data_type()) {
         // fallback to non-lexical sort
         let column = &columns[0];
         return sort_to_indices(&column.values, column.options, limit);
@@ -762,6 +795,7 @@ mod tests {
         FixedSizeListBuilder, Int64Builder, ListBuilder, PrimitiveRunBuilder,
     };
     use arrow_buffer::{i256, NullBuffer};
+    use arrow_schema::Field;
     use half::f16;
     use rand::rngs::StdRng;
     use rand::{Rng, RngCore, SeedableRng};
@@ -4203,4 +4237,47 @@ mod tests {
         let sort_indices = sort_to_indices(&a, None, None).unwrap();
         assert_eq!(sort_indices.values(), &[1, 2, 0]);
     }
+
+    #[test]
+    fn sort_struct_fallback_to_lexsort() {
+        let float = Arc::new(Float32Array::from(vec![1.0, -0.1, 3.5, 1.0]));
+        let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31]));
+
+        let struct_array = StructArray::from(vec![
+            (
+                Arc::new(Field::new("b", DataType::Float32, false)),
+                float.clone() as ArrayRef,
+            ),
+            (
+                Arc::new(Field::new("c", DataType::Int32, false)),
+                int.clone() as ArrayRef,
+            ),
+        ]);
+
+        assert!(!can_sort_to_indices(struct_array.data_type()));
+        assert!(sort_to_indices(&struct_array, None, None)
+            .err()
+            .unwrap()
+            .to_string()
+            .contains("Sort not supported for data type"));
+
+        let sort_columns = vec![SortColumn {
+            values: Arc::new(struct_array.clone()) as ArrayRef,
+            options: None,
+        }];
+        let sorted = lexsort(&sort_columns, None).unwrap();
+
+        let expected_struct_array = Arc::new(StructArray::from(vec![
+            (
+                Arc::new(Field::new("b", DataType::Float32, false)),
+                Arc::new(Float32Array::from(vec![-0.1, 1.0, 1.0, 3.5])) as 
ArrayRef,
+            ),
+            (
+                Arc::new(Field::new("c", DataType::Int32, false)),
+                Arc::new(Int32Array::from(vec![28, 31, 42, 19])) as ArrayRef,
+            ),
+        ])) as ArrayRef;
+
+        assert_eq!(&sorted[0], &expected_struct_array);
+    }
 }

Reply via email to