alamb commented on code in PR #9234:
URL: https://github.com/apache/arrow-datafusion/pull/9234#discussion_r1492366835


##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -2143,23 +2142,50 @@ impl ScalarValue {
     /// use datafusion_common::ScalarValue;
     /// use arrow::array::ListArray;
     /// use arrow::datatypes::{DataType, Int32Type};
+    /// use datafusion_common::utils::array_into_list_array;
+    /// use std::sync::Arc;
     ///
     /// let list_arr = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
     ///    Some(vec![Some(1), Some(2), Some(3)]),
-    ///    None,
     ///    Some(vec![Some(4), Some(5)])
     /// ]);
     ///
     /// let scalar_vec = 
ScalarValue::convert_array_to_scalar_vec(&list_arr).unwrap();
     ///
     /// let expected = vec![
-    ///   vec![
+    /// vec![
     ///     ScalarValue::Int32(Some(1)),
     ///     ScalarValue::Int32(Some(2)),
     ///     ScalarValue::Int32(Some(3)),
+    /// ],
+    /// vec![
+    ///    ScalarValue::Int32(Some(4)),
+    ///    ScalarValue::Int32(Some(5)),
+    /// ],
+    /// ];
+    ///
+    /// assert_eq!(scalar_vec, expected);

Review Comment:
   Thank you fro these comments -- I suggest we improve the docs even more by:
   1. Split the examples up (in this case add a second `# Example of using 
array_into_list_array` heading and a second doc example
   2. Add some comments explaining what each line is doing. For you it is 
likely obvious, but for a causal reader,. understanding what 
`array_into_list_array` is doing or why this is different from the previous 
example is likely not as easy



##########
datafusion/sqllogictest/test_files/aggregate.slt:
##########
@@ -139,6 +139,61 @@ AggregateExec: mode=Final, gby=[], 
aggr=[ARRAY_AGG(agg_order.c1)]
 --------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
 ----------CsvExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_agg_multi_order.csv]]}, 
projection=[c1, c2, c3], has_header=true
 
+# test array_agg_order with list data type
+statement ok
+CREATE TABLE array_agg_order_list_table AS VALUES
+  ('w', 2, [1,2,3], 10),
+  ('w', 1, [9,5,2], 20),
+  ('w', 1, [3,2,5], 30),
+  ('b', 2, [4,5,6], 20),
+  ('b', 1, [7,8,9], 30)
+;
+
+query T? rowsort
+select column1, array_agg(column3 order by column2, column4 desc) from 
array_agg_order_list_table group by column1;
+----
+b [[7, 8, 9], [4, 5, 6]]
+w [[3, 2, 5], [9, 5, 2], [1, 2, 3]]
+
+query T?? rowsort
+select column1, first_value(column3 order by column2, column4 desc), 
last_value(column3 order by column2, column4 desc) from 
array_agg_order_list_table group by column1;
+----
+b [7, 8, 9] [4, 5, 6]
+w [3, 2, 5] [1, 2, 3]
+
+query T? rowsort
+select column1, nth_value(column3, 2 order by column2, column4 desc) from 
array_agg_order_list_table group by column1;
+----
+b [4, 5, 6]
+w [9, 5, 2]
+
+statement ok
+drop table array_agg_order_list_table;
+
+# test array_agg_distinct with list data type
+statement ok
+CREATE TABLE array_agg_distinct_list_table AS VALUES
+  ('w', [0,1]),
+  ('w', [0,1]),
+  ('w', [1,0]),
+  ('b', [1,0]),
+  ('b', [1,0]),
+  ('b', [1,0]),
+  ('b', [0,1])
+;
+
+# Apply array_sort to have determinisitic result, higher dimension nested 
array also works but not for array sort,
+# so they are covered in 
`datafusion/physical-expr/src/aggregate/array_agg_distinct.rs`
+query ??
+select array_sort(c1), array_sort(c2) from (

Review Comment:
   Yeah, we would need to use a structure for deduplicating that preserved 
input order



##########
datafusion/physical-expr/src/aggregate/array_agg_distinct.rs:
##########
@@ -151,7 +153,11 @@ impl Accumulator for DistinctArrayAggAccumulator {
             return Ok(());
         }
 
-        self.update_batch(states)
+        let array = &states[0];
+
+        // Unwrap outer ListArray then do update batch
+        let inner_array = array.as_list::<i32>().value(0);

Review Comment:
   How do we know there is only a single row in the states array? Can we please 
add an assert here to make sure we don't accidentally lose data?
   
   Something like `assert_eq!(array.len(), 1)`?



##########
datafusion/physical-expr/src/aggregate/array_agg_distinct.rs:
##########
@@ -181,47 +187,55 @@ mod tests {
     use arrow_array::Array;
     use arrow_array::ListArray;
     use arrow_buffer::OffsetBuffer;
-    use datafusion_common::utils::array_into_list_array;
     use datafusion_common::{internal_err, DataFusionError};
 
-    // arrow::compute::sort cann't sort ListArray directly, so we need to sort 
the inner primitive array and wrap it back into ListArray.
-    fn sort_list_inner(arr: ScalarValue) -> ScalarValue {
-        let arr = match arr {
-            ScalarValue::List(arr) => arr.value(0),
-            _ => {
-                panic!("Expected ScalarValue::List, got {:?}", arr)
-            }
-        };
+    // arrow::compute::sort can't sort nested ListArray directly, so we 
compare the scalar values pair-wise.

Review Comment:
   What do you think about filing a ticket upstream in arrow-rs to properly 
support sorting nested lists?
   
   It seems like it would be valuable eventually and could likely be done much 
more efficiently than using `ScalarValue`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to