alamb commented on code in PR #14402:
URL: https://github.com/apache/datafusion/pull/14402#discussion_r1941935727


##########
datafusion/functions-aggregate/src/first_last.rs:
##########
@@ -259,19 +260,17 @@ impl FirstValueAccumulator {
             })
             .collect::<Vec<_>>();
 
-        if self.ignore_nulls {
-            let indices = lexsort_to_indices(&sort_columns, None)?;
-            // If ignoring nulls, find the first non-null value.
-            for index in indices.iter().flatten() {
-                if !value.is_null(index as usize) {
-                    return Ok(Some(index as usize));
-                }
-            }
-            Ok(None)
+        let comparator = LexicographicalComparator::try_new(&sort_columns)?;
+
+        let min_index = if self.ignore_nulls {
+            (0..value.len())
+                .filter(|&index| !value.is_null(index))
+                .min_by(|&a, &b| comparator.compare(a, b))

Review Comment:
   I double checked that `min_by` does not require the elements to be sorted 
already
   
   https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.min_by



##########
datafusion/sqllogictest/test_files/group_by.slt:
##########
@@ -3003,7 +3003,7 @@ SELECT FIRST_VALUE(amount ORDER BY ts ASC) AS fv1,
   LAST_VALUE(amount ORDER BY ts ASC) AS fv2
   FROM sales_global
 ----
-30 100
+30 80

Review Comment:
   
   Here is the table definition 
   ```sql
   CREATE TABLE sales_global (zip_code INT,
             country VARCHAR(3),
             sn INT,
             ts TIMESTAMP,
             currency VARCHAR(3),
             amount FLOAT
           ) as VALUES
             (0, 'GRC', 0, '2022-01-01 06:00:00'::timestamp, 'EUR', 30.0),
             (1, 'FRA', 1, '2022-01-01 08:00:00'::timestamp, 'EUR', 50.0),
             (1, 'TUR', 2, '2022-01-01 11:30:00'::timestamp, 'TRY', 75.0),
             (1, 'FRA', 3, '2022-01-02 12:00:00'::timestamp, 'EUR', 200.0),
             (1, 'TUR', 4, '2022-01-03 10:00:00'::timestamp, 'TRY', 100.0),
             (0, 'GRC', 4, '2022-01-03 10:00:00'::timestamp, 'EUR', 80.0)
   ```
   
   And indeed when I ran it locally, both `80` and `100` have the  same `ts` 
value, and thus I agree either is a valid answer. 
   
   In fact when I ran the query `80` actually comes out last so as @blaginin 
says it makes more sense
   
   ```sql
   > select * from sales_global ORDER BY ts ASC;
   +----------+---------+----+---------------------+----------+--------+
   | zip_code | country | sn | ts                  | currency | amount |
   +----------+---------+----+---------------------+----------+--------+
   | 0        | GRC     | 0  | 2022-01-01T06:00:00 | EUR      | 30.0   |
   | 1        | FRA     | 1  | 2022-01-01T08:00:00 | EUR      | 50.0   |
   | 1        | TUR     | 2  | 2022-01-01T11:30:00 | TRY      | 75.0   |
   | 1        | FRA     | 3  | 2022-01-02T12:00:00 | EUR      | 200.0  |
   | 1        | TUR     | 4  | 2022-01-03T10:00:00 | TRY      | 100.0  |
   | 0        | GRC     | 4  | 2022-01-03T10:00:00 | EUR      | 80.0   |
   +----------+---------+----+---------------------+----------+--------+
   ```



##########
datafusion/functions-aggregate/src/first_last.rs:
##########
@@ -627,24 +607,19 @@ impl Accumulator for LastValueAccumulator {
         // last index contains is_set flag.
         let is_set_idx = states.len() - 1;
         let flags = states[is_set_idx].as_boolean();
-        let filtered_states = filter_states_according_to_is_set(states, 
flags)?;
+        let filtered_states =
+            filter_states_according_to_is_set(&states[0..is_set_idx], flags)?;
         // 1..is_set_idx range corresponds to ordering section
-        let sort_cols = convert_to_sort_cols(
+        let sort_columns = convert_to_sort_cols(
             &filtered_states[1..is_set_idx],
             self.ordering_req.as_ref(),
         );
 
-        let ordered_states = if sort_cols.is_empty() {
-            // When no ordering is given, use existing state as is:
-            filtered_states
-        } else {
-            let indices = lexsort_to_indices(&sort_cols, None)?;
-            take_arrays(&filtered_states, &indices, None)?
-        };
+        let comparator = LexicographicalComparator::try_new(&sort_columns)?;

Review Comment:
   I think the lexographic comparator is good for this usecase 👍 
   
   Another potential thing to try is use the `RowConverter` but that incurs 
overhead to copy each row



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to