alamb commented on code in PR #6501: URL: https://github.com/apache/arrow-datafusion/pull/6501#discussion_r1214775206
########## datafusion/core/tests/sqllogictests/test_files/window.slt: ########## @@ -2405,6 +2405,30 @@ GlobalLimitExec: skip=0, fetch=5 ------SortExec: expr=[c9@0 DESC] --------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], has_header=true +# This test shows that ordering equivalence can keep track of complex expressions (not just Column expressions) Review Comment: ❤️ ########## datafusion/physical-expr/src/equivalence.rs: ########## @@ -34,14 +32,14 @@ use std::sync::Arc; /// This is used to represent both: /// /// 1. Equality conditions (like `A=B`), when `T` = [`Column`] -/// 2. Ordering (like `A ASC = B ASC`), when `T` = [`OrderedColumn`] +/// 2. Ordering (like `A ASC = B ASC`), when `T` = [`PhysicalSortExpr`] #[derive(Debug, Clone)] pub struct EquivalenceProperties<T = Column> { Review Comment: 🤔 maybe eventually we will do the same thing for `equality analysis as well` ########## datafusion/physical-expr/src/equivalence.rs: ########## @@ -115,6 +113,53 @@ impl<T: Eq + Hash + Clone> EquivalenceProperties<T> { } } +/// Remove duplicates inside the `in_data` vector, returned vector would consist of unique entries +fn deduplicate_vector<T: PartialEq>(in_data: Vec<T>) -> Vec<T> { + let mut result = vec![]; + for elem in in_data { + if !result.contains(&elem) { + result.push(elem); + } + } + result +} + +/// Find the position of `entry` inside `in_data`, if `entry` is not found return `None`. +fn get_entry_position<T: PartialEq>(in_data: &[T], entry: &T) -> Option<usize> { + in_data.iter().position(|item| item.eq(entry)) +} + +/// Remove `entry` for the `in_data`, returns `true` if removal is successful (e.g `entry` is indeed in the `in_data`) +/// Otherwise return `false` +fn remove_from_vec<T: PartialEq>(in_data: &mut Vec<T>, entry: &T) -> bool { Review Comment: Perhaps a more idiomatic way would be for this function to return `Option<T>` (which is what `Some(in_data.remove())` returns ) That might allow you to avoid some of the other changes to `remove` later ########## datafusion/physical-expr/src/equivalence.rs: ########## @@ -551,4 +555,52 @@ mod tests { Ok(()) } + + #[test] Review Comment: 👍 ########## datafusion/physical-expr/src/equivalence.rs: ########## @@ -115,6 +113,53 @@ impl<T: Eq + Hash + Clone> EquivalenceProperties<T> { } } +/// Remove duplicates inside the `in_data` vector, returned vector would consist of unique entries +fn deduplicate_vector<T: PartialEq>(in_data: Vec<T>) -> Vec<T> { + let mut result = vec![]; + for elem in in_data { + if !result.contains(&elem) { + result.push(elem); + } + } + result +} + +/// Find the position of `entry` inside `in_data`, if `entry` is not found return `None`. +fn get_entry_position<T: PartialEq>(in_data: &[T], entry: &T) -> Option<usize> { + in_data.iter().position(|item| item.eq(entry)) +} + +/// Remove `entry` for the `in_data`, returns `true` if removal is successful (e.g `entry` is indeed in the `in_data`) +/// Otherwise return `false` +fn remove_from_vec<T: PartialEq>(in_data: &mut Vec<T>, entry: &T) -> bool { + if let Some(idx) = get_entry_position(in_data, entry) { + in_data.remove(idx); + true + } else { + false + } +} + +// Helper function to calculate column info recursively +fn get_column_infos_helper( + indices: &mut Vec<(usize, String)>, + expr: &Arc<dyn PhysicalExpr>, +) { + if let Some(col) = expr.as_any().downcast_ref::<Column>() { + indices.push((col.index(), col.name().to_string())) + } else if let Some(binary_expr) = expr.as_any().downcast_ref::<BinaryExpr>() { + get_column_infos_helper(indices, binary_expr.left()); + get_column_infos_helper(indices, binary_expr.right()); + }; +} + +/// Get index and name of each column that is in the expression (Can return multiple entries for `BinaryExpr`s) +fn get_column_infos(expr: &Arc<dyn PhysicalExpr>) -> Vec<(usize, String)> { Review Comment: Maybe 'get_column_index` would better explain what this function is doing ########## datafusion/physical-expr/src/equivalence.rs: ########## @@ -198,58 +247,7 @@ impl<T: Eq + Hash + Clone> EquivalentClass<T> { } } -/// This object represents a [`Column`] with a definite ordering, for Review Comment: 🎉 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org