yahoNanJing commented on code in PR #4043: URL: https://github.com/apache/arrow-datafusion/pull/4043#discussion_r1013553521
########## datafusion/physical-expr/src/equivalence.rs: ########## @@ -0,0 +1,256 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::expressions::Column; + +use arrow::datatypes::SchemaRef; + +use std::collections::HashMap; +use std::collections::HashSet; + +/// Equivalence Properties is a vec of EquivalentClass. +#[derive(Debug, Default, Clone)] +pub struct EquivalenceProperties { + classes: Vec<EquivalentClass>, +} + +impl EquivalenceProperties { + pub fn new() -> Self { + EquivalenceProperties { classes: vec![] } + } + + pub fn classes(&self) -> &[EquivalentClass] { + &self.classes + } + + pub fn extend<I: IntoIterator<Item = EquivalentClass>>(&mut self, iter: I) { + self.classes.extend(iter) + } + + /// Add new equal conditions into the EquivalenceProperties, the new equal conditions are usually comming from the + /// equality predicates in Join or Filter + pub fn add_equal_conditions(&mut self, new_conditions: (&Column, &Column)) { + let mut idx1: Option<usize> = None; + let mut idx2: Option<usize> = None; + for (idx, class) in self.classes.iter_mut().enumerate() { + let contains_first = class.contains(new_conditions.0); + let contains_second = class.contains(new_conditions.1); + match (contains_first, contains_second) { + (true, false) => { + class.insert(new_conditions.1.clone()); + idx1 = Some(idx); + } + (false, true) => { + class.insert(new_conditions.0.clone()); + idx2 = Some(idx); + } + (true, true) => { + idx1 = Some(idx); + idx2 = Some(idx); + break; + } + (false, false) => {} + } + } + + match (idx1, idx2) { + (Some(idx_1), Some(idx_2)) if idx_1 != idx_2 => { + // need to merge the two existing EquivalentClasses + let second_eq_class = self.classes.get(idx_2).unwrap().clone(); + let first_eq_class = self.classes.get_mut(idx_1).unwrap(); + for prop in second_eq_class.iter() { + if !first_eq_class.contains(prop) { + first_eq_class.insert(prop.clone()); + } + } + self.classes.remove(idx_2); + } + (None, None) => { + // adding new pairs + self.classes.push(EquivalentClass::new( + new_conditions.0.clone(), + vec![new_conditions.1.clone()], + )); + } + _ => {} + } + } + + pub fn merge_properties_with_alias( + &mut self, + alias_map: &HashMap<Column, Vec<Column>>, + ) { + for (column, columns) in alias_map { + let mut find_match = false; + for class in self.classes.iter_mut() { + if class.contains(column) { + for col in columns { + class.insert(col.clone()); + } + find_match = true; + break; + } + } + if !find_match { + self.classes + .push(EquivalentClass::new(column.clone(), columns.clone())); + } + } + } + + pub fn truncate_properties_not_in_schema(&mut self, schema: &SchemaRef) { + for class in self.classes.iter_mut() { + let mut columns_to_remove = vec![]; + for column in class.iter() { + if let Ok(idx) = schema.index_of(column.name()) { + if idx != column.index() { + columns_to_remove.push(column.clone()); + } + } else { + columns_to_remove.push(column.clone()); + } + } + for column in columns_to_remove { + class.remove(&column); + } + } + self.classes.retain(|props| props.len() > 1); + } +} + +/// Equivalent Class is a set of Columns that are known to have the same value in all tuples in a relation +/// Equivalence Class is generated by equality predicates, typically equijoin conditions and equality conditions in filters. Review Comment: I like this abstraction an the comments. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
