alamb commented on code in PR #8034:
URL: https://github.com/apache/arrow-datafusion/pull/8034#discussion_r1391458012


##########
datafusion/physical-expr/src/equivalence.rs:
##########
@@ -20,26 +20,114 @@ use std::hash::Hash;
 use std::sync::Arc;
 
 use crate::expressions::Column;
-use crate::physical_expr::{deduplicate_physical_exprs, have_common_entries};
 use crate::sort_properties::{ExprOrdering, SortProperties};
 use crate::{
-    physical_exprs_contains, LexOrdering, LexOrderingRef, LexRequirement,
-    LexRequirementRef, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement,
+    physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, 
LexOrdering,
+    LexOrderingRef, LexRequirement, LexRequirementRef, PhysicalExpr, 
PhysicalSortExpr,
+    PhysicalSortRequirement,
 };
 
 use arrow::datatypes::SchemaRef;
 use arrow_schema::SortOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{JoinSide, JoinType, Result};
 
+use crate::physical_expr::deduplicate_physical_exprs;
 use indexmap::map::Entry;
 use indexmap::IndexMap;
 
 /// An `EquivalenceClass` is a set of [`Arc<dyn PhysicalExpr>`]s that are known
 /// to have the same value for all tuples in a relation. These are generated by
-/// equality predicates, typically equi-join conditions and equality conditions
-/// in filters.
-pub type EquivalenceClass = Vec<Arc<dyn PhysicalExpr>>;
+/// equality predicates (e.g. `a = b`), typically equi-join conditions and
+/// equality conditions in filters.
+#[derive(Debug, Clone)]
+pub struct EquivalenceClass {
+    inner: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+impl PartialEq for EquivalenceClass {
+    fn eq(&self, other: &Self) -> bool {
+        physical_exprs_equal(&self.inner, &other.inner)
+    }
+}
+
+impl EquivalenceClass {
+    /// Create a new empty equivalence class
+    pub fn new_empty() -> Self {
+        Self { inner: vec![] }
+    }
+
+    // Create a new equivalence class from a pre-existing `Vec`
+    pub fn new_from_vec(inner: Vec<Arc<dyn PhysicalExpr>>) -> Self {
+        Self { inner }
+    }
+
+    /// Return the "canonical" expression for this class (the first element)
+    /// if any
+    fn canonical_expr(&self) -> Option<Arc<dyn PhysicalExpr>> {
+        self.inner.first().cloned()
+    }
+
+    /// Insert the expression into this class, meaning it is known to be equal 
to
+    /// all other expressions in this class
+    pub fn push(&mut self, expr: Arc<dyn PhysicalExpr>) {
+        self.inner.push(expr);
+    }
+
+    /// Inserts all the expressions from other into this class
+    pub fn extend(&mut self, other: Self) {
+        self.inner.extend(other.inner);
+    }
+
+    /// Returns true if this equivalence class contains t expression
+    pub fn contains(&self, expr: &Arc<dyn PhysicalExpr>) -> bool {
+        physical_exprs_contains(&self.inner, expr)
+    }
+
+    /// Returns true if this equivalence class has any entries in common with 
other
+    pub fn contains_any(&self, other: &Self) -> bool {
+        self.inner.iter().any(|e| other.contains(e))
+    }
+
+    /// return the number of items in this class
+    pub fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    /// return true if this class is empty
+    pub fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    /// Removes all duplicated exprs in this class
+    // TODO should we deduplicate on insert?

Review Comment:
   This fits well with the definition of this being a set (rather than an 
ordered check). I will try it



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to