ozankabak commented on code in PR #8034:
URL: https://github.com/apache/arrow-datafusion/pull/8034#discussion_r1389895645


##########
datafusion/physical-expr/src/equivalence.rs:
##########
@@ -20,26 +20,114 @@ use std::hash::Hash;
 use std::sync::Arc;
 
 use crate::expressions::Column;
-use crate::physical_expr::{deduplicate_physical_exprs, have_common_entries};
 use crate::sort_properties::{ExprOrdering, SortProperties};
 use crate::{
-    physical_exprs_contains, LexOrdering, LexOrderingRef, LexRequirement,
-    LexRequirementRef, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement,
+    physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, 
LexOrdering,
+    LexOrderingRef, LexRequirement, LexRequirementRef, PhysicalExpr, 
PhysicalSortExpr,
+    PhysicalSortRequirement,
 };
 
 use arrow::datatypes::SchemaRef;
 use arrow_schema::SortOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{JoinSide, JoinType, Result};
 
+use crate::physical_expr::deduplicate_physical_exprs;
 use indexmap::map::Entry;
 use indexmap::IndexMap;
 
 /// An `EquivalenceClass` is a set of [`Arc<dyn PhysicalExpr>`]s that are known
 /// to have the same value for all tuples in a relation. These are generated by
-/// equality predicates, typically equi-join conditions and equality conditions
-/// in filters.
-pub type EquivalenceClass = Vec<Arc<dyn PhysicalExpr>>;
+/// equality predicates (e.g. `a = b`), typically equi-join conditions and
+/// equality conditions in filters.
+#[derive(Debug, Clone)]
+pub struct EquivalenceClass {
+    inner: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+impl PartialEq for EquivalenceClass {
+    fn eq(&self, other: &Self) -> bool {
+        physical_exprs_equal(&self.inner, &other.inner)
+    }

Review Comment:
   Mathematically, an equivalence class is a set of expressions. Therefore, 
`eq` implementation should use set equality. AFAIK we only check for 
list-equality during testing (and if we don't, that could be sign of a bug or a 
bad design!). Therefore, I suggest changing this `eq_strict` and implementing 
`eq` with set equality (and getting rid of `eq_bag`).



##########
datafusion/physical-expr/src/equivalence.rs:
##########
@@ -20,26 +20,114 @@ use std::hash::Hash;
 use std::sync::Arc;
 
 use crate::expressions::Column;
-use crate::physical_expr::{deduplicate_physical_exprs, have_common_entries};
 use crate::sort_properties::{ExprOrdering, SortProperties};
 use crate::{
-    physical_exprs_contains, LexOrdering, LexOrderingRef, LexRequirement,
-    LexRequirementRef, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement,
+    physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, 
LexOrdering,
+    LexOrderingRef, LexRequirement, LexRequirementRef, PhysicalExpr, 
PhysicalSortExpr,
+    PhysicalSortRequirement,
 };
 
 use arrow::datatypes::SchemaRef;
 use arrow_schema::SortOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{JoinSide, JoinType, Result};
 
+use crate::physical_expr::deduplicate_physical_exprs;
 use indexmap::map::Entry;
 use indexmap::IndexMap;
 
 /// An `EquivalenceClass` is a set of [`Arc<dyn PhysicalExpr>`]s that are known
 /// to have the same value for all tuples in a relation. These are generated by
-/// equality predicates, typically equi-join conditions and equality conditions
-/// in filters.
-pub type EquivalenceClass = Vec<Arc<dyn PhysicalExpr>>;
+/// equality predicates (e.g. `a = b`), typically equi-join conditions and
+/// equality conditions in filters.
+#[derive(Debug, Clone)]
+pub struct EquivalenceClass {
+    inner: Vec<Arc<dyn PhysicalExpr>>,

Review Comment:
   Maybe `exprs` or `items` is a better name here as this is the vector 
containing elements (expressions) that belong to this equivalence class.



##########
datafusion/physical-expr/src/equivalence.rs:
##########
@@ -20,26 +20,114 @@ use std::hash::Hash;
 use std::sync::Arc;
 
 use crate::expressions::Column;
-use crate::physical_expr::{deduplicate_physical_exprs, have_common_entries};
 use crate::sort_properties::{ExprOrdering, SortProperties};
 use crate::{
-    physical_exprs_contains, LexOrdering, LexOrderingRef, LexRequirement,
-    LexRequirementRef, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement,
+    physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, 
LexOrdering,
+    LexOrderingRef, LexRequirement, LexRequirementRef, PhysicalExpr, 
PhysicalSortExpr,
+    PhysicalSortRequirement,
 };
 
 use arrow::datatypes::SchemaRef;
 use arrow_schema::SortOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{JoinSide, JoinType, Result};
 
+use crate::physical_expr::deduplicate_physical_exprs;
 use indexmap::map::Entry;
 use indexmap::IndexMap;
 
 /// An `EquivalenceClass` is a set of [`Arc<dyn PhysicalExpr>`]s that are known
 /// to have the same value for all tuples in a relation. These are generated by
-/// equality predicates, typically equi-join conditions and equality conditions
-/// in filters.
-pub type EquivalenceClass = Vec<Arc<dyn PhysicalExpr>>;
+/// equality predicates (e.g. `a = b`), typically equi-join conditions and
+/// equality conditions in filters.
+#[derive(Debug, Clone)]
+pub struct EquivalenceClass {
+    inner: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+impl PartialEq for EquivalenceClass {
+    fn eq(&self, other: &Self) -> bool {
+        physical_exprs_equal(&self.inner, &other.inner)
+    }
+}
+
+impl EquivalenceClass {
+    /// Create a new empty equivalence class
+    pub fn new_empty() -> Self {
+        Self { inner: vec![] }
+    }
+
+    // Create a new equivalence class from a pre-existing `Vec`
+    pub fn new_from_vec(inner: Vec<Arc<dyn PhysicalExpr>>) -> Self {
+        Self { inner }
+    }
+
+    /// Return the "canonical" expression for this class (the first element)
+    /// if any
+    fn canonical_expr(&self) -> Option<Arc<dyn PhysicalExpr>> {
+        self.inner.first().cloned()
+    }
+
+    /// Insert the expression into this class, meaning it is known to be equal 
to
+    /// all other expressions in this class
+    pub fn push(&mut self, expr: Arc<dyn PhysicalExpr>) {
+        self.inner.push(expr);
+    }
+
+    /// Inserts all the expressions from other into this class
+    pub fn extend(&mut self, other: Self) {
+        self.inner.extend(other.inner);
+    }
+
+    /// Returns true if this equivalence class contains t expression
+    pub fn contains(&self, expr: &Arc<dyn PhysicalExpr>) -> bool {
+        physical_exprs_contains(&self.inner, expr)
+    }
+
+    /// Returns true if this equivalence class has any entries in common with 
other
+    pub fn contains_any(&self, other: &Self) -> bool {
+        self.inner.iter().any(|e| other.contains(e))
+    }
+
+    /// return the number of items in this class
+    pub fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    /// return true if this class is empty
+    pub fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    /// Removes all duplicated exprs in this class
+    // TODO should we deduplicate on insert?

Review Comment:
   We can do a quick `contains` check on `push`, and avoid adding the 
expression if the former check returns `true`. And you can call 
`deduplicate_physical_exprs` in `new`, and there would be no need for a 
`deduplicate` function anymore.



##########
datafusion/physical-expr/src/equivalence.rs:
##########
@@ -20,26 +20,114 @@ use std::hash::Hash;
 use std::sync::Arc;
 
 use crate::expressions::Column;
-use crate::physical_expr::{deduplicate_physical_exprs, have_common_entries};
 use crate::sort_properties::{ExprOrdering, SortProperties};
 use crate::{
-    physical_exprs_contains, LexOrdering, LexOrderingRef, LexRequirement,
-    LexRequirementRef, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement,
+    physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, 
LexOrdering,
+    LexOrderingRef, LexRequirement, LexRequirementRef, PhysicalExpr, 
PhysicalSortExpr,
+    PhysicalSortRequirement,
 };
 
 use arrow::datatypes::SchemaRef;
 use arrow_schema::SortOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{JoinSide, JoinType, Result};
 
+use crate::physical_expr::deduplicate_physical_exprs;
 use indexmap::map::Entry;
 use indexmap::IndexMap;
 
 /// An `EquivalenceClass` is a set of [`Arc<dyn PhysicalExpr>`]s that are known
 /// to have the same value for all tuples in a relation. These are generated by
-/// equality predicates, typically equi-join conditions and equality conditions
-/// in filters.
-pub type EquivalenceClass = Vec<Arc<dyn PhysicalExpr>>;
+/// equality predicates (e.g. `a = b`), typically equi-join conditions and
+/// equality conditions in filters.
+#[derive(Debug, Clone)]
+pub struct EquivalenceClass {
+    inner: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+impl PartialEq for EquivalenceClass {
+    fn eq(&self, other: &Self) -> bool {
+        physical_exprs_equal(&self.inner, &other.inner)
+    }
+}
+
+impl EquivalenceClass {
+    /// Create a new empty equivalence class
+    pub fn new_empty() -> Self {
+        Self { inner: vec![] }
+    }
+
+    // Create a new equivalence class from a pre-existing `Vec`
+    pub fn new_from_vec(inner: Vec<Arc<dyn PhysicalExpr>>) -> Self {

Review Comment:
   If we have a `new_empty`, why don't we simply call this `new(exprs: 
Vec<Arc<dyn PhysicalExpr>>)`?



##########
datafusion/physical-expr/src/equivalence.rs:
##########
@@ -20,26 +20,114 @@ use std::hash::Hash;
 use std::sync::Arc;
 
 use crate::expressions::Column;
-use crate::physical_expr::{deduplicate_physical_exprs, have_common_entries};
 use crate::sort_properties::{ExprOrdering, SortProperties};
 use crate::{
-    physical_exprs_contains, LexOrdering, LexOrderingRef, LexRequirement,
-    LexRequirementRef, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement,
+    physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, 
LexOrdering,
+    LexOrderingRef, LexRequirement, LexRequirementRef, PhysicalExpr, 
PhysicalSortExpr,
+    PhysicalSortRequirement,
 };
 
 use arrow::datatypes::SchemaRef;
 use arrow_schema::SortOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{JoinSide, JoinType, Result};
 
+use crate::physical_expr::deduplicate_physical_exprs;
 use indexmap::map::Entry;
 use indexmap::IndexMap;
 
 /// An `EquivalenceClass` is a set of [`Arc<dyn PhysicalExpr>`]s that are known
 /// to have the same value for all tuples in a relation. These are generated by
-/// equality predicates, typically equi-join conditions and equality conditions
-/// in filters.
-pub type EquivalenceClass = Vec<Arc<dyn PhysicalExpr>>;
+/// equality predicates (e.g. `a = b`), typically equi-join conditions and
+/// equality conditions in filters.
+#[derive(Debug, Clone)]
+pub struct EquivalenceClass {
+    inner: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+impl PartialEq for EquivalenceClass {
+    fn eq(&self, other: &Self) -> bool {
+        physical_exprs_equal(&self.inner, &other.inner)
+    }
+}
+
+impl EquivalenceClass {
+    /// Create a new empty equivalence class
+    pub fn new_empty() -> Self {
+        Self { inner: vec![] }
+    }
+
+    // Create a new equivalence class from a pre-existing `Vec`
+    pub fn new_from_vec(inner: Vec<Arc<dyn PhysicalExpr>>) -> Self {
+        Self { inner }
+    }
+
+    /// Return the "canonical" expression for this class (the first element)
+    /// if any
+    fn canonical_expr(&self) -> Option<Arc<dyn PhysicalExpr>> {
+        self.inner.first().cloned()
+    }
+
+    /// Insert the expression into this class, meaning it is known to be equal 
to
+    /// all other expressions in this class
+    pub fn push(&mut self, expr: Arc<dyn PhysicalExpr>) {
+        self.inner.push(expr);
+    }
+
+    /// Inserts all the expressions from other into this class
+    pub fn extend(&mut self, other: Self) {
+        self.inner.extend(other.inner);
+    }
+
+    /// Returns true if this equivalence class contains t expression
+    pub fn contains(&self, expr: &Arc<dyn PhysicalExpr>) -> bool {
+        physical_exprs_contains(&self.inner, expr)
+    }
+
+    /// Returns true if this equivalence class has any entries in common with 
other

Review Comment:
   ```suggestion
       /// Returns true if this equivalence class has any entries in common 
with `other`
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to