[GitHub] [arrow-datafusion] houqp commented on a change in pull request #605: fix join column handling logic for `On` and `Using` constraints

GitBox Mon, 12 Jul 2021 02:23:02 -0700


houqp commented on a change in pull request #605:
URL: https://github.com/apache/arrow-datafusion/pull/605#discussion_r667387968




##########
File path: datafusion/src/logical_plan/dfschema.rs
##########
@@ -149,47 +147,80 @@ impl DFSchema {
         )))
     }
 
-    /// Find the index of the column with the given qualifer and name
-    pub fn index_of_column(&self, col: &Column) -> Result<usize> {
-        for i in 0..self.fields.len() {
-            let field = &self.fields[i];
-            if field.qualifier() == col.relation.as_ref() && field.name() == 
&col.name {
-                return Ok(i);
-            }
+    fn index_of_column_by_name(
+        &self,
+        qualifier: Option<&str>,
+        name: &str,
+    ) -> Result<usize> {
+        let matches: Vec<usize> = self
+            .fields
+            .iter()
+            .enumerate()
+            .filter(|(_, field)| match (qualifier, &field.qualifier) {
+                // field to lookup is qualified.
+                // current field is qualified and not shared between 
relations, compare both
+                // qualifer and name.
+                (Some(q), Some(field_q)) => q == field_q && field.name() == 
name,
+                // field to lookup is qualified but current field is 
unqualified.
+                (Some(_), None) => false,
+                // field to lookup is unqualified, no need to compare qualifier
+                _ => field.name() == name,
+            })
+            .map(|(idx, _)| idx)
+            .collect();
+

Review comment:
       Good suggestion, fixed in 
https://github.com/apache/arrow-datafusion/pull/703.

##########
File path: datafusion/src/logical_plan/dfschema.rs
##########
@@ -149,47 +147,80 @@ impl DFSchema {
         )))
     }
 
-    /// Find the index of the column with the given qualifer and name
-    pub fn index_of_column(&self, col: &Column) -> Result<usize> {
-        for i in 0..self.fields.len() {
-            let field = &self.fields[i];
-            if field.qualifier() == col.relation.as_ref() && field.name() == 
&col.name {
-                return Ok(i);
-            }
+    fn index_of_column_by_name(
+        &self,
+        qualifier: Option<&str>,
+        name: &str,
+    ) -> Result<usize> {
+        let matches: Vec<usize> = self
+            .fields
+            .iter()
+            .enumerate()
+            .filter(|(_, field)| match (qualifier, &field.qualifier) {
+                // field to lookup is qualified.
+                // current field is qualified and not shared between 
relations, compare both
+                // qualifer and name.
+                (Some(q), Some(field_q)) => q == field_q && field.name() == 
name,
+                // field to lookup is qualified but current field is 
unqualified.
+                (Some(_), None) => false,
+                // field to lookup is unqualified, no need to compare qualifier
+                _ => field.name() == name,
+            })
+            .map(|(idx, _)| idx)
+            .collect();
+

Review comment:
       Good suggestion, fixed in 
https://github.com/apache/arrow-datafusion/pull/703.

##########
File path: datafusion/src/logical_plan/plan.rs
##########
@@ -354,6 +356,43 @@ impl LogicalPlan {
             | LogicalPlan::CreateExternalTable { .. } => vec![],
         }
     }
+
+    /// returns all `Using` join columns in a logical plan
+    pub fn using_columns(&self) -> Result<Vec<HashSet<Column>>, 
DataFusionError> {
+        struct UsingJoinColumnVisitor {
+            using_columns: Vec<HashSet<Column>>,
+        }
+
+        impl PlanVisitor for UsingJoinColumnVisitor {
+            type Error = DataFusionError;
+
+            fn pre_visit(&mut self, plan: &LogicalPlan) -> Result<bool, 
Self::Error> {
+                if let LogicalPlan::Join {
+                    join_constraint: JoinConstraint::Using,
+                    on,
+                    ..
+                } = plan
+                {
+                    self.using_columns.push(
+                        on.iter()
+                            .map(|entry| {
+                                std::iter::once(entry.0.clone())
+                                    .chain(std::iter::once(entry.1.clone()))

Review comment:
       @Jimexist sent https://github.com/apache/arrow-datafusion/pull/704 to 
address this. I found a comprise that's more readable and avoids the extra 
memory allocation.
   
   Here is my test to compare the code gen with different approaches: 
https://godbolt.org/z/j3dcjbnvM.

##########
File path: datafusion/src/logical_plan/dfschema.rs
##########
@@ -149,47 +147,80 @@ impl DFSchema {
         )))
     }
 
-    /// Find the index of the column with the given qualifer and name
-    pub fn index_of_column(&self, col: &Column) -> Result<usize> {
-        for i in 0..self.fields.len() {
-            let field = &self.fields[i];
-            if field.qualifier() == col.relation.as_ref() && field.name() == 
&col.name {
-                return Ok(i);
-            }
+    fn index_of_column_by_name(
+        &self,
+        qualifier: Option<&str>,
+        name: &str,
+    ) -> Result<usize> {
+        let matches: Vec<usize> = self
+            .fields
+            .iter()
+            .enumerate()
+            .filter(|(_, field)| match (qualifier, &field.qualifier) {
+                // field to lookup is qualified.
+                // current field is qualified and not shared between 
relations, compare both
+                // qualifer and name.
+                (Some(q), Some(field_q)) => q == field_q && field.name() == 
name,
+                // field to lookup is qualified but current field is 
unqualified.
+                (Some(_), None) => false,
+                // field to lookup is unqualified, no need to compare qualifier
+                _ => field.name() == name,
+            })
+            .map(|(idx, _)| idx)
+            .collect();
+

Review comment:
       Good suggestion, fixed in 
https://github.com/apache/arrow-datafusion/pull/703.

##########
File path: datafusion/src/logical_plan/plan.rs
##########
@@ -354,6 +356,43 @@ impl LogicalPlan {
             | LogicalPlan::CreateExternalTable { .. } => vec![],
         }
     }
+
+    /// returns all `Using` join columns in a logical plan
+    pub fn using_columns(&self) -> Result<Vec<HashSet<Column>>, 
DataFusionError> {
+        struct UsingJoinColumnVisitor {
+            using_columns: Vec<HashSet<Column>>,
+        }
+
+        impl PlanVisitor for UsingJoinColumnVisitor {
+            type Error = DataFusionError;
+
+            fn pre_visit(&mut self, plan: &LogicalPlan) -> Result<bool, 
Self::Error> {
+                if let LogicalPlan::Join {
+                    join_constraint: JoinConstraint::Using,
+                    on,
+                    ..
+                } = plan
+                {
+                    self.using_columns.push(
+                        on.iter()
+                            .map(|entry| {
+                                std::iter::once(entry.0.clone())
+                                    .chain(std::iter::once(entry.1.clone()))

Review comment:
       @Jimexist sent https://github.com/apache/arrow-datafusion/pull/704 to 
address this. I found a comprise that's more readable and avoids the extra 
memory allocation.
   
   Here is my test to compare the code gen with different approaches: 
https://godbolt.org/z/j3dcjbnvM.

##########
File path: datafusion/src/logical_plan/dfschema.rs
##########
@@ -149,47 +147,80 @@ impl DFSchema {
         )))
     }
 
-    /// Find the index of the column with the given qualifer and name
-    pub fn index_of_column(&self, col: &Column) -> Result<usize> {
-        for i in 0..self.fields.len() {
-            let field = &self.fields[i];
-            if field.qualifier() == col.relation.as_ref() && field.name() == 
&col.name {
-                return Ok(i);
-            }
+    fn index_of_column_by_name(
+        &self,
+        qualifier: Option<&str>,
+        name: &str,
+    ) -> Result<usize> {
+        let matches: Vec<usize> = self
+            .fields
+            .iter()
+            .enumerate()
+            .filter(|(_, field)| match (qualifier, &field.qualifier) {
+                // field to lookup is qualified.
+                // current field is qualified and not shared between 
relations, compare both
+                // qualifer and name.
+                (Some(q), Some(field_q)) => q == field_q && field.name() == 
name,
+                // field to lookup is qualified but current field is 
unqualified.
+                (Some(_), None) => false,
+                // field to lookup is unqualified, no need to compare qualifier
+                _ => field.name() == name,
+            })
+            .map(|(idx, _)| idx)
+            .collect();
+

Review comment:
       Good suggestion, fixed in 
https://github.com/apache/arrow-datafusion/pull/703.

##########
File path: datafusion/src/logical_plan/plan.rs
##########
@@ -354,6 +356,43 @@ impl LogicalPlan {
             | LogicalPlan::CreateExternalTable { .. } => vec![],
         }
     }
+
+    /// returns all `Using` join columns in a logical plan
+    pub fn using_columns(&self) -> Result<Vec<HashSet<Column>>, 
DataFusionError> {
+        struct UsingJoinColumnVisitor {
+            using_columns: Vec<HashSet<Column>>,
+        }
+
+        impl PlanVisitor for UsingJoinColumnVisitor {
+            type Error = DataFusionError;
+
+            fn pre_visit(&mut self, plan: &LogicalPlan) -> Result<bool, 
Self::Error> {
+                if let LogicalPlan::Join {
+                    join_constraint: JoinConstraint::Using,
+                    on,
+                    ..
+                } = plan
+                {
+                    self.using_columns.push(
+                        on.iter()
+                            .map(|entry| {
+                                std::iter::once(entry.0.clone())
+                                    .chain(std::iter::once(entry.1.clone()))

Review comment:
       @Jimexist sent https://github.com/apache/arrow-datafusion/pull/704 to 
address this. I found a comprise that's more readable and avoids the extra 
memory allocation.
   
   Here is my test to compare the code gen with different approaches: 
https://godbolt.org/z/j3dcjbnvM.

##########
File path: datafusion/src/logical_plan/dfschema.rs
##########
@@ -149,47 +147,80 @@ impl DFSchema {
         )))
     }
 
-    /// Find the index of the column with the given qualifer and name
-    pub fn index_of_column(&self, col: &Column) -> Result<usize> {
-        for i in 0..self.fields.len() {
-            let field = &self.fields[i];
-            if field.qualifier() == col.relation.as_ref() && field.name() == 
&col.name {
-                return Ok(i);
-            }
+    fn index_of_column_by_name(
+        &self,
+        qualifier: Option<&str>,
+        name: &str,
+    ) -> Result<usize> {
+        let matches: Vec<usize> = self
+            .fields
+            .iter()
+            .enumerate()
+            .filter(|(_, field)| match (qualifier, &field.qualifier) {
+                // field to lookup is qualified.
+                // current field is qualified and not shared between 
relations, compare both
+                // qualifer and name.
+                (Some(q), Some(field_q)) => q == field_q && field.name() == 
name,
+                // field to lookup is qualified but current field is 
unqualified.
+                (Some(_), None) => false,
+                // field to lookup is unqualified, no need to compare qualifier
+                _ => field.name() == name,
+            })
+            .map(|(idx, _)| idx)
+            .collect();
+

Review comment:
       Good suggestion, fixed in 
https://github.com/apache/arrow-datafusion/pull/703.

##########
File path: datafusion/src/logical_plan/plan.rs
##########
@@ -354,6 +356,43 @@ impl LogicalPlan {
             | LogicalPlan::CreateExternalTable { .. } => vec![],
         }
     }
+
+    /// returns all `Using` join columns in a logical plan
+    pub fn using_columns(&self) -> Result<Vec<HashSet<Column>>, 
DataFusionError> {
+        struct UsingJoinColumnVisitor {
+            using_columns: Vec<HashSet<Column>>,
+        }
+
+        impl PlanVisitor for UsingJoinColumnVisitor {
+            type Error = DataFusionError;
+
+            fn pre_visit(&mut self, plan: &LogicalPlan) -> Result<bool, 
Self::Error> {
+                if let LogicalPlan::Join {
+                    join_constraint: JoinConstraint::Using,
+                    on,
+                    ..
+                } = plan
+                {
+                    self.using_columns.push(
+                        on.iter()
+                            .map(|entry| {
+                                std::iter::once(entry.0.clone())
+                                    .chain(std::iter::once(entry.1.clone()))

Review comment:
       @Jimexist sent https://github.com/apache/arrow-datafusion/pull/704 to 
address this. I found a comprise that's more readable and avoids the extra 
memory allocation.
   
   Here is my test to compare the code gen with different approaches: 
https://godbolt.org/z/j3dcjbnvM.

##########
File path: datafusion/src/logical_plan/dfschema.rs
##########
@@ -149,47 +147,80 @@ impl DFSchema {
         )))
     }
 
-    /// Find the index of the column with the given qualifer and name
-    pub fn index_of_column(&self, col: &Column) -> Result<usize> {
-        for i in 0..self.fields.len() {
-            let field = &self.fields[i];
-            if field.qualifier() == col.relation.as_ref() && field.name() == 
&col.name {
-                return Ok(i);
-            }
+    fn index_of_column_by_name(
+        &self,
+        qualifier: Option<&str>,
+        name: &str,
+    ) -> Result<usize> {
+        let matches: Vec<usize> = self
+            .fields
+            .iter()
+            .enumerate()
+            .filter(|(_, field)| match (qualifier, &field.qualifier) {
+                // field to lookup is qualified.
+                // current field is qualified and not shared between 
relations, compare both
+                // qualifer and name.
+                (Some(q), Some(field_q)) => q == field_q && field.name() == 
name,
+                // field to lookup is qualified but current field is 
unqualified.
+                (Some(_), None) => false,
+                // field to lookup is unqualified, no need to compare qualifier
+                _ => field.name() == name,
+            })
+            .map(|(idx, _)| idx)
+            .collect();
+

Review comment:
       Good suggestion, fixed in 
https://github.com/apache/arrow-datafusion/pull/703.

##########
File path: datafusion/src/logical_plan/plan.rs
##########
@@ -354,6 +356,43 @@ impl LogicalPlan {
             | LogicalPlan::CreateExternalTable { .. } => vec![],
         }
     }
+
+    /// returns all `Using` join columns in a logical plan
+    pub fn using_columns(&self) -> Result<Vec<HashSet<Column>>, 
DataFusionError> {
+        struct UsingJoinColumnVisitor {
+            using_columns: Vec<HashSet<Column>>,
+        }
+
+        impl PlanVisitor for UsingJoinColumnVisitor {
+            type Error = DataFusionError;
+
+            fn pre_visit(&mut self, plan: &LogicalPlan) -> Result<bool, 
Self::Error> {
+                if let LogicalPlan::Join {
+                    join_constraint: JoinConstraint::Using,
+                    on,
+                    ..
+                } = plan
+                {
+                    self.using_columns.push(
+                        on.iter()
+                            .map(|entry| {
+                                std::iter::once(entry.0.clone())
+                                    .chain(std::iter::once(entry.1.clone()))

Review comment:
       @Jimexist sent https://github.com/apache/arrow-datafusion/pull/704 to 
address this. I found a comprise that's more readable and avoids the extra 
memory allocation.
   
   Here is my test to compare the code gen with different approaches: 
https://godbolt.org/z/j3dcjbnvM.

##########
File path: datafusion/src/logical_plan/dfschema.rs
##########
@@ -149,47 +147,80 @@ impl DFSchema {
         )))
     }
 
-    /// Find the index of the column with the given qualifer and name
-    pub fn index_of_column(&self, col: &Column) -> Result<usize> {
-        for i in 0..self.fields.len() {
-            let field = &self.fields[i];
-            if field.qualifier() == col.relation.as_ref() && field.name() == 
&col.name {
-                return Ok(i);
-            }
+    fn index_of_column_by_name(
+        &self,
+        qualifier: Option<&str>,
+        name: &str,
+    ) -> Result<usize> {
+        let matches: Vec<usize> = self
+            .fields
+            .iter()
+            .enumerate()
+            .filter(|(_, field)| match (qualifier, &field.qualifier) {
+                // field to lookup is qualified.
+                // current field is qualified and not shared between 
relations, compare both
+                // qualifer and name.
+                (Some(q), Some(field_q)) => q == field_q && field.name() == 
name,
+                // field to lookup is qualified but current field is 
unqualified.
+                (Some(_), None) => false,
+                // field to lookup is unqualified, no need to compare qualifier
+                _ => field.name() == name,
+            })
+            .map(|(idx, _)| idx)
+            .collect();
+

Review comment:
       Good suggestion, fixed in 
https://github.com/apache/arrow-datafusion/pull/703.

##########
File path: datafusion/src/logical_plan/plan.rs
##########
@@ -354,6 +356,43 @@ impl LogicalPlan {
             | LogicalPlan::CreateExternalTable { .. } => vec![],
         }
     }
+
+    /// returns all `Using` join columns in a logical plan
+    pub fn using_columns(&self) -> Result<Vec<HashSet<Column>>, 
DataFusionError> {
+        struct UsingJoinColumnVisitor {
+            using_columns: Vec<HashSet<Column>>,
+        }
+
+        impl PlanVisitor for UsingJoinColumnVisitor {
+            type Error = DataFusionError;
+
+            fn pre_visit(&mut self, plan: &LogicalPlan) -> Result<bool, 
Self::Error> {
+                if let LogicalPlan::Join {
+                    join_constraint: JoinConstraint::Using,
+                    on,
+                    ..
+                } = plan
+                {
+                    self.using_columns.push(
+                        on.iter()
+                            .map(|entry| {
+                                std::iter::once(entry.0.clone())
+                                    .chain(std::iter::once(entry.1.clone()))

Review comment:
       @Jimexist sent https://github.com/apache/arrow-datafusion/pull/704 to 
address this. I found a comprise that's more readable and avoids the extra 
memory allocation.
   
   Here is my test to compare the code gen with different approaches: 
https://godbolt.org/z/j3dcjbnvM.

##########
File path: datafusion/src/logical_plan/dfschema.rs
##########
@@ -149,47 +147,80 @@ impl DFSchema {
         )))
     }
 
-    /// Find the index of the column with the given qualifer and name
-    pub fn index_of_column(&self, col: &Column) -> Result<usize> {
-        for i in 0..self.fields.len() {
-            let field = &self.fields[i];
-            if field.qualifier() == col.relation.as_ref() && field.name() == 
&col.name {
-                return Ok(i);
-            }
+    fn index_of_column_by_name(
+        &self,
+        qualifier: Option<&str>,
+        name: &str,
+    ) -> Result<usize> {
+        let matches: Vec<usize> = self
+            .fields
+            .iter()
+            .enumerate()
+            .filter(|(_, field)| match (qualifier, &field.qualifier) {
+                // field to lookup is qualified.
+                // current field is qualified and not shared between 
relations, compare both
+                // qualifer and name.
+                (Some(q), Some(field_q)) => q == field_q && field.name() == 
name,
+                // field to lookup is qualified but current field is 
unqualified.
+                (Some(_), None) => false,
+                // field to lookup is unqualified, no need to compare qualifier
+                _ => field.name() == name,
+            })
+            .map(|(idx, _)| idx)
+            .collect();
+

Review comment:
       Good suggestion, fixed in 
https://github.com/apache/arrow-datafusion/pull/703.

##########
File path: datafusion/src/logical_plan/plan.rs
##########
@@ -354,6 +356,43 @@ impl LogicalPlan {
             | LogicalPlan::CreateExternalTable { .. } => vec![],
         }
     }
+
+    /// returns all `Using` join columns in a logical plan
+    pub fn using_columns(&self) -> Result<Vec<HashSet<Column>>, 
DataFusionError> {
+        struct UsingJoinColumnVisitor {
+            using_columns: Vec<HashSet<Column>>,
+        }
+
+        impl PlanVisitor for UsingJoinColumnVisitor {
+            type Error = DataFusionError;
+
+            fn pre_visit(&mut self, plan: &LogicalPlan) -> Result<bool, 
Self::Error> {
+                if let LogicalPlan::Join {
+                    join_constraint: JoinConstraint::Using,
+                    on,
+                    ..
+                } = plan
+                {
+                    self.using_columns.push(
+                        on.iter()
+                            .map(|entry| {
+                                std::iter::once(entry.0.clone())
+                                    .chain(std::iter::once(entry.1.clone()))

Review comment:
       @Jimexist sent https://github.com/apache/arrow-datafusion/pull/704 to 
address this. I found a comprise that's more readable and avoids the extra 
memory allocation.
   
   Here is my test to compare the code gen with different approaches: 
https://godbolt.org/z/j3dcjbnvM.

##########
File path: datafusion/src/logical_plan/dfschema.rs
##########
@@ -149,47 +147,80 @@ impl DFSchema {
         )))
     }
 
-    /// Find the index of the column with the given qualifer and name
-    pub fn index_of_column(&self, col: &Column) -> Result<usize> {
-        for i in 0..self.fields.len() {
-            let field = &self.fields[i];
-            if field.qualifier() == col.relation.as_ref() && field.name() == 
&col.name {
-                return Ok(i);
-            }
+    fn index_of_column_by_name(
+        &self,
+        qualifier: Option<&str>,
+        name: &str,
+    ) -> Result<usize> {
+        let matches: Vec<usize> = self
+            .fields
+            .iter()
+            .enumerate()
+            .filter(|(_, field)| match (qualifier, &field.qualifier) {
+                // field to lookup is qualified.
+                // current field is qualified and not shared between 
relations, compare both
+                // qualifer and name.
+                (Some(q), Some(field_q)) => q == field_q && field.name() == 
name,
+                // field to lookup is qualified but current field is 
unqualified.
+                (Some(_), None) => false,
+                // field to lookup is unqualified, no need to compare qualifier
+                _ => field.name() == name,
+            })
+            .map(|(idx, _)| idx)
+            .collect();
+

Review comment:
       Good suggestion, fixed in 
https://github.com/apache/arrow-datafusion/pull/703.

##########
File path: datafusion/src/logical_plan/dfschema.rs
##########
@@ -149,47 +147,80 @@ impl DFSchema {
         )))
     }
 
-    /// Find the index of the column with the given qualifer and name
-    pub fn index_of_column(&self, col: &Column) -> Result<usize> {
-        for i in 0..self.fields.len() {
-            let field = &self.fields[i];
-            if field.qualifier() == col.relation.as_ref() && field.name() == 
&col.name {
-                return Ok(i);
-            }
+    fn index_of_column_by_name(
+        &self,
+        qualifier: Option<&str>,
+        name: &str,
+    ) -> Result<usize> {
+        let matches: Vec<usize> = self
+            .fields
+            .iter()
+            .enumerate()
+            .filter(|(_, field)| match (qualifier, &field.qualifier) {
+                // field to lookup is qualified.
+                // current field is qualified and not shared between 
relations, compare both
+                // qualifer and name.
+                (Some(q), Some(field_q)) => q == field_q && field.name() == 
name,
+                // field to lookup is qualified but current field is 
unqualified.
+                (Some(_), None) => false,
+                // field to lookup is unqualified, no need to compare qualifier
+                _ => field.name() == name,
+            })
+            .map(|(idx, _)| idx)
+            .collect();
+

Review comment:
       Good suggestion, fixed in 
https://github.com/apache/arrow-datafusion/pull/703.

##########
File path: datafusion/src/logical_plan/plan.rs
##########
@@ -354,6 +356,43 @@ impl LogicalPlan {
             | LogicalPlan::CreateExternalTable { .. } => vec![],
         }
     }
+
+    /// returns all `Using` join columns in a logical plan
+    pub fn using_columns(&self) -> Result<Vec<HashSet<Column>>, 
DataFusionError> {
+        struct UsingJoinColumnVisitor {
+            using_columns: Vec<HashSet<Column>>,
+        }
+
+        impl PlanVisitor for UsingJoinColumnVisitor {
+            type Error = DataFusionError;
+
+            fn pre_visit(&mut self, plan: &LogicalPlan) -> Result<bool, 
Self::Error> {
+                if let LogicalPlan::Join {
+                    join_constraint: JoinConstraint::Using,
+                    on,
+                    ..
+                } = plan
+                {
+                    self.using_columns.push(
+                        on.iter()
+                            .map(|entry| {
+                                std::iter::once(entry.0.clone())
+                                    .chain(std::iter::once(entry.1.clone()))

Review comment:
       @Jimexist sent https://github.com/apache/arrow-datafusion/pull/704 to 
address this. I found a comprise that's more readable and avoids the extra 
memory allocation.
   
   Here is my test to compare the code gen with different approaches: 
https://godbolt.org/z/j3dcjbnvM.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-datafusion] houqp commented on a change in pull request #605: fix join column handling logic for `On` and `Using` constraints

Reply via email to