Re: [PR] [SPARK-48318][SQL] Enable hash join support for all collations (complex types) [spark]

via GitHub Thu, 30 May 2024 02:17:53 -0700


uros-db commented on code in PR #46722:
URL: https://github.com/apache/spark/pull/46722#discussion_r1620319639



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteCollationJoin.scala:
##########
@@ -17,29 +17,68 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, 
CollationKey, Equality}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.CollationFactory
+import org.apache.spark.sql.types._
 import org.apache.spark.sql.types.StringType
+import org.apache.spark.util.ArrayImplicits.SparkArrayOps
 
 object RewriteCollationJoin extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case j @ Join(_, _, _, Some(condition), _) =>
       val newCondition = condition transform {
         case e @ Equality(l: AttributeReference, r: AttributeReference) =>
-          (l.dataType, r.dataType) match {
-            case (st: StringType, _: StringType)
-              if 
!CollationFactory.fetchCollation(st.collationId).supportsBinaryEquality =>
-                e.withNewChildren(Seq(CollationKey(l), CollationKey(r)))
-            case _ =>
-              e
-          }
+          e.withNewChildren(Seq(processExpression(l, l.dataType), 
processExpression(r, r.dataType)))
       }
       if (!newCondition.fastEquals(condition)) {
         j.copy(condition = Some(newCondition))
       } else {
         j
       }
   }
+
+  private def processExpression(expr: Expression, dt: DataType): Expression = {
+    dt match {
+      case st: StringType
+        if 
!CollationFactory.fetchCollation(st.collationId).supportsBinaryEquality =>
+          CollationKey(expr)
+
+      case StructType(fields) =>
+        processStruct(expr, fields)
+
+      case ArrayType(et, containsNull) =>
+        processArray(expr, et, containsNull)
+

Review Comment:
   I don't think joins and aggregations are supported for map type at this time
   
   join:
   ```
   Failed to analyze query: org.apache.spark.sql.AnalysisException: 
[DATATYPE_MISMATCH.INVALID_ORDERING_TYPE] Cannot resolve "(m = m)" due to data 
type mismatch: The `=` does not support ordering on type "MAP<STRING, STRING>"
   ```
   
   agg:
   ```
   Failed to analyze query: org.apache.spark.sql.AnalysisException: 
[UNSUPPORTED_FEATURE.SET_OPERATION_ON_MAP_TYPE] The feature is not supported: 
Cannot have MAP type columns in DataFrame which calls set operations 
(INTERSECT, EXCEPT, etc.), but the type of column `m` is "MAP<STRING, STRING>".
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-48318][SQL] Enable hash join support for all collations (complex types) [spark]

Reply via email to