(spark) branch master updated: [SPARK-50405][SQL] Handle collation type coercion of complex data types properly

maxgekk Wed, 04 Dec 2024 00:31:44 -0800

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 5fc6b713874a [SPARK-50405][SQL] Handle collation type coercion of 
complex data types properly
5fc6b713874a is described below

commit 5fc6b713874a0617cc8ff4149f01e17c40623528
Author: Stefan Kandic <[email protected]>
AuthorDate: Wed Dec 4 09:29:40 2024 +0100

    [SPARK-50405][SQL] Handle collation type coercion of complex data types 
properly
    
    ### What changes were proposed in this pull request?
    This pull request generalizes collation type coercion to support not just 
casting all children to a single string type, but also handling complex data 
types such as structs, maps, and arrays (arrays partially worked already).
    
    The core idea is to recursively analyze the entire data type of an 
expression, annotating each StringType within it with the highest-priority 
collation and its strength. This annotation propagates upward through the 
expression tree. Once the root of the expression is reached, the annotations 
are removed, and the expression is cast to the desired data type.
    
    For the root expression `e`, the collation data type context is computed by 
first calculating the context for all its children and then merging those 
results into the data type of `e`.
    
    ### Why are the changes needed?
    In #48663, a new approach to calculating collation precedence was 
introduced. This approach recursively examines the children of an expression 
and propagates the collation with the highest priority upward.
    
    However, the current implementation of collation coercion is limited to 
determining the StringType that all children should be cast to. This approach 
falls short when dealing with complex types like structs, maps, and arrays, 
which can also contain collations. To address this limitation, we need a more 
general mechanism that allows coercion of any data type, not just simple 
strings.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    With new unit tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #48936 from stefankandic/collationCoercionComplex.
    
    Lead-authored-by: Stefan Kandic <[email protected]>
    Co-authored-by: Stefan Kandic 
<[email protected]>
    Signed-off-by: Max Gekk <[email protected]>
---
 .../catalyst/analysis/CollationTypeCoercion.scala  | 350 +++++++++++++--------
 .../sql-tests/analyzer-results/collations.sql.out  |  78 ++---
 .../resources/sql-tests/results/collations.sql.out |  28 +-
 .../apache/spark/sql/CollationSQLRegexpSuite.scala |   3 +-
 .../collation/CollationTypePrecedenceSuite.scala   | 158 +++++++++-
 5 files changed, 433 insertions(+), 184 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala
index cca1d21df3a7..02640aba2d28 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala
@@ -17,14 +17,12 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import scala.annotation.tailrec
-
 import org.apache.spark.sql.catalyst.analysis.CollationStrength.{Default, 
Explicit, Implicit}
-import org.apache.spark.sql.catalyst.analysis.TypeCoercion.{hasStringType, 
haveSameType}
+import org.apache.spark.sql.catalyst.analysis.TypeCoercion.haveSameType
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.errors.QueryCompilationErrors
-import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
+import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StringType, 
StructType}
 import org.apache.spark.sql.util.SchemaUtils
 
 /**
@@ -32,7 +30,7 @@ import org.apache.spark.sql.util.SchemaUtils
  */
 object CollationTypeCoercion {
 
-  private val COLLATION_CONTEXT_TAG = new 
TreeNodeTag[CollationContext]("collationContext")
+  private val COLLATION_CONTEXT_TAG = new 
TreeNodeTag[DataType]("collationContext")
 
   private def hasCollationContextTag(expr: Expression): Boolean = {
     expr.getTagValue(COLLATION_CONTEXT_TAG).isDefined
@@ -53,10 +51,10 @@ object CollationTypeCoercion {
       outputStringType match {
         case Some(st) =>
           val newBranches = caseWhenExpr.branches.map { case (condition, 
value) =>
-            (condition, castStringType(value, st))
+            (condition, changeType(value, st))
           }
           val newElseValue =
-            caseWhenExpr.elseValue.map(e => castStringType(e, st))
+            caseWhenExpr.elseValue.map(e => changeType(e, st))
           CaseWhen(newBranches, newElseValue)
 
         case _ =>
@@ -105,11 +103,9 @@ object CollationTypeCoercion {
       val newValues = collateToSingleType(mapCreate.values)
       mapCreate.withNewChildren(newKeys.zip(newValues).flatMap(pair => 
Seq(pair._1, pair._2)))
 
-    case namedStruct: CreateNamedStruct if namedStruct.children.size % 2 == 0 
=>
-      val newNames = collateToSingleType(namedStruct.nameExprs)
-      val newValues = collateToSingleType(namedStruct.valExprs)
-      val interleaved = newNames.zip(newValues).flatMap(pair => Seq(pair._1, 
pair._2))
-      namedStruct.withNewChildren(interleaved)
+    case namedStruct: CreateNamedStruct =>
+      // since each child is separate we should not coerce them at all
+      namedStruct
 
     case splitPart: SplitPart =>
       val Seq(str, delimiter, partNum) = splitPart.children
@@ -164,31 +160,89 @@ object CollationTypeCoercion {
   }
 
   /**
-   * Extracts StringTypes from filtered hasStringType
+   * Changes the data type of the expression to the given `newType`.
    */
-  @tailrec
-  private def extractStringType(dt: DataType): Option[StringType] = dt match {
-    case st: StringType => Some(st)
-    case ArrayType(et, _) => extractStringType(et)
-    case _ => None
+  private def changeType(expr: Expression, newType: DataType): Expression = {
+    mergeTypes(expr.dataType, newType) match {
+      case Some(newDataType) if newDataType != expr.dataType =>
+        
assert(!newDataType.existsRecursively(_.isInstanceOf[StringTypeWithContext]))
+
+        val exprWithNewType = expr match {
+          case lit: Literal => lit.copy(dataType = newDataType)
+          case cast: Cast => cast.copy(dataType = newDataType)
+          case _ => Cast(expr, newDataType)
+        }
+
+        // also copy the collation context tag
+        if (hasCollationContextTag(expr)) {
+          exprWithNewType.setTagValue(
+            COLLATION_CONTEXT_TAG, expr.getTagValue(COLLATION_CONTEXT_TAG).get)
+        }
+        exprWithNewType
+
+      case _ =>
+        expr
+    }
   }
 
   /**
-   * Casts given expression to collated StringType with id equal to 
collationId only
-   * if expression has StringType in the first place.
+   * If possible, returns the new data type from `inType` by applying
+   * the collation of `castType`.
    */
-  def castStringType(expr: Expression, st: StringType): Expression = {
-    castStringType(expr.dataType, st)
-      .map(dt => Cast(expr, dt))
-      .getOrElse(expr)
+  private def mergeTypes(inType: DataType, castType: DataType): 
Option[DataType] = {
+    val outType = mergeStructurally(inType, castType) {
+      case (_: StringType, right: StringTypeWithContext) =>
+        right.stringType
+    }
+
+    outType
   }
 
-  private def castStringType(inType: DataType, castType: StringType): 
Option[DataType] = {
-    inType match {
-      case st: StringType if st.collationId != castType.collationId =>
-        Some(castType)
-      case ArrayType(arrType, nullable) =>
-        castStringType(arrType, castType).map(ArrayType(_, nullable))
+  /**
+   * Merges two data types structurally according to the given base case.
+   */
+  private def mergeStructurally(
+      leftType: DataType,
+      rightType: DataType)
+      (baseCase: PartialFunction[(DataType, DataType), DataType]): 
Option[DataType] = {
+    (leftType, rightType) match {
+
+      // handle the base cases first
+      case _ if baseCase.isDefinedAt((leftType, rightType)) =>
+        Option(baseCase(leftType, rightType))
+
+      case _ if leftType == rightType =>
+        Some(leftType)
+
+      case (ArrayType(leftElemType, nullable), ArrayType(rightElemType, _)) =>
+        mergeStructurally(leftElemType, 
rightElemType)(baseCase).map(ArrayType(_, nullable))
+
+      case (MapType(leftKey, leftValue, nullable), MapType(rightKey, 
rightValue, _)) =>
+        for {
+          newKeyType <- mergeStructurally(leftKey, rightKey)(baseCase)
+          newValueType <- mergeStructurally(leftValue, rightValue)(baseCase)
+        } yield MapType(newKeyType, newValueType, nullable)
+
+      case (ArrayType(elementType, nullable), right) =>
+        mergeStructurally(elementType, right)(baseCase).map(ArrayType(_, 
nullable))
+
+      case (left, ArrayType(elementType, _)) =>
+        mergeStructurally(left, elementType)(baseCase)
+
+      case (StructType(leftFields), StructType(rightFields)) =>
+        if (leftFields.length != rightFields.length) {
+          return None
+        }
+        val newFields = leftFields.zip(rightFields).map {
+          case (leftField, rightField) =>
+            val newType = mergeStructurally(leftField.dataType, 
rightField.dataType)(baseCase)
+            if (newType.isEmpty) {
+              return None
+            }
+            leftField.copy(dataType = newType.get)
+        }
+        Some(StructType(newFields))
+
       case _ => None
     }
   }
@@ -201,7 +255,7 @@ object CollationTypeCoercion {
 
     lctOpt match {
       case Some(lct) =>
-        expressions.map(e => castStringType(e, lct))
+        expressions.map(e => changeType(e, lct))
       case _ =>
         expressions
     }
@@ -210,7 +264,7 @@ object CollationTypeCoercion {
   /**
    * Tries to find the least common StringType among the given expressions.
    */
-  private def findLeastCommonStringType(expressions: Seq[Expression]): 
Option[StringType] = {
+  private def findLeastCommonStringType(expressions: Seq[Expression]): 
Option[DataType] = {
     if (!expressions.exists(e => 
SchemaUtils.hasNonUTF8BinaryCollation(e.dataType))) {
       // if there are no collated types we don't need to do anything
       return None
@@ -223,62 +277,70 @@ object CollationTypeCoercion {
     val collationContextWinner = 
expressions.foldLeft(findCollationContext(expressions.head)) {
       case (Some(left), right) =>
         findCollationContext(right).flatMap { ctx =>
-          collationPrecedenceWinner(left, ctx)
+          mergeWinner(left, ctx)
         }
-      case (None, _) => return None
-    }
-
-    collationContextWinner.flatMap { cc =>
-      extractStringType(cc.dataType)
+      case (None, _) => None
     }
+    collationContextWinner
   }
 
   /**
-   * Tries to find the collation context for the given expression.
+   * Tries to find the data type with the collation context for the given 
expression.
    * If found, it will also set the [[COLLATION_CONTEXT_TAG]] on the 
expression,
    * so that the context can be reused later.
    */
-  private def findCollationContext(expr: Expression): Option[CollationContext] 
= {
+  private def findCollationContext(expr: Expression): Option[DataType] = {
     val contextOpt = expr match {
-      case _ if hasCollationContextTag(expr) =>
-        Some(expr.getTagValue(COLLATION_CONTEXT_TAG).get)
-
-      // if `expr` doesn't have a string in its dataType then it doesn't
-      // have the collation context either
-      case _ if !expr.dataType.existsRecursively(_.isInstanceOf[StringType]) =>
-        None
 
-      case collate: Collate =>
-        Some(CollationContext(collate.dataType, Explicit))
+      case _ if collationStrengthBaseCases.isDefinedAt(expr) =>
+        collationStrengthBaseCases(expr)
 
-      case _: Alias | _: SubqueryExpression | _: AttributeReference | _: 
VariableReference =>
-        Some(CollationContext(expr.dataType, Implicit))
+      case getStruct: GetStructField =>
+        val childContext = findCollationContext(getStruct.child)
+        childContext match {
+          case Some(struct: StructType) =>
+            val field = struct.fields(getStruct.ordinal)
+            Some(field.dataType)
+          case _ => None
+        }
 
-      case _: Literal =>
-        Some(CollationContext(expr.dataType, Default))
+      case getMapValue: GetMapValue =>
+        findCollationContext(getMapValue.child) match {
+          case Some(MapType(_, valueType, _)) =>
+            mergeWinner(getMapValue.dataType, valueType)
+          case _ =>
+            None
+        }
 
-      // if it does have a string type but none of its children do
-      // then the collation context strength is default
-      case _ if 
!expr.children.exists(_.dataType.existsRecursively(_.isInstanceOf[StringType])) 
=>
-        Some(CollationContext(expr.dataType, Default))
+      case struct: CreateNamedStruct =>
+        val childrenContexts = struct.valExprs.map(findCollationContext)
+        if (childrenContexts.isEmpty) {
+          return None
+        }
+        val newFields = struct.dataType.fields.zip(childrenContexts).map {
+          case (field, Some(context)) =>
+            field.copy(dataType = context)
+          case (field, None) => field
+        }
+        Some(StructType(newFields))
 
-      case _ =>
-        val contextWinnerOpt = getContextRelevantChildren(expr)
-          .flatMap(findCollationContext)
-          .foldLeft(Option.empty[CollationContext]) {
-            case (Some(left), right) =>
-              collationPrecedenceWinner(left, right)
-            case (None, right) =>
-              Some(right)
-          }
+      case map: CreateMap =>
+        val keyContexts = map.keys.flatMap(findCollationContext)
+        val valueContexts = map.values.flatMap(findCollationContext)
+        if (keyContexts.length + valueContexts.length != map.children.length) {
+          return None
+        }
 
-        contextWinnerOpt.map { context =>
-          if (hasStringType(expr.dataType)) {
-            CollationContext(expr.dataType, context.strength)
-          } else {
-            context
-          }
+        val keyContextWinner = mergeWinners(map.dataType.keyType, keyContexts)
+        val valueContextWinner = mergeWinners(map.dataType.valueType, 
valueContexts)
+        if (keyContextWinner.isEmpty || valueContextWinner.isEmpty) {
+          return None
         }
+        Some(MapType(keyContextWinner.get, valueContextWinner.get))
+
+      case _ =>
+        val childContexts = expr.children.flatMap(findCollationContext)
+        mergeWinners(expr.dataType, childContexts)
     }
 
     contextOpt.foreach(expr.setTagValue(COLLATION_CONTEXT_TAG, _))
@@ -286,69 +348,91 @@ object CollationTypeCoercion {
   }
 
   /**
-   * Returns the children of the given expression that should be used for 
calculating the
-   * winning collation context.
+   * Base cases for determining the strength of the collation.
    */
-  private def getContextRelevantChildren(expression: Expression): 
Seq[Expression] = {
-    expression match {
-      // collation context for named struct should be calculated based on its 
values only
-      case createStruct: CreateNamedStruct =>
-        createStruct.valExprs
+  private def collationStrengthBaseCases: PartialFunction[Expression, 
Option[DataType]] = {
+    case expr if hasCollationContextTag(expr) =>
+      Some(expr.getTagValue(COLLATION_CONTEXT_TAG).get)
 
-      // collation context does not depend on the key for extracting the value
-      case extract: ExtractValue =>
-        Seq(extract.child)
+    // if `expr` doesn't have a string in its dataType then it doesn't
+    // have the collation context either
+    case expr if !expr.dataType.existsRecursively(_.isInstanceOf[StringType]) 
=>
+      None
 
-      // we currently don't support collation precedence for maps,
-      // as this would involve calculating them for keys and values separately
-      case _: CreateMap =>
-        Seq.empty
+    case collate: Collate =>
+      Some(addContextToStringType(collate.dataType, Explicit))
 
-      case _ =>
-        expression.children
+    case expr @ (_: Alias | _: SubqueryExpression | _: AttributeReference | _: 
VariableReference) =>
+      Some(addContextToStringType(expr.dataType, Implicit))
+
+    case lit: Literal =>
+      Some(addContextToStringType(lit.dataType, Default))
+
+    // if it does have a string type but none of its children do
+    // then the collation context strength is default
+    case expr if 
!expr.children.exists(_.dataType.existsRecursively(_.isInstanceOf[StringType])) 
=>
+      Some(addContextToStringType(expr.dataType, Default))
+  }
+
+  /**
+   * Adds collation context to the given string type so we can know its 
strength.
+   */
+  private def addContextToStringType(dt: DataType, strength: 
CollationStrength): DataType = {
+    dt.transformRecursively {
+      case st: StringType => StringTypeWithContext(st, strength)
     }
   }
 
   /**
-   * Returns the collation context that wins in precedence between left and 
right.
+   * Merges multiple data types structurally according to strength of the 
collations into the
+   * data type of the `start`.
+   *
+   * If any of the data types cannot be merged, it returns None.
    */
-  private def collationPrecedenceWinner(
-      left: CollationContext,
-      right: CollationContext): Option[CollationContext] = {
-
-    val (leftStringType, rightStringType) =
-      (extractStringType(left.dataType), extractStringType(right.dataType)) 
match {
-        case (Some(l), Some(r)) =>
-          (l, r)
-        case (None, None) =>
-          return None
-        case (Some(_), None) =>
-          return Some(left)
-        case (None, Some(_)) =>
-          return Some(right)
-      }
+  private def mergeWinners(start: DataType, rest: Seq[DataType]): 
Option[DataType] = {
+    rest.foldLeft(Option(start)) {
+      case (Some(acc), childContext) =>
+        mergeWinner(acc, childContext)
+      case (None, _) =>
+        None
+    }
+  }
 
-    (left.strength, right.strength) match {
-      case (Explicit, Explicit) if leftStringType != rightStringType =>
-        throw QueryCompilationErrors.explicitCollationMismatchError(
-          Seq(leftStringType, rightStringType))
+  /**
+   * Merges two data types structurally according to strength of the 
collations.
+   */
+  private def mergeWinner(left: DataType, right: DataType): Option[DataType] = 
{
+    mergeStructurally(left, right) {
+      case (left: StringTypeWithContext, right: StringTypeWithContext) =>
+        getWinningStringType(left, right)
 
-      case (Explicit, _) => Some(left)
-      case (_, Explicit) => Some(right)
+      case (_: StringType, right: StringTypeWithContext) =>
+        right
+    }
+  }
 
-      case (Implicit, Implicit) if leftStringType != rightStringType =>
+  /** Determines the winning StringTypeWithContext based on the strength of 
the collation. */
+  private def getWinningStringType(
+      left: StringTypeWithContext,
+      right: StringTypeWithContext): StringTypeWithContext = {
+    def handleMismatch(): Nothing = {
+      if (left.strength == Explicit) {
+        throw QueryCompilationErrors.explicitCollationMismatchError(
+          Seq(left.stringType, right.stringType))
+      } else {
         throw QueryCompilationErrors.implicitCollationMismatchError(
-          Seq(leftStringType, rightStringType))
-
-      case (Implicit, _) => Some(left)
-      case (_, Implicit) => Some(right)
+          Seq(left.stringType, right.stringType))
+      }
+    }
 
-      case (Default, Default) if leftStringType != rightStringType =>
-        throw QueryCompilationErrors.implicitCollationMismatchError(
-          Seq(leftStringType, rightStringType))
+    (left.strength.priority, right.strength.priority) match {
+      case (leftPriority, rightPriority) if leftPriority == rightPriority =>
+        if (left.sameType(right)) left
+        else handleMismatch()
 
-      case _ =>
-        Some(left)
+      case (leftPriority, rightPriority) =>
+        if (leftPriority < rightPriority) left
+        else right
     }
   }
 }
@@ -356,18 +440,32 @@ object CollationTypeCoercion {
 /**
  * Represents the strength of collation used for determining precedence in 
collation resolution.
  */
-private sealed trait CollationStrength {}
+private sealed trait CollationStrength {
+  val priority: Int
+}
 
   private object CollationStrength {
-  case object Explicit extends CollationStrength {}
-  case object Implicit extends CollationStrength {}
-  case object Default extends CollationStrength {}
+  case object Explicit extends CollationStrength {
+    override val priority: Int = 0
+  }
+  case object Implicit extends CollationStrength {
+    override val priority: Int = 1
+  }
+  case object Default extends CollationStrength {
+    override val priority: Int = 2
+  }
 }
 
 /**
  * Encapsulates the context for collation, including data type and strength.
  *
- * @param dataType The data type associated with this collation context.
+ * @param stringType StringType.
  * @param strength The strength level of the collation, which determines its 
precedence.
  */
-private case class CollationContext(dataType: DataType, strength: 
CollationStrength) {}
+private case class StringTypeWithContext(stringType: StringType, strength: 
CollationStrength)
+  extends DataType {
+
+  override def defaultSize: Int = stringType.defaultSize
+
+  override private[spark] def asNullable: DataType = this
+}
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
index 45ab1cdcff79..0d5c414416d4 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
@@ -442,77 +442,77 @@ Project [array_except(array(collate(aaa, utf8_lcase)), 
array(collate(AAA, utf8_l
 -- !query
 select 'a' collate unicode < 'A'
 -- !query analysis
-Project [(collate(a, unicode) < cast(A as string collate UNICODE)) AS 
(collate(a, unicode) < A)#x]
+Project [(collate(a, unicode) < A) AS (collate(a, unicode) < 'A' collate 
UNICODE)#x]
 +- OneRowRelation
 
 
 -- !query
 select 'a' collate unicode_ci = 'A'
 -- !query analysis
-Project [(collate(a, unicode_ci) = cast(A as string collate UNICODE_CI)) AS 
(collate(a, unicode_ci) = A)#x]
+Project [(collate(a, unicode_ci) = A) AS (collate(a, unicode_ci) = 'A' collate 
UNICODE_CI)#x]
 +- OneRowRelation
 
 
 -- !query
 select 'a' collate unicode_ai = 'å'
 -- !query analysis
-Project [(collate(a, unicode_ai) = cast(å as string collate UNICODE_AI)) AS 
(collate(a, unicode_ai) = å)#x]
+Project [(collate(a, unicode_ai) = å) AS (collate(a, unicode_ai) = 'å' collate 
UNICODE_AI)#x]
 +- OneRowRelation
 
 
 -- !query
 select 'a' collate unicode_ci_ai = 'Å'
 -- !query analysis
-Project [(collate(a, unicode_ci_ai) = cast(Å as string collate UNICODE_CI_AI)) 
AS (collate(a, unicode_ci_ai) = Å)#x]
+Project [(collate(a, unicode_ci_ai) = Å) AS (collate(a, unicode_ci_ai) = 'Å' 
collate UNICODE_CI_AI)#x]
 +- OneRowRelation
 
 
 -- !query
 select 'a' collate en < 'A'
 -- !query analysis
-Project [(collate(a, en) < cast(A as string collate en)) AS (collate(a, en) < 
A)#x]
+Project [(collate(a, en) < A) AS (collate(a, en) < 'A' collate en)#x]
 +- OneRowRelation
 
 
 -- !query
 select 'a' collate en_ci = 'A'
 -- !query analysis
-Project [(collate(a, en_ci) = cast(A as string collate en_CI)) AS (collate(a, 
en_ci) = A)#x]
+Project [(collate(a, en_ci) = A) AS (collate(a, en_ci) = 'A' collate en_CI)#x]
 +- OneRowRelation
 
 
 -- !query
 select 'a' collate en_ai = 'å'
 -- !query analysis
-Project [(collate(a, en_ai) = cast(å as string collate en_AI)) AS (collate(a, 
en_ai) = å)#x]
+Project [(collate(a, en_ai) = å) AS (collate(a, en_ai) = 'å' collate en_AI)#x]
 +- OneRowRelation
 
 
 -- !query
 select 'a' collate en_ci_ai = 'Å'
 -- !query analysis
-Project [(collate(a, en_ci_ai) = cast(Å as string collate en_CI_AI)) AS 
(collate(a, en_ci_ai) = Å)#x]
+Project [(collate(a, en_ci_ai) = Å) AS (collate(a, en_ci_ai) = 'Å' collate 
en_CI_AI)#x]
 +- OneRowRelation
 
 
 -- !query
 select 'Kypper' collate sv < 'Köpfe'
 -- !query analysis
-Project [(collate(Kypper, sv) < cast(Köpfe as string collate sv)) AS 
(collate(Kypper, sv) < Köpfe)#x]
+Project [(collate(Kypper, sv) < Köpfe) AS (collate(Kypper, sv) < 'Köpfe' 
collate sv)#x]
 +- OneRowRelation
 
 
 -- !query
 select 'Kypper' collate de > 'Köpfe'
 -- !query analysis
-Project [(collate(Kypper, de) > cast(Köpfe as string collate de)) AS 
(collate(Kypper, de) > Köpfe)#x]
+Project [(collate(Kypper, de) > Köpfe) AS (collate(Kypper, de) > 'Köpfe' 
collate de)#x]
 +- OneRowRelation
 
 
 -- !query
 select 'I' collate tr_ci = 'ı'
 -- !query analysis
-Project [(collate(I, tr_ci) = cast(ı as string collate tr_CI)) AS (collate(I, 
tr_ci) = ı)#x]
+Project [(collate(I, tr_ci) = ı) AS (collate(I, tr_ci) = 'ı' collate tr_CI)#x]
 +- OneRowRelation
 
 
@@ -826,7 +826,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_d
 -- !query
 select concat_ws(' ', utf8_lcase, utf8_lcase) from t5
 -- !query analysis
-Project [concat_ws(cast(  as string collate UTF8_LCASE), utf8_lcase#x, 
utf8_lcase#x) AS concat_ws( , utf8_lcase, utf8_lcase)#x]
+Project [concat_ws( , utf8_lcase#x, utf8_lcase#x) AS concat_ws( , utf8_lcase, 
utf8_lcase)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -868,7 +868,7 @@ Project [concat_ws(collate( , utf8_lcase), 
cast(utf8_binary#x as string collate
 -- !query
 select concat_ws(',', utf8_lcase, 'word'), concat_ws(',', utf8_binary, 'word') 
from t5
 -- !query analysis
-Project [concat_ws(cast(, as string collate UTF8_LCASE), utf8_lcase#x, 
cast(word as string collate UTF8_LCASE)) AS concat_ws(,, utf8_lcase, word)#x, 
concat_ws(,, utf8_binary#x, word) AS concat_ws(,, utf8_binary, word)#x]
+Project [concat_ws(,, utf8_lcase#x, word) AS concat_ws(,, utf8_lcase, word)#x, 
concat_ws(,, utf8_binary#x, word) AS concat_ws(,, utf8_binary, word)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -876,7 +876,7 @@ Project [concat_ws(cast(, as string collate UTF8_LCASE), 
utf8_lcase#x, cast(word
 -- !query
 select concat_ws(',', utf8_lcase, 'word' collate utf8_binary), concat_ws(',', 
utf8_binary, 'word' collate utf8_lcase) from t5
 -- !query analysis
-Project [concat_ws(,, cast(utf8_lcase#x as string), collate(word, 
utf8_binary)) AS concat_ws(,, utf8_lcase, collate(word, utf8_binary))#x, 
concat_ws(cast(, as string collate UTF8_LCASE), cast(utf8_binary#x as string 
collate UTF8_LCASE), collate(word, utf8_lcase)) AS concat_ws(,, utf8_binary, 
collate(word, utf8_lcase))#x]
+Project [concat_ws(,, cast(utf8_lcase#x as string), collate(word, 
utf8_binary)) AS concat_ws(,, utf8_lcase, collate(word, utf8_binary))#x, 
concat_ws(,, cast(utf8_binary#x as string collate UTF8_LCASE), collate(word, 
utf8_lcase)) AS concat_ws(,, utf8_binary, collate(word, utf8_lcase))#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -934,7 +934,7 @@ Project [elt(1, collate(utf8_binary#x, utf8_binary), 
cast(utf8_lcase#x as string
 -- !query
 select elt(1, utf8_binary, 'word'), elt(1, utf8_lcase, 'word') from t5
 -- !query analysis
-Project [elt(1, utf8_binary#x, word, true) AS elt(1, utf8_binary, word)#x, 
elt(1, utf8_lcase#x, cast(word as string collate UTF8_LCASE), true) AS elt(1, 
utf8_lcase, word)#x]
+Project [elt(1, utf8_binary#x, word, true) AS elt(1, utf8_binary, word)#x, 
elt(1, utf8_lcase#x, word, true) AS elt(1, utf8_lcase, 'word' collate 
UTF8_LCASE)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1024,7 +1024,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 -- !query
 select split_part(utf8_binary, 'a', 3), split_part(utf8_lcase, 'a', 3) from t5
 -- !query analysis
-Project [split_part(utf8_binary#x, a, 3) AS split_part(utf8_binary, a, 3)#x, 
split_part(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 3) AS 
split_part(utf8_lcase, a, 3)#x]
+Project [split_part(utf8_binary#x, a, 3) AS split_part(utf8_binary, a, 3)#x, 
split_part(utf8_lcase#x, a, 3) AS split_part(utf8_lcase, a, 3)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1122,7 +1122,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 -- !query
 select contains(utf8_binary, 'a'), contains(utf8_lcase, 'a') from t5
 -- !query analysis
-Project [Contains(utf8_binary#x, a) AS contains(utf8_binary, a)#x, 
Contains(utf8_lcase#x, cast(a as string collate UTF8_LCASE)) AS 
contains(utf8_lcase, a)#x]
+Project [Contains(utf8_binary#x, a) AS contains(utf8_binary, a)#x, 
Contains(utf8_lcase#x, a) AS contains(utf8_lcase, a)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1220,7 +1220,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 -- !query
 select substring_index(utf8_binary, 'a', 2), substring_index(utf8_lcase, 'a', 
2) from t5
 -- !query analysis
-Project [substring_index(utf8_binary#x, a, 2) AS substring_index(utf8_binary, 
a, 2)#x, substring_index(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 2) 
AS substring_index(utf8_lcase, a, 2)#x]
+Project [substring_index(utf8_binary#x, a, 2) AS substring_index(utf8_binary, 
a, 2)#x, substring_index(utf8_lcase#x, a, 2) AS substring_index(utf8_lcase, a, 
2)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1318,7 +1318,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 -- !query
 select instr(utf8_binary, 'a'), instr(utf8_lcase, 'a') from t5
 -- !query analysis
-Project [instr(utf8_binary#x, a) AS instr(utf8_binary, a)#x, 
instr(utf8_lcase#x, cast(a as string collate UTF8_LCASE)) AS instr(utf8_lcase, 
a)#x]
+Project [instr(utf8_binary#x, a) AS instr(utf8_binary, a)#x, 
instr(utf8_lcase#x, a) AS instr(utf8_lcase, a)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1384,7 +1384,7 @@ Project [find_in_set(collate(utf8_binary#x, utf8_lcase), 
collate(utf8_lcase#x, u
 -- !query
 select find_in_set(utf8_binary, 'aaAaaAaA,i̇o'), find_in_set(utf8_lcase, 
'aaAaaAaA,i̇o') from t5
 -- !query analysis
-Project [find_in_set(utf8_binary#x, aaAaaAaA,i̇o) AS find_in_set(utf8_binary, 
aaAaaAaA,i̇o)#x, find_in_set(utf8_lcase#x, cast(aaAaaAaA,i̇o as string collate 
UTF8_LCASE)) AS find_in_set(utf8_lcase, aaAaaAaA,i̇o)#x]
+Project [find_in_set(utf8_binary#x, aaAaaAaA,i̇o) AS find_in_set(utf8_binary, 
aaAaaAaA,i̇o)#x, find_in_set(utf8_lcase#x, aaAaaAaA,i̇o) AS 
find_in_set(utf8_lcase, aaAaaAaA,i̇o)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1482,7 +1482,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 -- !query
 select startswith(utf8_binary, 'aaAaaAaA'), startswith(utf8_lcase, 'aaAaaAaA') 
from t5
 -- !query analysis
-Project [StartsWith(utf8_binary#x, aaAaaAaA) AS startswith(utf8_binary, 
aaAaaAaA)#x, StartsWith(utf8_lcase#x, cast(aaAaaAaA as string collate 
UTF8_LCASE)) AS startswith(utf8_lcase, aaAaaAaA)#x]
+Project [StartsWith(utf8_binary#x, aaAaaAaA) AS startswith(utf8_binary, 
aaAaaAaA)#x, StartsWith(utf8_lcase#x, aaAaaAaA) AS startswith(utf8_lcase, 
aaAaaAaA)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1506,7 +1506,7 @@ Project [StartsWith(cast(utf8_binary#x as string collate 
UTF8_LCASE_RTRIM), coll
 -- !query
 select translate(utf8_lcase, utf8_lcase, '12345') from t5
 -- !query analysis
-Project [translate(utf8_lcase#x, utf8_lcase#x, cast(12345 as string collate 
UTF8_LCASE)) AS translate(utf8_lcase, utf8_lcase, 12345)#x]
+Project [translate(utf8_lcase#x, utf8_lcase#x, 12345) AS translate(utf8_lcase, 
utf8_lcase, 12345)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1572,7 +1572,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 -- !query
 select translate(utf8_lcase, 'aaAaaAaA', '12345'), translate(utf8_binary, 
'aaAaaAaA', '12345') from t5
 -- !query analysis
-Project [translate(utf8_lcase#x, cast(aaAaaAaA as string collate UTF8_LCASE), 
cast(12345 as string collate UTF8_LCASE)) AS translate(utf8_lcase, aaAaaAaA, 
12345)#x, translate(utf8_binary#x, aaAaaAaA, 12345) AS translate(utf8_binary, 
aaAaaAaA, 12345)#x]
+Project [translate(utf8_lcase#x, aaAaaAaA, 12345) AS translate(utf8_lcase, 
aaAaaAaA, 12345)#x, translate(utf8_binary#x, aaAaaAaA, 12345) AS 
translate(utf8_binary, aaAaaAaA, 12345)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1580,7 +1580,7 @@ Project [translate(utf8_lcase#x, cast(aaAaaAaA as string 
collate UTF8_LCASE), ca
 -- !query
 select translate(utf8_lcase, 'aBc' collate utf8_binary, '12345'), 
translate(utf8_binary, 'aBc' collate utf8_lcase, '12345') from t5
 -- !query analysis
-Project [translate(cast(utf8_lcase#x as string), collate(aBc, utf8_binary), 
12345) AS translate(utf8_lcase, collate(aBc, utf8_binary), 12345)#x, 
translate(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aBc, 
utf8_lcase), cast(12345 as string collate UTF8_LCASE)) AS 
translate(utf8_binary, collate(aBc, utf8_lcase), 12345)#x]
+Project [translate(cast(utf8_lcase#x as string), collate(aBc, utf8_binary), 
12345) AS translate(utf8_lcase, collate(aBc, utf8_binary), 12345)#x, 
translate(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aBc, 
utf8_lcase), 12345) AS translate(utf8_binary, collate(aBc, utf8_lcase), 
12345)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1588,7 +1588,7 @@ Project [translate(cast(utf8_lcase#x as string), 
collate(aBc, utf8_binary), 1234
 -- !query
 select translate(utf8_lcase, 'aBc ' collate utf8_binary_rtrim, '12345'), 
translate(utf8_binary, 'aBc' collate utf8_lcase, '12345') from t5
 -- !query analysis
-Project [translate(cast(utf8_lcase#x as string collate UTF8_BINARY_RTRIM), 
collate(aBc , utf8_binary_rtrim), cast(12345 as string collate 
UTF8_BINARY_RTRIM)) AS translate(utf8_lcase, collate(aBc , utf8_binary_rtrim), 
12345)#x, translate(cast(utf8_binary#x as string collate UTF8_LCASE), 
collate(aBc, utf8_lcase), cast(12345 as string collate UTF8_LCASE)) AS 
translate(utf8_binary, collate(aBc, utf8_lcase), 12345)#x]
+Project [translate(cast(utf8_lcase#x as string collate UTF8_BINARY_RTRIM), 
collate(aBc , utf8_binary_rtrim), 12345) AS translate(utf8_lcase, collate(aBc , 
utf8_binary_rtrim), 12345)#x, translate(cast(utf8_binary#x as string collate 
UTF8_LCASE), collate(aBc, utf8_lcase), 12345) AS translate(utf8_binary, 
collate(aBc, utf8_lcase), 12345)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1638,7 +1638,7 @@ Project [replace(utf8_binary#x, collate(utf8_lcase#x, 
utf8_binary), abc) AS repl
 -- !query
 select replace(utf8_binary collate utf8_lcase, utf8_lcase collate utf8_lcase, 
'abc') from t5
 -- !query analysis
-Project [replace(collate(utf8_binary#x, utf8_lcase), collate(utf8_lcase#x, 
utf8_lcase), cast(abc as string collate UTF8_LCASE)) AS 
replace(collate(utf8_binary, utf8_lcase), collate(utf8_lcase, utf8_lcase), 
abc)#x]
+Project [replace(collate(utf8_binary#x, utf8_lcase), collate(utf8_lcase#x, 
utf8_lcase), abc) AS replace(collate(utf8_binary, utf8_lcase), 
collate(utf8_lcase, utf8_lcase), abc)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1655,7 +1655,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
     "inputType" : "\"STRING COLLATE UNICODE_AI\"",
     "paramIndex" : "first",
     "requiredType" : "\"STRING\"",
-    "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), 
collate(utf8_lcase, unicode_ai), abc)\""
+    "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), 
collate(utf8_lcase, unicode_ai), 'abc' collate UNICODE_AI)\""
   },
   "queryContext" : [ {
     "objectType" : "",
@@ -1670,7 +1670,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 -- !query
 select replace(utf8_binary, 'aaAaaAaA', 'abc'), replace(utf8_lcase, 
'aaAaaAaA', 'abc') from t5
 -- !query analysis
-Project [replace(utf8_binary#x, aaAaaAaA, abc) AS replace(utf8_binary, 
aaAaaAaA, abc)#x, replace(utf8_lcase#x, cast(aaAaaAaA as string collate 
UTF8_LCASE), cast(abc as string collate UTF8_LCASE)) AS replace(utf8_lcase, 
aaAaaAaA, abc)#x]
+Project [replace(utf8_binary#x, aaAaaAaA, abc) AS replace(utf8_binary, 
aaAaaAaA, abc)#x, replace(utf8_lcase#x, aaAaaAaA, abc) AS replace(utf8_lcase, 
aaAaaAaA, abc)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1678,7 +1678,7 @@ Project [replace(utf8_binary#x, aaAaaAaA, abc) AS 
replace(utf8_binary, aaAaaAaA,
 -- !query
 select replace(utf8_binary, 'aaAaaAaA' collate utf8_lcase, 'abc'), 
replace(utf8_lcase, 'aaAaaAaA' collate utf8_binary, 'abc') from t5
 -- !query analysis
-Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE), 
collate(aaAaaAaA, utf8_lcase), cast(abc as string collate UTF8_LCASE)) AS 
replace(utf8_binary, collate(aaAaaAaA, utf8_lcase), abc)#x, 
replace(cast(utf8_lcase#x as string), collate(aaAaaAaA, utf8_binary), abc) AS 
replace(utf8_lcase, collate(aaAaaAaA, utf8_binary), abc)#x]
+Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE), 
collate(aaAaaAaA, utf8_lcase), abc) AS replace(utf8_binary, collate(aaAaaAaA, 
utf8_lcase), abc)#x, replace(cast(utf8_lcase#x as string), collate(aaAaaAaA, 
utf8_binary), abc) AS replace(utf8_lcase, collate(aaAaaAaA, utf8_binary), 
abc)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1686,7 +1686,7 @@ Project [replace(cast(utf8_binary#x as string collate 
UTF8_LCASE), collate(aaAaa
 -- !query
 select replace(utf8_binary, 'aaAaaAaA ' collate utf8_lcase_rtrim, 'abc'), 
replace(utf8_lcase, 'aaAaaAaA' collate utf8_binary, 'abc') from t5
 -- !query analysis
-Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE_RTRIM), 
collate(aaAaaAaA , utf8_lcase_rtrim), cast(abc as string collate 
UTF8_LCASE_RTRIM)) AS replace(utf8_binary, collate(aaAaaAaA , 
utf8_lcase_rtrim), abc)#x, replace(cast(utf8_lcase#x as string), 
collate(aaAaaAaA, utf8_binary), abc) AS replace(utf8_lcase, collate(aaAaaAaA, 
utf8_binary), abc)#x]
+Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE_RTRIM), 
collate(aaAaaAaA , utf8_lcase_rtrim), abc) AS replace(utf8_binary, 
collate(aaAaaAaA , utf8_lcase_rtrim), abc)#x, replace(cast(utf8_lcase#x as 
string), collate(aaAaaAaA, utf8_binary), abc) AS replace(utf8_lcase, 
collate(aaAaaAaA, utf8_binary), abc)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -1768,7 +1768,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 -- !query
 select endswith(utf8_binary, 'aaAaaAaA'), endswith(utf8_lcase, 'aaAaaAaA') 
from t5
 -- !query analysis
-Project [EndsWith(utf8_binary#x, aaAaaAaA) AS endswith(utf8_binary, 
aaAaaAaA)#x, EndsWith(utf8_lcase#x, cast(aaAaaAaA as string collate 
UTF8_LCASE)) AS endswith(utf8_lcase, aaAaaAaA)#x]
+Project [EndsWith(utf8_binary#x, aaAaaAaA) AS endswith(utf8_binary, 
aaAaaAaA)#x, EndsWith(utf8_lcase#x, aaAaaAaA) AS endswith(utf8_lcase, 
aaAaaAaA)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -2042,7 +2042,7 @@ Project [overlay(collate(utf8_binary#x, utf8_lcase), 
collate(utf8_lcase#x, utf8_
 -- !query
 select overlay(utf8_binary, 'a', 2), overlay(utf8_lcase, 'a', 2) from t5
 -- !query analysis
-Project [overlay(utf8_binary#x, a, 2, -1) AS overlay(utf8_binary, a, 2, -1)#x, 
overlay(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 2, -1) AS 
overlay(utf8_lcase, a, 2, -1)#x]
+Project [overlay(utf8_binary#x, a, 2, -1) AS overlay(utf8_binary, a, 2, -1)#x, 
overlay(utf8_lcase#x, a, 2, -1) AS overlay(utf8_lcase, 'a' collate UTF8_LCASE, 
2, -1)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -2204,7 +2204,7 @@ Project [levenshtein(collate(utf8_binary#x, utf8_lcase), 
collate(utf8_lcase#x, u
 -- !query
 select levenshtein(utf8_binary, 'a'), levenshtein(utf8_lcase, 'a') from t5
 -- !query analysis
-Project [levenshtein(utf8_binary#x, a, None) AS levenshtein(utf8_binary, a)#x, 
levenshtein(utf8_lcase#x, cast(a as string collate UTF8_LCASE), None) AS 
levenshtein(utf8_lcase, a)#x]
+Project [levenshtein(utf8_binary#x, a, None) AS levenshtein(utf8_binary, a)#x, 
levenshtein(utf8_lcase#x, a, None) AS levenshtein(utf8_lcase, a)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -2390,7 +2390,7 @@ Project [lpad(collate(utf8_binary#x, utf8_binary_rtrim), 
8, collate(utf8_lcase#x
 -- !query
 select rpad(utf8_binary, 8, 'a'), rpad(utf8_lcase, 8, 'a') from t5
 -- !query analysis
-Project [rpad(utf8_binary#x, 8, a) AS rpad(utf8_binary, 8, a)#x, 
rpad(utf8_lcase#x, 8, cast(a as string collate UTF8_LCASE)) AS rpad(utf8_lcase, 
8, a)#x]
+Project [rpad(utf8_binary#x, 8, a) AS rpad(utf8_binary, 8, a)#x, 
rpad(utf8_lcase#x, 8, a) AS rpad(utf8_lcase, 8, a)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -2464,7 +2464,7 @@ Project [lpad(collate(utf8_binary#x, utf8_binary_rtrim), 
8, collate(utf8_lcase#x
 -- !query
 select lpad(utf8_binary, 8, 'a'), lpad(utf8_lcase, 8, 'a') from t5
 -- !query analysis
-Project [lpad(utf8_binary#x, 8, a) AS lpad(utf8_binary, 8, a)#x, 
lpad(utf8_lcase#x, 8, cast(a as string collate UTF8_LCASE)) AS lpad(utf8_lcase, 
8, a)#x]
+Project [lpad(utf8_binary#x, 8, a) AS lpad(utf8_binary, 8, a)#x, 
lpad(utf8_lcase#x, 8, a) AS lpad(utf8_lcase, 8, a)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -2554,7 +2554,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 -- !query
 select locate(utf8_binary, 'a'), locate(utf8_lcase, 'a') from t5
 -- !query analysis
-Project [locate(utf8_binary#x, a, 1) AS locate(utf8_binary, a, 1)#x, 
locate(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 1) AS 
locate(utf8_lcase, a, 1)#x]
+Project [locate(utf8_binary#x, a, 1) AS locate(utf8_binary, a, 1)#x, 
locate(utf8_lcase#x, a, 1) AS locate(utf8_lcase, a, 1)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -2660,7 +2660,7 @@ Project [trim(collate(utf8_lcase#x, utf8_binary_rtrim), 
Some(collate(utf8_binary
 -- !query
 select TRIM('ABc', utf8_binary), TRIM('ABc', utf8_lcase) from t5
 -- !query analysis
-Project [trim(utf8_binary#x, Some(ABc)) AS TRIM(BOTH ABc FROM utf8_binary)#x, 
trim(utf8_lcase#x, Some(cast(ABc as string collate UTF8_LCASE))) AS TRIM(BOTH 
ABc FROM utf8_lcase)#x]
+Project [trim(utf8_binary#x, Some(ABc)) AS TRIM(BOTH ABc FROM utf8_binary)#x, 
trim(utf8_lcase#x, Some(ABc)) AS TRIM(BOTH ABc FROM utf8_lcase)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -2856,7 +2856,7 @@ Project [ltrim(collate(utf8_lcase#x, utf8_binary_rtrim), 
Some(collate(utf8_binar
 -- !query
 select LTRIM('ABc', utf8_binary), LTRIM('ABc', utf8_lcase) from t5
 -- !query analysis
-Project [ltrim(utf8_binary#x, Some(ABc)) AS TRIM(LEADING ABc FROM 
utf8_binary)#x, ltrim(utf8_lcase#x, Some(cast(ABc as string collate 
UTF8_LCASE))) AS TRIM(LEADING ABc FROM utf8_lcase)#x]
+Project [ltrim(utf8_binary#x, Some(ABc)) AS TRIM(LEADING ABc FROM 
utf8_binary)#x, ltrim(utf8_lcase#x, Some(ABc)) AS TRIM(LEADING ABc FROM 
utf8_lcase)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
@@ -2954,7 +2954,7 @@ Project [rtrim(collate(utf8_lcase#x, utf8_binary_rtrim), 
Some(collate(utf8_binar
 -- !query
 select RTRIM('ABc', utf8_binary), RTRIM('ABc', utf8_lcase) from t5
 -- !query analysis
-Project [rtrim(utf8_binary#x, Some(ABc)) AS TRIM(TRAILING ABc FROM 
utf8_binary)#x, rtrim(utf8_lcase#x, Some(cast(ABc as string collate 
UTF8_LCASE))) AS TRIM(TRAILING ABc FROM utf8_lcase)#x]
+Project [rtrim(utf8_binary#x, Some(ABc)) AS TRIM(TRAILING ABc FROM 
utf8_binary)#x, rtrim(utf8_lcase#x, Some(ABc)) AS TRIM(TRAILING ABc FROM 
utf8_lcase)#x]
 +- SubqueryAlias spark_catalog.default.t5
    +- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
 
diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out 
b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
index f92fc5de8c3f..e96549f00d6e 100644
--- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
@@ -479,7 +479,7 @@ struct<array_except(array(collate(aaa, utf8_lcase)), 
array(collate(AAA, utf8_lca
 -- !query
 select 'a' collate unicode < 'A'
 -- !query schema
-struct<(collate(a, unicode) < A):boolean>
+struct<(collate(a, unicode) < 'A' collate UNICODE):boolean>
 -- !query output
 true
 
@@ -487,7 +487,7 @@ true
 -- !query
 select 'a' collate unicode_ci = 'A'
 -- !query schema
-struct<(collate(a, unicode_ci) = A):boolean>
+struct<(collate(a, unicode_ci) = 'A' collate UNICODE_CI):boolean>
 -- !query output
 true
 
@@ -495,7 +495,7 @@ true
 -- !query
 select 'a' collate unicode_ai = 'å'
 -- !query schema
-struct<(collate(a, unicode_ai) = å):boolean>
+struct<(collate(a, unicode_ai) = 'å' collate UNICODE_AI):boolean>
 -- !query output
 true
 
@@ -503,7 +503,7 @@ true
 -- !query
 select 'a' collate unicode_ci_ai = 'Å'
 -- !query schema
-struct<(collate(a, unicode_ci_ai) = Å):boolean>
+struct<(collate(a, unicode_ci_ai) = 'Å' collate UNICODE_CI_AI):boolean>
 -- !query output
 true
 
@@ -511,7 +511,7 @@ true
 -- !query
 select 'a' collate en < 'A'
 -- !query schema
-struct<(collate(a, en) < A):boolean>
+struct<(collate(a, en) < 'A' collate en):boolean>
 -- !query output
 true
 
@@ -519,7 +519,7 @@ true
 -- !query
 select 'a' collate en_ci = 'A'
 -- !query schema
-struct<(collate(a, en_ci) = A):boolean>
+struct<(collate(a, en_ci) = 'A' collate en_CI):boolean>
 -- !query output
 true
 
@@ -527,7 +527,7 @@ true
 -- !query
 select 'a' collate en_ai = 'å'
 -- !query schema
-struct<(collate(a, en_ai) = å):boolean>
+struct<(collate(a, en_ai) = 'å' collate en_AI):boolean>
 -- !query output
 true
 
@@ -535,7 +535,7 @@ true
 -- !query
 select 'a' collate en_ci_ai = 'Å'
 -- !query schema
-struct<(collate(a, en_ci_ai) = Å):boolean>
+struct<(collate(a, en_ci_ai) = 'Å' collate en_CI_AI):boolean>
 -- !query output
 true
 
@@ -543,7 +543,7 @@ true
 -- !query
 select 'Kypper' collate sv < 'Köpfe'
 -- !query schema
-struct<(collate(Kypper, sv) < Köpfe):boolean>
+struct<(collate(Kypper, sv) < 'Köpfe' collate sv):boolean>
 -- !query output
 true
 
@@ -551,7 +551,7 @@ true
 -- !query
 select 'Kypper' collate de > 'Köpfe'
 -- !query schema
-struct<(collate(Kypper, de) > Köpfe):boolean>
+struct<(collate(Kypper, de) > 'Köpfe' collate de):boolean>
 -- !query output
 true
 
@@ -559,7 +559,7 @@ true
 -- !query
 select 'I' collate tr_ci = 'ı'
 -- !query schema
-struct<(collate(I, tr_ci) = ı):boolean>
+struct<(collate(I, tr_ci) = 'ı' collate tr_CI):boolean>
 -- !query output
 true
 
@@ -1109,7 +1109,7 @@ kitten
 -- !query
 select elt(1, utf8_binary, 'word'), elt(1, utf8_lcase, 'word') from t5
 -- !query schema
-struct<elt(1, utf8_binary, word):string,elt(1, utf8_lcase, word):string 
collate UTF8_LCASE>
+struct<elt(1, utf8_binary, word):string,elt(1, utf8_lcase, 'word' collate 
UTF8_LCASE):string collate UTF8_LCASE>
 -- !query output
 Hello, world! Nice day.        Hello, world! Nice day.
 Something else. Nothing here.  Something else. Nothing here.
@@ -2492,7 +2492,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
     "inputType" : "\"STRING COLLATE UNICODE_AI\"",
     "paramIndex" : "first",
     "requiredType" : "\"STRING\"",
-    "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), 
collate(utf8_lcase, unicode_ai), abc)\""
+    "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai), 
collate(utf8_lcase, unicode_ai), 'abc' collate UNICODE_AI)\""
   },
   "queryContext" : [ {
     "objectType" : "",
@@ -3342,7 +3342,7 @@ ksitTing
 -- !query
 select overlay(utf8_binary, 'a', 2), overlay(utf8_lcase, 'a', 2) from t5
 -- !query schema
-struct<overlay(utf8_binary, a, 2, -1):string,overlay(utf8_lcase, a, 2, 
-1):string collate UTF8_LCASE>
+struct<overlay(utf8_binary, a, 2, -1):string,overlay(utf8_lcase, 'a' collate 
UTF8_LCASE, 2, -1):string collate UTF8_LCASE>
 -- !query output
 Hallo, world! Nice day.        Hallo, world! Nice day.
 Saark  SaL
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala
index 7cafb999ffcf..8d831e4ca166 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala
@@ -450,7 +450,8 @@ class CollationSQLRegexpSuite
         },
         condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
         parameters = Map(
-          "sqlExpr" -> "\"regexp_replace(collate(ABCDE, UNICODE_CI), .c., FFF, 
1)\"",
+          "sqlExpr" ->
+            """"regexp_replace(collate(ABCDE, UNICODE_CI), .c., 'FFF' collate 
UNICODE_CI, 1)"""",
           "paramIndex" -> "first",
           "inputSql" -> "\"collate(ABCDE, UNICODE_CI)\"",
           "inputType" -> "\"STRING COLLATE UNICODE_CI\"",
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala
index bb6fce1fb1b6..23d0d4ad8c21 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala
@@ -21,6 +21,7 @@ import org.apache.spark.SparkThrowable
 import org.apache.spark.sql.{DataFrame, QueryTest, Row}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types._
 
 class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession {
 
@@ -43,6 +44,11 @@ class CollationTypePrecedenceSuite extends QueryTest with 
SharedSparkSession {
   private def assertImplicitMismatch(df: => DataFrame): Unit =
     assertThrowsError(df, "COLLATION_MISMATCH.IMPLICIT")
 
+  private def assertQuerySchema(df: => DataFrame, expectedSchema: DataType): 
Unit = {
+    val querySchema = df.schema.fields.head.dataType
+    assert(DataType.equalsIgnoreNullability(querySchema, expectedSchema))
+  }
+
   test("explicit collation propagates up") {
     checkAnswer(
       sql(s"SELECT COLLATION('a' collate unicode)"),
@@ -382,13 +388,157 @@ class CollationTypePrecedenceSuite extends QueryTest 
with SharedSparkSession {
         s"'name2' collate utf8_lcase, 'value2' collate unicode)"),
       Row(Row("value1", "value2")))
 
-    assertExplicitMismatch(
+    checkAnswer(
       sql(s"SELECT named_struct" +
-        s"('name1' collate unicode, 'value1', 'name2' collate utf8_lcase, 
'value2')"))
+        s"('name1' collate unicode, 'value1', 'name2' collate utf8_lcase, 
'value2')"),
+      Row(Row("value1", "value2")))
 
-    assertExplicitMismatch(
+    checkAnswer(
       sql(s"SELECT named_struct" +
-        s"('name1', 'value1' collate unicode, 'name2', 'value2' collate 
utf8_lcase)"))
+        s"('name1', 'value1' collate unicode, 'name2', 'value2' collate 
utf8_lcase)"),
+      Row(Row("value1", "value2")))
+  }
+
+  test("coercing structs") {
+    assertQuerySchema(
+      sql(s"SELECT array(struct(1, 'a'), struct(2, 'b' collate utf8_lcase))"),
+      ArrayType(
+        StructType(
+          Seq(StructField("col1", IntegerType), StructField("col2", 
StringType("UTF8_LCASE"))))))
+
+    assertQuerySchema(
+      sql(s"SELECT array(struct(1, 'a' collate utf8_lcase), struct(2, 'b' 
collate utf8_lcase))"),
+      ArrayType(
+        StructType(
+          Seq(StructField("col1", IntegerType), StructField("col2", 
StringType("UTF8_LCASE"))))))
+
+    assertExplicitMismatch(
+      sql(s"SELECT array(struct(1, 'a' collate utf8_lcase), struct(2, 'b' 
collate unicode))"))
+
+    assertImplicitMismatch(sql(s"""
+           |SELECT array(struct(1, c1), struct(2, c2))
+           |FROM VALUES ('a' collate unicode, 'b' collate utf8_lcase) AS t(c1, 
c2)
+           |""".stripMargin))
+  }
+
+  test("coercing maps") {
+    assertQuerySchema(
+      sql(s"SELECT map('key1', 'val1', 'key2', 'val2')"),
+      MapType(StringType, StringType))
+
+    assertQuerySchema(
+      sql(s"SELECT map('key1' collate utf8_lcase, 'val1', 'key2', 'val2' 
collate unicode)"),
+      MapType(StringType("UTF8_LCASE"), StringType("UNICODE")))
+
+    assertQuerySchema(
+      sql(s"SELECT ARRAY(map('key1', 'val1'), map('key2' collate UNICODE, 
'val2'))"),
+      ArrayType(MapType(StringType("UNICODE"), StringType)))
+
+    assertExplicitMismatch(
+      sql(s"SELECT map('key1', 'val1' collate utf8_lcase, 'key2', 'val2' 
collate unicode)"))
+  }
+
+  test("maps of structs") {
+    assertQuerySchema(
+      sql(s"SELECT map('key1', struct(1, 'a' collate unicode), 'key2', 
struct(2, 'b'))"),
+      MapType(
+        StringType,
+        StructType(
+          Seq(StructField("col1", IntegerType), StructField("col2", 
StringType("UNICODE"))))))
+
+    checkAnswer(
+      sql(
+        s"SELECT map('key1', struct(1, 'a' collate unicode_ci)," +
+          s"'key2', struct(2, 'b'))['key1'].col2 = 'A'"),
+      Seq(Row(true)))
+  }
+
+  test("coercing arrays") {
+    assertQuerySchema(sql(s"SELECT array('a', 'b')"), ArrayType(StringType))
+
+    assertQuerySchema(
+      sql(s"SELECT array('a' collate utf8_lcase, 'b')"),
+      ArrayType(StringType("UTF8_LCASE")))
+
+    assertQuerySchema(
+      sql(s"SELECT array('a' collate utf8_lcase, 'b' collate utf8_lcase)"),
+      ArrayType(StringType("UTF8_LCASE")))
+
+    assertExplicitMismatch(sql(s"SELECT array('a' collate utf8_lcase, 'b' 
collate unicode)"))
+
+    assertQuerySchema(
+      sql(s"SELECT array(array('a', 'b'), array('c' collate utf8_lcase, 
'd'))"),
+      ArrayType(ArrayType(StringType("UTF8_LCASE"))))
+
+    checkAnswer(
+      sql(s"SELECT array('a', 'b') = array('A' collate utf8_lcase, 'B')"),
+      Seq(Row(true)))
+
+    checkAnswer(
+      sql(s"SELECT array('a', 'b')[0] = array('A' collate utf8_lcase, 
'B')[1]"),
+      Seq(Row(false)))
+
+    assertExplicitMismatch(
+      sql(s"SELECT array('a', 'b' collate unicode) = array('A' collate 
utf8_lcase, 'B')"))
+  }
+
+  test("array of structs") {
+    assertQuerySchema(
+      sql(s"SELECT array(struct(1, 'a' collate unicode), struct(2, 'b'))[0]"),
+      StructType(
+        Seq(StructField("col1", IntegerType), StructField("col2", 
StringType("UNICODE")))))
+
+    checkAnswer(
+      sql(s"SELECT array(struct(1, 'a' collate unicode_ci), struct(2, 
'b'))[0].col2 = 'A'"),
+      Seq(Row(true)))
+  }
+
+  test("coercing deeply nested complex types") {
+    assertQuerySchema(
+      sql(s"""
+           |SELECT struct(
+           |  struct(1, 'nested' collate unicode),
+           |  array(
+           |    struct(1, 'a' collate utf8_lcase),
+           |    struct(2, 'b' collate utf8_lcase)
+           |  )
+           |)
+           |""".stripMargin),
+      StructType(
+        Seq(
+          StructField(
+            "col1",
+            StructType(
+              Seq(StructField("col1", IntegerType), StructField("col2", 
StringType("UNICODE"))))),
+          StructField(
+            "col2",
+            ArrayType(
+              StructType(Seq(
+                StructField("col1", IntegerType),
+                StructField("col2", StringType("UTF8_LCASE")))))))))
+
+    assertQuerySchema(
+      sql(s"""
+           |SELECT struct(
+           |  struct(
+           |    array(
+           |      map('key1' collate utf8_lcase, 'val1',
+           |          'key2', 'val2'),
+           |      map('key3', 'val3' collate unicode)
+           |    )
+           |  ),
+           |  42
+           |)
+           |""".stripMargin),
+      StructType(
+        Seq(
+          StructField(
+            "col1",
+            StructType(
+              Seq(StructField(
+                "col1",
+                ArrayType(MapType(StringType("UTF8_LCASE"), 
StringType("UNICODE"))))))),
+          StructField("col2", IntegerType))))
   }
 
   test("access collated map via literal") {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-50405][SQL] Handle collation type coercion of complex data types properly

Reply via email to