This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 5fc6b713874a [SPARK-50405][SQL] Handle collation type coercion of
complex data types properly
5fc6b713874a is described below
commit 5fc6b713874a0617cc8ff4149f01e17c40623528
Author: Stefan Kandic <[email protected]>
AuthorDate: Wed Dec 4 09:29:40 2024 +0100
[SPARK-50405][SQL] Handle collation type coercion of complex data types
properly
### What changes were proposed in this pull request?
This pull request generalizes collation type coercion to support not just
casting all children to a single string type, but also handling complex data
types such as structs, maps, and arrays (arrays partially worked already).
The core idea is to recursively analyze the entire data type of an
expression, annotating each StringType within it with the highest-priority
collation and its strength. This annotation propagates upward through the
expression tree. Once the root of the expression is reached, the annotations
are removed, and the expression is cast to the desired data type.
For the root expression `e`, the collation data type context is computed by
first calculating the context for all its children and then merging those
results into the data type of `e`.
### Why are the changes needed?
In #48663, a new approach to calculating collation precedence was
introduced. This approach recursively examines the children of an expression
and propagates the collation with the highest priority upward.
However, the current implementation of collation coercion is limited to
determining the StringType that all children should be cast to. This approach
falls short when dealing with complex types like structs, maps, and arrays,
which can also contain collations. To address this limitation, we need a more
general mechanism that allows coercion of any data type, not just simple
strings.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
With new unit tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #48936 from stefankandic/collationCoercionComplex.
Lead-authored-by: Stefan Kandic <[email protected]>
Co-authored-by: Stefan Kandic
<[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../catalyst/analysis/CollationTypeCoercion.scala | 350 +++++++++++++--------
.../sql-tests/analyzer-results/collations.sql.out | 78 ++---
.../resources/sql-tests/results/collations.sql.out | 28 +-
.../apache/spark/sql/CollationSQLRegexpSuite.scala | 3 +-
.../collation/CollationTypePrecedenceSuite.scala | 158 +++++++++-
5 files changed, 433 insertions(+), 184 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala
index cca1d21df3a7..02640aba2d28 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCoercion.scala
@@ -17,14 +17,12 @@
package org.apache.spark.sql.catalyst.analysis
-import scala.annotation.tailrec
-
import org.apache.spark.sql.catalyst.analysis.CollationStrength.{Default,
Explicit, Implicit}
-import org.apache.spark.sql.catalyst.analysis.TypeCoercion.{hasStringType,
haveSameType}
+import org.apache.spark.sql.catalyst.analysis.TypeCoercion.haveSameType
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.trees.TreeNodeTag
import org.apache.spark.sql.errors.QueryCompilationErrors
-import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
+import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StringType,
StructType}
import org.apache.spark.sql.util.SchemaUtils
/**
@@ -32,7 +30,7 @@ import org.apache.spark.sql.util.SchemaUtils
*/
object CollationTypeCoercion {
- private val COLLATION_CONTEXT_TAG = new
TreeNodeTag[CollationContext]("collationContext")
+ private val COLLATION_CONTEXT_TAG = new
TreeNodeTag[DataType]("collationContext")
private def hasCollationContextTag(expr: Expression): Boolean = {
expr.getTagValue(COLLATION_CONTEXT_TAG).isDefined
@@ -53,10 +51,10 @@ object CollationTypeCoercion {
outputStringType match {
case Some(st) =>
val newBranches = caseWhenExpr.branches.map { case (condition,
value) =>
- (condition, castStringType(value, st))
+ (condition, changeType(value, st))
}
val newElseValue =
- caseWhenExpr.elseValue.map(e => castStringType(e, st))
+ caseWhenExpr.elseValue.map(e => changeType(e, st))
CaseWhen(newBranches, newElseValue)
case _ =>
@@ -105,11 +103,9 @@ object CollationTypeCoercion {
val newValues = collateToSingleType(mapCreate.values)
mapCreate.withNewChildren(newKeys.zip(newValues).flatMap(pair =>
Seq(pair._1, pair._2)))
- case namedStruct: CreateNamedStruct if namedStruct.children.size % 2 == 0
=>
- val newNames = collateToSingleType(namedStruct.nameExprs)
- val newValues = collateToSingleType(namedStruct.valExprs)
- val interleaved = newNames.zip(newValues).flatMap(pair => Seq(pair._1,
pair._2))
- namedStruct.withNewChildren(interleaved)
+ case namedStruct: CreateNamedStruct =>
+ // since each child is separate we should not coerce them at all
+ namedStruct
case splitPart: SplitPart =>
val Seq(str, delimiter, partNum) = splitPart.children
@@ -164,31 +160,89 @@ object CollationTypeCoercion {
}
/**
- * Extracts StringTypes from filtered hasStringType
+ * Changes the data type of the expression to the given `newType`.
*/
- @tailrec
- private def extractStringType(dt: DataType): Option[StringType] = dt match {
- case st: StringType => Some(st)
- case ArrayType(et, _) => extractStringType(et)
- case _ => None
+ private def changeType(expr: Expression, newType: DataType): Expression = {
+ mergeTypes(expr.dataType, newType) match {
+ case Some(newDataType) if newDataType != expr.dataType =>
+
assert(!newDataType.existsRecursively(_.isInstanceOf[StringTypeWithContext]))
+
+ val exprWithNewType = expr match {
+ case lit: Literal => lit.copy(dataType = newDataType)
+ case cast: Cast => cast.copy(dataType = newDataType)
+ case _ => Cast(expr, newDataType)
+ }
+
+ // also copy the collation context tag
+ if (hasCollationContextTag(expr)) {
+ exprWithNewType.setTagValue(
+ COLLATION_CONTEXT_TAG, expr.getTagValue(COLLATION_CONTEXT_TAG).get)
+ }
+ exprWithNewType
+
+ case _ =>
+ expr
+ }
}
/**
- * Casts given expression to collated StringType with id equal to
collationId only
- * if expression has StringType in the first place.
+ * If possible, returns the new data type from `inType` by applying
+ * the collation of `castType`.
*/
- def castStringType(expr: Expression, st: StringType): Expression = {
- castStringType(expr.dataType, st)
- .map(dt => Cast(expr, dt))
- .getOrElse(expr)
+ private def mergeTypes(inType: DataType, castType: DataType):
Option[DataType] = {
+ val outType = mergeStructurally(inType, castType) {
+ case (_: StringType, right: StringTypeWithContext) =>
+ right.stringType
+ }
+
+ outType
}
- private def castStringType(inType: DataType, castType: StringType):
Option[DataType] = {
- inType match {
- case st: StringType if st.collationId != castType.collationId =>
- Some(castType)
- case ArrayType(arrType, nullable) =>
- castStringType(arrType, castType).map(ArrayType(_, nullable))
+ /**
+ * Merges two data types structurally according to the given base case.
+ */
+ private def mergeStructurally(
+ leftType: DataType,
+ rightType: DataType)
+ (baseCase: PartialFunction[(DataType, DataType), DataType]):
Option[DataType] = {
+ (leftType, rightType) match {
+
+ // handle the base cases first
+ case _ if baseCase.isDefinedAt((leftType, rightType)) =>
+ Option(baseCase(leftType, rightType))
+
+ case _ if leftType == rightType =>
+ Some(leftType)
+
+ case (ArrayType(leftElemType, nullable), ArrayType(rightElemType, _)) =>
+ mergeStructurally(leftElemType,
rightElemType)(baseCase).map(ArrayType(_, nullable))
+
+ case (MapType(leftKey, leftValue, nullable), MapType(rightKey,
rightValue, _)) =>
+ for {
+ newKeyType <- mergeStructurally(leftKey, rightKey)(baseCase)
+ newValueType <- mergeStructurally(leftValue, rightValue)(baseCase)
+ } yield MapType(newKeyType, newValueType, nullable)
+
+ case (ArrayType(elementType, nullable), right) =>
+ mergeStructurally(elementType, right)(baseCase).map(ArrayType(_,
nullable))
+
+ case (left, ArrayType(elementType, _)) =>
+ mergeStructurally(left, elementType)(baseCase)
+
+ case (StructType(leftFields), StructType(rightFields)) =>
+ if (leftFields.length != rightFields.length) {
+ return None
+ }
+ val newFields = leftFields.zip(rightFields).map {
+ case (leftField, rightField) =>
+ val newType = mergeStructurally(leftField.dataType,
rightField.dataType)(baseCase)
+ if (newType.isEmpty) {
+ return None
+ }
+ leftField.copy(dataType = newType.get)
+ }
+ Some(StructType(newFields))
+
case _ => None
}
}
@@ -201,7 +255,7 @@ object CollationTypeCoercion {
lctOpt match {
case Some(lct) =>
- expressions.map(e => castStringType(e, lct))
+ expressions.map(e => changeType(e, lct))
case _ =>
expressions
}
@@ -210,7 +264,7 @@ object CollationTypeCoercion {
/**
* Tries to find the least common StringType among the given expressions.
*/
- private def findLeastCommonStringType(expressions: Seq[Expression]):
Option[StringType] = {
+ private def findLeastCommonStringType(expressions: Seq[Expression]):
Option[DataType] = {
if (!expressions.exists(e =>
SchemaUtils.hasNonUTF8BinaryCollation(e.dataType))) {
// if there are no collated types we don't need to do anything
return None
@@ -223,62 +277,70 @@ object CollationTypeCoercion {
val collationContextWinner =
expressions.foldLeft(findCollationContext(expressions.head)) {
case (Some(left), right) =>
findCollationContext(right).flatMap { ctx =>
- collationPrecedenceWinner(left, ctx)
+ mergeWinner(left, ctx)
}
- case (None, _) => return None
- }
-
- collationContextWinner.flatMap { cc =>
- extractStringType(cc.dataType)
+ case (None, _) => None
}
+ collationContextWinner
}
/**
- * Tries to find the collation context for the given expression.
+ * Tries to find the data type with the collation context for the given
expression.
* If found, it will also set the [[COLLATION_CONTEXT_TAG]] on the
expression,
* so that the context can be reused later.
*/
- private def findCollationContext(expr: Expression): Option[CollationContext]
= {
+ private def findCollationContext(expr: Expression): Option[DataType] = {
val contextOpt = expr match {
- case _ if hasCollationContextTag(expr) =>
- Some(expr.getTagValue(COLLATION_CONTEXT_TAG).get)
-
- // if `expr` doesn't have a string in its dataType then it doesn't
- // have the collation context either
- case _ if !expr.dataType.existsRecursively(_.isInstanceOf[StringType]) =>
- None
- case collate: Collate =>
- Some(CollationContext(collate.dataType, Explicit))
+ case _ if collationStrengthBaseCases.isDefinedAt(expr) =>
+ collationStrengthBaseCases(expr)
- case _: Alias | _: SubqueryExpression | _: AttributeReference | _:
VariableReference =>
- Some(CollationContext(expr.dataType, Implicit))
+ case getStruct: GetStructField =>
+ val childContext = findCollationContext(getStruct.child)
+ childContext match {
+ case Some(struct: StructType) =>
+ val field = struct.fields(getStruct.ordinal)
+ Some(field.dataType)
+ case _ => None
+ }
- case _: Literal =>
- Some(CollationContext(expr.dataType, Default))
+ case getMapValue: GetMapValue =>
+ findCollationContext(getMapValue.child) match {
+ case Some(MapType(_, valueType, _)) =>
+ mergeWinner(getMapValue.dataType, valueType)
+ case _ =>
+ None
+ }
- // if it does have a string type but none of its children do
- // then the collation context strength is default
- case _ if
!expr.children.exists(_.dataType.existsRecursively(_.isInstanceOf[StringType]))
=>
- Some(CollationContext(expr.dataType, Default))
+ case struct: CreateNamedStruct =>
+ val childrenContexts = struct.valExprs.map(findCollationContext)
+ if (childrenContexts.isEmpty) {
+ return None
+ }
+ val newFields = struct.dataType.fields.zip(childrenContexts).map {
+ case (field, Some(context)) =>
+ field.copy(dataType = context)
+ case (field, None) => field
+ }
+ Some(StructType(newFields))
- case _ =>
- val contextWinnerOpt = getContextRelevantChildren(expr)
- .flatMap(findCollationContext)
- .foldLeft(Option.empty[CollationContext]) {
- case (Some(left), right) =>
- collationPrecedenceWinner(left, right)
- case (None, right) =>
- Some(right)
- }
+ case map: CreateMap =>
+ val keyContexts = map.keys.flatMap(findCollationContext)
+ val valueContexts = map.values.flatMap(findCollationContext)
+ if (keyContexts.length + valueContexts.length != map.children.length) {
+ return None
+ }
- contextWinnerOpt.map { context =>
- if (hasStringType(expr.dataType)) {
- CollationContext(expr.dataType, context.strength)
- } else {
- context
- }
+ val keyContextWinner = mergeWinners(map.dataType.keyType, keyContexts)
+ val valueContextWinner = mergeWinners(map.dataType.valueType,
valueContexts)
+ if (keyContextWinner.isEmpty || valueContextWinner.isEmpty) {
+ return None
}
+ Some(MapType(keyContextWinner.get, valueContextWinner.get))
+
+ case _ =>
+ val childContexts = expr.children.flatMap(findCollationContext)
+ mergeWinners(expr.dataType, childContexts)
}
contextOpt.foreach(expr.setTagValue(COLLATION_CONTEXT_TAG, _))
@@ -286,69 +348,91 @@ object CollationTypeCoercion {
}
/**
- * Returns the children of the given expression that should be used for
calculating the
- * winning collation context.
+ * Base cases for determining the strength of the collation.
*/
- private def getContextRelevantChildren(expression: Expression):
Seq[Expression] = {
- expression match {
- // collation context for named struct should be calculated based on its
values only
- case createStruct: CreateNamedStruct =>
- createStruct.valExprs
+ private def collationStrengthBaseCases: PartialFunction[Expression,
Option[DataType]] = {
+ case expr if hasCollationContextTag(expr) =>
+ Some(expr.getTagValue(COLLATION_CONTEXT_TAG).get)
- // collation context does not depend on the key for extracting the value
- case extract: ExtractValue =>
- Seq(extract.child)
+ // if `expr` doesn't have a string in its dataType then it doesn't
+ // have the collation context either
+ case expr if !expr.dataType.existsRecursively(_.isInstanceOf[StringType])
=>
+ None
- // we currently don't support collation precedence for maps,
- // as this would involve calculating them for keys and values separately
- case _: CreateMap =>
- Seq.empty
+ case collate: Collate =>
+ Some(addContextToStringType(collate.dataType, Explicit))
- case _ =>
- expression.children
+ case expr @ (_: Alias | _: SubqueryExpression | _: AttributeReference | _:
VariableReference) =>
+ Some(addContextToStringType(expr.dataType, Implicit))
+
+ case lit: Literal =>
+ Some(addContextToStringType(lit.dataType, Default))
+
+ // if it does have a string type but none of its children do
+ // then the collation context strength is default
+ case expr if
!expr.children.exists(_.dataType.existsRecursively(_.isInstanceOf[StringType]))
=>
+ Some(addContextToStringType(expr.dataType, Default))
+ }
+
+ /**
+ * Adds collation context to the given string type so we can know its
strength.
+ */
+ private def addContextToStringType(dt: DataType, strength:
CollationStrength): DataType = {
+ dt.transformRecursively {
+ case st: StringType => StringTypeWithContext(st, strength)
}
}
/**
- * Returns the collation context that wins in precedence between left and
right.
+ * Merges multiple data types structurally according to strength of the
collations into the
+ * data type of the `start`.
+ *
+ * If any of the data types cannot be merged, it returns None.
*/
- private def collationPrecedenceWinner(
- left: CollationContext,
- right: CollationContext): Option[CollationContext] = {
-
- val (leftStringType, rightStringType) =
- (extractStringType(left.dataType), extractStringType(right.dataType))
match {
- case (Some(l), Some(r)) =>
- (l, r)
- case (None, None) =>
- return None
- case (Some(_), None) =>
- return Some(left)
- case (None, Some(_)) =>
- return Some(right)
- }
+ private def mergeWinners(start: DataType, rest: Seq[DataType]):
Option[DataType] = {
+ rest.foldLeft(Option(start)) {
+ case (Some(acc), childContext) =>
+ mergeWinner(acc, childContext)
+ case (None, _) =>
+ None
+ }
+ }
- (left.strength, right.strength) match {
- case (Explicit, Explicit) if leftStringType != rightStringType =>
- throw QueryCompilationErrors.explicitCollationMismatchError(
- Seq(leftStringType, rightStringType))
+ /**
+ * Merges two data types structurally according to strength of the
collations.
+ */
+ private def mergeWinner(left: DataType, right: DataType): Option[DataType] =
{
+ mergeStructurally(left, right) {
+ case (left: StringTypeWithContext, right: StringTypeWithContext) =>
+ getWinningStringType(left, right)
- case (Explicit, _) => Some(left)
- case (_, Explicit) => Some(right)
+ case (_: StringType, right: StringTypeWithContext) =>
+ right
+ }
+ }
- case (Implicit, Implicit) if leftStringType != rightStringType =>
+ /** Determines the winning StringTypeWithContext based on the strength of
the collation. */
+ private def getWinningStringType(
+ left: StringTypeWithContext,
+ right: StringTypeWithContext): StringTypeWithContext = {
+ def handleMismatch(): Nothing = {
+ if (left.strength == Explicit) {
+ throw QueryCompilationErrors.explicitCollationMismatchError(
+ Seq(left.stringType, right.stringType))
+ } else {
throw QueryCompilationErrors.implicitCollationMismatchError(
- Seq(leftStringType, rightStringType))
-
- case (Implicit, _) => Some(left)
- case (_, Implicit) => Some(right)
+ Seq(left.stringType, right.stringType))
+ }
+ }
- case (Default, Default) if leftStringType != rightStringType =>
- throw QueryCompilationErrors.implicitCollationMismatchError(
- Seq(leftStringType, rightStringType))
+ (left.strength.priority, right.strength.priority) match {
+ case (leftPriority, rightPriority) if leftPriority == rightPriority =>
+ if (left.sameType(right)) left
+ else handleMismatch()
- case _ =>
- Some(left)
+ case (leftPriority, rightPriority) =>
+ if (leftPriority < rightPriority) left
+ else right
}
}
}
@@ -356,18 +440,32 @@ object CollationTypeCoercion {
/**
* Represents the strength of collation used for determining precedence in
collation resolution.
*/
-private sealed trait CollationStrength {}
+private sealed trait CollationStrength {
+ val priority: Int
+}
private object CollationStrength {
- case object Explicit extends CollationStrength {}
- case object Implicit extends CollationStrength {}
- case object Default extends CollationStrength {}
+ case object Explicit extends CollationStrength {
+ override val priority: Int = 0
+ }
+ case object Implicit extends CollationStrength {
+ override val priority: Int = 1
+ }
+ case object Default extends CollationStrength {
+ override val priority: Int = 2
+ }
}
/**
* Encapsulates the context for collation, including data type and strength.
*
- * @param dataType The data type associated with this collation context.
+ * @param stringType StringType.
* @param strength The strength level of the collation, which determines its
precedence.
*/
-private case class CollationContext(dataType: DataType, strength:
CollationStrength) {}
+private case class StringTypeWithContext(stringType: StringType, strength:
CollationStrength)
+ extends DataType {
+
+ override def defaultSize: Int = stringType.defaultSize
+
+ override private[spark] def asNullable: DataType = this
+}
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
index 45ab1cdcff79..0d5c414416d4 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
@@ -442,77 +442,77 @@ Project [array_except(array(collate(aaa, utf8_lcase)),
array(collate(AAA, utf8_l
-- !query
select 'a' collate unicode < 'A'
-- !query analysis
-Project [(collate(a, unicode) < cast(A as string collate UNICODE)) AS
(collate(a, unicode) < A)#x]
+Project [(collate(a, unicode) < A) AS (collate(a, unicode) < 'A' collate
UNICODE)#x]
+- OneRowRelation
-- !query
select 'a' collate unicode_ci = 'A'
-- !query analysis
-Project [(collate(a, unicode_ci) = cast(A as string collate UNICODE_CI)) AS
(collate(a, unicode_ci) = A)#x]
+Project [(collate(a, unicode_ci) = A) AS (collate(a, unicode_ci) = 'A' collate
UNICODE_CI)#x]
+- OneRowRelation
-- !query
select 'a' collate unicode_ai = 'å'
-- !query analysis
-Project [(collate(a, unicode_ai) = cast(å as string collate UNICODE_AI)) AS
(collate(a, unicode_ai) = å)#x]
+Project [(collate(a, unicode_ai) = å) AS (collate(a, unicode_ai) = 'å' collate
UNICODE_AI)#x]
+- OneRowRelation
-- !query
select 'a' collate unicode_ci_ai = 'Å'
-- !query analysis
-Project [(collate(a, unicode_ci_ai) = cast(Å as string collate UNICODE_CI_AI))
AS (collate(a, unicode_ci_ai) = Å)#x]
+Project [(collate(a, unicode_ci_ai) = Å) AS (collate(a, unicode_ci_ai) = 'Å'
collate UNICODE_CI_AI)#x]
+- OneRowRelation
-- !query
select 'a' collate en < 'A'
-- !query analysis
-Project [(collate(a, en) < cast(A as string collate en)) AS (collate(a, en) <
A)#x]
+Project [(collate(a, en) < A) AS (collate(a, en) < 'A' collate en)#x]
+- OneRowRelation
-- !query
select 'a' collate en_ci = 'A'
-- !query analysis
-Project [(collate(a, en_ci) = cast(A as string collate en_CI)) AS (collate(a,
en_ci) = A)#x]
+Project [(collate(a, en_ci) = A) AS (collate(a, en_ci) = 'A' collate en_CI)#x]
+- OneRowRelation
-- !query
select 'a' collate en_ai = 'å'
-- !query analysis
-Project [(collate(a, en_ai) = cast(å as string collate en_AI)) AS (collate(a,
en_ai) = å)#x]
+Project [(collate(a, en_ai) = å) AS (collate(a, en_ai) = 'å' collate en_AI)#x]
+- OneRowRelation
-- !query
select 'a' collate en_ci_ai = 'Å'
-- !query analysis
-Project [(collate(a, en_ci_ai) = cast(Å as string collate en_CI_AI)) AS
(collate(a, en_ci_ai) = Å)#x]
+Project [(collate(a, en_ci_ai) = Å) AS (collate(a, en_ci_ai) = 'Å' collate
en_CI_AI)#x]
+- OneRowRelation
-- !query
select 'Kypper' collate sv < 'Köpfe'
-- !query analysis
-Project [(collate(Kypper, sv) < cast(Köpfe as string collate sv)) AS
(collate(Kypper, sv) < Köpfe)#x]
+Project [(collate(Kypper, sv) < Köpfe) AS (collate(Kypper, sv) < 'Köpfe'
collate sv)#x]
+- OneRowRelation
-- !query
select 'Kypper' collate de > 'Köpfe'
-- !query analysis
-Project [(collate(Kypper, de) > cast(Köpfe as string collate de)) AS
(collate(Kypper, de) > Köpfe)#x]
+Project [(collate(Kypper, de) > Köpfe) AS (collate(Kypper, de) > 'Köpfe'
collate de)#x]
+- OneRowRelation
-- !query
select 'I' collate tr_ci = 'ı'
-- !query analysis
-Project [(collate(I, tr_ci) = cast(ı as string collate tr_CI)) AS (collate(I,
tr_ci) = ı)#x]
+Project [(collate(I, tr_ci) = ı) AS (collate(I, tr_ci) = 'ı' collate tr_CI)#x]
+- OneRowRelation
@@ -826,7 +826,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_d
-- !query
select concat_ws(' ', utf8_lcase, utf8_lcase) from t5
-- !query analysis
-Project [concat_ws(cast( as string collate UTF8_LCASE), utf8_lcase#x,
utf8_lcase#x) AS concat_ws( , utf8_lcase, utf8_lcase)#x]
+Project [concat_ws( , utf8_lcase#x, utf8_lcase#x) AS concat_ws( , utf8_lcase,
utf8_lcase)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -868,7 +868,7 @@ Project [concat_ws(collate( , utf8_lcase),
cast(utf8_binary#x as string collate
-- !query
select concat_ws(',', utf8_lcase, 'word'), concat_ws(',', utf8_binary, 'word')
from t5
-- !query analysis
-Project [concat_ws(cast(, as string collate UTF8_LCASE), utf8_lcase#x,
cast(word as string collate UTF8_LCASE)) AS concat_ws(,, utf8_lcase, word)#x,
concat_ws(,, utf8_binary#x, word) AS concat_ws(,, utf8_binary, word)#x]
+Project [concat_ws(,, utf8_lcase#x, word) AS concat_ws(,, utf8_lcase, word)#x,
concat_ws(,, utf8_binary#x, word) AS concat_ws(,, utf8_binary, word)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -876,7 +876,7 @@ Project [concat_ws(cast(, as string collate UTF8_LCASE),
utf8_lcase#x, cast(word
-- !query
select concat_ws(',', utf8_lcase, 'word' collate utf8_binary), concat_ws(',',
utf8_binary, 'word' collate utf8_lcase) from t5
-- !query analysis
-Project [concat_ws(,, cast(utf8_lcase#x as string), collate(word,
utf8_binary)) AS concat_ws(,, utf8_lcase, collate(word, utf8_binary))#x,
concat_ws(cast(, as string collate UTF8_LCASE), cast(utf8_binary#x as string
collate UTF8_LCASE), collate(word, utf8_lcase)) AS concat_ws(,, utf8_binary,
collate(word, utf8_lcase))#x]
+Project [concat_ws(,, cast(utf8_lcase#x as string), collate(word,
utf8_binary)) AS concat_ws(,, utf8_lcase, collate(word, utf8_binary))#x,
concat_ws(,, cast(utf8_binary#x as string collate UTF8_LCASE), collate(word,
utf8_lcase)) AS concat_ws(,, utf8_binary, collate(word, utf8_lcase))#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -934,7 +934,7 @@ Project [elt(1, collate(utf8_binary#x, utf8_binary),
cast(utf8_lcase#x as string
-- !query
select elt(1, utf8_binary, 'word'), elt(1, utf8_lcase, 'word') from t5
-- !query analysis
-Project [elt(1, utf8_binary#x, word, true) AS elt(1, utf8_binary, word)#x,
elt(1, utf8_lcase#x, cast(word as string collate UTF8_LCASE), true) AS elt(1,
utf8_lcase, word)#x]
+Project [elt(1, utf8_binary#x, word, true) AS elt(1, utf8_binary, word)#x,
elt(1, utf8_lcase#x, word, true) AS elt(1, utf8_lcase, 'word' collate
UTF8_LCASE)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1024,7 +1024,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select split_part(utf8_binary, 'a', 3), split_part(utf8_lcase, 'a', 3) from t5
-- !query analysis
-Project [split_part(utf8_binary#x, a, 3) AS split_part(utf8_binary, a, 3)#x,
split_part(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 3) AS
split_part(utf8_lcase, a, 3)#x]
+Project [split_part(utf8_binary#x, a, 3) AS split_part(utf8_binary, a, 3)#x,
split_part(utf8_lcase#x, a, 3) AS split_part(utf8_lcase, a, 3)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1122,7 +1122,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select contains(utf8_binary, 'a'), contains(utf8_lcase, 'a') from t5
-- !query analysis
-Project [Contains(utf8_binary#x, a) AS contains(utf8_binary, a)#x,
Contains(utf8_lcase#x, cast(a as string collate UTF8_LCASE)) AS
contains(utf8_lcase, a)#x]
+Project [Contains(utf8_binary#x, a) AS contains(utf8_binary, a)#x,
Contains(utf8_lcase#x, a) AS contains(utf8_lcase, a)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1220,7 +1220,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select substring_index(utf8_binary, 'a', 2), substring_index(utf8_lcase, 'a',
2) from t5
-- !query analysis
-Project [substring_index(utf8_binary#x, a, 2) AS substring_index(utf8_binary,
a, 2)#x, substring_index(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 2)
AS substring_index(utf8_lcase, a, 2)#x]
+Project [substring_index(utf8_binary#x, a, 2) AS substring_index(utf8_binary,
a, 2)#x, substring_index(utf8_lcase#x, a, 2) AS substring_index(utf8_lcase, a,
2)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1318,7 +1318,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select instr(utf8_binary, 'a'), instr(utf8_lcase, 'a') from t5
-- !query analysis
-Project [instr(utf8_binary#x, a) AS instr(utf8_binary, a)#x,
instr(utf8_lcase#x, cast(a as string collate UTF8_LCASE)) AS instr(utf8_lcase,
a)#x]
+Project [instr(utf8_binary#x, a) AS instr(utf8_binary, a)#x,
instr(utf8_lcase#x, a) AS instr(utf8_lcase, a)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1384,7 +1384,7 @@ Project [find_in_set(collate(utf8_binary#x, utf8_lcase),
collate(utf8_lcase#x, u
-- !query
select find_in_set(utf8_binary, 'aaAaaAaA,i̇o'), find_in_set(utf8_lcase,
'aaAaaAaA,i̇o') from t5
-- !query analysis
-Project [find_in_set(utf8_binary#x, aaAaaAaA,i̇o) AS find_in_set(utf8_binary,
aaAaaAaA,i̇o)#x, find_in_set(utf8_lcase#x, cast(aaAaaAaA,i̇o as string collate
UTF8_LCASE)) AS find_in_set(utf8_lcase, aaAaaAaA,i̇o)#x]
+Project [find_in_set(utf8_binary#x, aaAaaAaA,i̇o) AS find_in_set(utf8_binary,
aaAaaAaA,i̇o)#x, find_in_set(utf8_lcase#x, aaAaaAaA,i̇o) AS
find_in_set(utf8_lcase, aaAaaAaA,i̇o)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1482,7 +1482,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select startswith(utf8_binary, 'aaAaaAaA'), startswith(utf8_lcase, 'aaAaaAaA')
from t5
-- !query analysis
-Project [StartsWith(utf8_binary#x, aaAaaAaA) AS startswith(utf8_binary,
aaAaaAaA)#x, StartsWith(utf8_lcase#x, cast(aaAaaAaA as string collate
UTF8_LCASE)) AS startswith(utf8_lcase, aaAaaAaA)#x]
+Project [StartsWith(utf8_binary#x, aaAaaAaA) AS startswith(utf8_binary,
aaAaaAaA)#x, StartsWith(utf8_lcase#x, aaAaaAaA) AS startswith(utf8_lcase,
aaAaaAaA)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1506,7 +1506,7 @@ Project [StartsWith(cast(utf8_binary#x as string collate
UTF8_LCASE_RTRIM), coll
-- !query
select translate(utf8_lcase, utf8_lcase, '12345') from t5
-- !query analysis
-Project [translate(utf8_lcase#x, utf8_lcase#x, cast(12345 as string collate
UTF8_LCASE)) AS translate(utf8_lcase, utf8_lcase, 12345)#x]
+Project [translate(utf8_lcase#x, utf8_lcase#x, 12345) AS translate(utf8_lcase,
utf8_lcase, 12345)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1572,7 +1572,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select translate(utf8_lcase, 'aaAaaAaA', '12345'), translate(utf8_binary,
'aaAaaAaA', '12345') from t5
-- !query analysis
-Project [translate(utf8_lcase#x, cast(aaAaaAaA as string collate UTF8_LCASE),
cast(12345 as string collate UTF8_LCASE)) AS translate(utf8_lcase, aaAaaAaA,
12345)#x, translate(utf8_binary#x, aaAaaAaA, 12345) AS translate(utf8_binary,
aaAaaAaA, 12345)#x]
+Project [translate(utf8_lcase#x, aaAaaAaA, 12345) AS translate(utf8_lcase,
aaAaaAaA, 12345)#x, translate(utf8_binary#x, aaAaaAaA, 12345) AS
translate(utf8_binary, aaAaaAaA, 12345)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1580,7 +1580,7 @@ Project [translate(utf8_lcase#x, cast(aaAaaAaA as string
collate UTF8_LCASE), ca
-- !query
select translate(utf8_lcase, 'aBc' collate utf8_binary, '12345'),
translate(utf8_binary, 'aBc' collate utf8_lcase, '12345') from t5
-- !query analysis
-Project [translate(cast(utf8_lcase#x as string), collate(aBc, utf8_binary),
12345) AS translate(utf8_lcase, collate(aBc, utf8_binary), 12345)#x,
translate(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aBc,
utf8_lcase), cast(12345 as string collate UTF8_LCASE)) AS
translate(utf8_binary, collate(aBc, utf8_lcase), 12345)#x]
+Project [translate(cast(utf8_lcase#x as string), collate(aBc, utf8_binary),
12345) AS translate(utf8_lcase, collate(aBc, utf8_binary), 12345)#x,
translate(cast(utf8_binary#x as string collate UTF8_LCASE), collate(aBc,
utf8_lcase), 12345) AS translate(utf8_binary, collate(aBc, utf8_lcase),
12345)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1588,7 +1588,7 @@ Project [translate(cast(utf8_lcase#x as string),
collate(aBc, utf8_binary), 1234
-- !query
select translate(utf8_lcase, 'aBc ' collate utf8_binary_rtrim, '12345'),
translate(utf8_binary, 'aBc' collate utf8_lcase, '12345') from t5
-- !query analysis
-Project [translate(cast(utf8_lcase#x as string collate UTF8_BINARY_RTRIM),
collate(aBc , utf8_binary_rtrim), cast(12345 as string collate
UTF8_BINARY_RTRIM)) AS translate(utf8_lcase, collate(aBc , utf8_binary_rtrim),
12345)#x, translate(cast(utf8_binary#x as string collate UTF8_LCASE),
collate(aBc, utf8_lcase), cast(12345 as string collate UTF8_LCASE)) AS
translate(utf8_binary, collate(aBc, utf8_lcase), 12345)#x]
+Project [translate(cast(utf8_lcase#x as string collate UTF8_BINARY_RTRIM),
collate(aBc , utf8_binary_rtrim), 12345) AS translate(utf8_lcase, collate(aBc ,
utf8_binary_rtrim), 12345)#x, translate(cast(utf8_binary#x as string collate
UTF8_LCASE), collate(aBc, utf8_lcase), 12345) AS translate(utf8_binary,
collate(aBc, utf8_lcase), 12345)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1638,7 +1638,7 @@ Project [replace(utf8_binary#x, collate(utf8_lcase#x,
utf8_binary), abc) AS repl
-- !query
select replace(utf8_binary collate utf8_lcase, utf8_lcase collate utf8_lcase,
'abc') from t5
-- !query analysis
-Project [replace(collate(utf8_binary#x, utf8_lcase), collate(utf8_lcase#x,
utf8_lcase), cast(abc as string collate UTF8_LCASE)) AS
replace(collate(utf8_binary, utf8_lcase), collate(utf8_lcase, utf8_lcase),
abc)#x]
+Project [replace(collate(utf8_binary#x, utf8_lcase), collate(utf8_lcase#x,
utf8_lcase), abc) AS replace(collate(utf8_binary, utf8_lcase),
collate(utf8_lcase, utf8_lcase), abc)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1655,7 +1655,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"inputType" : "\"STRING COLLATE UNICODE_AI\"",
"paramIndex" : "first",
"requiredType" : "\"STRING\"",
- "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai),
collate(utf8_lcase, unicode_ai), abc)\""
+ "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai),
collate(utf8_lcase, unicode_ai), 'abc' collate UNICODE_AI)\""
},
"queryContext" : [ {
"objectType" : "",
@@ -1670,7 +1670,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select replace(utf8_binary, 'aaAaaAaA', 'abc'), replace(utf8_lcase,
'aaAaaAaA', 'abc') from t5
-- !query analysis
-Project [replace(utf8_binary#x, aaAaaAaA, abc) AS replace(utf8_binary,
aaAaaAaA, abc)#x, replace(utf8_lcase#x, cast(aaAaaAaA as string collate
UTF8_LCASE), cast(abc as string collate UTF8_LCASE)) AS replace(utf8_lcase,
aaAaaAaA, abc)#x]
+Project [replace(utf8_binary#x, aaAaaAaA, abc) AS replace(utf8_binary,
aaAaaAaA, abc)#x, replace(utf8_lcase#x, aaAaaAaA, abc) AS replace(utf8_lcase,
aaAaaAaA, abc)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1678,7 +1678,7 @@ Project [replace(utf8_binary#x, aaAaaAaA, abc) AS
replace(utf8_binary, aaAaaAaA,
-- !query
select replace(utf8_binary, 'aaAaaAaA' collate utf8_lcase, 'abc'),
replace(utf8_lcase, 'aaAaaAaA' collate utf8_binary, 'abc') from t5
-- !query analysis
-Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE),
collate(aaAaaAaA, utf8_lcase), cast(abc as string collate UTF8_LCASE)) AS
replace(utf8_binary, collate(aaAaaAaA, utf8_lcase), abc)#x,
replace(cast(utf8_lcase#x as string), collate(aaAaaAaA, utf8_binary), abc) AS
replace(utf8_lcase, collate(aaAaaAaA, utf8_binary), abc)#x]
+Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE),
collate(aaAaaAaA, utf8_lcase), abc) AS replace(utf8_binary, collate(aaAaaAaA,
utf8_lcase), abc)#x, replace(cast(utf8_lcase#x as string), collate(aaAaaAaA,
utf8_binary), abc) AS replace(utf8_lcase, collate(aaAaaAaA, utf8_binary),
abc)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1686,7 +1686,7 @@ Project [replace(cast(utf8_binary#x as string collate
UTF8_LCASE), collate(aaAaa
-- !query
select replace(utf8_binary, 'aaAaaAaA ' collate utf8_lcase_rtrim, 'abc'),
replace(utf8_lcase, 'aaAaaAaA' collate utf8_binary, 'abc') from t5
-- !query analysis
-Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE_RTRIM),
collate(aaAaaAaA , utf8_lcase_rtrim), cast(abc as string collate
UTF8_LCASE_RTRIM)) AS replace(utf8_binary, collate(aaAaaAaA ,
utf8_lcase_rtrim), abc)#x, replace(cast(utf8_lcase#x as string),
collate(aaAaaAaA, utf8_binary), abc) AS replace(utf8_lcase, collate(aaAaaAaA,
utf8_binary), abc)#x]
+Project [replace(cast(utf8_binary#x as string collate UTF8_LCASE_RTRIM),
collate(aaAaaAaA , utf8_lcase_rtrim), abc) AS replace(utf8_binary,
collate(aaAaaAaA , utf8_lcase_rtrim), abc)#x, replace(cast(utf8_lcase#x as
string), collate(aaAaaAaA, utf8_binary), abc) AS replace(utf8_lcase,
collate(aaAaaAaA, utf8_binary), abc)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -1768,7 +1768,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select endswith(utf8_binary, 'aaAaaAaA'), endswith(utf8_lcase, 'aaAaaAaA')
from t5
-- !query analysis
-Project [EndsWith(utf8_binary#x, aaAaaAaA) AS endswith(utf8_binary,
aaAaaAaA)#x, EndsWith(utf8_lcase#x, cast(aaAaaAaA as string collate
UTF8_LCASE)) AS endswith(utf8_lcase, aaAaaAaA)#x]
+Project [EndsWith(utf8_binary#x, aaAaaAaA) AS endswith(utf8_binary,
aaAaaAaA)#x, EndsWith(utf8_lcase#x, aaAaaAaA) AS endswith(utf8_lcase,
aaAaaAaA)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -2042,7 +2042,7 @@ Project [overlay(collate(utf8_binary#x, utf8_lcase),
collate(utf8_lcase#x, utf8_
-- !query
select overlay(utf8_binary, 'a', 2), overlay(utf8_lcase, 'a', 2) from t5
-- !query analysis
-Project [overlay(utf8_binary#x, a, 2, -1) AS overlay(utf8_binary, a, 2, -1)#x,
overlay(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 2, -1) AS
overlay(utf8_lcase, a, 2, -1)#x]
+Project [overlay(utf8_binary#x, a, 2, -1) AS overlay(utf8_binary, a, 2, -1)#x,
overlay(utf8_lcase#x, a, 2, -1) AS overlay(utf8_lcase, 'a' collate UTF8_LCASE,
2, -1)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -2204,7 +2204,7 @@ Project [levenshtein(collate(utf8_binary#x, utf8_lcase),
collate(utf8_lcase#x, u
-- !query
select levenshtein(utf8_binary, 'a'), levenshtein(utf8_lcase, 'a') from t5
-- !query analysis
-Project [levenshtein(utf8_binary#x, a, None) AS levenshtein(utf8_binary, a)#x,
levenshtein(utf8_lcase#x, cast(a as string collate UTF8_LCASE), None) AS
levenshtein(utf8_lcase, a)#x]
+Project [levenshtein(utf8_binary#x, a, None) AS levenshtein(utf8_binary, a)#x,
levenshtein(utf8_lcase#x, a, None) AS levenshtein(utf8_lcase, a)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -2390,7 +2390,7 @@ Project [lpad(collate(utf8_binary#x, utf8_binary_rtrim),
8, collate(utf8_lcase#x
-- !query
select rpad(utf8_binary, 8, 'a'), rpad(utf8_lcase, 8, 'a') from t5
-- !query analysis
-Project [rpad(utf8_binary#x, 8, a) AS rpad(utf8_binary, 8, a)#x,
rpad(utf8_lcase#x, 8, cast(a as string collate UTF8_LCASE)) AS rpad(utf8_lcase,
8, a)#x]
+Project [rpad(utf8_binary#x, 8, a) AS rpad(utf8_binary, 8, a)#x,
rpad(utf8_lcase#x, 8, a) AS rpad(utf8_lcase, 8, a)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -2464,7 +2464,7 @@ Project [lpad(collate(utf8_binary#x, utf8_binary_rtrim),
8, collate(utf8_lcase#x
-- !query
select lpad(utf8_binary, 8, 'a'), lpad(utf8_lcase, 8, 'a') from t5
-- !query analysis
-Project [lpad(utf8_binary#x, 8, a) AS lpad(utf8_binary, 8, a)#x,
lpad(utf8_lcase#x, 8, cast(a as string collate UTF8_LCASE)) AS lpad(utf8_lcase,
8, a)#x]
+Project [lpad(utf8_binary#x, 8, a) AS lpad(utf8_binary, 8, a)#x,
lpad(utf8_lcase#x, 8, a) AS lpad(utf8_lcase, 8, a)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -2554,7 +2554,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select locate(utf8_binary, 'a'), locate(utf8_lcase, 'a') from t5
-- !query analysis
-Project [locate(utf8_binary#x, a, 1) AS locate(utf8_binary, a, 1)#x,
locate(utf8_lcase#x, cast(a as string collate UTF8_LCASE), 1) AS
locate(utf8_lcase, a, 1)#x]
+Project [locate(utf8_binary#x, a, 1) AS locate(utf8_binary, a, 1)#x,
locate(utf8_lcase#x, a, 1) AS locate(utf8_lcase, a, 1)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -2660,7 +2660,7 @@ Project [trim(collate(utf8_lcase#x, utf8_binary_rtrim),
Some(collate(utf8_binary
-- !query
select TRIM('ABc', utf8_binary), TRIM('ABc', utf8_lcase) from t5
-- !query analysis
-Project [trim(utf8_binary#x, Some(ABc)) AS TRIM(BOTH ABc FROM utf8_binary)#x,
trim(utf8_lcase#x, Some(cast(ABc as string collate UTF8_LCASE))) AS TRIM(BOTH
ABc FROM utf8_lcase)#x]
+Project [trim(utf8_binary#x, Some(ABc)) AS TRIM(BOTH ABc FROM utf8_binary)#x,
trim(utf8_lcase#x, Some(ABc)) AS TRIM(BOTH ABc FROM utf8_lcase)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -2856,7 +2856,7 @@ Project [ltrim(collate(utf8_lcase#x, utf8_binary_rtrim),
Some(collate(utf8_binar
-- !query
select LTRIM('ABc', utf8_binary), LTRIM('ABc', utf8_lcase) from t5
-- !query analysis
-Project [ltrim(utf8_binary#x, Some(ABc)) AS TRIM(LEADING ABc FROM
utf8_binary)#x, ltrim(utf8_lcase#x, Some(cast(ABc as string collate
UTF8_LCASE))) AS TRIM(LEADING ABc FROM utf8_lcase)#x]
+Project [ltrim(utf8_binary#x, Some(ABc)) AS TRIM(LEADING ABc FROM
utf8_binary)#x, ltrim(utf8_lcase#x, Some(ABc)) AS TRIM(LEADING ABc FROM
utf8_lcase)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
@@ -2954,7 +2954,7 @@ Project [rtrim(collate(utf8_lcase#x, utf8_binary_rtrim),
Some(collate(utf8_binar
-- !query
select RTRIM('ABc', utf8_binary), RTRIM('ABc', utf8_lcase) from t5
-- !query analysis
-Project [rtrim(utf8_binary#x, Some(ABc)) AS TRIM(TRAILING ABc FROM
utf8_binary)#x, rtrim(utf8_lcase#x, Some(cast(ABc as string collate
UTF8_LCASE))) AS TRIM(TRAILING ABc FROM utf8_lcase)#x]
+Project [rtrim(utf8_binary#x, Some(ABc)) AS TRIM(TRAILING ABc FROM
utf8_binary)#x, rtrim(utf8_lcase#x, Some(ABc)) AS TRIM(TRAILING ABc FROM
utf8_lcase)#x]
+- SubqueryAlias spark_catalog.default.t5
+- Relation spark_catalog.default.t5[s#x,utf8_binary#x,utf8_lcase#x] parquet
diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out
b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
index f92fc5de8c3f..e96549f00d6e 100644
--- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
@@ -479,7 +479,7 @@ struct<array_except(array(collate(aaa, utf8_lcase)),
array(collate(AAA, utf8_lca
-- !query
select 'a' collate unicode < 'A'
-- !query schema
-struct<(collate(a, unicode) < A):boolean>
+struct<(collate(a, unicode) < 'A' collate UNICODE):boolean>
-- !query output
true
@@ -487,7 +487,7 @@ true
-- !query
select 'a' collate unicode_ci = 'A'
-- !query schema
-struct<(collate(a, unicode_ci) = A):boolean>
+struct<(collate(a, unicode_ci) = 'A' collate UNICODE_CI):boolean>
-- !query output
true
@@ -495,7 +495,7 @@ true
-- !query
select 'a' collate unicode_ai = 'å'
-- !query schema
-struct<(collate(a, unicode_ai) = å):boolean>
+struct<(collate(a, unicode_ai) = 'å' collate UNICODE_AI):boolean>
-- !query output
true
@@ -503,7 +503,7 @@ true
-- !query
select 'a' collate unicode_ci_ai = 'Å'
-- !query schema
-struct<(collate(a, unicode_ci_ai) = Å):boolean>
+struct<(collate(a, unicode_ci_ai) = 'Å' collate UNICODE_CI_AI):boolean>
-- !query output
true
@@ -511,7 +511,7 @@ true
-- !query
select 'a' collate en < 'A'
-- !query schema
-struct<(collate(a, en) < A):boolean>
+struct<(collate(a, en) < 'A' collate en):boolean>
-- !query output
true
@@ -519,7 +519,7 @@ true
-- !query
select 'a' collate en_ci = 'A'
-- !query schema
-struct<(collate(a, en_ci) = A):boolean>
+struct<(collate(a, en_ci) = 'A' collate en_CI):boolean>
-- !query output
true
@@ -527,7 +527,7 @@ true
-- !query
select 'a' collate en_ai = 'å'
-- !query schema
-struct<(collate(a, en_ai) = å):boolean>
+struct<(collate(a, en_ai) = 'å' collate en_AI):boolean>
-- !query output
true
@@ -535,7 +535,7 @@ true
-- !query
select 'a' collate en_ci_ai = 'Å'
-- !query schema
-struct<(collate(a, en_ci_ai) = Å):boolean>
+struct<(collate(a, en_ci_ai) = 'Å' collate en_CI_AI):boolean>
-- !query output
true
@@ -543,7 +543,7 @@ true
-- !query
select 'Kypper' collate sv < 'Köpfe'
-- !query schema
-struct<(collate(Kypper, sv) < Köpfe):boolean>
+struct<(collate(Kypper, sv) < 'Köpfe' collate sv):boolean>
-- !query output
true
@@ -551,7 +551,7 @@ true
-- !query
select 'Kypper' collate de > 'Köpfe'
-- !query schema
-struct<(collate(Kypper, de) > Köpfe):boolean>
+struct<(collate(Kypper, de) > 'Köpfe' collate de):boolean>
-- !query output
true
@@ -559,7 +559,7 @@ true
-- !query
select 'I' collate tr_ci = 'ı'
-- !query schema
-struct<(collate(I, tr_ci) = ı):boolean>
+struct<(collate(I, tr_ci) = 'ı' collate tr_CI):boolean>
-- !query output
true
@@ -1109,7 +1109,7 @@ kitten
-- !query
select elt(1, utf8_binary, 'word'), elt(1, utf8_lcase, 'word') from t5
-- !query schema
-struct<elt(1, utf8_binary, word):string,elt(1, utf8_lcase, word):string
collate UTF8_LCASE>
+struct<elt(1, utf8_binary, word):string,elt(1, utf8_lcase, 'word' collate
UTF8_LCASE):string collate UTF8_LCASE>
-- !query output
Hello, world! Nice day. Hello, world! Nice day.
Something else. Nothing here. Something else. Nothing here.
@@ -2492,7 +2492,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"inputType" : "\"STRING COLLATE UNICODE_AI\"",
"paramIndex" : "first",
"requiredType" : "\"STRING\"",
- "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai),
collate(utf8_lcase, unicode_ai), abc)\""
+ "sqlExpr" : "\"replace(collate(utf8_binary, unicode_ai),
collate(utf8_lcase, unicode_ai), 'abc' collate UNICODE_AI)\""
},
"queryContext" : [ {
"objectType" : "",
@@ -3342,7 +3342,7 @@ ksitTing
-- !query
select overlay(utf8_binary, 'a', 2), overlay(utf8_lcase, 'a', 2) from t5
-- !query schema
-struct<overlay(utf8_binary, a, 2, -1):string,overlay(utf8_lcase, a, 2,
-1):string collate UTF8_LCASE>
+struct<overlay(utf8_binary, a, 2, -1):string,overlay(utf8_lcase, 'a' collate
UTF8_LCASE, 2, -1):string collate UTF8_LCASE>
-- !query output
Hallo, world! Nice day. Hallo, world! Nice day.
Saark SaL
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala
index 7cafb999ffcf..8d831e4ca166 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala
@@ -450,7 +450,8 @@ class CollationSQLRegexpSuite
},
condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
parameters = Map(
- "sqlExpr" -> "\"regexp_replace(collate(ABCDE, UNICODE_CI), .c., FFF,
1)\"",
+ "sqlExpr" ->
+ """"regexp_replace(collate(ABCDE, UNICODE_CI), .c., 'FFF' collate
UNICODE_CI, 1)"""",
"paramIndex" -> "first",
"inputSql" -> "\"collate(ABCDE, UNICODE_CI)\"",
"inputType" -> "\"STRING COLLATE UNICODE_CI\"",
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala
index bb6fce1fb1b6..23d0d4ad8c21 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationTypePrecedenceSuite.scala
@@ -21,6 +21,7 @@ import org.apache.spark.SparkThrowable
import org.apache.spark.sql.{DataFrame, QueryTest, Row}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types._
class CollationTypePrecedenceSuite extends QueryTest with SharedSparkSession {
@@ -43,6 +44,11 @@ class CollationTypePrecedenceSuite extends QueryTest with
SharedSparkSession {
private def assertImplicitMismatch(df: => DataFrame): Unit =
assertThrowsError(df, "COLLATION_MISMATCH.IMPLICIT")
+ private def assertQuerySchema(df: => DataFrame, expectedSchema: DataType):
Unit = {
+ val querySchema = df.schema.fields.head.dataType
+ assert(DataType.equalsIgnoreNullability(querySchema, expectedSchema))
+ }
+
test("explicit collation propagates up") {
checkAnswer(
sql(s"SELECT COLLATION('a' collate unicode)"),
@@ -382,13 +388,157 @@ class CollationTypePrecedenceSuite extends QueryTest
with SharedSparkSession {
s"'name2' collate utf8_lcase, 'value2' collate unicode)"),
Row(Row("value1", "value2")))
- assertExplicitMismatch(
+ checkAnswer(
sql(s"SELECT named_struct" +
- s"('name1' collate unicode, 'value1', 'name2' collate utf8_lcase,
'value2')"))
+ s"('name1' collate unicode, 'value1', 'name2' collate utf8_lcase,
'value2')"),
+ Row(Row("value1", "value2")))
- assertExplicitMismatch(
+ checkAnswer(
sql(s"SELECT named_struct" +
- s"('name1', 'value1' collate unicode, 'name2', 'value2' collate
utf8_lcase)"))
+ s"('name1', 'value1' collate unicode, 'name2', 'value2' collate
utf8_lcase)"),
+ Row(Row("value1", "value2")))
+ }
+
+ test("coercing structs") {
+ assertQuerySchema(
+ sql(s"SELECT array(struct(1, 'a'), struct(2, 'b' collate utf8_lcase))"),
+ ArrayType(
+ StructType(
+ Seq(StructField("col1", IntegerType), StructField("col2",
StringType("UTF8_LCASE"))))))
+
+ assertQuerySchema(
+ sql(s"SELECT array(struct(1, 'a' collate utf8_lcase), struct(2, 'b'
collate utf8_lcase))"),
+ ArrayType(
+ StructType(
+ Seq(StructField("col1", IntegerType), StructField("col2",
StringType("UTF8_LCASE"))))))
+
+ assertExplicitMismatch(
+ sql(s"SELECT array(struct(1, 'a' collate utf8_lcase), struct(2, 'b'
collate unicode))"))
+
+ assertImplicitMismatch(sql(s"""
+ |SELECT array(struct(1, c1), struct(2, c2))
+ |FROM VALUES ('a' collate unicode, 'b' collate utf8_lcase) AS t(c1,
c2)
+ |""".stripMargin))
+ }
+
+ test("coercing maps") {
+ assertQuerySchema(
+ sql(s"SELECT map('key1', 'val1', 'key2', 'val2')"),
+ MapType(StringType, StringType))
+
+ assertQuerySchema(
+ sql(s"SELECT map('key1' collate utf8_lcase, 'val1', 'key2', 'val2'
collate unicode)"),
+ MapType(StringType("UTF8_LCASE"), StringType("UNICODE")))
+
+ assertQuerySchema(
+ sql(s"SELECT ARRAY(map('key1', 'val1'), map('key2' collate UNICODE,
'val2'))"),
+ ArrayType(MapType(StringType("UNICODE"), StringType)))
+
+ assertExplicitMismatch(
+ sql(s"SELECT map('key1', 'val1' collate utf8_lcase, 'key2', 'val2'
collate unicode)"))
+ }
+
+ test("maps of structs") {
+ assertQuerySchema(
+ sql(s"SELECT map('key1', struct(1, 'a' collate unicode), 'key2',
struct(2, 'b'))"),
+ MapType(
+ StringType,
+ StructType(
+ Seq(StructField("col1", IntegerType), StructField("col2",
StringType("UNICODE"))))))
+
+ checkAnswer(
+ sql(
+ s"SELECT map('key1', struct(1, 'a' collate unicode_ci)," +
+ s"'key2', struct(2, 'b'))['key1'].col2 = 'A'"),
+ Seq(Row(true)))
+ }
+
+ test("coercing arrays") {
+ assertQuerySchema(sql(s"SELECT array('a', 'b')"), ArrayType(StringType))
+
+ assertQuerySchema(
+ sql(s"SELECT array('a' collate utf8_lcase, 'b')"),
+ ArrayType(StringType("UTF8_LCASE")))
+
+ assertQuerySchema(
+ sql(s"SELECT array('a' collate utf8_lcase, 'b' collate utf8_lcase)"),
+ ArrayType(StringType("UTF8_LCASE")))
+
+ assertExplicitMismatch(sql(s"SELECT array('a' collate utf8_lcase, 'b'
collate unicode)"))
+
+ assertQuerySchema(
+ sql(s"SELECT array(array('a', 'b'), array('c' collate utf8_lcase,
'd'))"),
+ ArrayType(ArrayType(StringType("UTF8_LCASE"))))
+
+ checkAnswer(
+ sql(s"SELECT array('a', 'b') = array('A' collate utf8_lcase, 'B')"),
+ Seq(Row(true)))
+
+ checkAnswer(
+ sql(s"SELECT array('a', 'b')[0] = array('A' collate utf8_lcase,
'B')[1]"),
+ Seq(Row(false)))
+
+ assertExplicitMismatch(
+ sql(s"SELECT array('a', 'b' collate unicode) = array('A' collate
utf8_lcase, 'B')"))
+ }
+
+ test("array of structs") {
+ assertQuerySchema(
+ sql(s"SELECT array(struct(1, 'a' collate unicode), struct(2, 'b'))[0]"),
+ StructType(
+ Seq(StructField("col1", IntegerType), StructField("col2",
StringType("UNICODE")))))
+
+ checkAnswer(
+ sql(s"SELECT array(struct(1, 'a' collate unicode_ci), struct(2,
'b'))[0].col2 = 'A'"),
+ Seq(Row(true)))
+ }
+
+ test("coercing deeply nested complex types") {
+ assertQuerySchema(
+ sql(s"""
+ |SELECT struct(
+ | struct(1, 'nested' collate unicode),
+ | array(
+ | struct(1, 'a' collate utf8_lcase),
+ | struct(2, 'b' collate utf8_lcase)
+ | )
+ |)
+ |""".stripMargin),
+ StructType(
+ Seq(
+ StructField(
+ "col1",
+ StructType(
+ Seq(StructField("col1", IntegerType), StructField("col2",
StringType("UNICODE"))))),
+ StructField(
+ "col2",
+ ArrayType(
+ StructType(Seq(
+ StructField("col1", IntegerType),
+ StructField("col2", StringType("UTF8_LCASE")))))))))
+
+ assertQuerySchema(
+ sql(s"""
+ |SELECT struct(
+ | struct(
+ | array(
+ | map('key1' collate utf8_lcase, 'val1',
+ | 'key2', 'val2'),
+ | map('key3', 'val3' collate unicode)
+ | )
+ | ),
+ | 42
+ |)
+ |""".stripMargin),
+ StructType(
+ Seq(
+ StructField(
+ "col1",
+ StructType(
+ Seq(StructField(
+ "col1",
+ ArrayType(MapType(StringType("UTF8_LCASE"),
StringType("UNICODE"))))))),
+ StructField("col2", IntegerType))))
}
test("access collated map via literal") {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]