This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new fb5697d6873c [SPARK-48658][SQL] Encode/Decode functions report coding
errors instead of mojibake for unmappable characters
fb5697d6873c is described below
commit fb5697d6873ce8a08a2f5b6e4dd7539e557a69a2
Author: Kent Yao <[email protected]>
AuthorDate: Mon Jun 24 20:41:55 2024 +0800
[SPARK-48658][SQL] Encode/Decode functions report coding errors instead of
mojibake for unmappable characters
### What changes were proposed in this pull request?
This PR makes encode/decode functions report coding errors instead of
mojibake for unmappable characters, take `select encode('渭城朝雨浥轻尘', 'US-ASCII')`
as an example
Before this PR,
```sql
???????
```
After this PR,
```json
org.apache.spark.SparkRuntimeException
{
"errorClass" : "MALFORMED_CHARACTER_CODING",
"sqlState" : "22000",
"messageParameters" : {
"charset" : "US-ASCII",
"function" : "`encode`"
}
}
```
### Why are the changes needed?
Improve data quality.
### Does this PR introduce _any_ user-facing change?
Yes.
When set spark.sql.legacy.codingErrorAction to true, encode/decode
functions replace unmappable characters with mojibake instead of reporting
coding errors.
### How was this patch tested?
new unit tests
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #47017 from yaooqinn/SPARK-48658.
Authored-by: Kent Yao <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
---
.../src/main/resources/error/error-conditions.json | 6 +
.../explain-results/function_decode.explain | 2 +-
.../explain-results/function_encode.explain | 2 +-
.../function_to_binary_with_format.explain | 2 +-
.../catalyst/expressions/stringExpressions.scala | 198 ++++++++++++---------
.../spark/sql/errors/QueryExecutionErrors.scala | 8 +
.../org/apache/spark/sql/internal/SQLConf.scala | 10 ++
.../catalyst/expressions/CodeGenerationSuite.scala | 2 +-
.../expressions/ExpressionEvalHelper.scala | 7 +-
.../expressions/StringExpressionsSuite.scala | 4 +-
.../analyzer-results/ansi/string-functions.sql.out | 106 +++++++++--
.../analyzer-results/string-functions.sql.out | 106 +++++++++--
.../typeCoercion/native/concat.sql.out | 18 +-
.../typeCoercion/native/elt.sql.out | 8 +-
.../sql-tests/inputs/string-functions.sql | 12 ++
.../results/ansi/string-functions.sql.out | 128 +++++++++++++
.../sql-tests/results/string-functions.sql.out | 128 +++++++++++++
.../scala/org/apache/spark/sql/ExplainSuite.scala | 15 +-
18 files changed, 628 insertions(+), 134 deletions(-)
diff --git a/common/utils/src/main/resources/error/error-conditions.json
b/common/utils/src/main/resources/error/error-conditions.json
index 2e012d67d58b..975536c076dd 100644
--- a/common/utils/src/main/resources/error/error-conditions.json
+++ b/common/utils/src/main/resources/error/error-conditions.json
@@ -3006,6 +3006,12 @@
],
"sqlState" : "42710"
},
+ "MALFORMED_CHARACTER_CODING" : {
+ "message" : [
+ "Invalid value found when performing <function> with <charset>"
+ ],
+ "sqlState" : "22000"
+ },
"MALFORMED_CSV_RECORD" : {
"message" : [
"Malformed CSV record: <badRecord>"
diff --git
a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain
b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain
index 165be9b9e12f..e1a445120c13 100644
---
a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain
+++
b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain
@@ -1,2 +1,2 @@
-Project [decode(cast(g#0 as binary), UTF-8, false) AS decode(g, UTF-8)#0]
+Project [staticinvoke(class
org.apache.spark.sql.catalyst.expressions.StringDecode, StringType, decode,
cast(g#0 as binary), UTF-8, false, false, BinaryType, StringTypeAnyCollation,
BooleanType, BooleanType, true, true, true) AS decode(g, UTF-8)#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git
a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain
b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain
index 2f6543605923..7ce8776d754d 100644
---
a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain
+++
b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain
@@ -1,2 +1,2 @@
-Project [encode(g#0, UTF-8, false) AS encode(g, UTF-8)#0]
+Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.Encode,
BinaryType, encode, g#0, UTF-8, false, false, StringTypeAnyCollation,
StringTypeAnyCollation, BooleanType, BooleanType, true, true, true) AS
encode(g, UTF-8)#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git
a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain
b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain
index b62ccccc0c15..d999697a4c9e 100644
---
a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain
+++
b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain
@@ -1,2 +1,2 @@
-Project [encode(g#0, UTF-8, false) AS to_binary(g, utf-8)#0]
+Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.Encode,
BinaryType, encode, g#0, UTF-8, false, false, StringTypeAnyCollation,
StringTypeAnyCollation, BooleanType, BooleanType, true, true, true) AS
to_binary(g, utf-8)#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index e0a9d6f77edd..055ef074d621 100755
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -17,7 +17,8 @@
package org.apache.spark.sql.catalyst.expressions
-import java.io.UnsupportedEncodingException
+import java.nio.{ByteBuffer, CharBuffer}
+import java.nio.charset.{CharacterCodingException, Charset, CodingErrorAction,
IllegalCharsetNameException, UnsupportedCharsetException}
import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
import java.util.{Base64 => JBase64}
import java.util.{HashMap, Locale, Map => JMap}
@@ -25,6 +26,7 @@ import java.util.{HashMap, Locale, Map => JMap}
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.QueryContext
+import org.apache.spark.network.util.JavaUtils
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder,
FunctionRegistry, TypeCheckResult}
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
@@ -2716,62 +2718,69 @@ case class Decode(params: Seq[Expression], replacement:
Expression)
since = "1.5.0",
group = "string_funcs")
// scalastyle:on line.size.limit
-case class StringDecode(bin: Expression, charset: Expression, legacyCharsets:
Boolean)
- extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant {
+case class StringDecode(
+ bin: Expression,
+ charset: Expression,
+ legacyCharsets: Boolean,
+ legacyErrorAction: Boolean)
+ extends RuntimeReplaceable with ImplicitCastInputTypes {
def this(bin: Expression, charset: Expression) =
- this(bin, charset, SQLConf.get.legacyJavaCharsets)
+ this(bin, charset, SQLConf.get.legacyJavaCharsets,
SQLConf.get.legacyCodingErrorAction)
- override def left: Expression = bin
- override def right: Expression = charset
override def dataType: DataType = SQLConf.get.defaultStringType
override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType,
StringTypeAnyCollation)
+ override def prettyName: String = "decode"
+ override def toString: String = s"$prettyName($bin, $charset)"
- private val supportedCharsets = Set(
- "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16",
"UTF-32")
-
- protected override def nullSafeEval(input1: Any, input2: Any): Any = {
- val fromCharset = input2.asInstanceOf[UTF8String].toString
- try {
- if (legacyCharsets ||
supportedCharsets.contains(fromCharset.toUpperCase(Locale.ROOT))) {
- UTF8String.fromString(new String(input1.asInstanceOf[Array[Byte]],
fromCharset))
- } else throw new UnsupportedEncodingException
- } catch {
- case _: UnsupportedEncodingException =>
- throw QueryExecutionErrors.invalidCharsetError(prettyName, fromCharset)
- }
- }
-
- override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
- nullSafeCodeGen(ctx, ev, (bytes, charset) => {
- val fromCharset = ctx.freshName("fromCharset")
- val sc = JavaCode.global(
- ctx.addReferenceObj("supportedCharsets", supportedCharsets),
- supportedCharsets.getClass)
- s"""
- String $fromCharset = $charset.toString();
- try {
- if ($legacyCharsets ||
$sc.contains($fromCharset.toUpperCase(java.util.Locale.ROOT))) {
- ${ev.value} = UTF8String.fromString(new String($bytes,
$fromCharset));
- } else {
- throw new java.io.UnsupportedEncodingException();
- }
- } catch (java.io.UnsupportedEncodingException e) {
- throw QueryExecutionErrors.invalidCharsetError("$prettyName",
$fromCharset);
- }
- """
- })
- }
-
- override protected def withNewChildrenInternal(
- newLeft: Expression, newRight: Expression): StringDecode =
- copy(bin = newLeft, charset = newRight)
+ override def replacement: Expression = StaticInvoke(
+ classOf[StringDecode],
+ SQLConf.get.defaultStringType,
+ "decode",
+ Seq(bin, charset, Literal(legacyCharsets), Literal(legacyErrorAction)),
+ Seq(BinaryType, StringTypeAnyCollation, BooleanType, BooleanType))
- override def prettyName: String = "decode"
+ override def children: Seq[Expression] = Seq(bin, charset)
+ override protected def withNewChildrenInternal(newChildren:
IndexedSeq[Expression]): Expression =
+ copy(bin = newChildren(0), charset = newChildren(1))
}
object StringDecode {
def apply(bin: Expression, charset: Expression): StringDecode = new
StringDecode(bin, charset)
+ def decode(
+ input: Array[Byte],
+ charset: UTF8String,
+ legacyCharsets: Boolean,
+ legacyErrorAction: Boolean): UTF8String = {
+ val fromCharset = charset.toString
+ if (legacyCharsets ||
Encode.VALID_CHARSETS.contains(fromCharset.toUpperCase(Locale.ROOT))) {
+ val decoder = try {
+ val codingErrorAction = if (legacyErrorAction) {
+ CodingErrorAction.REPLACE
+ } else {
+ CodingErrorAction.REPORT
+ }
+ Charset.forName(fromCharset)
+ .newDecoder()
+ .onMalformedInput(codingErrorAction)
+ .onUnmappableCharacter(codingErrorAction)
+ } catch {
+ case _: IllegalCharsetNameException |
+ _: UnsupportedCharsetException |
+ _: IllegalArgumentException =>
+ throw QueryExecutionErrors.invalidCharsetError("decode", fromCharset)
+ }
+ try {
+ val cb = decoder.decode(ByteBuffer.wrap(input))
+ UTF8String.fromString(cb.toString)
+ } catch {
+ case _: CharacterCodingException =>
+ throw QueryExecutionErrors.malformedCharacterCoding("decode",
fromCharset)
+ }
+ } else {
+ throw QueryExecutionErrors.invalidCharsetError("decode", fromCharset)
+ }
+ }
}
/**
@@ -2793,59 +2802,76 @@ object StringDecode {
since = "1.5.0",
group = "string_funcs")
// scalastyle:on line.size.limit
-case class Encode(str: Expression, charset: Expression, legacyCharsets:
Boolean)
- extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant {
+case class Encode(
+ str: Expression,
+ charset: Expression,
+ legacyCharsets: Boolean,
+ legacyErrorAction: Boolean)
+ extends RuntimeReplaceable with ImplicitCastInputTypes {
def this(value: Expression, charset: Expression) =
- this(value, charset, SQLConf.get.legacyJavaCharsets)
+ this(value, charset, SQLConf.get.legacyJavaCharsets,
SQLConf.get.legacyCodingErrorAction)
- override def left: Expression = str
- override def right: Expression = charset
override def dataType: DataType = BinaryType
override def inputTypes: Seq[AbstractDataType] =
Seq(StringTypeAnyCollation, StringTypeAnyCollation)
- private val supportedCharsets = Set(
- "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16",
"UTF-32")
-
- protected override def nullSafeEval(input1: Any, input2: Any): Any = {
- val toCharset = input2.asInstanceOf[UTF8String].toString
- try {
- if (legacyCharsets ||
supportedCharsets.contains(toCharset.toUpperCase(Locale.ROOT))) {
- input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
- } else throw new UnsupportedEncodingException
- } catch {
- case _: UnsupportedEncodingException =>
- throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset)
- }
- }
+ override val replacement: Expression = StaticInvoke(
+ classOf[Encode],
+ BinaryType,
+ "encode",
+ Seq(
+ str, charset, Literal(legacyCharsets, BooleanType),
Literal(legacyErrorAction, BooleanType)),
+ Seq(StringTypeAnyCollation, StringTypeAnyCollation, BooleanType,
BooleanType))
- override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
- nullSafeCodeGen(ctx, ev, (string, charset) => {
- val toCharset = ctx.freshName("toCharset")
- val sc = JavaCode.global(
- ctx.addReferenceObj("supportedCharsets", supportedCharsets),
- supportedCharsets.getClass)
- s"""
- String $toCharset = $charset.toString();
- try {
- if ($legacyCharsets ||
$sc.contains($toCharset.toUpperCase(java.util.Locale.ROOT))) {
- ${ev.value} = $string.toString().getBytes($toCharset);
- } else {
- throw new java.io.UnsupportedEncodingException();
- }
- } catch (java.io.UnsupportedEncodingException e) {
- throw QueryExecutionErrors.invalidCharsetError("$prettyName",
$toCharset);
- }"""
- })
- }
+ override def toString: String = s"$prettyName($str, $charset)"
- override protected def withNewChildrenInternal(
- newLeft: Expression, newRight: Expression): Encode = copy(str = newLeft,
charset = newRight)
+ override def children: Seq[Expression] = Seq(str, charset)
+
+ override protected def withNewChildrenInternal(newChildren:
IndexedSeq[Expression]): Expression =
+ copy(str = newChildren.head, charset = newChildren(1))
}
object Encode {
def apply(value: Expression, charset: Expression): Encode = new
Encode(value, charset)
+
+ private[expressions] final lazy val VALID_CHARSETS =
+ Set("US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16",
"UTF-32")
+
+ def encode(
+ input: UTF8String,
+ charset: UTF8String,
+ legacyCharsets: Boolean,
+ legacyErrorAction: Boolean): Array[Byte] = {
+ val toCharset = charset.toString
+ if (legacyCharsets ||
VALID_CHARSETS.contains(toCharset.toUpperCase(Locale.ROOT))) {
+ val encoder = try {
+ val codingErrorAction = if (legacyErrorAction) {
+ CodingErrorAction.REPLACE
+ } else {
+ CodingErrorAction.REPORT
+ }
+ Charset.forName(toCharset)
+ .newEncoder()
+ .onMalformedInput(codingErrorAction)
+ .onUnmappableCharacter(codingErrorAction)
+ } catch {
+ case _: IllegalCharsetNameException |
+ _: UnsupportedCharsetException |
+ _: IllegalArgumentException =>
+ throw QueryExecutionErrors.invalidCharsetError("encode", toCharset)
+ }
+ try {
+ val bb = encoder.encode(CharBuffer.wrap(input.toString))
+ JavaUtils.bufferToArray(bb)
+ } catch {
+ case _: CharacterCodingException =>
+ throw QueryExecutionErrors.malformedCharacterCoding("encode",
toCharset)
+ }
+ } else {
+ throw QueryExecutionErrors.invalidCharsetError("encode", toCharset)
+ }
+ }
}
/**
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
index 30e53f146982..8af931976b2e 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
@@ -2741,6 +2741,14 @@ private[sql] object QueryExecutionErrors extends
QueryErrorsBase with ExecutionE
"charset" -> charset))
}
+ def malformedCharacterCoding(functionName: String, charset: String):
RuntimeException = {
+ new SparkRuntimeException(
+ errorClass = "MALFORMED_CHARACTER_CODING",
+ messageParameters = Map(
+ "function" -> toSQLId(functionName),
+ "charset" -> charset))
+ }
+
def invalidWriterCommitMessageError(details: String): Throwable = {
new SparkRuntimeException(
errorClass = "INVALID_WRITER_COMMIT_MESSAGE",
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 799e54aaecea..d0dc75017fa6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -5018,6 +5018,14 @@ object SQLConf {
.booleanConf
.createWithDefault(false)
+ val LEGACY_CODING_ERROR_ACTION =
buildConf("spark.sql.legacy.codingErrorAction")
+ .internal()
+ .doc("When set to true, encode/decode functions replace unmappable
characters with mojibake " +
+ "instead of reporting coding errors.")
+ .version("4.0.0")
+ .booleanConf
+ .createWithDefault(false)
+
val LEGACY_EVAL_CURRENT_TIME =
buildConf("spark.sql.legacy.earlyEvalCurrentTime")
.internal()
.doc("When set to true, evaluation and constant folding will happen for
now() and " +
@@ -5994,6 +6002,8 @@ class SQLConf extends Serializable with Logging with
SqlApiConf {
def legacyJavaCharsets: Boolean = getConf(SQLConf.LEGACY_JAVA_CHARSETS)
+ def legacyCodingErrorAction: Boolean =
getConf(SQLConf.LEGACY_CODING_ERROR_ACTION)
+
def legacyEvalCurrentTime: Boolean =
getConf(SQLConf.LEGACY_EVAL_CURRENT_TIME)
/** ********************** SQLConf functionality methods ************ */
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 4df8d87074fc..4c045f9fda73 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -104,7 +104,7 @@ class CodeGenerationSuite extends SparkFunSuite with
ExpressionEvalHelper {
test("SPARK-22543: split large if expressions into blocks due to JVM code
size limit") {
var strExpr: Expression = Literal("abc")
for (_ <- 1 to 150) {
- strExpr = StringDecode(Encode(strExpr, "utf-8"), "utf-8")
+ strExpr = StringTrimRight(StringTrimLeft(strExpr))
}
val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr))
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 21e6b8692911..a063e53486ad 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -71,10 +71,15 @@ trait ExpressionEvalHelper extends
ScalaCheckDrivenPropertyChecks with PlanTestB
new ArrayBasedMapData(keyArray, valueArray)
}
+ protected def replace(expr: Expression): Expression = expr match {
+ case r: RuntimeReplaceable => replace(r.replacement)
+ case _ => expr.mapChildren(replace)
+ }
+
private def prepareEvaluation(expression: Expression): Expression = {
val serializer = new JavaSerializer(new SparkConf()).newInstance()
val resolver = ResolveTimeZone
- val expr = resolver.resolveTimeZones(expression)
+ val expr = resolver.resolveTimeZones(replace(expression))
assert(expr.resolved)
serializer.deserialize(serializer.serialize(expr))
}
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 51de44d8dfd9..ebd724543481 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -505,8 +505,8 @@ class StringExpressionsSuite extends SparkFunSuite with
ExpressionEvalHelper {
checkEvaluation(StringDecode(b, Literal.create(null, StringType)), null,
create_row(null))
// Test escaping of charset
- GenerateUnsafeProjection.generate(Encode(a, Literal("\"quote")) :: Nil)
- GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")) ::
Nil)
+ GenerateUnsafeProjection.generate(Encode(a,
Literal("\"quote")).replacement :: Nil)
+ GenerateUnsafeProjection.generate(StringDecode(b,
Literal("\"quote")).replacement :: Nil)
}
test("initcap unit test") {
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
index c9b451187356..c7675b16384f 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
@@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x]
-- !query
SELECT btrim(encode(" xyz ", 'utf-8'))
-- !query analysis
-Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x]
+Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x]
+- OneRowRelation
-- !query
SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'))
-- !query analysis
-Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS
btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x]
+Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS
btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x]
+- OneRowRelation
-- !query
SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'))
-- !query analysis
-Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS
btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x]
+Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS
btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x]
+- OneRowRelation
@@ -649,14 +649,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(true))
-- !query
select encode('hello', 'WINDOWS-1252')
-- !query analysis
-Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x]
+Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x]
+- OneRowRelation
-- !query
select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
-- !query analysis
-Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x]
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- SubqueryAlias t
+- LocalRelation [scol#x, ecol#x]
@@ -670,14 +670,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(false))
-- !query
select encode('hello', 'WINDOWS-1252')
-- !query analysis
-Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x]
+Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x]
+- OneRowRelation
-- !query
select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
-- !query analysis
-Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x]
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- SubqueryAlias t
+- LocalRelation [scol#x, ecol#x]
@@ -685,14 +685,56 @@ Project [encode(scol#x, ecol#x, false) AS encode(scol,
ecol)#x]
-- !query
select encode('hello', 'Windows-xxx')
-- !query analysis
-Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x]
+Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
+- OneRowRelation
-- !query
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
-- !query analysis
-Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x]
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+ +- LocalRelation [scol#x, ecol#x]
+
+
+-- !query
+set spark.sql.legacy.codingErrorAction=true
+-- !query analysis
+SetCommand (spark.sql.legacy.codingErrorAction,Some(true))
+
+
+-- !query
+select encode('渭城朝雨浥轻尘', 'US-ASCII')
+-- !query analysis
+Project [encode(渭城朝雨浥轻尘, US-ASCII) AS encode(渭城朝雨浥轻尘, US-ASCII)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+ +- LocalRelation [scol#x, ecol#x]
+
+
+-- !query
+set spark.sql.legacy.codingErrorAction=false
+-- !query analysis
+SetCommand (spark.sql.legacy.codingErrorAction,Some(false))
+
+
+-- !query
+select encode('客舍青青柳色新', 'US-ASCII')
+-- !query analysis
+Project [encode(客舍青青柳色新, US-ASCII) AS encode(客舍青青柳色新, US-ASCII)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- SubqueryAlias t
+- LocalRelation [scol#x, ecol#x]
@@ -746,14 +788,14 @@ org.apache.spark.sql.AnalysisException
-- !query
select decode(encode('abc', 'utf-8'), 'utf-8')
-- !query analysis
-Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc,
utf-8), utf-8)#x]
+Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8),
utf-8)#x]
+- OneRowRelation
-- !query
select decode(encode('大千世界', 'utf-32'), 'utf-32')
-- !query analysis
-Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界,
utf-32), utf-32)#x]
+Project [decode(encode(大千世界, utf-32), utf-32) AS decode(encode(大千世界, utf-32),
utf-32)#x]
+- OneRowRelation
@@ -863,6 +905,48 @@ Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x]
+- LocalRelation [scol#x, ecol#x]
+-- !query
+set spark.sql.legacy.codingErrorAction=true
+-- !query analysis
+SetCommand (spark.sql.legacy.codingErrorAction,Some(true))
+
+
+-- !query
+select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII')
+-- !query analysis
+Project [decode(0xE58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS
decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', US-ASCII)#x]
++- OneRowRelation
+
+
+-- !query
+select decode(scol, ecol) from
values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol,
ecol)
+-- !query analysis
+Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x]
++- SubqueryAlias t
+ +- LocalRelation [scol#x, ecol#x]
+
+
+-- !query
+set spark.sql.legacy.codingErrorAction=false
+-- !query analysis
+SetCommand (spark.sql.legacy.codingErrorAction,Some(false))
+
+
+-- !query
+select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII')
+-- !query analysis
+Project [decode(0xE8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS
decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', US-ASCII)#x]
++- OneRowRelation
+
+
+-- !query
+select decode(scol, ecol) from
values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol,
ecol)
+-- !query analysis
+Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x]
++- SubqueryAlias t
+ +- LocalRelation [scol#x, ecol#x]
+
+
-- !query
SELECT CONTAINS(null, 'Spark')
-- !query analysis
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
index c9b451187356..c7675b16384f 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
@@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x]
-- !query
SELECT btrim(encode(" xyz ", 'utf-8'))
-- !query analysis
-Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x]
+Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x]
+- OneRowRelation
-- !query
SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'))
-- !query analysis
-Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS
btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x]
+Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS
btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x]
+- OneRowRelation
-- !query
SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'))
-- !query analysis
-Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS
btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x]
+Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS
btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x]
+- OneRowRelation
@@ -649,14 +649,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(true))
-- !query
select encode('hello', 'WINDOWS-1252')
-- !query analysis
-Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x]
+Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x]
+- OneRowRelation
-- !query
select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
-- !query analysis
-Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x]
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- SubqueryAlias t
+- LocalRelation [scol#x, ecol#x]
@@ -670,14 +670,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(false))
-- !query
select encode('hello', 'WINDOWS-1252')
-- !query analysis
-Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x]
+Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x]
+- OneRowRelation
-- !query
select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
-- !query analysis
-Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x]
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- SubqueryAlias t
+- LocalRelation [scol#x, ecol#x]
@@ -685,14 +685,56 @@ Project [encode(scol#x, ecol#x, false) AS encode(scol,
ecol)#x]
-- !query
select encode('hello', 'Windows-xxx')
-- !query analysis
-Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x]
+Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
+- OneRowRelation
-- !query
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
-- !query analysis
-Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x]
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+ +- LocalRelation [scol#x, ecol#x]
+
+
+-- !query
+set spark.sql.legacy.codingErrorAction=true
+-- !query analysis
+SetCommand (spark.sql.legacy.codingErrorAction,Some(true))
+
+
+-- !query
+select encode('渭城朝雨浥轻尘', 'US-ASCII')
+-- !query analysis
+Project [encode(渭城朝雨浥轻尘, US-ASCII) AS encode(渭城朝雨浥轻尘, US-ASCII)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+ +- LocalRelation [scol#x, ecol#x]
+
+
+-- !query
+set spark.sql.legacy.codingErrorAction=false
+-- !query analysis
+SetCommand (spark.sql.legacy.codingErrorAction,Some(false))
+
+
+-- !query
+select encode('客舍青青柳色新', 'US-ASCII')
+-- !query analysis
+Project [encode(客舍青青柳色新, US-ASCII) AS encode(客舍青青柳色新, US-ASCII)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- SubqueryAlias t
+- LocalRelation [scol#x, ecol#x]
@@ -746,14 +788,14 @@ org.apache.spark.sql.AnalysisException
-- !query
select decode(encode('abc', 'utf-8'), 'utf-8')
-- !query analysis
-Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc,
utf-8), utf-8)#x]
+Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8),
utf-8)#x]
+- OneRowRelation
-- !query
select decode(encode('大千世界', 'utf-32'), 'utf-32')
-- !query analysis
-Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界,
utf-32), utf-32)#x]
+Project [decode(encode(大千世界, utf-32), utf-32) AS decode(encode(大千世界, utf-32),
utf-32)#x]
+- OneRowRelation
@@ -863,6 +905,48 @@ Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x]
+- LocalRelation [scol#x, ecol#x]
+-- !query
+set spark.sql.legacy.codingErrorAction=true
+-- !query analysis
+SetCommand (spark.sql.legacy.codingErrorAction,Some(true))
+
+
+-- !query
+select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII')
+-- !query analysis
+Project [decode(0xE58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS
decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', US-ASCII)#x]
++- OneRowRelation
+
+
+-- !query
+select decode(scol, ecol) from
values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol,
ecol)
+-- !query analysis
+Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x]
++- SubqueryAlias t
+ +- LocalRelation [scol#x, ecol#x]
+
+
+-- !query
+set spark.sql.legacy.codingErrorAction=false
+-- !query analysis
+SetCommand (spark.sql.legacy.codingErrorAction,Some(false))
+
+
+-- !query
+select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII')
+-- !query analysis
+Project [decode(0xE8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS
decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', US-ASCII)#x]
++- OneRowRelation
+
+
+-- !query
+select decode(scol, ecol) from
values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol,
ecol)
+-- !query analysis
+Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x]
++- SubqueryAlias t
+ +- LocalRelation [scol#x, ecol#x]
+
+
-- !query
SELECT CONTAINS(null, 'Spark')
-- !query analysis
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out
index b3c5034656e2..62e3a8747326 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out
@@ -11,7 +11,7 @@ FROM (
-- !query analysis
Project [concat(concat(cast(col1#xL as string), col2#x), cast(col3#x as
string)) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string)
AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS
col3#x]
+ +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string)
AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x]
+- Range (0, 10, step=1)
@@ -29,7 +29,7 @@ FROM (
-- !query analysis
Project [concat(concat(concat(col1#x, cast(col2#xL as string)), concat(col3#x,
cast(col4#x as string))), cast(col5#x as string)) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as
bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as
string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x]
+ +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as
bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as
string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x]
+- Range (0, 10, step=1)
@@ -46,7 +46,7 @@ FROM (
-- !query analysis
Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as
string)) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as
bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as
string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as
string), utf-8, false) AS col4#x]
+ +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as
bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as
string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string),
utf-8) AS col4#x]
+- Range (0, 10, step=1)
@@ -67,7 +67,7 @@ FROM (
-- !query analysis
Project [concat(cast(col1#x as string), cast(col2#x as string)) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x]
+ +- Project [encode(cast(id#xL as string), utf-8) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x]
+- Range (0, 10, step=1)
@@ -84,7 +84,7 @@ FROM (
-- !query analysis
Project [concat(concat(concat(cast(col1#x as string), cast(col2#x as string)),
cast(col3#x as string)), cast(col4#x as string)) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x,
encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x,
encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x]
+ +- Project [encode(cast(id#xL as string), utf-8) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x,
encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x,
encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]
+- Range (0, 10, step=1)
@@ -101,7 +101,7 @@ FROM (
-- !query analysis
Project [concat(concat(cast(col1#x as string), cast(col2#x as string)),
concat(cast(col3#x as string), cast(col4#x as string))) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x,
encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x,
encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x]
+ +- Project [encode(cast(id#xL as string), utf-8) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x,
encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x,
encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]
+- Range (0, 10, step=1)
@@ -122,7 +122,7 @@ FROM (
-- !query analysis
Project [concat(col1#x, col2#x) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x]
+ +- Project [encode(cast(id#xL as string), utf-8) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x]
+- Range (0, 10, step=1)
@@ -139,7 +139,7 @@ FROM (
-- !query analysis
Project [concat(concat(concat(col1#x, col2#x), col3#x), col4#x) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x,
encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x,
encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x]
+ +- Project [encode(cast(id#xL as string), utf-8) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x,
encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x,
encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]
+- Range (0, 10, step=1)
@@ -156,7 +156,7 @@ FROM (
-- !query analysis
Project [concat(concat(col1#x, col2#x), concat(col3#x, col4#x)) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x,
encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x,
encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x]
+ +- Project [encode(cast(id#xL as string), utf-8) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x,
encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x,
encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]
+- Range (0, 10, step=1)
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out
index 60b7fa711791..f4902012f0f9 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out
@@ -13,7 +13,7 @@ FROM (
-- !query analysis
Project [elt(2, col1#x, cast(col2#xL as string), col3#x, cast(col4#x as
string), cast(col5#x as string), false) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as
bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as
string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x]
+ +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as
bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as
string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x]
+- Range (0, 10, step=1)
@@ -30,7 +30,7 @@ FROM (
-- !query analysis
Project [elt(3, col1#x, col2#x, cast(col3#x as string), cast(col4#x as
string), false) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as
bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as
string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as
string), utf-8, false) AS col4#x]
+ +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as
bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as
string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string),
utf-8) AS col4#x]
+- Range (0, 10, step=1)
@@ -51,7 +51,7 @@ FROM (
-- !query analysis
Project [elt(1, cast(col1#x as string), cast(col2#x as string), false) AS
col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x]
+ +- Project [encode(cast(id#xL as string), utf-8) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x]
+- Range (0, 10, step=1)
@@ -72,5 +72,5 @@ FROM (
-- !query analysis
Project [elt(2, col1#x, col2#x, false) AS col#x]
+- SubqueryAlias __auto_generated_subquery_name
- +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x]
+ +- Project [encode(cast(id#xL as string), utf-8) AS col1#x,
encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x]
+- Range (0, 10, step=1)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
index 733720a7e21b..0d9c0f3a6a14 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -126,6 +126,12 @@ select encode('hello', 'WINDOWS-1252');
select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol,
ecol);
select encode('hello', 'Windows-xxx');
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol);
+set spark.sql.legacy.codingErrorAction=true;
+select encode('渭城朝雨浥轻尘', 'US-ASCII');
+select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol);
+set spark.sql.legacy.codingErrorAction=false;
+select encode('客舍青青柳色新', 'US-ASCII');
+select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol);
-- decode
select decode();
@@ -147,6 +153,12 @@ select decode(scol, ecol) from values(X'68656c6c6f',
'WINDOWS-1252') as t(scol,
set spark.sql.legacy.javaCharsets=false;
select decode(X'68656c6c6f', 'WINDOWS-1252');
select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as
t(scol, ecol);
+set spark.sql.legacy.codingErrorAction=true;
+select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII');
+select decode(scol, ecol) from
values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol,
ecol);
+set spark.sql.legacy.codingErrorAction=false;
+select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII');
+select decode(scol, ecol) from
values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol,
ecol);
-- contains
SELECT CONTAINS(null, 'Spark');
diff --git
a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
index 09d4f8892fa4..9f72e215ea54 100644
---
a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
+++
b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
@@ -903,6 +903,70 @@ org.apache.spark.SparkIllegalArgumentException
}
+-- !query
+set spark.sql.legacy.codingErrorAction=true
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.codingErrorAction true
+
+
+-- !query
+select encode('渭城朝雨浥轻尘', 'US-ASCII')
+-- !query schema
+struct<encode(渭城朝雨浥轻尘, US-ASCII):binary>
+-- !query output
+???????
+
+
+-- !query
+select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol)
+-- !query schema
+struct<encode(scol, ecol):binary>
+-- !query output
+???????
+
+
+-- !query
+set spark.sql.legacy.codingErrorAction=false
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.codingErrorAction false
+
+
+-- !query
+select encode('客舍青青柳色新', 'US-ASCII')
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+ "errorClass" : "MALFORMED_CHARACTER_CODING",
+ "sqlState" : "22000",
+ "messageParameters" : {
+ "charset" : "US-ASCII",
+ "function" : "`encode`"
+ }
+}
+
+
+-- !query
+select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+ "errorClass" : "MALFORMED_CHARACTER_CODING",
+ "sqlState" : "22000",
+ "messageParameters" : {
+ "charset" : "US-ASCII",
+ "function" : "`encode`"
+ }
+}
+
+
-- !query
select decode()
-- !query schema
@@ -1125,6 +1189,70 @@ org.apache.spark.SparkIllegalArgumentException
}
+-- !query
+set spark.sql.legacy.codingErrorAction=true
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.codingErrorAction true
+
+
+-- !query
+select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII')
+-- !query schema
+struct<decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', US-ASCII):string>
+-- !query output
+���������������������
+
+
+-- !query
+select decode(scol, ecol) from
values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol,
ecol)
+-- !query schema
+struct<decode(scol, ecol):string>
+-- !query output
+���������������������
+
+
+-- !query
+set spark.sql.legacy.codingErrorAction=false
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.codingErrorAction false
+
+
+-- !query
+select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII')
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+ "errorClass" : "MALFORMED_CHARACTER_CODING",
+ "sqlState" : "22000",
+ "messageParameters" : {
+ "charset" : "US-ASCII",
+ "function" : "`decode`"
+ }
+}
+
+
+-- !query
+select decode(scol, ecol) from
values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol,
ecol)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+ "errorClass" : "MALFORMED_CHARACTER_CODING",
+ "sqlState" : "22000",
+ "messageParameters" : {
+ "charset" : "US-ASCII",
+ "function" : "`decode`"
+ }
+}
+
+
-- !query
SELECT CONTAINS(null, 'Spark')
-- !query schema
diff --git
a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
index 506524840f10..e6778cb539bd 100644
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -835,6 +835,70 @@ org.apache.spark.SparkIllegalArgumentException
}
+-- !query
+set spark.sql.legacy.codingErrorAction=true
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.codingErrorAction true
+
+
+-- !query
+select encode('渭城朝雨浥轻尘', 'US-ASCII')
+-- !query schema
+struct<encode(渭城朝雨浥轻尘, US-ASCII):binary>
+-- !query output
+???????
+
+
+-- !query
+select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol)
+-- !query schema
+struct<encode(scol, ecol):binary>
+-- !query output
+???????
+
+
+-- !query
+set spark.sql.legacy.codingErrorAction=false
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.codingErrorAction false
+
+
+-- !query
+select encode('客舍青青柳色新', 'US-ASCII')
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+ "errorClass" : "MALFORMED_CHARACTER_CODING",
+ "sqlState" : "22000",
+ "messageParameters" : {
+ "charset" : "US-ASCII",
+ "function" : "`encode`"
+ }
+}
+
+
+-- !query
+select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+ "errorClass" : "MALFORMED_CHARACTER_CODING",
+ "sqlState" : "22000",
+ "messageParameters" : {
+ "charset" : "US-ASCII",
+ "function" : "`encode`"
+ }
+}
+
+
-- !query
select decode()
-- !query schema
@@ -1057,6 +1121,70 @@ org.apache.spark.SparkIllegalArgumentException
}
+-- !query
+set spark.sql.legacy.codingErrorAction=true
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.codingErrorAction true
+
+
+-- !query
+select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII')
+-- !query schema
+struct<decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', US-ASCII):string>
+-- !query output
+���������������������
+
+
+-- !query
+select decode(scol, ecol) from
values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol,
ecol)
+-- !query schema
+struct<decode(scol, ecol):string>
+-- !query output
+���������������������
+
+
+-- !query
+set spark.sql.legacy.codingErrorAction=false
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.codingErrorAction false
+
+
+-- !query
+select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII')
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+ "errorClass" : "MALFORMED_CHARACTER_CODING",
+ "sqlState" : "22000",
+ "messageParameters" : {
+ "charset" : "US-ASCII",
+ "function" : "`decode`"
+ }
+}
+
+
+-- !query
+select decode(scol, ecol) from
values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol,
ecol)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+ "errorClass" : "MALFORMED_CHARACTER_CODING",
+ "sqlState" : "22000",
+ "messageParameters" : {
+ "charset" : "US-ASCII",
+ "function" : "`decode`"
+ }
+}
+
+
-- !query
SELECT CONTAINS(null, 'Spark')
-- !query schema
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
index b2aaaceb26ab..22fdd96ce6ba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -192,9 +192,11 @@ class ExplainSuite extends ExplainSuiteHelper with
DisableAdaptiveExecutionSuite
|)
""".stripMargin)
checkKeywordsExistsInExplain(df2,
- "Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), "
+
- "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string),
" +
- "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string))
AS col#x]")
+ "Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x)
as string)) AS col#x]",
+ "Project [cast(id#xL as string) AS col1#x, " +
+ "cast((id#xL + cast(1 as bigint)) as string) AS col2#x, " +
+ "encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS
col3#x, " +
+ "encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS
col4#x]")
val df3 = sql(
"""
@@ -208,9 +210,10 @@ class ExplainSuite extends ExplainSuiteHelper with
DisableAdaptiveExecutionSuite
|)
""".stripMargin)
checkKeywordsExistsInExplain(df3,
- "Project [concat(cast(id#xL as string), " +
- "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string),
" +
- "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string))
AS col#x]")
+ "Project [concat(col1#x, cast(concat(col3#x, col4#x) as string)) AS
col#x]",
+ "Project [cast(id#xL as string) AS col1#x, " +
+ "encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS
col3#x, " +
+ "encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS
col4#x]")
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]