This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new e470e7442f9 [SPARK-45887][SQL] Align codegen and non-codegen
implementation of `Encode`
e470e7442f9 is described below
commit e470e7442f90f7189346993d76733cafe7469ada
Author: Max Gekk <[email protected]>
AuthorDate: Thu Nov 23 18:33:13 2023 +0300
[SPARK-45887][SQL] Align codegen and non-codegen implementation of `Encode`
### What changes were proposed in this pull request?
In the PR, I propose to change the implementation of interpretation mode,
and make it consistent to codegen. Both implementation raise the same error
with new error class `INVALID_PARAMETER_VALUE.CHARSET`.
### Why are the changes needed?
To make codegen and non-codegen of the `Encode` expression consistent. So,
users will observe the same behaviour in both modes.
### Does this PR introduce _any_ user-facing change?
Yes, if user code depends on error from `encode()`.
### How was this patch tested?
By running the following test suites:
```
$ PYSPARK_PYTHON=python3 build/sbt "sql/testOnly
org.apache.spark.sql.SQLQueryTestSuite -- -z string-functions.sql"
$ build/sbt "core/testOnly *SparkThrowableSuite"
$ build/sbt "test:testOnly *.StringFunctionsSuite"
```
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43759 from MaxGekk/restrict-charsets-in-encode.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../src/main/resources/error/error-classes.json | 5 ++++
...nditions-invalid-parameter-value-error-class.md | 4 +++
.../catalyst/expressions/stringExpressions.scala | 19 ++++++++----
.../spark/sql/errors/QueryExecutionErrors.scala | 9 ++++++
.../analyzer-results/ansi/string-functions.sql.out | 15 ++++++++++
.../analyzer-results/string-functions.sql.out | 15 ++++++++++
.../sql-tests/inputs/string-functions.sql | 4 +++
.../results/ansi/string-functions.sql.out | 34 ++++++++++++++++++++++
.../sql-tests/results/string-functions.sql.out | 34 ++++++++++++++++++++++
9 files changed, 134 insertions(+), 5 deletions(-)
diff --git a/common/utils/src/main/resources/error/error-classes.json
b/common/utils/src/main/resources/error/error-classes.json
index b0621c44532..19b70307a1c 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -2042,6 +2042,11 @@
"expects an integer value in [0, <upper>), but got <invalidValue>."
]
},
+ "CHARSET" : {
+ "message" : [
+ "expects one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8',
'UTF-16BE', 'UTF-16LE', 'UTF-16', but got <charset>."
+ ]
+ },
"DATETIME_UNIT" : {
"message" : [
"expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK,
DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the
string literal <invalidValue>."
diff --git a/docs/sql-error-conditions-invalid-parameter-value-error-class.md
b/docs/sql-error-conditions-invalid-parameter-value-error-class.md
index d58d4fc2f59..8547d8b31f0 100644
--- a/docs/sql-error-conditions-invalid-parameter-value-error-class.md
+++ b/docs/sql-error-conditions-invalid-parameter-value-error-class.md
@@ -45,6 +45,10 @@ expects one of binary formats 'base64', 'hex', 'utf-8', but
got `<invalidFormat>
expects an integer value in [0, `<upper>`), but got `<invalidValue>`.
+## CHARSET
+
+expects one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE',
'UTF-16LE', 'UTF-16', but got `<charset>`.
+
## DATETIME_UNIT
expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY,
DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string
literal `<invalidValue>`.
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 811d6e013ab..0d3239423b2 100755
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -17,6 +17,7 @@
package org.apache.spark.sql.catalyst.expressions
+import java.io.UnsupportedEncodingException
import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
import java.util.{Base64 => JBase64}
import java.util.{HashMap, Locale, Map => JMap}
@@ -2694,17 +2695,25 @@ case class Encode(value: Expression, charset:
Expression)
protected override def nullSafeEval(input1: Any, input2: Any): Any = {
val toCharset = input2.asInstanceOf[UTF8String].toString
- input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
+ try {
+ input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
+ } catch {
+ case _: UnsupportedEncodingException =>
+ throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset)
+ }
}
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
- nullSafeCodeGen(ctx, ev, (string, charset) =>
+ nullSafeCodeGen(ctx, ev, (string, charset) => {
+ val toCharset = ctx.freshName("toCharset")
s"""
+ String $toCharset = $charset.toString();
try {
- ${ev.value} = $string.toString().getBytes($charset.toString());
+ ${ev.value} = $string.toString().getBytes($toCharset);
} catch (java.io.UnsupportedEncodingException e) {
- org.apache.spark.unsafe.Platform.throwException(e);
- }""")
+ throw QueryExecutionErrors.invalidCharsetError("$prettyName",
$toCharset);
+ }"""
+ })
}
override protected def withNewChildrenInternal(
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
index 0abb202a10f..1aa25a51fa9 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
@@ -2758,4 +2758,13 @@ private[sql] object QueryExecutionErrors extends
QueryErrorsBase with ExecutionE
"upper" -> size.toString,
"invalidValue" -> pos.toString))
}
+
+ def invalidCharsetError(functionName: String, charset: String):
RuntimeException = {
+ new SparkIllegalArgumentException(
+ errorClass = "INVALID_PARAMETER_VALUE.CHARSET",
+ messageParameters = Map(
+ "functionName" -> toSQLId(functionName),
+ "parameter" -> toSQLId("charset"),
+ "charset" -> charset))
+ }
}
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
index baed5a62772..9c210a713de 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
@@ -640,6 +640,21 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57',
5, abc)#x]
+- OneRowRelation
+-- !query
+select encode('hello', 'Windows-xxx')
+-- !query analysis
+Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+ +- LocalRelation [scol#x, ecol#x]
+
+
-- !query
select decode()
-- !query analysis
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
index baed5a62772..9c210a713de 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
@@ -640,6 +640,21 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57',
5, abc)#x]
+- OneRowRelation
+-- !query
+select encode('hello', 'Windows-xxx')
+-- !query analysis
+Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+ +- LocalRelation [scol#x, ecol#x]
+
+
-- !query
select decode()
-- !query analysis
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
index 3ef0d0f0cfb..0fbf211ec5c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -117,6 +117,10 @@ SELECT lpad(x'57', 5, 'abc');
SELECT rpad('abc', 5, x'57');
SELECT rpad(x'57', 5, 'abc');
+-- encode
+select encode('hello', 'Windows-xxx');
+select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol);
+
-- decode
select decode();
select decode(encode('abc', 'utf-8'));
diff --git
a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
index 61cea27d290..082ff03efac 100644
---
a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
+++
b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
@@ -803,6 +803,40 @@ struct<rpad(X'57', 5, abc):string>
Wabca
+-- !query
+select encode('hello', 'Windows-xxx')
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+ "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+ "sqlState" : "22023",
+ "messageParameters" : {
+ "charset" : "Windows-xxx",
+ "functionName" : "`encode`",
+ "parameter" : "`charset`"
+ }
+}
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+ "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+ "sqlState" : "22023",
+ "messageParameters" : {
+ "charset" : "Windows-xxx",
+ "functionName" : "`encode`",
+ "parameter" : "`charset`"
+ }
+}
+
+
-- !query
select decode()
-- !query schema
diff --git
a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
index d4d61ac9ef1..79140920378 100644
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -735,6 +735,40 @@ struct<rpad(X'57', 5, abc):string>
Wabca
+-- !query
+select encode('hello', 'Windows-xxx')
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+ "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+ "sqlState" : "22023",
+ "messageParameters" : {
+ "charset" : "Windows-xxx",
+ "functionName" : "`encode`",
+ "parameter" : "`charset`"
+ }
+}
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+ "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+ "sqlState" : "22023",
+ "messageParameters" : {
+ "charset" : "Windows-xxx",
+ "functionName" : "`encode`",
+ "parameter" : "`charset`"
+ }
+}
+
+
-- !query
select decode()
-- !query schema
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]