This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new e93bff6fc0bc [SPARK-46187][SQL] Align codegen and non-codegen implementation of `StringDecode` e93bff6fc0bc is described below commit e93bff6fc0bc3a549de02958ffc17b1bca3d50b8 Author: Max Gekk <max.g...@gmail.com> AuthorDate: Fri Dec 1 10:42:31 2023 +0100 [SPARK-46187][SQL] Align codegen and non-codegen implementation of `StringDecode` ### What changes were proposed in this pull request? In the PR, I propose to change the implementation of interpretation mode of `StringDecode` and apparently of the `decode` function. And make it consistent to codegen. Both implementation raise the same error with of the error class `INVALID_PARAMETER_VALUE.CHARSET`. ### Why are the changes needed? To make codegen and non-codegen of the `StringDecode` expression consistent. So, users will observe the same behaviour in both modes. ### Does this PR introduce _any_ user-facing change? Yes, if user code depends on error from `decode()`. ### How was this patch tested? By running the following test suites: ``` $ PYSPARK_PYTHON=python3 build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z string-functions.sql" $ build/sbt "core/testOnly *SparkThrowableSuite" $ build/sbt "test:testOnly *.StringFunctionsSuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44094 from MaxGekk/align-codegen-stringdecode. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../catalyst/expressions/stringExpressions.scala | 18 ++++++++---- .../analyzer-results/ansi/string-functions.sql.out | 15 ++++++++++ .../analyzer-results/string-functions.sql.out | 15 ++++++++++ .../sql-tests/inputs/string-functions.sql | 2 ++ .../results/ansi/string-functions.sql.out | 34 ++++++++++++++++++++++ .../sql-tests/results/string-functions.sql.out | 34 ++++++++++++++++++++++ 6 files changed, 113 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 412422f4da4e..84a5eebd70ec 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2648,18 +2648,26 @@ case class StringDecode(bin: Expression, charset: Expression) protected override def nullSafeEval(input1: Any, input2: Any): Any = { val fromCharset = input2.asInstanceOf[UTF8String].toString - UTF8String.fromString(new String(input1.asInstanceOf[Array[Byte]], fromCharset)) + try { + UTF8String.fromString(new String(input1.asInstanceOf[Array[Byte]], fromCharset)) + } catch { + case _: UnsupportedEncodingException => + throw QueryExecutionErrors.invalidCharsetError(prettyName, fromCharset) + } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (bytes, charset) => + nullSafeCodeGen(ctx, ev, (bytes, charset) => { + val fromCharset = ctx.freshName("fromCharset") s""" + String $fromCharset = $charset.toString(); try { - ${ev.value} = UTF8String.fromString(new String($bytes, $charset.toString())); + ${ev.value} = UTF8String.fromString(new String($bytes, $fromCharset)); } catch (java.io.UnsupportedEncodingException e) { - org.apache.spark.unsafe.Platform.throwException(e); + throw QueryExecutionErrors.invalidCharsetError("$prettyName", $fromCharset); } - """) + """ + }) } override protected def withNewChildrenInternal( diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index 9d8705e3e862..7ace300001d5 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -799,6 +799,21 @@ Project [decode(null, 6, Spark, null, SQL, 4, rocks, null, .) AS decode(NULL, 6, +- OneRowRelation +-- !query +select decode(X'68656c6c6f', 'Windows-xxx') +-- !query analysis +Project [decode(0x68656C6C6F, Windows-xxx) AS decode(X'68656C6C6F', Windows-xxx)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'Windows-xxx') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query SELECT CONTAINS(null, 'Spark') -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index 9d8705e3e862..7ace300001d5 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -799,6 +799,21 @@ Project [decode(null, 6, Spark, null, SQL, 4, rocks, null, .) AS decode(NULL, 6, +- OneRowRelation +-- !query +select decode(X'68656c6c6f', 'Windows-xxx') +-- !query analysis +Project [decode(0x68656C6C6F, Windows-xxx) AS decode(X'68656C6C6F', Windows-xxx)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'Windows-xxx') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query SELECT CONTAINS(null, 'Spark') -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 645f6bcb8327..a90557eba7c7 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -138,6 +138,8 @@ select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattl select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle'); select decode(null, 6, 'Spark', NULL, 'SQL', 4, 'rocks'); select decode(null, 6, 'Spark', NULL, 'SQL', 4, 'rocks', NULL, '.'); +select decode(X'68656c6c6f', 'Windows-xxx'); +select decode(scol, ecol) from values(X'68656c6c6f', 'Windows-xxx') as t(scol, ecol); -- contains SELECT CONTAINS(null, 'Spark'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 89bb20fc1bff..813200b8bb0e 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1017,6 +1017,40 @@ struct<decode(NULL, 6, Spark, NULL, SQL, 4, rocks, NULL, .):string> SQL +-- !query +select decode(X'68656c6c6f', 'Windows-xxx') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "Windows-xxx", + "functionName" : "`decode`", + "parameter" : "`charset`" + } +} + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'Windows-xxx') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "Windows-xxx", + "functionName" : "`decode`", + "parameter" : "`charset`" + } +} + + -- !query SELECT CONTAINS(null, 'Spark') -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 6d90a5091578..56782b9a7534 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -949,6 +949,40 @@ struct<decode(NULL, 6, Spark, NULL, SQL, 4, rocks, NULL, .):string> SQL +-- !query +select decode(X'68656c6c6f', 'Windows-xxx') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "Windows-xxx", + "functionName" : "`decode`", + "parameter" : "`charset`" + } +} + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'Windows-xxx') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "Windows-xxx", + "functionName" : "`decode`", + "parameter" : "`charset`" + } +} + + -- !query SELECT CONTAINS(null, 'Spark') -- !query schema --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org