This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 52b94a6f6a9 [SPARK-46220][SQL] Restrict charsets in `decode()` 52b94a6f6a9 is described below commit 52b94a6f6a9335ebfaee1456f78d1943c24694d7 Author: Max Gekk <max.g...@gmail.com> AuthorDate: Mon Dec 4 10:42:40 2023 +0900 [SPARK-46220][SQL] Restrict charsets in `decode()` ### What changes were proposed in this pull request? In the PR, I propose to restrict the supported charsets in the `decode()` functions by the list from [the doc](https://spark.apache.org/docs/latest/api/sql/#decode): ``` 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' ``` and use the existing SQL config `spark.sql.legacy.javaCharsets` for restoring the previous behaviour. ### Why are the changes needed? Currently the list of supported charsets in `decode()` is not stable and fully depends on the used JDK version. So, sometimes user code might not work because a devop changed Java version in a Spark cluster. ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? By running new checks: ``` $ PYSPARK_PYTHON=python3 build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z string-functions.sql" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44131 from MaxGekk/restrict-charsets-in-stringdecode. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../explain-results/function_decode.explain | 2 +- docs/sql-migration-guide.md | 2 +- .../catalyst/expressions/stringExpressions.scala | 25 +++++++- .../analyzer-results/ansi/string-functions.sql.out | 42 ++++++++++++++ .../analyzer-results/string-functions.sql.out | 42 ++++++++++++++ .../sql-tests/inputs/string-functions.sql | 6 ++ .../results/ansi/string-functions.sql.out | 66 ++++++++++++++++++++++ .../sql-tests/results/string-functions.sql.out | 66 ++++++++++++++++++++++ 8 files changed, 246 insertions(+), 5 deletions(-) diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain index 3b8e1eea576..165be9b9e12 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain @@ -1,2 +1,2 @@ -Project [decode(cast(g#0 as binary), UTF-8) AS decode(g, UTF-8)#0] +Project [decode(cast(g#0 as binary), UTF-8, false) AS decode(g, UTF-8)#0] +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index cb4f59323c3..9f9c15521c6 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -29,7 +29,7 @@ license: | - Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions. - Since Spark 4.0, `spark.sql.parquet.compression.codec` drops the support of codec name `lz4raw`, please use `lz4_raw` instead. - Since Spark 4.0, when overflowing during casting timestamp to byte/short/int under non-ansi mode, Spark will return null instead a wrapping value. -- Since Spark 4.0, the `encode()` function supports only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`. +- Since Spark 4.0, the `encode()` and `decode()` functions support only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`. ## Upgrading from Spark SQL 3.4 to 3.5 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 84a5eebd70e..7c5d65d2b95 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2638,18 +2638,26 @@ case class Decode(params: Seq[Expression], replacement: Expression) since = "1.5.0", group = "string_funcs") // scalastyle:on line.size.limit -case class StringDecode(bin: Expression, charset: Expression) +case class StringDecode(bin: Expression, charset: Expression, legacyCharsets: Boolean) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { + def this(bin: Expression, charset: Expression) = + this(bin, charset, SQLConf.get.legacyJavaCharsets) + override def left: Expression = bin override def right: Expression = charset override def dataType: DataType = StringType override def inputTypes: Seq[DataType] = Seq(BinaryType, StringType) + private val supportedCharsets = Set( + "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16") + protected override def nullSafeEval(input1: Any, input2: Any): Any = { val fromCharset = input2.asInstanceOf[UTF8String].toString try { - UTF8String.fromString(new String(input1.asInstanceOf[Array[Byte]], fromCharset)) + if (legacyCharsets || supportedCharsets.contains(fromCharset.toUpperCase(Locale.ROOT))) { + UTF8String.fromString(new String(input1.asInstanceOf[Array[Byte]], fromCharset)) + } else throw new UnsupportedEncodingException } catch { case _: UnsupportedEncodingException => throw QueryExecutionErrors.invalidCharsetError(prettyName, fromCharset) @@ -2659,10 +2667,17 @@ case class StringDecode(bin: Expression, charset: Expression) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (bytes, charset) => { val fromCharset = ctx.freshName("fromCharset") + val sc = JavaCode.global( + ctx.addReferenceObj("supportedCharsets", supportedCharsets), + supportedCharsets.getClass) s""" String $fromCharset = $charset.toString(); try { - ${ev.value} = UTF8String.fromString(new String($bytes, $fromCharset)); + if ($legacyCharsets || $sc.contains($fromCharset.toUpperCase(java.util.Locale.ROOT))) { + ${ev.value} = UTF8String.fromString(new String($bytes, $fromCharset)); + } else { + throw new java.io.UnsupportedEncodingException(); + } } catch (java.io.UnsupportedEncodingException e) { throw QueryExecutionErrors.invalidCharsetError("$prettyName", $fromCharset); } @@ -2677,6 +2692,10 @@ case class StringDecode(bin: Expression, charset: Expression) override def prettyName: String = "decode" } +object StringDecode { + def apply(bin: Expression, charset: Expression): StringDecode = new StringDecode(bin, charset) +} + /** * Encode the given string to a binary using the provided charset. */ diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index 7ace300001d..7ffd3cbd8ba 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -814,6 +814,48 @@ Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- LocalRelation [scol#x, ecol#x] +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(true)) + + +-- !query +select decode(X'68656c6c6f', 'WINDOWS-1252') +-- !query analysis +Project [decode(0x68656C6C6F, WINDOWS-1252) AS decode(X'68656C6C6F', WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(false)) + + +-- !query +select decode(X'68656c6c6f', 'WINDOWS-1252') +-- !query analysis +Project [decode(0x68656C6C6F, WINDOWS-1252) AS decode(X'68656C6C6F', WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query SELECT CONTAINS(null, 'Spark') -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index 7ace300001d..7ffd3cbd8ba 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -814,6 +814,48 @@ Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- LocalRelation [scol#x, ecol#x] +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(true)) + + +-- !query +select decode(X'68656c6c6f', 'WINDOWS-1252') +-- !query analysis +Project [decode(0x68656C6C6F, WINDOWS-1252) AS decode(X'68656C6C6F', WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(false)) + + +-- !query +select decode(X'68656c6c6f', 'WINDOWS-1252') +-- !query analysis +Project [decode(0x68656C6C6F, WINDOWS-1252) AS decode(X'68656C6C6F', WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query SELECT CONTAINS(null, 'Spark') -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index a90557eba7c..64ea6e655d0 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -140,6 +140,12 @@ select decode(null, 6, 'Spark', NULL, 'SQL', 4, 'rocks'); select decode(null, 6, 'Spark', NULL, 'SQL', 4, 'rocks', NULL, '.'); select decode(X'68656c6c6f', 'Windows-xxx'); select decode(scol, ecol) from values(X'68656c6c6f', 'Windows-xxx') as t(scol, ecol); +set spark.sql.legacy.javaCharsets=true; +select decode(X'68656c6c6f', 'WINDOWS-1252'); +select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol); +set spark.sql.legacy.javaCharsets=false; +select decode(X'68656c6c6f', 'WINDOWS-1252'); +select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol); -- contains SELECT CONTAINS(null, 'Spark'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 813200b8bb0..8096cef266e 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1051,6 +1051,72 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query schema +struct<key:string,value:string> +-- !query output +spark.sql.legacy.javaCharsets true + + +-- !query +select decode(X'68656c6c6f', 'WINDOWS-1252') +-- !query schema +struct<decode(X'68656C6C6F', WINDOWS-1252):string> +-- !query output +hello + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct<decode(scol, ecol):string> +-- !query output +hello + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query schema +struct<key:string,value:string> +-- !query output +spark.sql.legacy.javaCharsets false + + +-- !query +select decode(X'68656c6c6f', 'WINDOWS-1252') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`decode`", + "parameter" : "`charset`" + } +} + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`decode`", + "parameter" : "`charset`" + } +} + + -- !query SELECT CONTAINS(null, 'Spark') -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 56782b9a753..91ad830dd3d 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -983,6 +983,72 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query schema +struct<key:string,value:string> +-- !query output +spark.sql.legacy.javaCharsets true + + +-- !query +select decode(X'68656c6c6f', 'WINDOWS-1252') +-- !query schema +struct<decode(X'68656C6C6F', WINDOWS-1252):string> +-- !query output +hello + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct<decode(scol, ecol):string> +-- !query output +hello + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query schema +struct<key:string,value:string> +-- !query output +spark.sql.legacy.javaCharsets false + + +-- !query +select decode(X'68656c6c6f', 'WINDOWS-1252') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`decode`", + "parameter" : "`charset`" + } +} + + +-- !query +select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`decode`", + "parameter" : "`charset`" + } +} + + -- !query SELECT CONTAINS(null, 'Spark') -- !query schema --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org