This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 4946d025b62 [SPARK-44983][SQL] Convert binary to string by `to_char` for the formats: `hex`, `base64`, `utf-8` 4946d025b62 is described below commit 4946d025b6200ad90dfdfbb1f24526016f810523 Author: Max Gekk <max.g...@gmail.com> AuthorDate: Mon Aug 28 16:55:35 2023 +0300 [SPARK-44983][SQL] Convert binary to string by `to_char` for the formats: `hex`, `base64`, `utf-8` ### What changes were proposed in this pull request? In the PR, I propose to re-use the `Hex`, `Base64` and `Decode` expressions in the `ToCharacter` (the `to_char`/`to_varchar` functions) when the `format` parameter is one of `hex`, `base64` and `utf-8`. ### Why are the changes needed? To make the migration to Spark SQL easier from the systems like: - Snowflake: https://docs.snowflake.com/en/sql-reference/functions/to_char - SAP SQL Anywhere: https://help.sap.com/docs/SAP_SQL_Anywhere/93079d4ba8e44920ae63ffb4def91f5b/81fe51196ce21014b9c6cf43b2666698.html - Oracle: https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/TO_CHAR-number.html#GUID-00DA076D-2468-41AB-A3AC-CC78DBA0D9CB - Vertica: https://www.vertica.com/docs/9.3.x/HTML/Content/Authoring/SQLReferenceManual/Functions/Formatting/TO_CHAR.htm ### Does this PR introduce _any_ user-facing change? No. This PR extends existing API. It might be considered as an user-facing change only if user's code depends on errors in the case of wrong formats. ### How was this patch tested? By running new examples: ``` $ build/sbt "sql/test:testOnly org.apache.spark.sql.expressions.ExpressionInfoSuite" ``` and new tests: ``` $ build/sbt "test:testOnly *.StringFunctionsSuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42632 from MaxGekk/to_char-binary-2. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../src/main/resources/error/error-classes.json | 5 ++ ...nditions-invalid-parameter-value-error-class.md | 4 ++ .../expressions/numberFormatExpressions.scala | 28 +++++++-- .../spark/sql/errors/QueryCompilationErrors.scala | 9 +++ .../apache/spark/sql/StringFunctionsSuite.scala | 69 +++++++++++++++------- 5 files changed, 89 insertions(+), 26 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 632c449b992..53c596c00fc 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -1788,6 +1788,11 @@ "expects a binary value with 16, 24 or 32 bytes, but got <actualLength> bytes." ] }, + "BINARY_FORMAT" : { + "message" : [ + "expects one of binary formats 'base64', 'hex', 'utf-8', but got <invalidFormat>." + ] + }, "DATETIME_UNIT" : { "message" : [ "expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal <invalidValue>." diff --git a/docs/sql-error-conditions-invalid-parameter-value-error-class.md b/docs/sql-error-conditions-invalid-parameter-value-error-class.md index 370e6da3362..96829e564aa 100644 --- a/docs/sql-error-conditions-invalid-parameter-value-error-class.md +++ b/docs/sql-error-conditions-invalid-parameter-value-error-class.md @@ -37,6 +37,10 @@ supports 16-byte CBC IVs and 12-byte GCM IVs, but got `<actualLength>` bytes for expects a binary value with 16, 24 or 32 bytes, but got `<actualLength>` bytes. +## BINARY_FORMAT + +expects one of binary formats 'base64', 'hex', 'utf-8', but got `<invalidFormat>`. + ## DATETIME_UNIT expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal `<invalidValue>`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala index 3a424ac21c5..7875ed8fe20 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGe import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper import org.apache.spark.sql.catalyst.util.ToNumberParser import org.apache.spark.sql.errors.QueryCompilationErrors -import org.apache.spark.sql.types.{AbstractDataType, DataType, DatetimeType, Decimal, DecimalType, StringType} +import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, DatetimeType, Decimal, DecimalType, StringType} import org.apache.spark.unsafe.types.UTF8String abstract class ToNumberBase(left: Expression, right: Expression, errorOnFail: Boolean) @@ -209,6 +209,10 @@ case class TryToNumber(left: Expression, right: Expression) wrapped by angle brackets if the input value is negative. ('<1>'). If `expr` is a datetime, `format` shall be a valid datetime pattern, see <a href="https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html">Datetime Patterns</a>. + If `expr` is a binary, it is converted to a string in one of the formats: + 'base64': a base 64 string. + 'hex': a string in the hexadecimal format. + 'utf-8': the input binary is decoded to UTF-8 string. """, examples = """ Examples: @@ -224,6 +228,12 @@ case class TryToNumber(left: Expression, right: Expression) 12,454.8- > SELECT _FUNC_(date'2016-04-08', 'y'); 2016 + > SELECT _FUNC_(x'537061726b2053514c', 'base64'); + U3BhcmsgU1FM + > SELECT _FUNC_(x'537061726b2053514c', 'hex'); + 537061726B2053514C + > SELECT _FUNC_(encode('abc', 'utf-8'), 'utf-8'); + abc """, since = "3.4.0", group = "string_funcs") @@ -232,10 +242,20 @@ object ToCharacterBuilder extends ExpressionBuilder { override def build(funcName: String, expressions: Seq[Expression]): Expression = { val numArgs = expressions.length if (expressions.length == 2) { - val inputExpr = expressions.head + val (inputExpr, format) = (expressions(0), expressions(1)) inputExpr.dataType match { - case _: DatetimeType => DateFormatClass(inputExpr, expressions(1)) - case _ => ToCharacter(inputExpr, expressions(1)) + case _: DatetimeType => DateFormatClass(inputExpr, format) + case _: BinaryType => + if (!(format.dataType == StringType && format.foldable)) { + throw QueryCompilationErrors.requireLiteralParameter(funcName, "format", "string") + } + format.eval().asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT).trim match { + case "base64" => Base64(inputExpr) + case "hex" => Hex(inputExpr) + case "utf-8" => new Decode(Seq(inputExpr, format)) + case invalid => throw QueryCompilationErrors.binaryFormatError(funcName, invalid) + } + case _ => ToCharacter(inputExpr, format) } } else { throw QueryCompilationErrors.wrongNumArgsError(funcName, Seq(2), numArgs) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 3d2c59cebd8..e579e5cf565 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -157,6 +157,15 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat "functionName" -> toSQLId("format_string"))) } + def binaryFormatError(funcName: String, invalidFormat: String): Throwable = { + new AnalysisException( + errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT", + messageParameters = Map( + "parameter" -> toSQLId("format"), + "functionName" -> toSQLId(funcName), + "invalidFormat" -> toSQLValue(invalidFormat, StringType))) + } + def unorderablePivotColError(pivotCol: Expression): Throwable = { new AnalysisException( errorClass = "INCOMPARABLE_PIVOT_COLUMN", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 88c9e15570e..12881f4a22a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -854,28 +854,53 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { ) } - test("to_char") { - val df = Seq(78.12).toDF("a") - checkAnswer( - df.selectExpr("to_char(a, '$99.99')"), - Seq(Row("$78.12")) - ) - checkAnswer( - df.select(to_char(col("a"), lit("$99.99"))), - Seq(Row("$78.12")) - ) - } - - test("to_varchar") { - val df = Seq(78.12).toDF("a") - checkAnswer( - df.selectExpr("to_varchar(a, '$99.99')"), - Seq(Row("$78.12")) - ) - checkAnswer( - df.select(to_varchar(col("a"), lit("$99.99"))), - Seq(Row("$78.12")) - ) + test("to_char/to_varchar") { + Seq( + "to_char" -> ((e: Column, fmt: Column) => to_char(e, fmt)), + "to_varchar" -> ((e: Column, fmt: Column) => to_varchar(e, fmt)) + ).foreach { case (funcName, func) => + val df = Seq(78.12).toDF("a") + checkAnswer(df.selectExpr(s"$funcName(a, '$$99.99')"), Seq(Row("$78.12"))) + checkAnswer(df.select(func(col("a"), lit("$99.99"))), Seq(Row("$78.12"))) + + val df2 = Seq((Array(100.toByte), "base64")).toDF("input", "format") + checkAnswer(df2.selectExpr(s"$funcName(input, 'hex')"), Seq(Row("64"))) + checkAnswer(df2.select(func(col("input"), lit("hex"))), Seq(Row("64"))) + checkAnswer(df2.selectExpr(s"$funcName(input, 'base64')"), Seq(Row("ZA=="))) + checkAnswer(df2.select(func(col("input"), lit("base64"))), Seq(Row("ZA=="))) + checkAnswer(df2.selectExpr(s"$funcName(input, 'utf-8')"), Seq(Row("d"))) + checkAnswer(df2.select(func(col("input"), lit("utf-8"))), Seq(Row("d"))) + + checkError( + exception = intercept[AnalysisException] { + df2.select(func(col("input"), col("format"))).collect() + }, + errorClass = "_LEGACY_ERROR_TEMP_1100", + parameters = Map( + "argName" -> "format", + "funcName" -> "to_char", + "requiredType" -> "string")) + checkError( + exception = intercept[AnalysisException] { + df2.select(func(col("input"), lit("invalid_format"))).collect() + }, + errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT", + parameters = Map( + "parameter" -> "`format`", + "functionName" -> "`to_char`", + "invalidFormat" -> "'invalid_format'")) + checkError( + exception = intercept[AnalysisException] { + sql(s"select $funcName('a', 'b', 'c')") + }, + errorClass = "WRONG_NUM_ARGS.WITHOUT_SUGGESTION", + parameters = Map( + "functionName" -> s"`$funcName`", + "expectedNum" -> "2", + "actualNum" -> "3", + "docroot" -> SPARK_DOC_ROOT), + context = ExpectedContext("", "", 7, 21 + funcName.length, s"$funcName('a', 'b', 'c')")) + } } test("to_number") { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org