This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 4946d025b62 [SPARK-44983][SQL] Convert binary to string by `to_char`
for the formats: `hex`, `base64`, `utf-8`
4946d025b62 is described below
commit 4946d025b6200ad90dfdfbb1f24526016f810523
Author: Max Gekk <[email protected]>
AuthorDate: Mon Aug 28 16:55:35 2023 +0300
[SPARK-44983][SQL] Convert binary to string by `to_char` for the formats:
`hex`, `base64`, `utf-8`
### What changes were proposed in this pull request?
In the PR, I propose to re-use the `Hex`, `Base64` and `Decode` expressions
in the `ToCharacter` (the `to_char`/`to_varchar` functions) when the `format`
parameter is one of `hex`, `base64` and `utf-8`.
### Why are the changes needed?
To make the migration to Spark SQL easier from the systems like:
- Snowflake: https://docs.snowflake.com/en/sql-reference/functions/to_char
- SAP SQL Anywhere:
https://help.sap.com/docs/SAP_SQL_Anywhere/93079d4ba8e44920ae63ffb4def91f5b/81fe51196ce21014b9c6cf43b2666698.html
- Oracle:
https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/TO_CHAR-number.html#GUID-00DA076D-2468-41AB-A3AC-CC78DBA0D9CB
- Vertica:
https://www.vertica.com/docs/9.3.x/HTML/Content/Authoring/SQLReferenceManual/Functions/Formatting/TO_CHAR.htm
### Does this PR introduce _any_ user-facing change?
No. This PR extends existing API. It might be considered as an user-facing
change only if user's code depends on errors in the case of wrong formats.
### How was this patch tested?
By running new examples:
```
$ build/sbt "sql/test:testOnly
org.apache.spark.sql.expressions.ExpressionInfoSuite"
```
and new tests:
```
$ build/sbt "test:testOnly *.StringFunctionsSuite"
```
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #42632 from MaxGekk/to_char-binary-2.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../src/main/resources/error/error-classes.json | 5 ++
...nditions-invalid-parameter-value-error-class.md | 4 ++
.../expressions/numberFormatExpressions.scala | 28 +++++++--
.../spark/sql/errors/QueryCompilationErrors.scala | 9 +++
.../apache/spark/sql/StringFunctionsSuite.scala | 69 +++++++++++++++-------
5 files changed, 89 insertions(+), 26 deletions(-)
diff --git a/common/utils/src/main/resources/error/error-classes.json
b/common/utils/src/main/resources/error/error-classes.json
index 632c449b992..53c596c00fc 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -1788,6 +1788,11 @@
"expects a binary value with 16, 24 or 32 bytes, but got
<actualLength> bytes."
]
},
+ "BINARY_FORMAT" : {
+ "message" : [
+ "expects one of binary formats 'base64', 'hex', 'utf-8', but got
<invalidFormat>."
+ ]
+ },
"DATETIME_UNIT" : {
"message" : [
"expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK,
DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the
string literal <invalidValue>."
diff --git a/docs/sql-error-conditions-invalid-parameter-value-error-class.md
b/docs/sql-error-conditions-invalid-parameter-value-error-class.md
index 370e6da3362..96829e564aa 100644
--- a/docs/sql-error-conditions-invalid-parameter-value-error-class.md
+++ b/docs/sql-error-conditions-invalid-parameter-value-error-class.md
@@ -37,6 +37,10 @@ supports 16-byte CBC IVs and 12-byte GCM IVs, but got
`<actualLength>` bytes for
expects a binary value with 16, 24 or 32 bytes, but got `<actualLength>` bytes.
+## BINARY_FORMAT
+
+expects one of binary formats 'base64', 'hex', 'utf-8', but got
`<invalidFormat>`.
+
## DATETIME_UNIT
expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY,
DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string
literal `<invalidValue>`.
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala
index 3a424ac21c5..7875ed8fe20 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala
@@ -26,7 +26,7 @@ import
org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGe
import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
import org.apache.spark.sql.catalyst.util.ToNumberParser
import org.apache.spark.sql.errors.QueryCompilationErrors
-import org.apache.spark.sql.types.{AbstractDataType, DataType, DatetimeType,
Decimal, DecimalType, StringType}
+import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType,
DatetimeType, Decimal, DecimalType, StringType}
import org.apache.spark.unsafe.types.UTF8String
abstract class ToNumberBase(left: Expression, right: Expression, errorOnFail:
Boolean)
@@ -209,6 +209,10 @@ case class TryToNumber(left: Expression, right: Expression)
wrapped by angle brackets if the input value is negative.
('<1>').
If `expr` is a datetime, `format` shall be a valid datetime pattern, see
<a
href="https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html">Datetime
Patterns</a>.
+ If `expr` is a binary, it is converted to a string in one of the formats:
+ 'base64': a base 64 string.
+ 'hex': a string in the hexadecimal format.
+ 'utf-8': the input binary is decoded to UTF-8 string.
""",
examples = """
Examples:
@@ -224,6 +228,12 @@ case class TryToNumber(left: Expression, right: Expression)
12,454.8-
> SELECT _FUNC_(date'2016-04-08', 'y');
2016
+ > SELECT _FUNC_(x'537061726b2053514c', 'base64');
+ U3BhcmsgU1FM
+ > SELECT _FUNC_(x'537061726b2053514c', 'hex');
+ 537061726B2053514C
+ > SELECT _FUNC_(encode('abc', 'utf-8'), 'utf-8');
+ abc
""",
since = "3.4.0",
group = "string_funcs")
@@ -232,10 +242,20 @@ object ToCharacterBuilder extends ExpressionBuilder {
override def build(funcName: String, expressions: Seq[Expression]):
Expression = {
val numArgs = expressions.length
if (expressions.length == 2) {
- val inputExpr = expressions.head
+ val (inputExpr, format) = (expressions(0), expressions(1))
inputExpr.dataType match {
- case _: DatetimeType => DateFormatClass(inputExpr, expressions(1))
- case _ => ToCharacter(inputExpr, expressions(1))
+ case _: DatetimeType => DateFormatClass(inputExpr, format)
+ case _: BinaryType =>
+ if (!(format.dataType == StringType && format.foldable)) {
+ throw QueryCompilationErrors.requireLiteralParameter(funcName,
"format", "string")
+ }
+
format.eval().asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT).trim
match {
+ case "base64" => Base64(inputExpr)
+ case "hex" => Hex(inputExpr)
+ case "utf-8" => new Decode(Seq(inputExpr, format))
+ case invalid => throw
QueryCompilationErrors.binaryFormatError(funcName, invalid)
+ }
+ case _ => ToCharacter(inputExpr, format)
}
} else {
throw QueryCompilationErrors.wrongNumArgsError(funcName, Seq(2), numArgs)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
index 3d2c59cebd8..e579e5cf565 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -157,6 +157,15 @@ private[sql] object QueryCompilationErrors extends
QueryErrorsBase with Compilat
"functionName" -> toSQLId("format_string")))
}
+ def binaryFormatError(funcName: String, invalidFormat: String): Throwable = {
+ new AnalysisException(
+ errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT",
+ messageParameters = Map(
+ "parameter" -> toSQLId("format"),
+ "functionName" -> toSQLId(funcName),
+ "invalidFormat" -> toSQLValue(invalidFormat, StringType)))
+ }
+
def unorderablePivotColError(pivotCol: Expression): Throwable = {
new AnalysisException(
errorClass = "INCOMPARABLE_PIVOT_COLUMN",
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 88c9e15570e..12881f4a22a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -854,28 +854,53 @@ class StringFunctionsSuite extends QueryTest with
SharedSparkSession {
)
}
- test("to_char") {
- val df = Seq(78.12).toDF("a")
- checkAnswer(
- df.selectExpr("to_char(a, '$99.99')"),
- Seq(Row("$78.12"))
- )
- checkAnswer(
- df.select(to_char(col("a"), lit("$99.99"))),
- Seq(Row("$78.12"))
- )
- }
-
- test("to_varchar") {
- val df = Seq(78.12).toDF("a")
- checkAnswer(
- df.selectExpr("to_varchar(a, '$99.99')"),
- Seq(Row("$78.12"))
- )
- checkAnswer(
- df.select(to_varchar(col("a"), lit("$99.99"))),
- Seq(Row("$78.12"))
- )
+ test("to_char/to_varchar") {
+ Seq(
+ "to_char" -> ((e: Column, fmt: Column) => to_char(e, fmt)),
+ "to_varchar" -> ((e: Column, fmt: Column) => to_varchar(e, fmt))
+ ).foreach { case (funcName, func) =>
+ val df = Seq(78.12).toDF("a")
+ checkAnswer(df.selectExpr(s"$funcName(a, '$$99.99')"),
Seq(Row("$78.12")))
+ checkAnswer(df.select(func(col("a"), lit("$99.99"))), Seq(Row("$78.12")))
+
+ val df2 = Seq((Array(100.toByte), "base64")).toDF("input", "format")
+ checkAnswer(df2.selectExpr(s"$funcName(input, 'hex')"), Seq(Row("64")))
+ checkAnswer(df2.select(func(col("input"), lit("hex"))), Seq(Row("64")))
+ checkAnswer(df2.selectExpr(s"$funcName(input, 'base64')"),
Seq(Row("ZA==")))
+ checkAnswer(df2.select(func(col("input"), lit("base64"))),
Seq(Row("ZA==")))
+ checkAnswer(df2.selectExpr(s"$funcName(input, 'utf-8')"), Seq(Row("d")))
+ checkAnswer(df2.select(func(col("input"), lit("utf-8"))), Seq(Row("d")))
+
+ checkError(
+ exception = intercept[AnalysisException] {
+ df2.select(func(col("input"), col("format"))).collect()
+ },
+ errorClass = "_LEGACY_ERROR_TEMP_1100",
+ parameters = Map(
+ "argName" -> "format",
+ "funcName" -> "to_char",
+ "requiredType" -> "string"))
+ checkError(
+ exception = intercept[AnalysisException] {
+ df2.select(func(col("input"), lit("invalid_format"))).collect()
+ },
+ errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT",
+ parameters = Map(
+ "parameter" -> "`format`",
+ "functionName" -> "`to_char`",
+ "invalidFormat" -> "'invalid_format'"))
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(s"select $funcName('a', 'b', 'c')")
+ },
+ errorClass = "WRONG_NUM_ARGS.WITHOUT_SUGGESTION",
+ parameters = Map(
+ "functionName" -> s"`$funcName`",
+ "expectedNum" -> "2",
+ "actualNum" -> "3",
+ "docroot" -> SPARK_DOC_ROOT),
+ context = ExpectedContext("", "", 7, 21 + funcName.length,
s"$funcName('a', 'b', 'c')"))
+ }
}
test("to_number") {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]