This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push:
new e049a6c7f63e [SPARK-47646][SQL] Make try_to_number return NULL for
malformed input
e049a6c7f63e is described below
commit e049a6c7f63e2c03525a01c4c2e3ce54cc9fb617
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Fri Mar 29 17:38:10 2024 +0900
[SPARK-47646][SQL] Make try_to_number return NULL for malformed input
### What changes were proposed in this pull request?
This PR proposes to add NULL check after parsing the number so the output
can be safely null for `try_to_number` expression.
```scala
import org.apache.spark.sql.functions._
val df = spark.createDataset(spark.sparkContext.parallelize(Seq("11")))
df.select(try_to_number($"value", lit("$99.99"))).show()
```
```
java.lang.NullPointerException: Cannot invoke
"org.apache.spark.sql.types.Decimal.toPlainString()" because "<local7>" is null
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.serializefromobject_doConsume_0$(Unknown
Source)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:894)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:894)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:368)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:332)
```
### Why are the changes needed?
To fix the bug, and let `try_to_number` return `NULL` for malformed input
as designed.
### Does this PR introduce _any_ user-facing change?
Yes, it fixes a bug. Previously, `try_to_number` failed with NPE.
### How was this patch tested?
Unittest was added.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #45771 from HyukjinKwon/SPARK-47646.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit d709e20066becf15adf5aa35e1bdd8eecf500b4b)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../spark/sql/catalyst/expressions/numberFormatExpressions.scala | 1 +
.../src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala | 5 +++++
2 files changed, 6 insertions(+)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala
index 2d4f0438db76..9dcca65efe5a 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala
@@ -86,6 +86,7 @@ abstract class ToNumberBase(left: Expression, right:
Expression, errorOnFail: Bo
|${CodeGenerator.javaType(dataType)} ${ev.value} =
${CodeGenerator.defaultValue(dataType)};
|if (!${ev.isNull}) {
| ${ev.value} = $builder.parse(${eval.value});
+ | ${ev.isNull} = ${ev.isNull} || (${ev.value} == null);
|}
""".stripMargin)
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 88c9e15570e3..4709c2b5e192 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -1173,6 +1173,11 @@ class StringFunctionsSuite extends QueryTest with
SharedSparkSession {
checkAnswer(df.select(try_to_number(col("a"), lit("$99.99"))),
Seq(Row(78.12)))
}
+ test("SPARK-47646: try_to_number should return NULL for malformed input") {
+ val df = spark.createDataset(spark.sparkContext.parallelize(Seq("11")))
+ checkAnswer(df.select(try_to_number($"value", lit("$99.99"))),
Seq(Row(null)))
+ }
+
test("SPARK-44905: stateful lastRegex causes NullPointerException on eval
for regexp_replace") {
val df = sql("select regexp_replace('', '[a\\\\d]{0, 2}', 'x')")
intercept[SparkRuntimeException](df.queryExecution.optimizedPlan)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]