[spark] branch master updated: [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy

maxgekk Wed, 06 Sep 2023 00:32:59 -0700

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 24b29adcf53 [SPARK-45079][SQL] Fix an internal error from 
`percentile_approx()`on `NULL` accuracy
24b29adcf53 is described below

commit 24b29adcf53616067a9fa2ca201e3f4d2f54436b
Author: Max Gekk <max.g...@gmail.com>
AuthorDate: Wed Sep 6 10:32:37 2023 +0300

    [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on 
`NULL` accuracy
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to check the `accuracy` argument is not a NULL in 
`ApproximatePercentile`. And if it is, throw an `AnalysisException` with new 
error class `DATATYPE_MISMATCH.UNEXPECTED_NULL`.
    
    ### Why are the changes needed?
    To fix the issue demonstrated by the example:
    ```sql
    $ spark-sql (default)> SELECT percentile_approx(col, array(0.5, 0.4, 0.1), 
NULL) FROM VALUES (0), (1), (2), (10) AS tab(col);
    [INTERNAL_ERROR] The Spark SQL phase analysis failed with an internal 
error. You hit a bug in Spark or the Spark plugins you use. Please, report this 
bug to the corresponding communities or vendors, and provide the full stack 
trace.
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    By running new test:
    ```
    $ build/sbt "test:testOnly *.ApproximatePercentileQuerySuite"
    ```
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #42817 from MaxGekk/fix-internal-error-in-percentile_approx.
    
    Authored-by: Max Gekk <max.g...@gmail.com>
    Signed-off-by: Max Gekk <max.g...@gmail.com>
---
 .../aggregate/ApproximatePercentile.scala          |  7 ++++-
 .../sql/ApproximatePercentileQuerySuite.scala      | 31 ++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index 3c3afc1c7e7..5b44c3fa31b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -97,7 +97,8 @@ case class ApproximatePercentile(
   }
 
   // Mark as lazy so that accuracyExpression is not evaluated during tree 
transformation.
-  private lazy val accuracy: Long = 
accuracyExpression.eval().asInstanceOf[Number].longValue
+  private lazy val accuracyNum = accuracyExpression.eval().asInstanceOf[Number]
+  private lazy val accuracy: Long = accuracyNum.longValue
 
   override def inputTypes: Seq[AbstractDataType] = {
     // Support NumericType, DateType, TimestampType and TimestampNTZType since 
their internal types
@@ -138,6 +139,10 @@ case class ApproximatePercentile(
           "inputExpr" -> toSQLExpr(accuracyExpression)
         )
       )
+    } else if (accuracyNum == null) {
+      DataTypeMismatch(
+        errorSubClass = "UNEXPECTED_NULL",
+        messageParameters = Map("exprName" -> "accuracy"))
     } else if (accuracy <= 0 || accuracy > Int.MaxValue) {
       DataTypeMismatch(
         errorSubClass = "VALUE_OUT_OF_RANGE",
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
index 18e8dd6249b..273e8e08fd7 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
@@ -339,4 +339,35 @@ class ApproximatePercentileQuerySuite extends QueryTest 
with SharedSparkSession
           Row(Period.ofMonths(200).normalized(), null, 
Duration.ofSeconds(200L)))
     }
   }
+
+  test("SPARK-45079: NULL arguments of percentile_approx") {
+    checkError(
+      exception = intercept[AnalysisException] {
+        sql(
+          """
+            |SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL)
+            |FROM VALUES (0), (1), (2), (10) AS tab(col);
+            |""".stripMargin).collect()
+      },
+      errorClass = "DATATYPE_MISMATCH.UNEXPECTED_NULL",
+      parameters = Map(
+        "exprName" -> "accuracy",
+        "sqlExpr" -> "\"percentile_approx(col, array(0.5, 0.4, 0.1), NULL)\""),
+      context = ExpectedContext(
+        "", "", 8, 57, "percentile_approx(col, array(0.5, 0.4, 0.1), NULL)"))
+    checkError(
+      exception = intercept[AnalysisException] {
+        sql(
+          """
+            |SELECT percentile_approx(col, NULL, 100)
+            |FROM VALUES (0), (1), (2), (10) AS tab(col);
+            |""".stripMargin).collect()
+      },
+      errorClass = "DATATYPE_MISMATCH.UNEXPECTED_NULL",
+      parameters = Map(
+        "exprName" -> "percentage",
+        "sqlExpr" -> "\"percentile_approx(col, NULL, 100)\""),
+      context = ExpectedContext(
+        "", "", 8, 40, "percentile_approx(col, NULL, 100)"))
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy

Reply via email to