This is an automated email from the ASF dual-hosted git repository.

dtenedor pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 14489233a35e [SPARK-43986][SQL][FOLLOWUP] Catch 
ArrayIndexOutOfBoundsException in Datasketches HLL functions
14489233a35e is described below

commit 14489233a35ec07ce4e05092aba7e2e3a4922870
Author: Daniel Tenedorio <[email protected]>
AuthorDate: Fri Jan 16 10:20:15 2026 -0800

    [SPARK-43986][SQL][FOLLOWUP] Catch ArrayIndexOutOfBoundsException in 
Datasketches HLL functions
    
    ### What changes were proposed in this pull request?
    
    This PR fixes error handling in `HllUnionAgg` to properly catch 
`ArrayIndexOutOfBoundsException` and convert it to the user-friendly 
`HLL_INVALID_INPUT_SKETCH_BUFFER` error.
    
    When an invalid HLL sketch binary is passed to `HllUnionAgg`, the 
DataSketches library may throw an `ArrayIndexOutOfBoundsException` during 
parsing. For example, if the `curMode` ordinal in the HLL preamble is set to an 
invalid value (e.g., 3 when only 0, 1, 2 are valid), `CurMode.fromOrdinal()` 
throws this exception. Previously, this exception would propagate to the user 
as an unhelpful `ArrayIndexOutOfBoundsException`.
    
    The fix adds `ArrayIndexOutOfBoundsException` to the existing catch block 
that already handles `SketchesArgumentException` and `java.lang.Error`, 
ensuring all invalid sketch buffer errors are converted to the clear 
`HLL_INVALID_INPUT_SKETCH_BUFFER` error message.
    
    ### Why are the changes needed?
    
    Without this fix, users see a confusing `ArrayIndexOutOfBoundsException` 
when passing malformed binary data to HLL union functions. This provides a poor 
user experience as it doesn't indicate what went wrong or how to fix it. The 
`HLL_INVALID_INPUT_SKETCH_BUFFER` error clearly indicates that the input binary 
is not a valid HLL sketch.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes. Users who pass invalid binary data to `hll_union_agg` will now see a 
clear error message (`HLL_INVALID_INPUT_SKETCH_BUFFER`) instead of an 
`ArrayIndexOutOfBoundsException`.
    
    ### How was this patch tested?
    
    Added a new unit test `"HllUnionAgg throws proper error for invalid binary 
input causing ArrayIndexOutOfBounds"` in `DatasketchesHllSketchSuite` that:
    1. Crafts a byte array with a valid HLL preamble structure but an invalid 
`curMode` ordinal (3 instead of 0, 1, or 2)
    2. Verifies that the exception is NOT an `ArrayIndexOutOfBoundsException`
    3. Verifies that the error message contains 
`HLL_INVALID_INPUT_SKETCH_BUFFER`
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Yes, `claude-4.5-opus-high` plus manual review and editing.
    
    Closes #53823 from dtenedor/fix-kll-quantiles-errors.
    
    Authored-by: Daniel Tenedorio <[email protected]>
    Signed-off-by: Daniel Tenedorio <[email protected]>
    (cherry picked from commit 6939fcd4a313717e0ac425a3bfed7a784908adca)
    Signed-off-by: Daniel Tenedorio <[email protected]>
---
 .../aggregate/datasketchesAggregates.scala         |  3 +-
 .../expressions/datasketchesExpressions.scala      |  9 +++--
 .../aggregate/DatasketchesHllSketchSuite.scala     | 45 ++++++++++++++++++++++
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala
index 8ae9b8fddde7..8fb1bf51319c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala
@@ -338,7 +338,8 @@ case class HllUnionAgg(
             union.update(sketch)
             Some(union)
           } catch {
-            case _: SketchesArgumentException | _: java.lang.Error =>
+            case _: SketchesArgumentException | _: java.lang.Error
+                 | _: ArrayIndexOutOfBoundsException =>
               throw 
QueryExecutionErrors.hllInvalidInputSketchBuffer(prettyName)
           }
         case _ =>
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datasketchesExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datasketchesExpressions.scala
index a4ac0bdbb11d..1880d71e7d54 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datasketchesExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datasketchesExpressions.scala
@@ -56,7 +56,8 @@ case class HllSketchEstimate(child: Expression)
     try {
       Math.round(HllSketch.heapify(Memory.wrap(buffer)).getEstimate)
     } catch {
-      case _: SketchesArgumentException | _: java.lang.Error =>
+      case _: SketchesArgumentException | _: java.lang.Error
+           | _: ArrayIndexOutOfBoundsException =>
         throw QueryExecutionErrors.hllInvalidInputSketchBuffer(prettyName)
     }
   }
@@ -108,13 +109,15 @@ case class HllUnion(first: Expression, second: 
Expression, third: Expression)
     val sketch1 = try {
       HllSketch.heapify(Memory.wrap(value1.asInstanceOf[Array[Byte]]))
     } catch {
-      case _: SketchesArgumentException | _: java.lang.Error =>
+      case _: SketchesArgumentException | _: java.lang.Error
+           | _: ArrayIndexOutOfBoundsException =>
         throw QueryExecutionErrors.hllInvalidInputSketchBuffer(prettyName)
     }
     val sketch2 = try {
       HllSketch.heapify(Memory.wrap(value2.asInstanceOf[Array[Byte]]))
     } catch {
-      case _: SketchesArgumentException | _: java.lang.Error =>
+      case _: SketchesArgumentException | _: java.lang.Error
+           | _: ArrayIndexOutOfBoundsException =>
         throw QueryExecutionErrors.hllInvalidInputSketchBuffer(prettyName)
     }
     val allowDifferentLgConfigK = value3.asInstanceOf[Boolean]
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/DatasketchesHllSketchSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/DatasketchesHllSketchSuite.scala
index 0841702cc518..0f7f5ca54be0 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/DatasketchesHllSketchSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/DatasketchesHllSketchSuite.scala
@@ -108,4 +108,49 @@ class DatasketchesHllSketchSuite extends SparkFunSuite {
 
     
assert(HllSketch.heapify(Memory.wrap(binary3.asInstanceOf[Array[Byte]])).getLgConfigK
 == 12)
   }
+
+  test("HllUnionAgg throws proper error for invalid binary input causing 
ArrayIndexOutOfBounds") {
+    val aggFunc = new HllUnionAgg(BoundReference(0, BinaryType, nullable = 
true), true)
+    val union = aggFunc.createAggregationBuffer()
+
+    // Craft a byte array that passes initial size checks but has an invalid 
CurMode ordinal.
+    // HLL preamble layout:
+    //   Byte 0: preInts (preamble size in ints)
+    //   Byte 1: serVer (must be 1)
+    //   Byte 2: famId (must be 7 for HLL)
+    //   Byte 3: lgK (4-21)
+    //   Byte 5: flags
+    //   Byte 7: modeByte - bits 0-1 contain curMode ordinal (0=LIST, 1=SET, 
2=HLL)
+    //
+    // Setting bits 0-1 of byte 7 to 0b11 (=3) causes CurMode.fromOrdinal(3) 
to throw
+    // ArrayIndexOutOfBoundsException since CurMode only has ordinals 0, 1, 2.
+    // This happens in PreambleUtil.extractCurMode() before other validations 
run.
+    val invalidBinary = Array[Byte](
+      2,    // byte 0: preInts = 2 (LIST_PREINTS, passes check)
+      1,    // byte 1: serVer = 1 (valid)
+      7,    // byte 2: famId = 7 (HLL family)
+      12,   // byte 3: lgK = 12 (valid range 4-21)
+      0,    // byte 4: unused
+      0,    // byte 5: flags = 0
+      0,    // byte 6: unused
+      3     // byte 7: modeByte with bits 0-1 = 0b11 = 3 (INVALID curMode 
ordinal!)
+    )
+
+    val exception = intercept[Exception] {
+      aggFunc.update(union, InternalRow(invalidBinary))
+    }
+
+    // Verify that ArrayIndexOutOfBoundsException is properly caught and 
converted
+    // to the user-friendly HLL_INVALID_INPUT_SKETCH_BUFFER error
+    assert(
+      !exception.isInstanceOf[ArrayIndexOutOfBoundsException],
+      s"ArrayIndexOutOfBoundsException should be caught and converted to " +
+        s"HLL_INVALID_INPUT_SKETCH_BUFFER error, but got: 
${exception.getClass.getName}"
+    )
+    assert(
+      exception.getMessage.contains("HLL_INVALID_INPUT_SKETCH_BUFFER"),
+      s"Expected HLL_INVALID_INPUT_SKETCH_BUFFER error, " +
+        s"but got: ${exception.getClass.getName}: ${exception.getMessage}"
+    )
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to