Re: [PR] [SPARK-54698][SQL] Support hashing for all data types for array set like operations [spark]

via GitHub Wed, 17 Dec 2025 07:35:38 -0800


qlong commented on code in PR #53468:
URL: https://github.com/apache/spark/pull/53468#discussion_r2627462373



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala:
##########
@@ -4201,62 +4207,32 @@ case class ArrayDistinct(child: Expression)
     }
   }
 
-  override def nullSafeEval(array: Any): Any = {
-    val data = array.asInstanceOf[ArrayData]
-    doEvaluation(data)
-  }
-
-  @transient private lazy val doEvaluation = if 
(TypeUtils.typeWithProperEquals(elementType)) {
-    (array: ArrayData) =>
-      val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
-      val hs = new SQLOpenHashSet[Any]()
-      val withNaNCheckFunc = SQLOpenHashSet.withNaNCheckFunc(elementType, hs,
-        (value: Any) =>
-          if (!hs.contains(value)) {
-            if (arrayBuffer.size > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
-              throw 
QueryExecutionErrors.arrayFunctionWithElementsExceedLimitError(
-                prettyName, arrayBuffer.size)
-            }
-            arrayBuffer += value
-            hs.add(value)
-          },
-        (valueNaN: Any) => arrayBuffer += valueNaN)
-      val withNullCheckFunc = SQLOpenHashSet.withNullCheckFunc(elementType, hs,
-        (value: Any) => withNaNCheckFunc(value),
-        () => arrayBuffer += null)
-      var i = 0
-      while (i < array.numElements()) {
-        withNullCheckFunc(array, i)
-        i += 1
-      }
-      new GenericArrayData(arrayBuffer)
-  } else {
-    (data: ArrayData) => {
-      val array = data.toArray[AnyRef](elementType)
-      val arrayBuffer = new scala.collection.mutable.ArrayBuffer[AnyRef]
-      var alreadyStoredNull = false
-      for (i <- array.indices) {
-        if (array(i) != null) {
-          var found = false
-          var j = 0
-          while (!found && j < arrayBuffer.size) {
-            val va = arrayBuffer(j)
-            found = (va != null) && ordering.equiv(va, array(i))
-            j += 1
-          }
-          if (!found) {
-            arrayBuffer += array(i)
-          }
-        } else {
-          // De-duplicate the null values.
-          if (!alreadyStoredNull) {
-            arrayBuffer += array(i)
-            alreadyStoredNull = true
+  override def nullSafeEval(input: Any): Any = {
+    val array = input.asInstanceOf[ArrayData]
+    val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
+    val hs = new SQLOpenHashSet[Any]()
+    val withNaNCheckFunc = SQLOpenHashSet.withNaNCheckFunc(elementType, hs,
+      (value: Any) => {
+        val key = keyGenerator(value)
+        if (!hs.contains(key)) {
+          if (arrayBuffer.size > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
+            throw 
QueryExecutionErrors.arrayFunctionWithElementsExceedLimitError(
+              prettyName, arrayBuffer.size)
           }
+          arrayBuffer += value
+          hs.add(key)
         }
-      }
-      new GenericArrayData(arrayBuffer)
+      },
+      (valueNaN: Any) => arrayBuffer += valueNaN)
+    val withNullCheckFunc = SQLOpenHashSet.withNullCheckFunc(elementType, hs,
+      (value: Any) => withNaNCheckFunc(value),
+      () => arrayBuffer += null)
+    var i = 0
+    while (i < array.numElements()) {
+      withNullCheckFunc(array, i)
+      i += 1
     }
+    new GenericArrayData(arrayBuffer)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {

Review Comment:
   Now i see the codegen just delegates to eval for non-primitive



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-54698][SQL] Support hashing for all data types for array set like operations [spark]

Reply via email to