Kimahriman commented on code in PR #53468:
URL: https://github.com/apache/spark/pull/53468#discussion_r2625497953


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala:
##########
@@ -4201,62 +4207,32 @@ case class ArrayDistinct(child: Expression)
     }
   }
 
-  override def nullSafeEval(array: Any): Any = {
-    val data = array.asInstanceOf[ArrayData]
-    doEvaluation(data)
-  }
-
-  @transient private lazy val doEvaluation = if 
(TypeUtils.typeWithProperEquals(elementType)) {
-    (array: ArrayData) =>
-      val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
-      val hs = new SQLOpenHashSet[Any]()
-      val withNaNCheckFunc = SQLOpenHashSet.withNaNCheckFunc(elementType, hs,
-        (value: Any) =>
-          if (!hs.contains(value)) {
-            if (arrayBuffer.size > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
-              throw 
QueryExecutionErrors.arrayFunctionWithElementsExceedLimitError(
-                prettyName, arrayBuffer.size)
-            }
-            arrayBuffer += value
-            hs.add(value)
-          },
-        (valueNaN: Any) => arrayBuffer += valueNaN)
-      val withNullCheckFunc = SQLOpenHashSet.withNullCheckFunc(elementType, hs,
-        (value: Any) => withNaNCheckFunc(value),
-        () => arrayBuffer += null)
-      var i = 0
-      while (i < array.numElements()) {
-        withNullCheckFunc(array, i)
-        i += 1
-      }
-      new GenericArrayData(arrayBuffer)
-  } else {
-    (data: ArrayData) => {
-      val array = data.toArray[AnyRef](elementType)
-      val arrayBuffer = new scala.collection.mutable.ArrayBuffer[AnyRef]
-      var alreadyStoredNull = false
-      for (i <- array.indices) {
-        if (array(i) != null) {
-          var found = false
-          var j = 0
-          while (!found && j < arrayBuffer.size) {
-            val va = arrayBuffer(j)
-            found = (va != null) && ordering.equiv(va, array(i))
-            j += 1
-          }
-          if (!found) {
-            arrayBuffer += array(i)
-          }
-        } else {
-          // De-duplicate the null values.
-          if (!alreadyStoredNull) {
-            arrayBuffer += array(i)
-            alreadyStoredNull = true
+  override def nullSafeEval(input: Any): Any = {
+    val array = input.asInstanceOf[ArrayData]
+    val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
+    val hs = new SQLOpenHashSet[Any]()
+    val withNaNCheckFunc = SQLOpenHashSet.withNaNCheckFunc(elementType, hs,
+      (value: Any) => {
+        val key = keyGenerator(value)
+        if (!hs.contains(key)) {
+          if (arrayBuffer.size > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
+            throw 
QueryExecutionErrors.arrayFunctionWithElementsExceedLimitError(
+              prettyName, arrayBuffer.size)
           }
+          arrayBuffer += value
+          hs.add(key)
         }
-      }
-      new GenericArrayData(arrayBuffer)
+      },
+      (valueNaN: Any) => arrayBuffer += valueNaN)
+    val withNullCheckFunc = SQLOpenHashSet.withNullCheckFunc(elementType, hs,
+      (value: Any) => withNaNCheckFunc(value),
+      () => arrayBuffer += null)
+    var i = 0
+    while (i < array.numElements()) {
+      withNullCheckFunc(array, i)
+      i += 1
     }
+    new GenericArrayData(arrayBuffer)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {

Review Comment:
   The codegen path just uses the eval function for non simple types already. 
Didn't look into it too much for how much work it would take to just also 
implement this in codegen directly. The types supported in codegen are even 
more strict than the existing hashing eval case (strings will fall back to 
interpreted). I assumed that might be for a reason



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to