Github user ueshin commented on a diff in the pull request:
https://github.com/apache/spark/pull/21102#discussion_r205342201
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
---
@@ -3805,3 +3801,339 @@ object ArrayUnion {
new GenericArrayData(arrayBuffer)
}
}
+
+/**
+ * Returns an array of the elements in the intersect of x and y, without
duplicates
+ */
+@ExpressionDescription(
+ usage = """
+ _FUNC_(array1, array2) - Returns an array of the elements in the
intersection of array1 and
+ array2, without duplicates.
+ """,
+ examples = """
+ Examples:Fun
+ > SELECT _FUNC_(array(1, 2, 3), array(1, 3, 5));
+ array(1, 3)
+ """,
+ since = "2.4.0")
+case class ArrayIntersect(left: Expression, right: Expression) extends
ArraySetLike {
+ override def dataType: DataType = ArrayType(elementType,
+ left.dataType.asInstanceOf[ArrayType].containsNull &&
+ right.dataType.asInstanceOf[ArrayType].containsNull)
+
+ var hsInt: OpenHashSet[Int] = _
+ var hsResultInt: OpenHashSet[Int] = _
+ var hsLong: OpenHashSet[Long] = _
+ var hsResultLong: OpenHashSet[Long] = _
+
+ def assignInt(array: ArrayData, idx: Int, resultArray: ArrayData, pos:
Int): Boolean = {
+ val elem = array.getInt(idx)
+ if (hsInt.contains(elem) && !hsResultInt.contains(elem)) {
+ if (resultArray != null) {
+ resultArray.setInt(pos, elem)
+ }
+ hsResultInt.add(elem)
+ true
+ } else {
+ false
+ }
+ }
+
+ def assignLong(array: ArrayData, idx: Int, resultArray: ArrayData, pos:
Int): Boolean = {
+ val elem = array.getLong(idx)
+ if (hsLong.contains(elem) && !hsResultLong.contains(elem)) {
+ if (resultArray != null) {
+ resultArray.setLong(pos, elem)
+ }
+ hsResultLong.add(elem)
+ true
+ } else {
+ false
+ }
+ }
+
+ def evalIntLongPrimitiveType(
+ array1: ArrayData,
+ array2: ArrayData,
+ resultArray: ArrayData,
+ initFoundNullElement: Boolean,
+ isLongType: Boolean): (Int, Boolean) = {
+ // store elements into resultArray
+ var i = 0
+ var foundNullElement = initFoundNullElement
+ if (resultArray == null) {
+ // hsInt or hsLong is updated only once since it is not changed
--- End diff --
I might miss something, but can we do the same thing for `array_except`? It
would be good if we can skip traversing the right array. This is not urgent,
maybe we can do it in the follow-up pr of `array_except` pr.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]