Github user kiszk commented on a diff in the pull request:
https://github.com/apache/spark/pull/21061#discussion_r192490355
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
---
@@ -1882,3 +1882,311 @@ case class ArrayRepeat(left: Expression, right:
Expression)
}
}
+
+object ArraySetLike {
+ val kindUnion = 1
+
+ private val MAX_ARRAY_LENGTH: Int =
ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
+
+ def toArrayDataInt(hs: OpenHashSet[Int]): ArrayData = {
+ val array = new Array[Int](hs.size)
+ var pos = hs.nextPos(0)
+ var i = 0
+ while (pos != OpenHashSet.INVALID_POS) {
+ array(i) = hs.getValue(pos)
+ pos = hs.nextPos(pos + 1)
+ i += 1
+ }
+
+ val numBytes = 4L * array.length
+ val unsafeArraySizeInBytes =
UnsafeArrayData.calculateHeaderPortionInBytes(array.length) +
+
org.apache.spark.unsafe.array.ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes)
+ // Since UnsafeArrayData.fromPrimitiveArray() uses long[], max
elements * 8 bytes can be used
+ if (unsafeArraySizeInBytes <= Integer.MAX_VALUE * 8) {
+ UnsafeArrayData.fromPrimitiveArray(array)
+ } else {
+ new GenericArrayData(array)
+ }
+ }
+
+ def toArrayDataLong(hs: OpenHashSet[Long]): ArrayData = {
+ val array = new Array[Long](hs.size)
+ var pos = hs.nextPos(0)
+ var i = 0
+ while (pos != OpenHashSet.INVALID_POS) {
+ array(i) = hs.getValue(pos)
+ pos = hs.nextPos(pos + 1)
+ i += 1
+ }
+
+ val numBytes = 8L * array.length
+ val unsafeArraySizeInBytes =
UnsafeArrayData.calculateHeaderPortionInBytes(array.length) +
+
org.apache.spark.unsafe.array.ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes)
+ // Since UnsafeArrayData.fromPrimitiveArray() uses long[], max
elements * 8 bytes can be used
+ if (unsafeArraySizeInBytes <= Integer.MAX_VALUE * 8) {
--- End diff --
Ah, I misunderstood. To accept `Integer.MAX_VALUE * 8` looks a future plan.
Anyway, I will use the same calculation in
`UnsafeArrayData.fromPrimitiveArray()`.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]