Github user mn-mikke commented on a diff in the pull request:
https://github.com/apache/spark/pull/21050#discussion_r184471365
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
---
@@ -1059,3 +1059,78 @@ case class Flatten(child: Expression) extends
UnaryExpression {
override def prettyName: String = "flatten"
}
+
+/**
+ * Removes duplicate values from the array.
+ */
+@ExpressionDescription(
+ usage = "_FUNC_(array) - Removes duplicate values from the array.",
+ examples = """
+ Examples:
+ > SELECT _FUNC_(array(1, 2, 3, null, 3));
+ [1,2,3,null]
+ """, since = "2.4.0")
+case class ArrayDistinct(child: Expression)
+ extends UnaryExpression with ExpectsInputTypes {
+
+ override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType)
+
+ override def dataType: DataType = child.dataType
+
+ override def nullSafeEval(array: Any): Any = {
+ val elementType = child.dataType.asInstanceOf[ArrayType].elementType
+ val data =
array.asInstanceOf[ArrayData].toArray[AnyRef](elementType).distinct
+ new GenericArrayData(data.asInstanceOf[Array[Any]])
+ }
+
+ override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+ val elementType = dataType.asInstanceOf[ArrayType].elementType
+ nullSafeCodeGen(ctx, ev, (array) => {
+ val arrayClass = classOf[GenericArrayData].getName
+ val tempArray = ctx.freshName("tempArray")
+ val distinctArray = ctx.freshName("distinctArray")
+ val i = ctx.freshName("i")
+ val j = ctx.freshName("j")
+ val pos = ctx.freshName("arrayPosition")
+ val getValue1 = CodeGenerator.getValue(array, elementType, i)
+ val getValue2 = CodeGenerator.getValue(array, elementType, j)
+ s"""
+ |int $pos = 0;
+ |Object[] $tempArray = new Object[$array.numElements()];
--- End diff --
Just wondering about cases with big arrays with lots of duplicated values.
In such cases, `$tempArray` is unnecesserily big. What about performing the
filtering in two runs? The first run would calculate the result array size and
the second would copy items from the source to the result?.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]