Github user mn-mikke commented on a diff in the pull request:
https://github.com/apache/spark/pull/21050#discussion_r184466157
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
---
@@ -1059,3 +1059,78 @@ case class Flatten(child: Expression) extends
UnaryExpression {
override def prettyName: String = "flatten"
}
+
+/**
+ * Removes duplicate values from the array.
+ */
+@ExpressionDescription(
+ usage = "_FUNC_(array) - Removes duplicate values from the array.",
+ examples = """
+ Examples:
+ > SELECT _FUNC_(array(1, 2, 3, null, 3));
+ [1,2,3,null]
+ """, since = "2.4.0")
+case class ArrayDistinct(child: Expression)
+ extends UnaryExpression with ExpectsInputTypes {
+
+ override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType)
+
+ override def dataType: DataType = child.dataType
+
+ override def nullSafeEval(array: Any): Any = {
+ val elementType = child.dataType.asInstanceOf[ArrayType].elementType
+ val data =
array.asInstanceOf[ArrayData].toArray[AnyRef](elementType).distinct
+ new GenericArrayData(data.asInstanceOf[Array[Any]])
+ }
+
+ override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+ val elementType = dataType.asInstanceOf[ArrayType].elementType
+ nullSafeCodeGen(ctx, ev, (array) => {
+ val arrayClass = classOf[GenericArrayData].getName
+ val tempArray = ctx.freshName("tempArray")
+ val distinctArray = ctx.freshName("distinctArray")
+ val i = ctx.freshName("i")
+ val j = ctx.freshName("j")
+ val pos = ctx.freshName("arrayPosition")
+ val getValue1 = CodeGenerator.getValue(array, elementType, i)
+ val getValue2 = CodeGenerator.getValue(array, elementType, j)
+ s"""
+ |int $pos = 0;
+ |Object[] $tempArray = new Object[$array.numElements()];
+ |for (int $i = 0; $i < $array.numElements(); $i ++) {
+ | if ($array.isNullAt($i)) {
+ | int $j;
+ | for ($j = 0; $j < $i; $j ++) {
+ | if ($array.isNullAt($j))
+ | break;
+ | }
+ | if ($i == $j) {
+ | $tempArray[$pos] = null;
+ | $pos = $pos + 1;
+ | }
+ | }
+ | else {
+ | int $j;
+ | for ($j = 0; $j < $i; $j ++) {
+ | if (${ctx.genEqual(elementType, getValue1, getValue2)})
--- End diff --
Shouldn't you check `$array.isNullAt($j)` in this loop as well? Especially,
when `elementType` is primitive?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]