[GitHub] [spark] yaooqinn commented on a change in pull request #26279: [SPARK-28382][SQL] Add array function - unnest

GitBox Fri, 15 Nov 2019 01:33:42 -0800

yaooqinn commented on a change in pull request #26279: [SPARK-28382][SQL] Add 
array function - unnest
URL: https://github.com/apache/spark/pull/26279#discussion_r346732410


 ##########
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
 ##########
 @@ -343,6 +344,87 @@ abstract class ExplodeBase extends UnaryExpression with 
CollectionGenerator with
   }
 }
 
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Separates the elements of array `expr` into multiple 
rows recursively.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_(array(10, 20));
+       10
+       20
+      > SELECT _FUNC_(a) FROM VALUES (array(1,2)), (array(3,4)) AS v1(a);
+       1
+       2
+       3
+       4
+      > SELECT _FUNC_(a) FROM VALUES (array(array(1,2), array(3,4))) AS v1(a);
+       1
+       2
+       3
+       4
+  """,
+  since = "3.0.0")
+case class UnNest(child: Expression) extends UnaryExpression with Generator 
with CodegenFallback {
+
+  override def prettyName: String = "unnest"
+
+  override def elementSchema: StructType = {
+    new StructType().add(prettyName, getLeafDataType(child.dataType), true)
+  }
+
+  // TODO: multidimensional arrays must have array expressions with matching 
dimensions
+  override def checkInputDataTypes(): TypeCheckResult = child.dataType match {
+    case ArrayType(_: NullType, _) => TypeCheckResult.TypeCheckSuccess
+    case _: ArrayType =>
+      var currentType = child.dataType
+      while(currentType.isInstanceOf[ArrayType]) {
+        val arrayType = currentType.asInstanceOf[ArrayType]
+        if (arrayType.containsNull && 
!arrayType.elementType.isInstanceOf[AtomicType]) {
+          return TypeCheckResult.TypeCheckFailure("multidimensional arrays 
must not contains nulls")
+        }
+        currentType = getElementTypeOrItself(currentType)
+      }
+      TypeUtils.checkForSameTypeInputExpr(child.children.map(_.dataType), 
s"function $prettyName")
+
+    case _ =>
+      TypeCheckResult.TypeCheckFailure(
+        s"input to function unnest should be array type not 
${child.dataType.catalogString}")
+  }
+
+  @scala.annotation.tailrec
+  private def getLeafDataType(typ: DataType): DataType = typ match {
+    case ArrayType(et, _) => getLeafDataType(et)
+    case at => at
+  }
+
+  private def getElementTypeOrItself(typ: DataType): DataType = typ match {
+    case ArrayType(et, _) => et
+    case _ => typ
+  }
+
+  override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
+    var inputArray = child.eval(input).asInstanceOf[ArrayData]
+    if (inputArray == null) {
+      Nil
+    } else {
+      val rows = ArrayBuffer[InternalRow]()
+      var currentType = child.dataType
+      while (currentType.isInstanceOf[ArrayType]) {
+        val currentElementType = getElementTypeOrItself(currentType)
+        if (!currentElementType.isInstanceOf[ArrayType]) {
+          inputArray.foreach(currentElementType, (_, e) => rows += 
InternalRow(e))
 
 Review comment:
   is this a scala foreach? I guess it is a Spark ArrayData' own implementation

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] yaooqinn commented on a change in pull request #26279: [SPARK-28382][SQL] Add array function - unnest

Reply via email to