Github user DylanGuedes commented on a diff in the pull request:
https://github.com/apache/spark/pull/21045#discussion_r189316316
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
---
@@ -126,6 +126,134 @@ case class MapKeys(child: Expression)
override def prettyName: String = "map_keys"
}
+@ExpressionDescription(
+ usage = """_FUNC_(a1, a2, ...) - Returns a merged array containing in
the N-th position the
+ N-th value of each array given.""",
+ examples = """
+ Examples:
+ > SELECT _FUNC_(array(1, 2, 3), array(2, 3, 4));
+ [[1, 2], [2, 3], [3, 4]]
+ > SELECT _FUNC_(array(1, 2), array(2, 3), array(3, 4));
+ [[1, 2, 3], [2, 3, 4]]
+ """,
+ since = "2.4.0")
+case class Zip(children: Seq[Expression]) extends Expression with
ExpectsInputTypes {
+
+ override def inputTypes: Seq[AbstractDataType] =
Seq.fill(children.length)(ArrayType)
+
+ override def dataType: DataType = ArrayType(mountSchema)
+
+ override def prettyName: String = "zip"
+
+ override def nullable: Boolean = children.forall(_.nullable)
+
+ lazy val numberOfArrays: Int = children.length
+
+ def mountSchema: StructType = {
+ val arrayAT = children.map(_.dataType.asInstanceOf[ArrayType])
+ val fields = arrayAT.zipWithIndex.foldRight(List[StructField]()) {
case ((arr, idx), list) =>
+ StructField(s"_$idx", arr.elementType, children(idx).nullable ||
arr.containsNull) :: list
+ }
+ StructType(fields)
+ }
+
+ override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+ val genericArrayData = classOf[GenericArrayData].getName
+ val genericInternalRow = classOf[GenericInternalRow].getName
+
+ val evals = children.map(_.genCode(ctx))
+
+ val arrayTypes =
children.map(_.dataType.asInstanceOf[ArrayType].elementType)
+
+ val arrVals = ctx.freshName("arrVals")
+ val arrCardinality = ctx.freshName("arrCardinality")
+ val biggestCardinality = ctx.freshName("biggestCardinality")
+
+ val inputs = evals.zipWithIndex.map { case (eval, index) =>
+ s"""
+ |${eval.code}
+ |if (!${eval.isNull}) {
+ | $arrVals[$index] = ${eval.value};
+ | $arrCardinality[$index] = ${eval.value}.numElements();
+ |} else {
+ | $arrVals[$index] = null;
+ | $arrCardinality[$index] = 0;
+ |}
+ |$biggestCardinality = Math.max($biggestCardinality,
$arrCardinality[$index]);
+ """.stripMargin
+ }.mkString("\n")
+
+ val myobject = ctx.freshName("myobject")
+ val j = ctx.freshName("j")
+ val i = ctx.freshName("i")
+ val args = ctx.freshName("args")
+
+ val fillValue = evals.zipWithIndex.map { case (eval, index) =>
+ s"""
+ | if ($j == ${index}) {
+ | $myobject[$j] = ${CodeGenerator.getValue(s"$arrVals[$j]",
arrayTypes(index), i)};
+ | }
+ """.stripMargin
+ }
+
+ val fillValueSplitted = ctx.splitExpressions(
+ expressions = fillValue,
+ funcName = "fillValue",
+ arguments =
+ ("int", j) ::
+ ("Array[] Object", myobject) :: Nil)
+
+ ev.copy(s"""
+ |ArrayData[] $arrVals = new ArrayData[$numberOfArrays];
+ |int[] $arrCardinality = new int[$numberOfArrays];
+ |int $biggestCardinality = 0;
+ |$inputs
+ |Object[] $args = new Object[$biggestCardinality];
+ |for (int $i = 0; $i < $biggestCardinality; $i ++) {
+ | Object[] $myobject = new Object[$numberOfArrays];
+ | for (int $j = 0; $j < $numberOfArrays; $j ++) {
+ | if ($arrVals[$j] != null && $arrCardinality[$j] > $i &&
!$arrVals[$j].isNullAt($i)) {
+ | $fillValueSplitted
--- End diff --
I changed to the following solution: I gen one `if` for each distinct type,
instead of a `if` for every expression. This way I generate at worst the
maximum types of different array types (I think they are less than 20). What
you think? The whole problem is due to I'm not being able to evaluate `j` in
the getValue, which would allow me to use `getValue(s"$arrVals[$j]",
arrayElementTypes(eval(j)), i)`.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]