[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/21028 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r188840260 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -18,15 +18,53 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator +import scala.collection.mutable + import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.ArraySortLike.NullOrder import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + private val caseSensitive = SQLConf.get.caseSensitiveAnalysis + + @transient protected lazy val elementType: DataType = +inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +(left.dataType, right.dataType) match { + case (ArrayType(e1, hasNull1), ArrayType(e2, hasNull2)) => +TypeCoercion.findTightestCommonType(e1, e2, caseSensitive) match { --- End diff -- this should fail the build now --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r188583588 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,235 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2) +} else { + (arr2, arr1) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { --- End diff -- thanks, I added also some test cases for this --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r188570497 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,235 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2) +} else { + (arr2, arr1) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { --- End diff -- We should also compare the `numElements` here and check the array is empty for the smaller one? Otherwise the result is different if the `arr1` is not empty and contains `null` and `arr2` is empty? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r188200606 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2) +} else { + (arr2, arr1) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |ArrayData $bigger; + |if ($a1.numElements() > $a2.numElements()) { + |
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r188143390 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2) +} else { + (arr2, arr1) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |ArrayData $bigger; + |if ($a1.numElements() > $a2.numElements()) { +
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187994904 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2) +} else { + (arr2, arr1) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |ArrayData $bigger; + |if ($a1.numElements() > $a2.numElements()) { + |
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187993318 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2) +} else { + (arr2, arr1) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |ArrayData $bigger; + |if ($a1.numElements() > $a2.numElements()) { +
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187990948 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2) +} else { + (arr2, arr1) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |ArrayData $bigger; + |if ($a1.numElements() > $a2.numElements()) { +
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187988455 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) +} else { + (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |Arr
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187950642 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) +} else { + (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |Arra
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187950523 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) +} else { + (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |Arra
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187946945 --- Diff: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala --- @@ -136,6 +136,59 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(ArrayContains(a3, Literal.create(null, StringType)), null) } + test("ArraysOverlap") { +val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType)) +val a1 = Literal.create(Seq(4, 5, 3), ArrayType(IntegerType)) +val a2 = Literal.create(Seq(null, 5, 6), ArrayType(IntegerType)) +val a3 = Literal.create(Seq(7, 8), ArrayType(IntegerType)) +val a4 = Literal.create(Seq.empty[Int], ArrayType(IntegerType)) + +val a5 = Literal.create(Seq[String](null, ""), ArrayType(StringType)) +val a6 = Literal.create(Seq[String]("", "abc"), ArrayType(StringType)) +val a7 = Literal.create(Seq[String]("def", "ghi"), ArrayType(StringType)) + +checkEvaluation(ArraysOverlap(a0, a1), true) +checkEvaluation(ArraysOverlap(a0, a2), null) +checkEvaluation(ArraysOverlap(a1, a2), true) +checkEvaluation(ArraysOverlap(a1, a3), false) +checkEvaluation(ArraysOverlap(a0, a4), false) +checkEvaluation(ArraysOverlap(a2, a4), null) +checkEvaluation(ArraysOverlap(a4, a2), null) + +checkEvaluation(ArraysOverlap(a5, a6), true) +checkEvaluation(ArraysOverlap(a5, a7), null) +checkEvaluation(ArraysOverlap(a6, a7), false) + +// null handling +checkEvaluation(ArraysOverlap(Literal.create(null, ArrayType(IntegerType)), a0), null) +checkEvaluation(ArraysOverlap(a0, Literal.create(null, ArrayType(IntegerType))), null) +checkEvaluation(ArraysOverlap( + Literal.create(Seq(null), ArrayType(IntegerType)), + Literal.create(Seq(null), ArrayType(IntegerType))), null) --- End diff -- This case is covered by https://github.com/apache/spark/pull/21028/files#diff-d31eca9f1c4c33104dc2cb8950486910R163 for instance. Anyway, I am adding another on which is exactly this one. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187935251 --- Diff: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala --- @@ -136,6 +136,59 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(ArrayContains(a3, Literal.create(null, StringType)), null) } + test("ArraysOverlap") { +val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType)) +val a1 = Literal.create(Seq(4, 5, 3), ArrayType(IntegerType)) +val a2 = Literal.create(Seq(null, 5, 6), ArrayType(IntegerType)) +val a3 = Literal.create(Seq(7, 8), ArrayType(IntegerType)) +val a4 = Literal.create(Seq.empty[Int], ArrayType(IntegerType)) + +val a5 = Literal.create(Seq[String](null, ""), ArrayType(StringType)) +val a6 = Literal.create(Seq[String]("", "abc"), ArrayType(StringType)) +val a7 = Literal.create(Seq[String]("def", "ghi"), ArrayType(StringType)) + +checkEvaluation(ArraysOverlap(a0, a1), true) +checkEvaluation(ArraysOverlap(a0, a2), null) +checkEvaluation(ArraysOverlap(a1, a2), true) +checkEvaluation(ArraysOverlap(a1, a3), false) +checkEvaluation(ArraysOverlap(a0, a4), false) +checkEvaluation(ArraysOverlap(a2, a4), null) +checkEvaluation(ArraysOverlap(a4, a2), null) + +checkEvaluation(ArraysOverlap(a5, a6), true) +checkEvaluation(ArraysOverlap(a5, a7), null) +checkEvaluation(ArraysOverlap(a6, a7), false) + +// null handling +checkEvaluation(ArraysOverlap(Literal.create(null, ArrayType(IntegerType)), a0), null) +checkEvaluation(ArraysOverlap(a0, Literal.create(null, ArrayType(IntegerType))), null) +checkEvaluation(ArraysOverlap( + Literal.create(Seq(null), ArrayType(IntegerType)), + Literal.create(Seq(null), ArrayType(IntegerType))), null) --- End diff -- do we have a test case for it? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187934895 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) +} else { + (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |Arr
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187934593 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) +} else { + (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |Arr
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187932792 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) +} else { + (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val smaller = ctx.freshName("smallerArray") + val bigger = ctx.freshName("biggerArray") + val comparisonCode = if (elementTypeSupportEquals) { +fastCodegen(ctx, ev, smaller, bigger) + } else { +bruteForceCodegen(ctx, ev, smaller, bigger) + } + s""" + |ArrayData $smaller; + |Arr
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187931865 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +567,239 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { +fastEval _ + } else { +bruteForceEval _ + } + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { --- End diff -- the `biggerDt` is not used --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187606811 --- Diff: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala --- @@ -136,6 +136,59 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(ArrayContains(a3, Literal.create(null, StringType)), null) } + test("ArraysOverlap") { +val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType)) +val a1 = Literal.create(Seq(4, 5, 3), ArrayType(IntegerType)) +val a2 = Literal.create(Seq(null, 5, 6), ArrayType(IntegerType)) +val a3 = Literal.create(Seq(7, 8), ArrayType(IntegerType)) +val a4 = Literal.create(Seq.empty[Int], ArrayType(IntegerType)) + +val a5 = Literal.create(Seq[String](null, ""), ArrayType(StringType)) +val a6 = Literal.create(Seq[String]("", "abc"), ArrayType(StringType)) +val a7 = Literal.create(Seq[String]("def", "ghi"), ArrayType(StringType)) + +checkEvaluation(ArraysOverlap(a0, a1), true) +checkEvaluation(ArraysOverlap(a0, a2), null) +checkEvaluation(ArraysOverlap(a1, a2), true) +checkEvaluation(ArraysOverlap(a1, a3), false) +checkEvaluation(ArraysOverlap(a0, a4), false) +checkEvaluation(ArraysOverlap(a2, a4), null) +checkEvaluation(ArraysOverlap(a4, a2), null) + +checkEvaluation(ArraysOverlap(a5, a6), true) +checkEvaluation(ArraysOverlap(a5, a7), null) +checkEvaluation(ArraysOverlap(a6, a7), false) + +// null handling +checkEvaluation(ArraysOverlap(Literal.create(null, ArrayType(IntegerType)), a0), null) +checkEvaluation(ArraysOverlap(a0, Literal.create(null, ArrayType(IntegerType))), null) +checkEvaluation(ArraysOverlap( + Literal.create(Seq(null), ArrayType(IntegerType)), + Literal.create(Seq(null), ArrayType(IntegerType))), null) --- End diff -- I am returning `null` for it. This is interesting. I checked Presto's implementation and it returns `false` if any of the input arrays is empty. I am copying Presto's behavior but this is quite against what the docs say: > Returns null if there are no non-null elements in common but either array contains null. I will add a sentence to clarify the behavior in our docs. Thanks for this nice catch! --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187603861 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +564,157 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) +} else { + (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { --- End diff -- actually it was not working also with `ArrayType`, so I addressed the problem in a more general way which supports both these cases. Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187294411 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +564,272 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { + fastEval _ +} else { + bruteForceEval _ +} + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) +} else { + (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} else if (containsNull(bigger, biggerDt)) { + hasNull = true +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} else if (containsNull(arr2, right.dataType.asInstanceOf[ArrayType])) { + hasNull = true +} +if (hasNull) { + null +} else { + false +} + } + + def containsNull(arr: ArrayData, dt: ArrayType): Boolean = { +if (dt.containsNull) { + var i = 0 + var hasNull = false + while (i < arr.numElements && !hasNull) { +hasNull = arr.isNullAt(i) +i += 1 + } + hasNull +} else { + false +} + }
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187233292 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +564,272 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { + fastEval _ +} else { + bruteForceEval _ +} --- End diff -- nit: indent --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187234226 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +564,272 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match { +case TypeCheckResult.TypeCheckSuccess => + if (RowOrdering.isOrderable(elementType)) { +TypeCheckResult.TypeCheckSuccess + } else { +TypeCheckResult.TypeCheckFailure(s"${elementType.simpleString} cannot be used in comparison.") + } +case failure => failure + } + + @transient private lazy val ordering: Ordering[Any] = +TypeUtils.getInterpretedOrdering(elementType) + + @transient private lazy val elementTypeSupportEquals = elementType match { +case BinaryType => false +case _: AtomicType => true +case _ => false + } + + @transient private lazy val doEvaluation = if (elementTypeSupportEquals) { + fastEval _ +} else { + bruteForceEval _ +} + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +doEvaluation(a1.asInstanceOf[ArrayData], a2.asInstanceOf[ArrayData]) + } + + /** + * A fast implementation which puts all the elements from the smaller array in a set + * and then performs a lookup on it for each element of the bigger one. + * This eval mode works only for data types which implements properly the equals method. + */ + private def fastEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) +} else { + (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} else if (containsNull(bigger, biggerDt)) { + hasNull = true +} +if (hasNull) { + null +} else { + false +} + } + + /** + * A slower evaluation which performs a nested loop and supports all the data types. + */ + private def bruteForceEval(arr1: ArrayData, arr2: ArrayData): Any = { +var hasNull = false +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v1 == null) { + hasNull = true +} else if (ordering.equiv(v1, v2)) { + return true +} + ) +}) +} else if (containsNull(arr2, right.dataType.asInstanceOf[ArrayType])) { + hasNull = true +} +if (hasNull) { + null +} else { + false +} + } + + def containsNull(arr: ArrayData, dt: ArrayType): Boolean = { +if (dt.containsNull) { + var i = 0 + var hasNull = false + while (i < arr.numElements && !hasNull) { +hasNull = arr.isNullAt(i) +i += 1 + } + hasNull +} else { + false +} + }
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187236142 --- Diff: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala --- @@ -136,6 +136,59 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(ArrayContains(a3, Literal.create(null, StringType)), null) } + test("ArraysOverlap") { +val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType)) +val a1 = Literal.create(Seq(4, 5, 3), ArrayType(IntegerType)) +val a2 = Literal.create(Seq(null, 5, 6), ArrayType(IntegerType)) +val a3 = Literal.create(Seq(7, 8), ArrayType(IntegerType)) +val a4 = Literal.create(Seq.empty[Int], ArrayType(IntegerType)) + +val a5 = Literal.create(Seq[String](null, ""), ArrayType(StringType)) +val a6 = Literal.create(Seq[String]("", "abc"), ArrayType(StringType)) +val a7 = Literal.create(Seq[String]("def", "ghi"), ArrayType(StringType)) + +checkEvaluation(ArraysOverlap(a0, a1), true) +checkEvaluation(ArraysOverlap(a0, a2), null) +checkEvaluation(ArraysOverlap(a1, a2), true) +checkEvaluation(ArraysOverlap(a1, a3), false) +checkEvaluation(ArraysOverlap(a0, a4), false) +checkEvaluation(ArraysOverlap(a2, a4), null) +checkEvaluation(ArraysOverlap(a4, a2), null) + +checkEvaluation(ArraysOverlap(a5, a6), true) +checkEvaluation(ArraysOverlap(a5, a7), null) +checkEvaluation(ArraysOverlap(a6, a7), false) + +// null handling +checkEvaluation(ArraysOverlap(Literal.create(null, ArrayType(IntegerType)), a0), null) +checkEvaluation(ArraysOverlap(a0, Literal.create(null, ArrayType(IntegerType))), null) +checkEvaluation(ArraysOverlap( + Literal.create(Seq(null), ArrayType(IntegerType)), + Literal.create(Seq(null), ArrayType(IntegerType))), null) --- End diff -- What if `arrays_overlap(array(), array(null))`? Seems like Presto returns `false` for the case. [TestArrayOperators.java#L1041](https://github.com/prestodb/presto/blob/master/presto-main/src/test/java/com/facebook/presto/type/TestArrayOperators.java#L1041) Also can you add the test case? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187066619 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -529,6 +564,157 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) +} else { + (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) +} +if (smaller.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smaller.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + bigger.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { --- End diff -- this doesn't work with `BinaryType`(the data is byte[]). We may need to wrap values with `ByteBuffer` first. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r187060515 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -18,15 +18,50 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator +import scala.collection.mutable + import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.ArraySortLike.NullOrder import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + @transient protected lazy val elementType: DataType = +inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +(left.dataType, right.dataType) match { + case (ArrayType(e1, hasNull1), ArrayType(e2, hasNull2)) => +TypeCoercion.findTightestCommonType(e1, e2) match { --- End diff -- makes sense --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186953487 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -18,15 +18,50 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator +import scala.collection.mutable + import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.ArraySortLike.NullOrder import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + @transient protected lazy val elementType: DataType = +inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +(left.dataType, right.dataType) match { + case (ArrayType(e1, hasNull1), ArrayType(e2, hasNull2)) => +TypeCoercion.findTightestCommonType(e1, e2) match { --- End diff -- yes, I think so. What we are not supporting here is nested array with different datatypes which is coherent with the rest of Spark casting model. The other option is to make `findTightestCommonType` less strict about complex data type comparison. But I think this is a more throughout change to be done, since it would (slightly) change the whole Spark casting model. And I am not sure this is the best place to do that, since the main goal is introducing a new function. If we decide to do something like that I would propose creating a different PR for that (I am happy to create the PR if we agree this should be done). What do you think? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186929981 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -18,15 +18,50 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator +import scala.collection.mutable + import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.ArraySortLike.NullOrder import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + @transient protected lazy val elementType: DataType = +inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +(left.dataType, right.dataType) match { + case (ArrayType(e1, hasNull1), ArrayType(e2, hasNull2)) => +TypeCoercion.findTightestCommonType(e1, e2) match { --- End diff -- shall we support nested array? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186728795 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType --- End diff -- (or `@transient lazy val` too optionally) --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186710034 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- yes, I'll go for it. Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186706829 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- ok then `findTightestCommonType` is a better choice? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186690973 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- good question. Checking the way we are doing it I would say no. Since we are bounding in a quite strange way at the moment (causing loss of int digits instead of decimals) I would say no, since this could lead to have many `NULL`s. What do you think? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186606368 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- now the question is, shall we allow precision lose for array functions? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186458781 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- what about `findWiderTypeWithoutStringPromotionForTwo`? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186455988 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType --- End diff -- thanks for your explanation! --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186445741 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- Then we probably need to call `TypeCoercion.findTightestCommonType` here, and fix `findTightestCommonType` for array/struct/map types. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186445302 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType --- End diff -- `lazy val` will be serialized. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186417334 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType --- End diff -- why is it better a `def` than a `lazy val`? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186417229 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- it does not for int and string, but it does for decimal and int. What shall we do? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186355622 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/functions.scala --- @@ -3039,6 +3039,16 @@ object functions { ArrayContains(column.expr, Literal(value)) } + /** + * Returns `true` if `a1` and `a2` have at least one non-null element in common. If not and + * any of the arrays contains a `null`, it returns `null`. It returns `false` otherwise. + * @group collection_funcs + * @since 2.4.0 + */ + def arrays_overlap(a1: Column, a2: Column): Column = withExpr { +ArraysOverlap(a1.expr, a2.expr) + } --- End diff -- nit: indent --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186355288 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -530,6 +560,155 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +val (biggestArr, smallestArr) = if (arr1.numElements() > arr2.numElements()) { --- End diff -- it's just 2 arrays, `smaller` and `bigger` should be better --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186355096 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -530,6 +560,155 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +val (biggestArr, smallestArr) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2) +} else { + (arr2, arr1) +} +if (smallestArr.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smallestArr.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + biggestArr.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} else if (containsNull(biggestArr, right.dataType.asInstanceOf[ArrayType])) { --- End diff -- `right.dataType.asInstanceOf[ArrayType]` may not match the `biggerArr` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186354007 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -530,6 +560,155 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +val (biggestArr, smallestArr) = if (arr1.numElements() > arr2.numElements()) { + (arr1, arr2) +} else { + (arr2, arr1) +} +if (smallestArr.numElements() > 0) { + val smallestSet = new mutable.HashSet[Any] + smallestArr.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + smallestSet += v +}) + biggestArr.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (smallestSet.contains(v1)) { + return true +} + ) +} else if (containsNull(biggestArr, right.dataType.asInstanceOf[ArrayType])) { + hasNull = true +} +if (hasNull) { + null +} else { + false +} + } + + def containsNull(arr: ArrayData, dt: ArrayType): Boolean = { +if (dt.containsNull) { + arr.foreach(elementType, (_, v) => --- End diff -- ``` var i = 0 var hasNull = false while (i < arr.numElements && !hasNull) { hasNull = arr.isNullAt(i) i += 1 } hasNull ``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186353058 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType --- End diff -- this can be a `def` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186353005 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -28,6 +30,34 @@ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- does presto allow implicitly casting to string for these collection functions? e.g. can `ArraysOverlap` work for array of int and array of string? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186246741 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala --- @@ -413,6 +413,25 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { ) } + test("arrays_overlap function") { +val df = Seq( + (Seq[Option[Int]](Some(1), Some(2)), Seq[Option[Int]](Some(-1), Some(10))), + (Seq.empty[Option[Int]], Seq[Option[Int]](Some(-1), None)), + (Seq[Option[Int]](Some(3), Some(2)), Seq[Option[Int]](Some(1), Some(2))) +).toDF("a", "b") + +val answer = Seq(Row(false), Row(null), Row(true)) + +checkAnswer(df.select(arrays_overlap(df("a"), df("b"))), answer) +checkAnswer(df.selectExpr("arrays_overlap(a, b)"), answer) + +checkAnswer(sql("select arrays_overlap(array(1, 2, 3), array('a', 'b', 'c'))"), Row(false)) + --- End diff -- Thank you. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186182632 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala --- @@ -413,6 +413,25 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { ) } + test("arrays_overlap function") { +val df = Seq( + (Seq[Option[Int]](Some(1), Some(2)), Seq[Option[Int]](Some(-1), Some(10))), + (Seq.empty[Option[Int]], Seq[Option[Int]](Some(-1), None)), + (Seq[Option[Int]](Some(3), Some(2)), Seq[Option[Int]](Some(1), Some(2))) +).toDF("a", "b") + +val answer = Seq(Row(false), Row(null), Row(true)) + +checkAnswer(df.select(arrays_overlap(df("a"), df("b"))), answer) +checkAnswer(df.selectExpr("arrays_overlap(a, b)"), answer) + +checkAnswer(sql("select arrays_overlap(array(1, 2, 3), array('a', 'b', 'c'))"), Row(false)) + --- End diff -- Honestly I don't see its utility but I see also no harm in introducing it, so if you think it is a added value, I think it is fine to add it. So I just added it, thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186153677 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala --- @@ -413,6 +413,25 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { ) } + test("arrays_overlap function") { +val df = Seq( + (Seq[Option[Int]](Some(1), Some(2)), Seq[Option[Int]](Some(-1), Some(10))), + (Seq.empty[Option[Int]], Seq[Option[Int]](Some(-1), None)), + (Seq[Option[Int]](Some(3), Some(2)), Seq[Option[Int]](Some(1), Some(2))) +).toDF("a", "b") + +val answer = Seq(Row(false), Row(null), Row(true)) + +checkAnswer(df.select(arrays_overlap(df("a"), df("b"))), answer) +checkAnswer(df.selectExpr("arrays_overlap(a, b)"), answer) + +checkAnswer(sql("select arrays_overlap(array(1, 2, 3), array('a', 'b', 'c'))"), Row(false)) + --- End diff -- I think that to add this makes sense to explicitly ensure `so any other type (NullType included) throws an AnalysisException`. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186151824 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala --- @@ -413,6 +413,25 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { ) } + test("arrays_overlap function") { +val df = Seq( + (Seq[Option[Int]](Some(1), Some(2)), Seq[Option[Int]](Some(-1), Some(10))), + (Seq.empty[Option[Int]], Seq[Option[Int]](Some(-1), None)), + (Seq[Option[Int]](Some(3), Some(2)), Seq[Option[Int]](Some(1), Some(2))) +).toDF("a", "b") + +val answer = Seq(Row(false), Row(null), Row(true)) + +checkAnswer(df.select(arrays_overlap(df("a"), df("b"))), answer) +checkAnswer(df.selectExpr("arrays_overlap(a, b)"), answer) + +checkAnswer(sql("select arrays_overlap(array(1, 2, 3), array('a', 'b', 'c'))"), Row(false)) + --- End diff -- Oh, now I see what you mean. I can add it, but it seems useless to me. This function accepts only `Array`s so any other type (`NullType` included) throws an `AnalysisException`. In `array_contains` is different since we the second argument can be anything and so makes sense to check the behavior of `NullType` which is handled differently from the others. Do you agree? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186149012 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala --- @@ -413,6 +413,25 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { ) } + test("arrays_overlap function") { +val df = Seq( + (Seq[Option[Int]](Some(1), Some(2)), Seq[Option[Int]](Some(-1), Some(10))), + (Seq.empty[Option[Int]], Seq[Option[Int]](Some(-1), None)), + (Seq[Option[Int]](Some(3), Some(2)), Seq[Option[Int]](Some(1), Some(2))) +).toDF("a", "b") + +val answer = Seq(Row(false), Row(null), Row(true)) + +checkAnswer(df.select(arrays_overlap(df("a"), df("b"))), answer) +checkAnswer(df.selectExpr("arrays_overlap(a, b)"), answer) + +checkAnswer(sql("select arrays_overlap(array(1, 2, 3), array('a', 'b', 'c'))"), Row(false)) + --- End diff -- IIUC, I think no. My comment is talking about `null` handling in Dataframe API. [Other operations](https://github.com/apache/spark/pull/21028/files#diff-8e1a34391fdefa4a3a0349d7d454d86fR396) aslo perform tests with `null` in DataFrame API and primitive. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186140513 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala --- @@ -413,6 +413,25 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { ) } + test("arrays_overlap function") { +val df = Seq( + (Seq[Option[Int]](Some(1), Some(2)), Seq[Option[Int]](Some(-1), Some(10))), + (Seq.empty[Option[Int]], Seq[Option[Int]](Some(-1), None)), + (Seq[Option[Int]](Some(3), Some(2)), Seq[Option[Int]](Some(1), Some(2))) +).toDF("a", "b") + +val answer = Seq(Row(false), Row(null), Row(true)) + +checkAnswer(df.select(arrays_overlap(df("a"), df("b"))), answer) +checkAnswer(df.selectExpr("arrays_overlap(a, b)"), answer) + +checkAnswer(sql("select arrays_overlap(array(1, 2, 3), array('a', 'b', 'c'))"), Row(false)) + --- End diff -- this case is already covered here: https://github.com/apache/spark/pull/21028/files#diff-d31eca9f1c4c33104dc2cb8950486910R136, am I right? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186126547 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala --- @@ -413,6 +413,25 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { ) } + test("arrays_overlap function") { +val df = Seq( + (Seq[Option[Int]](Some(1), Some(2)), Seq[Option[Int]](Some(-1), Some(10))), + (Seq.empty[Option[Int]], Seq[Option[Int]](Some(-1), None)), + (Seq[Option[Int]](Some(3), Some(2)), Seq[Option[Int]](Some(1), Some(2))) +).toDF("a", "b") + +val answer = Seq(Row(false), Row(null), Row(true)) + +checkAnswer(df.select(arrays_overlap(df("a"), df("b"))), answer) +checkAnswer(df.selectExpr("arrays_overlap(a, b)"), answer) + +checkAnswer(sql("select arrays_overlap(array(1, 2, 3), array('a', 'b', 'c'))"), Row(false)) + --- End diff -- Do we add a test like this? ``` val df = Seq((null, null)).toDF("a", "b") val ans = ... checkAnswer(df.select(array_overlap($"a", $"b")), ans) checkAnswer(df.selectExpr("array_overlap(a, b)"), ans) ``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186052495 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -378,6 +408,135 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +if (arr1.numElements() > 0) { + val set2 = new mutable.HashSet[Any] + arr2.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + set2 += v +}) + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (set2.contains(v1)) { + return true +} + ) +} else if (containsNull(arr2, right.dataType.asInstanceOf[ArrayType])) { + hasNull = true +} +if (hasNull) { + null +} else { + false +} + } + + def containsNull(arr: ArrayData, dt: ArrayType): Boolean = { +if (dt.containsNull) { + arr.foreach(elementType, (_, v) => +if (v == null) { + return true +} + ) +} +false + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val i1 = ctx.freshName("i") + val i2 = ctx.freshName("i") + val getValue1 = CodeGenerator.getValue(a1, elementType, i1) + val getValue2 = CodeGenerator.getValue(a2, elementType, i2) + val leftEmptyCode = if (right.dataType.asInstanceOf[ArrayType].containsNull) { +s""" + |else { + | for (int $i2 = 0; $i2 < $a2.numElements(); $i2 ++) { + |if ($a2.isNullAt($i2)) { + | ${ev.isNull} = true; + | break; + |} + | } + |} + """.stripMargin + } else { +"" + } + val javaElementClass = CodeGenerator.boxedType(elementType) + val javaSet = classOf[java.util.HashSet[_]].getName + val set2 = ctx.freshName("set") + s""" + |if ($a1.numElements() > 0) { + | $javaSet<$javaElementClass> $set2 = new $javaSet<$javaElementClass>(); + | for (int $i2 = 0; $i2 < $a2.numElements(); $i2 ++) { + | ${nullSafeElementCodegen(right.dataType.asInstanceOf[ArrayType], a2, i2, + s"$set2.add($getValue2);", s"${ev.isNull} = true;")} + | } + | for (int $i1 = 0; $i1 < $a1.numElements(); $i1 ++) { + | ${nullSafeElementCodegen(left.dataType.asInstanceOf[ArrayType], a1, i1, --- End diff -- ditto --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186052476 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -378,6 +408,135 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +if (arr1.numElements() > 0) { + val set2 = new mutable.HashSet[Any] + arr2.foreach(elementType, (_, v) => +if (v == null) { + hasNull = true +} else { + set2 += v +}) + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else if (set2.contains(v1)) { + return true +} + ) +} else if (containsNull(arr2, right.dataType.asInstanceOf[ArrayType])) { + hasNull = true +} +if (hasNull) { + null +} else { + false +} + } + + def containsNull(arr: ArrayData, dt: ArrayType): Boolean = { +if (dt.containsNull) { + arr.foreach(elementType, (_, v) => +if (v == null) { + return true +} + ) +} +false + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val i1 = ctx.freshName("i") + val i2 = ctx.freshName("i") + val getValue1 = CodeGenerator.getValue(a1, elementType, i1) + val getValue2 = CodeGenerator.getValue(a2, elementType, i2) + val leftEmptyCode = if (right.dataType.asInstanceOf[ArrayType].containsNull) { +s""" + |else { + | for (int $i2 = 0; $i2 < $a2.numElements(); $i2 ++) { + |if ($a2.isNullAt($i2)) { + | ${ev.isNull} = true; + | break; + |} + | } + |} + """.stripMargin + } else { +"" + } + val javaElementClass = CodeGenerator.boxedType(elementType) + val javaSet = classOf[java.util.HashSet[_]].getName + val set2 = ctx.freshName("set") + s""" + |if ($a1.numElements() > 0) { + | $javaSet<$javaElementClass> $set2 = new $javaSet<$javaElementClass>(); + | for (int $i2 = 0; $i2 < $a2.numElements(); $i2 ++) { + | ${nullSafeElementCodegen(right.dataType.asInstanceOf[ArrayType], a2, i2, --- End diff -- code style nit: we should create a variable to hold the long string. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186052197 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -378,6 +408,135 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +if (arr1.numElements() > 0) { + val set2 = new mutable.HashSet[Any] --- End diff -- we should build set for the smaller array. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r186027879 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -19,14 +19,41 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { + case Some(arrayType) => Seq(arrayType, arrayType) + case _ => Seq.empty +} + } + + override def checkInputDataTypes(): TypeCheckResult = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- sorry, my bad, I got confused. I am fixing this, thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r185841399 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -19,14 +19,41 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { + case Some(arrayType) => Seq(arrayType, arrayType) + case _ => Seq.empty +} + } + + override def checkInputDataTypes(): TypeCheckResult = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- > we would throw an exception No, the type coercion rule will be run again and again until this expression is resolved, or hit fixed point and fail. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r185840044 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -19,14 +19,41 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression --- End diff -- I see, I would like to hear other opinions --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r185783290 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -19,14 +19,41 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { + case Some(arrayType) => Seq(arrayType, arrayType) + case _ => Seq.empty +} + } + + override def checkInputDataTypes(): TypeCheckResult = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- with your suggestion, if we have two arrays with different element types (but one can be casted to the other), we would throw an exception. Instead, with this approach that use case is valid and we perform an implicit cast to the wider type. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r185781375 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -378,6 +405,127 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryArrayExpressionWithImplicitCast { + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => --- End diff -- shall we build a set instead of doing a nested loop? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r185780243 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -19,14 +19,41 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression + with ImplicitCastInputTypes { + + protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def inputTypes: Seq[AbstractDataType] = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { + case Some(arrayType) => Seq(arrayType, arrayType) + case _ => Seq.empty +} + } + + override def checkInputDataTypes(): TypeCheckResult = { +TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { --- End diff -- We should not call `findWiderTypeForTwo` here. Instead we should directly check if the 2 inputs are array type and have the same element type, so that it's safe to say this expression is resolved. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r185753083 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -378,6 +405,125 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) --- End diff -- done, thanks! --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mn-mikke commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r185577745 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -378,6 +405,125 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +// scalastyle:off line.size.limit +case class ArraysOverlap(left: Expression, right: Expression) --- End diff -- Shouldn't you override `prettyName` to a value following the conventions? ``` override def prettyName: String = "arrays_overlap" ``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184844407 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -19,14 +19,41 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression --- End diff -- @kiszk you are not wrong, but `Concat` is a very specific case, since it supports also `String`s and `Binary`s, so it would anyway require a specific implementation. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184837304 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -19,14 +19,41 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression --- End diff -- As @ueshin pointed out [here](https://github.com/apache/spark/pull/21028#discussion_r184266872), `concat` is also a use case that has a different number of children. Am I wrong? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184741553 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -19,14 +19,41 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression --- End diff -- It's possible indeed. Though, as far as I know there is no use case for a function with a different number of children, so I am not sure if it makes sense to generalize it. @cloud-fan @kiszk @ueshin WDYT? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mn-mikke commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184730604 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -19,14 +19,41 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Comparator import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.unsafe.types.{ByteArray, UTF8String} +/** + * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit + * casting. + */ +trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression --- End diff -- The `ImplicitCastInputTypes` trait is able to work with any number of children. Would it be possible to implement this trait to behave in the same way? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184706765 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- implicit type cast is allowed in Presto. I am pushing here a proposal of trait, let me know what you think about it. Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mn-mikke commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184700686 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- @mgaido91 Sorry, I should have been more explicit. I've been referring to the below case that I added into `FunctionArgumentConversion` due to enabling type coercion of array types. ``` case c @ Concat(children) if children.forall(c => ArrayType.acceptsType(c.dataType)) && !haveSameType(children) => val types = children.map(_.dataType) findWiderCommonType(types) match { case Some(finalDataType) => Concat(children.map(Cast(_, finalDataType))) case None => c } ``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184683126 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- @mn-mikke I am not sure, since it is quite a strange case, since it allows also string and byte. I am not sure we can do this with implicit type cast. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mn-mikke commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184379580 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- Just a quick note, there is a [dedicated type coercion rule](https://github.com/apache/spark/pull/20858/files#diff-383a8cdd0a9c58cae68e0a79295520a3) for `concat` functions. So if there was the trait you described, we could remove the rule. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184351841 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- yea if implicit type cast is not allowed for these functions. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184310311 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- In that case, sounds like `concat` implementation is a good example. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184305294 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- We can create a new trait, which will first make sure all its children are array type, and then make sure all its children are same type after implicit type cast(make sure other databases also do implicit type cast for these functions). Then update `TypeCoercion` rule to handle this trait. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184292558 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- If common functions for a method, which accepts two array with the same type, are provided as trait, it would be fine. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r184266872 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- There are similar functions, such as `array_union`(#21061), `array_intersect`(#21102), `array_except`(#21103), and maybe `concat`(#20858) which is slightly different though, to handle two (or more) arrays with the same element type. I think we should use the same way to specify and check input types. I'd like to discuss the best way for it here or somewhere else. cc @kiszk @mn-mikke Do you have any suggestions? Also cc @gatorsmile @cloud-fan --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r183085374 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- no, because in that way we would loose the information about the `elementType` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r182682158 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { --- End diff -- `Seq(ArrayType, ArrayType)`? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r182688229 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { +case la: ArrayType if la.sameType(right.dataType) => + Seq(la, la) +case _ => Seq.empty + } + + override def checkInputDataTypes(): TypeCheckResult = { +if (!left.dataType.isInstanceOf[ArrayType] || !right.dataType.isInstanceOf[ArrayType] || +!left.dataType.sameType(right.dataType)) { + TypeCheckResult.TypeCheckFailure("Arguments must be arrays with the same element type.") +} else { + TypeCheckResult.TypeCheckSuccess +} + } + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v2 == null) { + hasNull = true +} else if (v1 == v2) { + return true +} + ) +} + ) +} else { --- End diff -- We can skip if the right is `containsNull == false`? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r182689303 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", + examples = """ +Examples: + > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); + true + """, since = "2.4.0") +case class ArraysOverlap(left: Expression, right: Expression) + extends BinaryExpression with ImplicitCastInputTypes { + + private lazy val elementType = inputTypes.head.asInstanceOf[ArrayType].elementType + + override def dataType: DataType = BooleanType + + override def inputTypes: Seq[AbstractDataType] = left.dataType match { +case la: ArrayType if la.sameType(right.dataType) => + Seq(la, la) +case _ => Seq.empty + } + + override def checkInputDataTypes(): TypeCheckResult = { +if (!left.dataType.isInstanceOf[ArrayType] || !right.dataType.isInstanceOf[ArrayType] || +!left.dataType.sameType(right.dataType)) { + TypeCheckResult.TypeCheckFailure("Arguments must be arrays with the same element type.") +} else { + TypeCheckResult.TypeCheckSuccess +} + } + + override def nullable: Boolean = { +left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || + right.dataType.asInstanceOf[ArrayType].containsNull + } + + override def nullSafeEval(a1: Any, a2: Any): Any = { +var hasNull = false +val arr1 = a1.asInstanceOf[ArrayData] +val arr2 = a2.asInstanceOf[ArrayData] +if (arr1.numElements() > 0) { + arr1.foreach(elementType, (_, v1) => +if (v1 == null) { + hasNull = true +} else { + arr2.foreach(elementType, (_, v2) => +if (v2 == null) { + hasNull = true +} else if (v1 == v2) { + return true +} + ) +} + ) +} else { + arr2.foreach(elementType, (_, v) => +if (v == null) { + return null +} + ) +} +if (hasNull) { + null +} else { + false +} + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +nullSafeCodeGen(ctx, ev, (a1, a2) => { + val i1 = ctx.freshName("i") + val i2 = ctx.freshName("i") + val getValue1 = CodeGenerator.getValue(a1, elementType, i1) + val getValue2 = CodeGenerator.getValue(a2, elementType, i2) + s""" + |if ($a1.numElements() > 0) { + | for (int $i1 = 0; $i1 < $a1.numElements(); $i1 ++) { + |if ($a1.isNullAt($i1)) { + | ${ev.isNull} = true; + |} else { + | for (int $i2 = 0; $i2 < $a2.numElements(); $i2 ++) { + |if ($a2.isNullAt($i2)) { + | ${ev.isNull} = true; + |} else if (${ctx.genEqual(elementType, getValue1, getValue2)}) { + | ${ev.isNull} = false; + | ${ev.value} = true; + | break; + |} + | } + | if (${ev.value}) { + |break; + | } + |} + | } + |} else { --- End diff -- ditto. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r182686266 --- Diff: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala --- @@ -106,6 +106,30 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper checkEvaluation(ArrayContains(a3, Literal.create(null, StringType)), null) } + test("ArraysOverlap") { +val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType)) +val a1 = Literal.create(Seq(4, 5, 3), ArrayType(IntegerType)) +val a2 = Literal.create(Seq(null, 5, 6), ArrayType(IntegerType)) +val a3 = Literal.create(Seq(7, 8), ArrayType(IntegerType)) +val a4 = Literal.create(Seq.empty[Int], ArrayType(IntegerType)) + +val a5 = Literal.create(Seq[String](null, ""), ArrayType(StringType)) +val a6 = Literal.create(Seq[String]("", "abc"), ArrayType(StringType)) +val a7 = Literal.create(Seq[String]("def", "ghi"), ArrayType(StringType)) + +checkEvaluation(ArraysOverlap(a0, a1), true) +checkEvaluation(ArraysOverlap(a0, a2), null) +checkEvaluation(ArraysOverlap(a1, a2), true) +checkEvaluation(ArraysOverlap(a1, a3), false) +checkEvaluation(ArraysOverlap(a0, a4), false) +checkEvaluation(ArraysOverlap(a2, a4), null) +checkEvaluation(ArraysOverlap(a4, a2), null) + +checkEvaluation(ArraysOverlap(a5, a6), true) +checkEvaluation(ArraysOverlap(a5, a7), null) +checkEvaluation(ArraysOverlap(a6, a7), false) + } --- End diff -- Can you add cases for one of the two arguments is `null` and `ArraysOverlap(Seq(null), Seq(null))`? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21028#discussion_r182680902 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala --- @@ -288,6 +288,114 @@ case class ArrayContains(left: Expression, right: Expression) override def prettyName: String = "array_contains" } +/** + * Checks if the two arrays contain at least one common element. + */ +@ExpressionDescription( + usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2.", --- End diff -- Can you add a note for null handling? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21028: [SPARK-23922][SQL] Add arrays_overlap function
GitHub user mgaido91 opened a pull request: https://github.com/apache/spark/pull/21028 [SPARK-23922][SQL] Add arrays_overlap function ## What changes were proposed in this pull request? The PR adds the function `arrays_overlap`. This function returns `true` if the input arrays contain a non-null common element; if not, it returns `null` if any of the arrays contains a `null` element, `false` otherwise. ## How was this patch tested? added UTs You can merge this pull request into a Git repository by running: $ git pull https://github.com/mgaido91/spark SPARK-23922 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/21028.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #21028 commit e5ebdad41645c0058f1cd2788f6cc1d4158ff2e9 Author: Marco Gaido Date: 2018-04-10T13:49:53Z [SPARK-23922][SQL] Add arrays_overlap function --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org