[
https://issues.apache.org/jira/browse/SPARK-40413?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Hyukjin Kwon resolved SPARK-40413.
----------------------------------
Resolution: Invalid
> Column.isin produces non-boolean results
> ----------------------------------------
>
> Key: SPARK-40413
> URL: https://issues.apache.org/jira/browse/SPARK-40413
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 3.3.0
> Reporter: Andreas Franz
> Priority: Major
>
> I observed an inconsistent behaviour using the Column.isin function. The
> [documentation|https://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Column.html#isin(list:Any*):org.apache.spark.sql.Column]
> states that an "up-cast" takes places when different data types are
> involved. When working with _null_ values the results are confusing to me.
> I prepared a small example demonstrating the issue
> {code:java}
> package example
> import org.apache.spark.sql.{Row, SparkSession}
> import org.apache.spark.sql.types.{StringType, StructField, StructType}
> import org.apache.spark.sql.functions._
> object Test {
> def main(args: Array[String]): Unit = {
> val spark = SparkSession.builder()
> .appName("App")
> .master("local[*]")
> .config("spark.driver.host", "localhost")
> .config("spark.ui.enabled", "false")
> .getOrCreate()
> val schema = StructType(
> Array(
> StructField("name", StringType, nullable = true)
> )
> )
> val data = Seq(
> Row("a"),
> Row("b"),
> Row("c"),
> Row(""),
> Row(null)
> ).toList
> val list1 = Array("a", "d", "")
> val list2 = Array("a", "d", "", null)
> val dataFrame =
> spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
> dataFrame
> .withColumn("name_is_in_list_1", col("name").isin(list1: _*))
> .show(10, truncate = false)
> /*
> +----+-----------------+
> |name|name_is_in_list_1|
> +----+-----------------+
> |a |true |
> |b |false |
> |c |false |
> | |true |
> |null|null | // check value null is not contained in
> list1, why is null returned here? Expected result: false
> +----+-----------------+
> */
> dataFrame
> .withColumn("name_is_in_list_2", col("name").isin(list2: _*))
> .show(10, truncate = false)
> /*
> +----+-----------------+
> |name|name_is_in_list_2|
> +----+-----------------+
> |a |true |
> |b |null | // check value "b" is not contained in
> list1, why is null returned here? Expected result: false
> |c |null | // check value "c" is not contained in
> list1, why is null returned here? Expected result: false
> | |true |
> |null|null | // check value null is in list1, why is
> null returned here? Expected result: true
> +----+-----------------+
> */
> val data2 = Seq(
> Row("a"),
> Row("b"),
> Row("c"),
> Row(""),
> ).toList
> val dataFrame2 =
> spark.createDataFrame(spark.sparkContext.parallelize(data2), schema)
> dataFrame2
> .withColumn("name_is_in_list_2", col("name").isin(list2: _*))
> .show(10, truncate = false)
>
> /*
> +----+-----------------+
> |name|name_is_in_list_2|
> +----+-----------------+
> |a |true |
> |b |null | // check value "b" is not contained in
> list2, why is null returned here? Expected result: false
> |c |null | // check value "b" is not contained in
> list2, why is null returned here? Expected result: false
> | |true |
> +----+-----------------+
> */
> }
> }{code}
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]