[ 
https://issues.apache.org/jira/browse/SPARK-40413?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Andreas Franz updated SPARK-40413:
----------------------------------
    Description: 
I observed an inconsistent behaviour using the Column.isin function. The 
[documentation|https://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Column.html#isin(list:Any*):org.apache.spark.sql.Column]
 states that an "up-cast" takes places when different data types are involved. 
When working with _null_ values the results are confusing to me.

I prepared a small example demonstrating the issue
{code:java}
package example

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.functions._

object Test {


    def main(args: Array[String]): Unit = {

        val spark = SparkSession.builder()
            .appName("App")
            .master("local[*]")
            .config("spark.driver.host", "localhost")
            .config("spark.ui.enabled", "false")
            .getOrCreate()

        val schema = StructType(
            Array(
                StructField("name", StringType, nullable = true)
            )
        )

        val data = Seq(
            Row("a"),
            Row("b"),
            Row("c"),
            Row(""),
            Row(null)
        ).toList

        val list1 = Array("a", "d", "")
        val list2 = Array("a", "d", "", null)

        val dataFrame = 
spark.createDataFrame(spark.sparkContext.parallelize(data), schema)

        dataFrame
            .withColumn("name_is_in_list_1", col("name").isin(list1: _*))
            .show(10, truncate = false)

        /*
            +----+-----------------+
            |name|name_is_in_list_1|
            +----+-----------------+
            |a   |true             |
            |b   |false            |
            |c   |false            |
            |    |true             |
            |null|null             | // check value null is not contained in 
list1, why is null returned here? Expected result: false
            +----+-----------------+
         */

        dataFrame
            .withColumn("name_is_in_list_2", col("name").isin(list2: _*))
            .show(10, truncate = false)

        /*
            +----+-----------------+
            |name|name_is_in_list_2|
            +----+-----------------+
            |a   |true             |
            |b   |null             | // check value "b" is not contained in 
list1, why is null returned here? Expected result: false
            |c   |null             | // check value "c" is not contained in 
list1, why is null returned here? Expected result: false
            |    |true             |
            |null|null             | // check value null is in list1, why is 
null returned here? Expected result: true
            +----+-----------------+
         */


        val data2 = Seq(
            Row("a"),
            Row("b"),
            Row("c"),
            Row(""),
        ).toList

        val dataFrame2 = 
spark.createDataFrame(spark.sparkContext.parallelize(data2), schema)

        dataFrame2
            .withColumn("name_is_in_list_2", col("name").isin(list2: _*))
            .show(10, truncate = false)
        
        /*
            +----+-----------------+
            |name|name_is_in_list_2|
            +----+-----------------+
            |a   |true             |
            |b   |null             | // check value "b" is not contained in 
list2, why is null returned here? Expected result: false
            |c   |null             | // check value "b" is not contained in 
list2, why is null returned here? Expected result: false
            |    |true             |
            +----+-----------------+
         */
    }
}{code}
 

  was:
I observed an inconsistent behaviour using the Column.isin function. The 
[documentation|https://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Column.html#isin(list:Any*):org.apache.spark.sql.Column]
 states that an "up-cast" takes places when different data types are involved. 
When working with _null_ values the results are confusing to me.

I prepared a small example demonstrating the issue
{code:java}
package example

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.functions._

object Test {


    def main(args: Array[String]): Unit = {

        val spark = SparkSession.builder()
            .appName("App")
            .master("local[*]")
            .config("spark.driver.host", "localhost")
            .config("spark.ui.enabled", "false")
            .getOrCreate()

        val schema = StructType(
            Array(
                StructField("name", StringType, nullable = true)
            )
        )

        val data = Seq(
            Row("a"),
            Row("b"),
            Row("c"),
            Row(""),
            Row(null)
        ).toList

        val list1 = Array("a", "d", "")
        val list2 = Array("a", "d", "", null)

        val dataFrame = 
spark.createDataFrame(spark.sparkContext.parallelize(data), schema)

        dataFrame
            .withColumn("name_is_in_list_1", col("name").isin(list1: _*))
            .show(10, truncate = false)

        /*
            +----+-----------------+
            |name|name_is_in_list_1|
            +----+-----------------+
            |a   |true             |
            |b   |false            |
            |c   |false            |
            |    |true             |
            |null|null             | // check value null is not contained in 
list1, why is null returned here? Expected result: false
            +----+-----------------+
         */

        dataFrame
            .withColumn("name_is_in_list_2", col("name").isin(list2: _*))
            .show(10, truncate = false)

        /*
            +----+-----------------+
            |name|name_is_in_list_2|
            +----+-----------------+
            |a   |true             |
            |b   |null             | // check value "b" is not contained in 
list1, why is null returned here? Expected result: false
            |c   |null             | // check value "c" is not contained in 
list1, why is null returned here? Expected result: false
            |    |true             |
            |null|null             | // check value null is in list1, why is 
null returned here? Expected result: true
            +----+-----------------+
         */


        val data2 = Seq(
            Row("a"),
            Row("b"),
            Row("c"),
            Row(""),
        ).toList

        val dataFrame2 = 
spark.createDataFrame(spark.sparkContext.parallelize(data2), schema)

        dataFrame2
            .withColumn("name_is_in_list_2", col("name").isin(list2: _*))
            .show(10, truncate = false)
        
        /*
            +----+-----------------+
            |name|name_is_in_list_2|
            +----+-----------------+
            |a   |true             |
            |b   |null             | // check value "b" is not contained in 
list1, why is null returned here? Expected result: false
            |c   |null             | // check value "b" is not contained in 
list1, why is null returned here? Expected result: false
            |    |true             |
            +----+-----------------+
         */
    }
}{code}
 


> Column.isin produces non-boolean results
> ----------------------------------------
>
>                 Key: SPARK-40413
>                 URL: https://issues.apache.org/jira/browse/SPARK-40413
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 3.3.0
>            Reporter: Andreas Franz
>            Priority: Major
>
> I observed an inconsistent behaviour using the Column.isin function. The 
> [documentation|https://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Column.html#isin(list:Any*):org.apache.spark.sql.Column]
>  states that an "up-cast" takes places when different data types are 
> involved. When working with _null_ values the results are confusing to me.
> I prepared a small example demonstrating the issue
> {code:java}
> package example
> import org.apache.spark.sql.{Row, SparkSession}
> import org.apache.spark.sql.types.{StringType, StructField, StructType}
> import org.apache.spark.sql.functions._
> object Test {
>     def main(args: Array[String]): Unit = {
>         val spark = SparkSession.builder()
>             .appName("App")
>             .master("local[*]")
>             .config("spark.driver.host", "localhost")
>             .config("spark.ui.enabled", "false")
>             .getOrCreate()
>         val schema = StructType(
>             Array(
>                 StructField("name", StringType, nullable = true)
>             )
>         )
>         val data = Seq(
>             Row("a"),
>             Row("b"),
>             Row("c"),
>             Row(""),
>             Row(null)
>         ).toList
>         val list1 = Array("a", "d", "")
>         val list2 = Array("a", "d", "", null)
>         val dataFrame = 
> spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
>         dataFrame
>             .withColumn("name_is_in_list_1", col("name").isin(list1: _*))
>             .show(10, truncate = false)
>         /*
>             +----+-----------------+
>             |name|name_is_in_list_1|
>             +----+-----------------+
>             |a   |true             |
>             |b   |false            |
>             |c   |false            |
>             |    |true             |
>             |null|null             | // check value null is not contained in 
> list1, why is null returned here? Expected result: false
>             +----+-----------------+
>          */
>         dataFrame
>             .withColumn("name_is_in_list_2", col("name").isin(list2: _*))
>             .show(10, truncate = false)
>         /*
>             +----+-----------------+
>             |name|name_is_in_list_2|
>             +----+-----------------+
>             |a   |true             |
>             |b   |null             | // check value "b" is not contained in 
> list1, why is null returned here? Expected result: false
>             |c   |null             | // check value "c" is not contained in 
> list1, why is null returned here? Expected result: false
>             |    |true             |
>             |null|null             | // check value null is in list1, why is 
> null returned here? Expected result: true
>             +----+-----------------+
>          */
>         val data2 = Seq(
>             Row("a"),
>             Row("b"),
>             Row("c"),
>             Row(""),
>         ).toList
>         val dataFrame2 = 
> spark.createDataFrame(spark.sparkContext.parallelize(data2), schema)
>         dataFrame2
>             .withColumn("name_is_in_list_2", col("name").isin(list2: _*))
>             .show(10, truncate = false)
>         
>         /*
>             +----+-----------------+
>             |name|name_is_in_list_2|
>             +----+-----------------+
>             |a   |true             |
>             |b   |null             | // check value "b" is not contained in 
> list2, why is null returned here? Expected result: false
>             |c   |null             | // check value "b" is not contained in 
> list2, why is null returned here? Expected result: false
>             |    |true             |
>             +----+-----------------+
>          */
>     }
> }{code}
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to