[ 
https://issues.apache.org/jira/browse/SPARK-27685?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Huon Wilson updated SPARK-27685:
--------------------------------
    Description: 
When doing a {{union}} of two dataframes, a column that is nullable in one of 
the dataframes will be nullable in the union, promoting the non-nullable one to 
be nullable. 

This doesn't happen properly for columns nested as subcolumns of a {{struct}}. 
It seems to just take the nullability of the first dataframe in the union, 
meaning a nullable column will become non-nullable, resulting in invalid values.

{code:scala}
case class X(x: Option[Long])
case class Nested(nested: X)

// First, just work with normal columns
val df1 = Seq(1L, 2L).toDF("x")
val df2 = Seq(Some(3L), None).toDF("x")

df1.printSchema
// root
//  |-- x: long (nullable = false)

df2.printSchema
// root
//  |-- x: long (nullable = true)

(df1 union df2).printSchema
// root
//  |-- x: long (nullable = true)

(df1 union df2).as[X].collect
// res19: Array[X] = Array(X(Some(1)), X(Some(2)), X(Some(3)), X(None))

(df1 union df2).select("*").show
// +----+
// |   x|
// +----+
// |   1|
// |   2|
// |   3|
// |null|
// +----+

// Now, the same with the 'x' column within a struct:

val struct1 = df1.select(struct('x) as "nested")
val struct2 = df2.select(struct('x) as "nested")

struct1.printSchema
// root
//  |-- nested: struct (nullable = false)
//  |    |-- x: long (nullable = false)

struct2.printSchema
// root
//  |-- nested: struct (nullable = false)
//  |    |-- x: long (nullable = true)

// BAD: the x column is not nullable
(struct1 union struct2).printSchema
// root
//  |-- nested: struct (nullable = false)
//  |    |-- x: long (nullable = false)

// BAD: the last x value became "Some(0)", instead of "None"
(struct1 union struct2).as[Nested].collect
// res23: Array[Nested] = Array(Nested(X(Some(1))), Nested(X(Some(2))), 
Nested(X(Some(3))), Nested(X(Some(0))))

// BAD: showing just the nested columns throws a NPE
(struct1 union struct2).select("nested.*").show
// java.lang.NullPointerException
//  at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificSafeProjection.apply(Unknown
 Source)
//  at 
org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collectFromPlan$1.apply(Dataset.scala:3387)
// ...
//  at org.apache.spark.sql.Dataset.show(Dataset.scala:714)
//  ... 49 elided


// Flipping the order makes x nullable as desired
(struct2 union struct1).printSchema
// root
//  |-- nested: struct (nullable = false)
//  |    |-- x: long (nullable = true)
(struct2 union struct1).as[Y].collect
// res26: Array[Y] = Array(Y(X(Some(3))), Y(X(None)), Y(X(Some(1))), 
Y(X(Some(2))))

(struct2 union struct1).select("nested.*").show
// +----+
// |   x|
// +----+
// |   3|
// |null|
// |   1|
// |   2|
// +----+
{code}

Note the three {{BAD}} lines, where the union of structs became non-nullable 
and resulted in invalid values and exceptions.

  was:
When doing a {{union}} of two dataframes, a column that is nullable in one of 
the dataframes will be nullable in the union, promoting the non-nullable one to 
be nullable. 

This doesn't happen properly for columns nested as subcolumns of a {{struct}}. 
It seems to just take the nullability of the first dataframe in the union, 
meaning a nullable column will become non-nullable, resulting in invalid values.

{code:scala}
case class X(x: Option[Long])
case class Nested(nested: X)

// First, just work with normal columns
val df1 = Seq(1L, 2L).toDF("x")
val df2 = Seq(Some(3L), None).toDF("x")

df1.printSchema
// root
//  |-- x: long (nullable = false)

df2.printSchema
// root
//  |-- x: long (nullable = true)

(df1 union df2).printSchema
// root
//  |-- x: long (nullable = true)

(df1 union df2).as[X].collect
// res19: Array[X] = Array(X(Some(1)), X(Some(2)), X(Some(3)), X(None))

// Now, the same with the 'x' column within a struct:

val struct1 = df1.select(struct('x) as "nested")
val struct2 = df2.select(struct('x) as "nested")

struct1.printSchema
// root
//  |-- nested: struct (nullable = false)
//  |    |-- x: long (nullable = false)

struct2.printSchema
// root
//  |-- nested: struct (nullable = false)
//  |    |-- x: long (nullable = true)

// BAD: the x column is not nullable
(struct1 union struct2).printSchema
// root
//  |-- nested: struct (nullable = false)
//  |    |-- x: long (nullable = false)

// BAD: the last x value became "Some(0)", instead of "None"
(struct1 union struct2).as[Nested].collect
// res23: Array[Nested] = Array(Nested(X(Some(1))), Nested(X(Some(2))), 
Nested(X(Some(3))), Nested(X(Some(0))))

// Flipping the order makes x nullable as desired
(struct2 union struct1).printSchema
// root
//  |-- nested: struct (nullable = false)
//  |    |-- x: long (nullable = true)
(struct2 union struct1).as[Y].collect
// res26: Array[Y] = Array(Y(X(Some(3))), Y(X(None)), Y(X(Some(1))), 
Y(X(Some(2))))
{code}

Note the two {{BAD}} lines, where the union of structs became non-nullable and 
resulted in invalid values.


> `union` doesn't promote non-nullable columns of struct to nullable
> ------------------------------------------------------------------
>
>                 Key: SPARK-27685
>                 URL: https://issues.apache.org/jira/browse/SPARK-27685
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.4.0
>            Reporter: Huon Wilson
>            Priority: Major
>
> When doing a {{union}} of two dataframes, a column that is nullable in one of 
> the dataframes will be nullable in the union, promoting the non-nullable one 
> to be nullable. 
> This doesn't happen properly for columns nested as subcolumns of a 
> {{struct}}. It seems to just take the nullability of the first dataframe in 
> the union, meaning a nullable column will become non-nullable, resulting in 
> invalid values.
> {code:scala}
> case class X(x: Option[Long])
> case class Nested(nested: X)
> // First, just work with normal columns
> val df1 = Seq(1L, 2L).toDF("x")
> val df2 = Seq(Some(3L), None).toDF("x")
> df1.printSchema
> // root
> //  |-- x: long (nullable = false)
> df2.printSchema
> // root
> //  |-- x: long (nullable = true)
> (df1 union df2).printSchema
> // root
> //  |-- x: long (nullable = true)
> (df1 union df2).as[X].collect
> // res19: Array[X] = Array(X(Some(1)), X(Some(2)), X(Some(3)), X(None))
> (df1 union df2).select("*").show
> // +----+
> // |   x|
> // +----+
> // |   1|
> // |   2|
> // |   3|
> // |null|
> // +----+
> // Now, the same with the 'x' column within a struct:
> val struct1 = df1.select(struct('x) as "nested")
> val struct2 = df2.select(struct('x) as "nested")
> struct1.printSchema
> // root
> //  |-- nested: struct (nullable = false)
> //  |    |-- x: long (nullable = false)
> struct2.printSchema
> // root
> //  |-- nested: struct (nullable = false)
> //  |    |-- x: long (nullable = true)
> // BAD: the x column is not nullable
> (struct1 union struct2).printSchema
> // root
> //  |-- nested: struct (nullable = false)
> //  |    |-- x: long (nullable = false)
> // BAD: the last x value became "Some(0)", instead of "None"
> (struct1 union struct2).as[Nested].collect
> // res23: Array[Nested] = Array(Nested(X(Some(1))), Nested(X(Some(2))), 
> Nested(X(Some(3))), Nested(X(Some(0))))
> // BAD: showing just the nested columns throws a NPE
> (struct1 union struct2).select("nested.*").show
> // java.lang.NullPointerException
> //  at 
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificSafeProjection.apply(Unknown
>  Source)
> //  at 
> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collectFromPlan$1.apply(Dataset.scala:3387)
> // ...
> //  at org.apache.spark.sql.Dataset.show(Dataset.scala:714)
> //  ... 49 elided
> // Flipping the order makes x nullable as desired
> (struct2 union struct1).printSchema
> // root
> //  |-- nested: struct (nullable = false)
> //  |    |-- x: long (nullable = true)
> (struct2 union struct1).as[Y].collect
> // res26: Array[Y] = Array(Y(X(Some(3))), Y(X(None)), Y(X(Some(1))), 
> Y(X(Some(2))))
> (struct2 union struct1).select("nested.*").show
> // +----+
> // |   x|
> // +----+
> // |   3|
> // |null|
> // |   1|
> // |   2|
> // +----+
> {code}
> Note the three {{BAD}} lines, where the union of structs became non-nullable 
> and resulted in invalid values and exceptions.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to