This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 0219eb5984f [SPARK-45661][SQL][PYTHON] Add toNullable in StructType, MapType and ArrayType 0219eb5984f is described below commit 0219eb5984f0f4a7209deb091b713ded10aebba3 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Thu Oct 26 09:30:59 2023 +0900 [SPARK-45661][SQL][PYTHON] Add toNullable in StructType, MapType and ArrayType ### What changes were proposed in this pull request? This PR proposes to add: - `StructType.toNullable` - `MapType.toNullable` - `ArrayType.toNullable` that returns a nullable schema. ### Why are the changes needed? See https://stackoverflow.com/questions/33193958/change-nullable-property-of-column-in-spark-dataframe as an example. ### Does this PR introduce _any_ user-facing change? Yes, it adds new API in both Scala and Python: - `StructType.toNullable` - `MapType.toNullable` - `ArrayType.toNullable` ### How was this patch tested? For Scala, it just adds an alias. For Python side, doctests were added. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43523 from HyukjinKwon/SPARK-45661. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/types.py | 124 +++++++++++++++++++++ .../org/apache/spark/sql/types/ArrayType.scala | 8 ++ .../scala/org/apache/spark/sql/types/MapType.scala | 8 ++ .../org/apache/spark/sql/types/StructType.scala | 8 ++ 4 files changed, 148 insertions(+) diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 01db75b2500..d6862d7178a 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -139,6 +139,9 @@ class DataType: """ return obj + def _as_nullable(self) -> "DataType": + return self + @classmethod def fromDDL(cls, ddl: str) -> "DataType": """ @@ -593,6 +596,41 @@ class ArrayType(DataType): def simpleString(self) -> str: return "array<%s>" % self.elementType.simpleString() + def _as_nullable(self) -> "ArrayType": + return ArrayType(self.elementType._as_nullable(), containsNull=True) + + def toNullable(self) -> "ArrayType": + """ + Returns the same data type but set all nullability fields are true + (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`). + + .. versionadded:: 4.0.0 + + Returns + ------- + :class:`ArrayType` + + Examples + -------- + Example 1: Simple nullability conversion + + >>> ArrayType(IntegerType(), containsNull=False).toNullable() + ArrayType(IntegerType(), True) + + Example 2: Nested nullability conversion + + >>> ArrayType( + ... StructType([ + ... StructField("b", IntegerType(), nullable=False), + ... StructField("c", ArrayType(IntegerType(), containsNull=False)) + ... ]), + ... containsNull=False + ... ).toNullable() + ArrayType(StructType([StructField('b', IntegerType(), True), + StructField('c', ArrayType(IntegerType(), True), True)]), True) + """ + return self._as_nullable() + def __repr__(self) -> str: return "ArrayType(%s, %s)" % (self.elementType, str(self.containsNull)) @@ -671,6 +709,44 @@ class MapType(DataType): def simpleString(self) -> str: return "map<%s,%s>" % (self.keyType.simpleString(), self.valueType.simpleString()) + def _as_nullable(self) -> "MapType": + return MapType( + self.keyType._as_nullable(), self.valueType._as_nullable(), valueContainsNull=True + ) + + def toNullable(self) -> "MapType": + """ + Returns the same data type but set all nullability fields are true + (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`). + + .. versionadded:: 4.0.0 + + Returns + ------- + :class:`MapType` + + Examples + -------- + Example 1: Simple nullability conversion + + >>> MapType(IntegerType(), StringType(), valueContainsNull=False).toNullable() + MapType(IntegerType(), StringType(), True) + + Example 2: Nested nullability conversion + + >>> MapType( + ... StringType(), + ... MapType( + ... IntegerType(), + ... ArrayType(IntegerType(), containsNull=False), + ... valueContainsNull=False + ... ), + ... valueContainsNull=False + ... ).toNullable() + MapType(StringType(), MapType(IntegerType(), ArrayType(IntegerType(), True), True), True) + """ + return self._as_nullable() + def __repr__(self) -> str: return "MapType(%s, %s, %s)" % (self.keyType, self.valueType, str(self.valueContainsNull)) @@ -978,6 +1054,54 @@ class StructType(DataType): def simpleString(self) -> str: return "struct<%s>" % (",".join(f.simpleString() for f in self)) + def _as_nullable(self) -> "StructType": + fields = [] + for field in self.fields: + fields.append( + StructField( + field.name, + field.dataType._as_nullable(), + nullable=True, + metadata=field.metadata, + ) + ) + return StructType(fields) + + def toNullable(self) -> "StructType": + """ + Returns the same data type but set all nullability fields are true + (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`). + + .. versionadded:: 4.0.0 + + Returns + ------- + :class:`StructType` + + Examples + -------- + Example 1: Simple nullability conversion + + >>> StructType([StructField("a", IntegerType(), nullable=False)]).toNullable() + StructType([StructField('a', IntegerType(), True)]) + + Example 2: Nested nullability conversion + + >>> StructType([ + ... StructField("a", + ... StructType([ + ... StructField("b", IntegerType(), nullable=False), + ... StructField("c", StructType([ + ... StructField("d", IntegerType(), nullable=False) + ... ])) + ... ]), + ... nullable=False) + ... ]).toNullable() + StructType([StructField('a', StructType([StructField('b', IntegerType(), True), + StructField('c', StructType([StructField('d', IntegerType(), True)]), True)]), True)]) + """ + return self._as_nullable() + def __repr__(self) -> str: return "StructType([%s])" % ", ".join(str(field) for field in self) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala index a5226870097..e5af472d90e 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala @@ -96,6 +96,14 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT override private[spark] def asNullable: ArrayType = ArrayType(elementType.asNullable, containsNull = true) + /** + * Returns the same data type but set all nullability fields are true + * (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`). + * + * @since 4.0.0 + */ + def toNullable: ArrayType = asNullable + override private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = { f(this) || elementType.existsRecursively(f) } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala index ce0c76dbe4f..dba870466fc 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala @@ -76,6 +76,14 @@ case class MapType( override private[spark] def asNullable: MapType = MapType(keyType.asNullable, valueType.asNullable, valueContainsNull = true) + /** + * Returns the same data type but set all nullability fields are true + * (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`). + * + * @since 4.0.0 + */ + def toNullable: MapType = asNullable + override private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = { f(this) || keyType.existsRecursively(f) || valueType.existsRecursively(f) } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala index f1771d933bb..5fe6b0a5f00 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala @@ -490,6 +490,14 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru StructType(newFields) } + /** + * Returns the same data type but set all nullability fields are true + * (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`). + * + * @since 4.0.0 + */ + def toNullable: StructType = asNullable + override private[spark] def existsRecursively(f: (DataType) => Boolean): Boolean = { f(this) || fields.exists(field => field.dataType.existsRecursively(f)) } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org