This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 0219eb5984f [SPARK-45661][SQL][PYTHON] Add toNullable in StructType,
MapType and ArrayType
0219eb5984f is described below
commit 0219eb5984f0f4a7209deb091b713ded10aebba3
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Thu Oct 26 09:30:59 2023 +0900
[SPARK-45661][SQL][PYTHON] Add toNullable in StructType, MapType and
ArrayType
### What changes were proposed in this pull request?
This PR proposes to add:
- `StructType.toNullable`
- `MapType.toNullable`
- `ArrayType.toNullable`
that returns a nullable schema.
### Why are the changes needed?
See
https://stackoverflow.com/questions/33193958/change-nullable-property-of-column-in-spark-dataframe
as an example.
### Does this PR introduce _any_ user-facing change?
Yes, it adds new API in both Scala and Python:
- `StructType.toNullable`
- `MapType.toNullable`
- `ArrayType.toNullable`
### How was this patch tested?
For Scala, it just adds an alias.
For Python side, doctests were added.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43523 from HyukjinKwon/SPARK-45661.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/types.py | 124 +++++++++++++++++++++
.../org/apache/spark/sql/types/ArrayType.scala | 8 ++
.../scala/org/apache/spark/sql/types/MapType.scala | 8 ++
.../org/apache/spark/sql/types/StructType.scala | 8 ++
4 files changed, 148 insertions(+)
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 01db75b2500..d6862d7178a 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -139,6 +139,9 @@ class DataType:
"""
return obj
+ def _as_nullable(self) -> "DataType":
+ return self
+
@classmethod
def fromDDL(cls, ddl: str) -> "DataType":
"""
@@ -593,6 +596,41 @@ class ArrayType(DataType):
def simpleString(self) -> str:
return "array<%s>" % self.elementType.simpleString()
+ def _as_nullable(self) -> "ArrayType":
+ return ArrayType(self.elementType._as_nullable(), containsNull=True)
+
+ def toNullable(self) -> "ArrayType":
+ """
+ Returns the same data type but set all nullability fields are true
+ (`StructField.nullable`, `ArrayType.containsNull`, and
`MapType.valueContainsNull`).
+
+ .. versionadded:: 4.0.0
+
+ Returns
+ -------
+ :class:`ArrayType`
+
+ Examples
+ --------
+ Example 1: Simple nullability conversion
+
+ >>> ArrayType(IntegerType(), containsNull=False).toNullable()
+ ArrayType(IntegerType(), True)
+
+ Example 2: Nested nullability conversion
+
+ >>> ArrayType(
+ ... StructType([
+ ... StructField("b", IntegerType(), nullable=False),
+ ... StructField("c", ArrayType(IntegerType(),
containsNull=False))
+ ... ]),
+ ... containsNull=False
+ ... ).toNullable()
+ ArrayType(StructType([StructField('b', IntegerType(), True),
+ StructField('c', ArrayType(IntegerType(), True), True)]), True)
+ """
+ return self._as_nullable()
+
def __repr__(self) -> str:
return "ArrayType(%s, %s)" % (self.elementType, str(self.containsNull))
@@ -671,6 +709,44 @@ class MapType(DataType):
def simpleString(self) -> str:
return "map<%s,%s>" % (self.keyType.simpleString(),
self.valueType.simpleString())
+ def _as_nullable(self) -> "MapType":
+ return MapType(
+ self.keyType._as_nullable(), self.valueType._as_nullable(),
valueContainsNull=True
+ )
+
+ def toNullable(self) -> "MapType":
+ """
+ Returns the same data type but set all nullability fields are true
+ (`StructField.nullable`, `ArrayType.containsNull`, and
`MapType.valueContainsNull`).
+
+ .. versionadded:: 4.0.0
+
+ Returns
+ -------
+ :class:`MapType`
+
+ Examples
+ --------
+ Example 1: Simple nullability conversion
+
+ >>> MapType(IntegerType(), StringType(),
valueContainsNull=False).toNullable()
+ MapType(IntegerType(), StringType(), True)
+
+ Example 2: Nested nullability conversion
+
+ >>> MapType(
+ ... StringType(),
+ ... MapType(
+ ... IntegerType(),
+ ... ArrayType(IntegerType(), containsNull=False),
+ ... valueContainsNull=False
+ ... ),
+ ... valueContainsNull=False
+ ... ).toNullable()
+ MapType(StringType(), MapType(IntegerType(), ArrayType(IntegerType(),
True), True), True)
+ """
+ return self._as_nullable()
+
def __repr__(self) -> str:
return "MapType(%s, %s, %s)" % (self.keyType, self.valueType,
str(self.valueContainsNull))
@@ -978,6 +1054,54 @@ class StructType(DataType):
def simpleString(self) -> str:
return "struct<%s>" % (",".join(f.simpleString() for f in self))
+ def _as_nullable(self) -> "StructType":
+ fields = []
+ for field in self.fields:
+ fields.append(
+ StructField(
+ field.name,
+ field.dataType._as_nullable(),
+ nullable=True,
+ metadata=field.metadata,
+ )
+ )
+ return StructType(fields)
+
+ def toNullable(self) -> "StructType":
+ """
+ Returns the same data type but set all nullability fields are true
+ (`StructField.nullable`, `ArrayType.containsNull`, and
`MapType.valueContainsNull`).
+
+ .. versionadded:: 4.0.0
+
+ Returns
+ -------
+ :class:`StructType`
+
+ Examples
+ --------
+ Example 1: Simple nullability conversion
+
+ >>> StructType([StructField("a", IntegerType(),
nullable=False)]).toNullable()
+ StructType([StructField('a', IntegerType(), True)])
+
+ Example 2: Nested nullability conversion
+
+ >>> StructType([
+ ... StructField("a",
+ ... StructType([
+ ... StructField("b", IntegerType(), nullable=False),
+ ... StructField("c", StructType([
+ ... StructField("d", IntegerType(), nullable=False)
+ ... ]))
+ ... ]),
+ ... nullable=False)
+ ... ]).toNullable()
+ StructType([StructField('a', StructType([StructField('b',
IntegerType(), True),
+ StructField('c', StructType([StructField('d', IntegerType(), True)]),
True)]), True)])
+ """
+ return self._as_nullable()
+
def __repr__(self) -> str:
return "StructType([%s])" % ", ".join(str(field) for field in self)
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
b/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index a5226870097..e5af472d90e 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -96,6 +96,14 @@ case class ArrayType(elementType: DataType, containsNull:
Boolean) extends DataT
override private[spark] def asNullable: ArrayType =
ArrayType(elementType.asNullable, containsNull = true)
+ /**
+ * Returns the same data type but set all nullability fields are true
+ * (`StructField.nullable`, `ArrayType.containsNull`, and
`MapType.valueContainsNull`).
+ *
+ * @since 4.0.0
+ */
+ def toNullable: ArrayType = asNullable
+
override private[spark] def existsRecursively(f: (DataType) => Boolean):
Boolean = {
f(this) || elementType.existsRecursively(f)
}
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala
b/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala
index ce0c76dbe4f..dba870466fc 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -76,6 +76,14 @@ case class MapType(
override private[spark] def asNullable: MapType =
MapType(keyType.asNullable, valueType.asNullable, valueContainsNull = true)
+ /**
+ * Returns the same data type but set all nullability fields are true
+ * (`StructField.nullable`, `ArrayType.containsNull`, and
`MapType.valueContainsNull`).
+ *
+ * @since 4.0.0
+ */
+ def toNullable: MapType = asNullable
+
override private[spark] def existsRecursively(f: (DataType) => Boolean):
Boolean = {
f(this) || keyType.existsRecursively(f) || valueType.existsRecursively(f)
}
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala
b/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala
index f1771d933bb..5fe6b0a5f00 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -490,6 +490,14 @@ case class StructType(fields: Array[StructField]) extends
DataType with Seq[Stru
StructType(newFields)
}
+ /**
+ * Returns the same data type but set all nullability fields are true
+ * (`StructField.nullable`, `ArrayType.containsNull`, and
`MapType.valueContainsNull`).
+ *
+ * @since 4.0.0
+ */
+ def toNullable: StructType = asNullable
+
override private[spark] def existsRecursively(f: (DataType) => Boolean):
Boolean = {
f(this) || fields.exists(field => field.dataType.existsRecursively(f))
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]