This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7984b1a4b2d [SPARK-45420][SQL][PYTHON][CONNECT] Add DataType.fromDDL
into PySpark
7984b1a4b2d is described below
commit 7984b1a4b2da6586358397e90d1c28ae73aca6ce
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Thu Oct 5 18:07:21 2023 +0900
[SPARK-45420][SQL][PYTHON][CONNECT] Add DataType.fromDDL into PySpark
### What changes were proposed in this pull request?
This PR implements `DataType.fromDDL` as the parity to Scala API:
https://github.com/apache/spark/blob/350b8d8388c9ad15303d39f22b249b8c73785695/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala#L121-L126
One difference is that Python API also supports the legacy format inside
`struct<...>`, e.g., `a: int, b: int`.
### Why are the changes needed?
In order for the end users to parse the DDL formatted type easily.
### Does this PR introduce _any_ user-facing change?
Yes, this PR adds a new user-facing API: `DataType.fromDDL`.
### How was this patch tested?
Unittests were added, and manually tested them too.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43226 from HyukjinKwon/addfromDDL.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/tests/test_types.py | 12 +++++++++
python/pyspark/sql/types.py | 48 ++++++++++++++++++++++++++++++++++
2 files changed, 60 insertions(+)
diff --git a/python/pyspark/sql/tests/test_types.py
b/python/pyspark/sql/tests/test_types.py
index fb752b93a33..c6e70da1f8d 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -28,6 +28,7 @@ from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark.errors import AnalysisException, PySparkTypeError,
PySparkValueError
from pyspark.sql.types import (
+ DataType,
ByteType,
ShortType,
IntegerType,
@@ -1288,6 +1289,17 @@ class TypesTestsMixin:
schema1 = self.spark.range(1).select(F.make_interval(F.lit(1))).schema
self.assertEqual(schema1.fields[0].dataType, CalendarIntervalType())
+ def test_from_ddl(self):
+ self.assertEqual(DataType.fromDDL("long"), LongType())
+ self.assertEqual(
+ DataType.fromDDL("a: int, b: string"),
+ StructType([StructField("a", IntegerType()), StructField("b",
StringType())]),
+ )
+ self.assertEqual(
+ DataType.fromDDL("a int, b string"),
+ StructType([StructField("a", IntegerType()), StructField("b",
StringType())]),
+ )
+
class DataTypeTests(unittest.TestCase):
# regression test for SPARK-6055
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index bf63bea69ad..01db75b2500 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -139,6 +139,54 @@ class DataType:
"""
return obj
+ @classmethod
+ def fromDDL(cls, ddl: str) -> "DataType":
+ """
+ Creates :class:`DataType` for a given DDL-formatted string.
+
+ .. versionadded:: 4.0.0
+
+ Parameters
+ ----------
+ ddl : str
+ DDL-formatted string representation of types, e.g.
+ :class:`pyspark.sql.types.DataType.simpleString`, except that top
level struct
+ type can omit the ``struct<>`` for the compatibility reason with
+ ``spark.createDataFrame`` and Python UDFs.
+
+ Returns
+ -------
+ :class:`DataType`
+
+ Examples
+ --------
+ Create a StructType by the corresponding DDL formatted string.
+
+ >>> from pyspark.sql.types import DataType
+ >>> DataType.fromDDL("b string, a int")
+ StructType([StructField('b', StringType(), True), StructField('a',
IntegerType(), True)])
+
+ Create a single DataType by the corresponding DDL formatted string.
+
+ >>> DataType.fromDDL("decimal(10,10)")
+ DecimalType(10,10)
+
+ Create a StructType by the legacy string format.
+
+ >>> DataType.fromDDL("b: string, a: int")
+ StructType([StructField('b', StringType(), True), StructField('a',
IntegerType(), True)])
+ """
+ from pyspark.sql import SparkSession
+ from pyspark.sql.functions import udf
+
+ # Intentionally uses SparkSession so one implementation can be shared
with/without
+ # Spark Connect.
+ schema = (
+ SparkSession.active().range(0).select(udf(lambda x: x,
returnType=ddl)("id")).schema
+ )
+ assert len(schema) == 1
+ return schema[0].dataType
+
# This singleton pattern does not work with pickle, you will get
# another object after pickle and unpickle
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]