This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 9d84369ade6 [SPARK-45018][PYTHON][CONNECT] Add CalendarIntervalType to
Python Client
9d84369ade6 is described below
commit 9d84369ade670737a4ccda166e452e5208eb8253
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Thu Aug 31 15:27:41 2023 +0800
[SPARK-45018][PYTHON][CONNECT] Add CalendarIntervalType to Python Client
### What changes were proposed in this pull request?
Add CalendarIntervalType to Python Client
### Why are the changes needed?
for feature parity
### Does this PR introduce _any_ user-facing change?
yes
before this PR:
```
In [1]: from pyspark.sql import functions as sf
In [2]: spark.range(1).select(sf.make_interval(sf.lit(1))).schema
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
Cell In[2], line 1
----> 1 spark.range(1).select(sf.make_interval(sf.lit(1))).schema
File ~/Dev/spark/python/pyspark/sql/connect/dataframe.py:1687, in
DataFrame.schema(self)
1685 if self._session is None:
1686 raise Exception("Cannot analyze without SparkSession.")
-> 1687 return self._session.client.schema(query)
1688 else:
1689 raise Exception("Empty plan.")
...
Exception: Unsupported data type calendar_interval
```
after this PR:
```
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 4.0.0.dev0
/_/
Using Python version 3.10.11 (main, May 17 2023 14:30:36)
Client connected to the Spark Connect server at localhost
SparkSession available as 'spark'.
In [1]: from pyspark.sql import functions as sf
In [2]: spark.range(1).select(sf.make_interval(sf.lit(1))).schema
Out[2]: StructType([StructField('make_interval(1, 0, 0, 0, 0, 0, 0)',
CalendarIntervalType(), True)])
```
### How was this patch tested?
added UT
### Was this patch authored or co-authored using generative AI tooling?
NO
Closes #42743 from zhengruifeng/py_connect_cal_interval.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/connect/types.py | 5 +++++
python/pyspark/sql/tests/connect/test_parity_types.py | 2 +-
python/pyspark/sql/tests/test_types.py | 4 ++++
3 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/python/pyspark/sql/connect/types.py
b/python/pyspark/sql/connect/types.py
index 0db2833d2c1..cd2311e614e 100644
--- a/python/pyspark/sql/connect/types.py
+++ b/python/pyspark/sql/connect/types.py
@@ -33,6 +33,7 @@ from pyspark.sql.types import (
TimestampNTZType,
DayTimeIntervalType,
YearMonthIntervalType,
+ CalendarIntervalType,
MapType,
StringType,
CharType,
@@ -169,6 +170,8 @@ def pyspark_types_to_proto_types(data_type: DataType) ->
pb2.DataType:
elif isinstance(data_type, YearMonthIntervalType):
ret.year_month_interval.start_field = data_type.startField
ret.year_month_interval.end_field = data_type.endField
+ elif isinstance(data_type, CalendarIntervalType):
+ ret.calendar_interval.CopyFrom(pb2.DataType.CalendarInterval())
elif isinstance(data_type, StructType):
struct = pb2.DataType.Struct()
for field in data_type.fields:
@@ -265,6 +268,8 @@ def proto_schema_to_pyspark_data_type(schema: pb2.DataType)
-> DataType:
else None
)
return YearMonthIntervalType(startField=start, endField=end)
+ elif schema.HasField("calendar_interval"):
+ return CalendarIntervalType()
elif schema.HasField("array"):
return ArrayType(
proto_schema_to_pyspark_data_type(schema.array.element_type),
diff --git a/python/pyspark/sql/tests/connect/test_parity_types.py
b/python/pyspark/sql/tests/connect/test_parity_types.py
index 533506c7d27..44171fd61a3 100644
--- a/python/pyspark/sql/tests/connect/test_parity_types.py
+++ b/python/pyspark/sql/tests/connect/test_parity_types.py
@@ -86,7 +86,7 @@ class TypesParityTests(TypesTestsMixin,
ReusedConnectTestCase):
def test_udt(self):
super().test_udt()
- @unittest.skip("SPARK-45018: should support CalendarIntervalType")
+ @unittest.skip("SPARK-45026: spark.sql should support datatypes not
compatible with arrow")
def test_calendar_interval_type(self):
super().test_calendar_interval_type()
diff --git a/python/pyspark/sql/tests/test_types.py
b/python/pyspark/sql/tests/test_types.py
index d45c4d7e808..fb752b93a33 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -1284,6 +1284,10 @@ class TypesTestsMixin:
schema1 = self.spark.sql("SELECT make_interval(100, 11, 1, 1, 12, 30,
01.001001)").schema
self.assertEqual(schema1.fields[0].dataType, CalendarIntervalType())
+ def test_calendar_interval_type_with_sf(self):
+ schema1 = self.spark.range(1).select(F.make_interval(F.lit(1))).schema
+ self.assertEqual(schema1.fields[0].dataType, CalendarIntervalType())
+
class DataTypeTests(unittest.TestCase):
# regression test for SPARK-6055
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]