This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 03dedbbaeb1b [SPARK-55164][PYTHON][TESTS] Refactor tests for python
udf input type coercion
03dedbbaeb1b is described below
commit 03dedbbaeb1b75c5a207d65830ee53d454becf9a
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Jan 26 08:10:16 2026 +0900
[SPARK-55164][PYTHON][TESTS] Refactor tests for python udf input type
coercion
### What changes were proposed in this pull request?
Refactor tests for python udf input type coercion
### Why are the changes needed?
to save/load golden with pandas
### Does this PR introduce _any_ user-facing change?
no, test-only
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #53947 from zhengruifeng/refactor_py_udf_input.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
dev/sparktestsupport/modules.py | 1 +
...lden_pandas_udf_input_type_coercion_vanilla.csv | 40 +++
...olden_pandas_udf_input_type_coercion_vanilla.md | 41 +++
...n_pandas_udf_input_type_coercion_with_arrow.csv | 40 +++
...en_pandas_udf_input_type_coercion_with_arrow.md | 41 +++
...f_input_type_coercion_with_arrow_and_pandas.csv | 40 +++
...df_input_type_coercion_with_arrow_and_pandas.md | 41 +++
.../test_python_udf_input_type.py} | 360 ++++++++++-----------
.../golden_udf_input_types_arrow_disabled.txt | 43 ---
.../golden_udf_input_types_arrow_enabled.txt | 43 ---
.../golden_udf_input_types_arrow_legacy_pandas.txt | 43 ---
.../tests/udf_type_tests/test_udf_input_types.py | 98 +-----
12 files changed, 410 insertions(+), 421 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 6ba2e619f703..fb8e4697fcb3 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -595,6 +595,7 @@ pyspark_sql = Module(
"pyspark.sql.tests.plot.test_frame_plot_plotly",
"pyspark.sql.tests.test_connect_compatibility",
"pyspark.sql.tests.udf_type_tests.test_udf_input_types",
+ "pyspark.sql.tests.coercion.test_python_udf_input_type",
"pyspark.sql.tests.coercion.test_pandas_udf_return_type",
"pyspark.sql.tests.coercion.test_python_udf_return_type",
],
diff --git
a/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_vanilla.csv
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_vanilla.csv
new file mode 100644
index 000000000000..d29167ee2553
--- /dev/null
+++
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_vanilla.csv
@@ -0,0 +1,40 @@
+ Test Case Spark Type Spark Value Python Type Python
Value
+0 byte_values tinyint [-128, 127, 0] ['int', 'int', 'int']
['-128', '127', '0']
+1 byte_null tinyint [None, 42] ['NoneType', 'int']
['None', '42']
+2 short_values smallint [-32768, 32767, 0] ['int', 'int',
'int'] ['-32768', '32767', '0']
+3 short_null smallint [None, 123] ['NoneType', 'int']
['None', '123']
+4 int_values int [-2147483648, 2147483647, 0] ['int', 'int',
'int'] ['-2147483648', '2147483647', '0']
+5 int_null int [None, 456] ['NoneType', 'int']
['None', '456']
+6 long_values bigint [-9223372036854775808, 9223372036854775807, 0]
['int', 'int', 'int'] ['-9223372036854775808', '9223372036854775807', '0']
+7 long_null bigint [None, 789] ['NoneType', 'int']
['None', '789']
+8 float_values float [0.0, 1.0, 3.140000104904175] ['float',
'float', 'float'] ['0.0', '1.0', '3.140000104904175']
+9 float_null float [None, 3.140000104904175] ['NoneType',
'float'] ['None', '3.140000104904175']
+10 double_values double [0.0, 1.0, 0.3333333333333333] ['float',
'float', 'float'] ['0.0', '1.0', '0.3333333333333333']
+11 double_null double [None, 2.71] ['NoneType', 'float']
['None', '2.71']
+12 decimal_values decimal(3,2) [Decimal('5.35'), Decimal('1.23')]
['Decimal', 'Decimal'] ['5.35', '1.23']
+13 decimal_null decimal(3,2) [None, Decimal('9.99')] ['NoneType',
'Decimal'] ['None', '9.99']
+14 string_values string ['abc', '', 'hello'] ['str', 'str', 'str']
['abc', '', 'hello']
+15 string_null string [None, 'test'] ['NoneType', 'str']
['None', 'test']
+16 binary_values binary [b'abc', b'', b'ABC'] ['bytes', 'bytes',
'bytes'] "[""b'abc'"", ""b''"", ""b'ABC'""]"
+17 binary_null binary [None, b'test'] ['NoneType', 'bytes']
"['None', ""b'test'""]"
+18 boolean_values boolean [True, False] ['bool', 'bool']
['True', 'False']
+19 boolean_null boolean [None, True] ['NoneType', 'bool']
['None', 'True']
+20 date_values date [datetime.date(2020, 2, 2), datetime.date(1970,
1, 1)] ['date', 'date'] ['2020-02-02', '1970-01-01']
+21 date_null date [None, datetime.date(2023, 1, 1)]
['NoneType', 'date'] ['None', '2023-01-01']
+22 timestamp_values timestamp [datetime.datetime(2020, 2, 2,
12, 15, 16, 123000)] ['datetime'] ['2020-02-02 12:15:16.123000']
+23 timestamp_null timestamp [None, datetime.datetime(2023, 1, 1,
12, 0)] ['NoneType', 'datetime'] ['None', '2023-01-01 12:00:00']
+24 array_int_values array<int> [[1, 2, 3], [], [1, None, 3]]
['list', 'list', 'list'] ['[1, 2, 3]', '[]', '[1, None, 3]']
+25 array_int_null array<int> [None, [4, 5, 6]] ['NoneType',
'list'] ['None', '[4, 5, 6]']
+26 map_str_int_values map<string,int> [{'world': 2, 'hello': 1}, {}]
['dict', 'dict'] "[""{'world': 2, 'hello': 1}"", '{}']"
+27 map_str_int_null map<string,int> [None, {'test': 123}]
['NoneType', 'dict'] "['None', ""{'test': 123}""]"
+28 struct_int_str_values struct<a1:int,a2:string> [Row(a1=1,
a2='hello'), Row(a1=2, a2='world')] ['Row', 'Row'] "[""Row(a1=1,
a2='hello')"", ""Row(a1=2, a2='world')""]"
+29 struct_int_str_null struct<a1:int,a2:string> [None,
Row(a1=99, a2='test')] ['NoneType', 'Row'] "['None', ""Row(a1=99,
a2='test')""]"
+30 array_array_int array<array<int>> [[[1, 2, 3]], [[1], [2, 3]]]
['list', 'list'] ['[[1, 2, 3]]', '[[1], [2, 3]]']
+31 array_map_str_int array<map<string,int>> [[{'world': 2, 'hello':
1}], [{'a': 1}, {'b': 2}]] ['list', 'list'] "[""[{'world': 2,
'hello': 1}]"", ""[{'a': 1}, {'b': 2}]""]"
+32 array_struct_int_str array<struct<a1:int,a2:string>> [[Row(a1=1,
a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]] ['list',
'list'] "[""[Row(a1=1, a2='hello')]"", ""[Row(a1=1, a2='hello'),
Row(a1=2, a2='world')]""]"
+33 map_int_array_int map<int,array<int>> [{1: [1, 2, 3]}, {1:
[1], 2: [2, 3]}] ['dict', 'dict'] ['{1: [1, 2, 3]}', '{1: [1], 2: [2,
3]}']
+34 map_int_map_str_int map<int,map<string,int>> [{1: {'world':
2, 'hello': 1}}] ['dict'] "[""{1: {'world': 2, 'hello': 1}}""]"
+35 map_int_struct_int_str map<int,struct<a1:int,a2:string>> [{1:
Row(a1=1, a2='hello')}] ['dict'] "[""{1: Row(a1=1, a2='hello')}""]"
+36 struct_int_array_int struct<a:int,b:array<int>> [Row(a=1, b=[1,
2, 3])] ['Row'] ['Row(a=1, b=[1, 2, 3])']
+37 struct_int_map_str_int struct<a:int,b:map<string,int>> [Row(a=1,
b={'world': 2, 'hello': 1})] ['Row'] "[""Row(a=1, b={'world': 2, 'hello':
1})""]"
+38 struct_int_struct_int_str
struct<a:int,b:struct<a1:int,a2:string>> [Row(a=1, b=Row(a1=1,
a2='hello'))] ['Row'] "[""Row(a=1, b=Row(a1=1, a2='hello'))""]"
diff --git
a/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_vanilla.md
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_vanilla.md
new file mode 100644
index 000000000000..c622e115e00a
--- /dev/null
+++
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_vanilla.md
@@ -0,0 +1,41 @@
+| | Test Case | Spark Type |
Spark Value |
Python Type | Python Value
|
+|----|---------------------------|------------------------------------------|---------------------------------------------------------------------------|-----------------------------|-------------------------------------------------------------------------------|
+| 0 | byte_values | tinyint |
[-128, 127, 0] |
['int', 'int', 'int'] | ['-128', '127', '0']
|
+| 1 | byte_null | tinyint |
[None, 42] |
['NoneType', 'int'] | ['None', '42']
|
+| 2 | short_values | smallint |
[-32768, 32767, 0] |
['int', 'int', 'int'] | ['-32768', '32767', '0']
|
+| 3 | short_null | smallint |
[None, 123] |
['NoneType', 'int'] | ['None', '123']
|
+| 4 | int_values | int |
[-2147483648, 2147483647, 0] |
['int', 'int', 'int'] | ['-2147483648', '2147483647', '0']
|
+| 5 | int_null | int |
[None, 456] |
['NoneType', 'int'] | ['None', '456']
|
+| 6 | long_values | bigint |
[-9223372036854775808, 9223372036854775807, 0] |
['int', 'int', 'int'] | ['-9223372036854775808', '9223372036854775807',
'0'] |
+| 7 | long_null | bigint |
[None, 789] |
['NoneType', 'int'] | ['None', '789']
|
+| 8 | float_values | float |
[0.0, 1.0, 3.140000104904175] |
['float', 'float', 'float'] | ['0.0', '1.0', '3.140000104904175']
|
+| 9 | float_null | float |
[None, 3.140000104904175] |
['NoneType', 'float'] | ['None', '3.140000104904175']
|
+| 10 | double_values | double |
[0.0, 1.0, 0.3333333333333333] |
['float', 'float', 'float'] | ['0.0', '1.0', '0.3333333333333333']
|
+| 11 | double_null | double |
[None, 2.71] |
['NoneType', 'float'] | ['None', '2.71']
|
+| 12 | decimal_values | decimal(3,2) |
[Decimal('5.35'), Decimal('1.23')] |
['Decimal', 'Decimal'] | ['5.35', '1.23']
|
+| 13 | decimal_null | decimal(3,2) |
[None, Decimal('9.99')] |
['NoneType', 'Decimal'] | ['None', '9.99']
|
+| 14 | string_values | string |
['abc', '', 'hello'] |
['str', 'str', 'str'] | ['abc', '', 'hello']
|
+| 15 | string_null | string |
[None, 'test'] |
['NoneType', 'str'] | ['None', 'test']
|
+| 16 | binary_values | binary |
[b'abc', b'', b'ABC'] |
['bytes', 'bytes', 'bytes'] | ["b'abc'", "b''", "b'ABC'"]
|
+| 17 | binary_null | binary |
[None, b'test'] |
['NoneType', 'bytes'] | ['None', "b'test'"]
|
+| 18 | boolean_values | boolean |
[True, False] |
['bool', 'bool'] | ['True', 'False']
|
+| 19 | boolean_null | boolean |
[None, True] |
['NoneType', 'bool'] | ['None', 'True']
|
+| 20 | date_values | date |
[datetime.date(2020, 2, 2), datetime.date(1970, 1, 1)] |
['date', 'date'] | ['2020-02-02', '1970-01-01']
|
+| 21 | date_null | date |
[None, datetime.date(2023, 1, 1)] |
['NoneType', 'date'] | ['None', '2023-01-01']
|
+| 22 | timestamp_values | timestamp |
[datetime.datetime(2020, 2, 2, 12, 15, 16, 123000)] |
['datetime'] | ['2020-02-02 12:15:16.123000']
|
+| 23 | timestamp_null | timestamp |
[None, datetime.datetime(2023, 1, 1, 12, 0)] |
['NoneType', 'datetime'] | ['None', '2023-01-01 12:00:00']
|
+| 24 | array_int_values | array<int> |
[[1, 2, 3], [], [1, None, 3]] |
['list', 'list', 'list'] | ['[1, 2, 3]', '[]', '[1, None, 3]']
|
+| 25 | array_int_null | array<int> |
[None, [4, 5, 6]] |
['NoneType', 'list'] | ['None', '[4, 5, 6]']
|
+| 26 | map_str_int_values | map<string,int> |
[{'world': 2, 'hello': 1}, {}] |
['dict', 'dict'] | ["{'world': 2, 'hello': 1}", '{}']
|
+| 27 | map_str_int_null | map<string,int> |
[None, {'test': 123}] |
['NoneType', 'dict'] | ['None', "{'test': 123}"]
|
+| 28 | struct_int_str_values | struct<a1:int,a2:string> |
[Row(a1=1, a2='hello'), Row(a1=2, a2='world')] |
['Row', 'Row'] | ["Row(a1=1, a2='hello')", "Row(a1=2,
a2='world')"] |
+| 29 | struct_int_str_null | struct<a1:int,a2:string> |
[None, Row(a1=99, a2='test')] |
['NoneType', 'Row'] | ['None', "Row(a1=99, a2='test')"]
|
+| 30 | array_array_int | array<array<int>> |
[[[1, 2, 3]], [[1], [2, 3]]] |
['list', 'list'] | ['[[1, 2, 3]]', '[[1], [2, 3]]']
|
+| 31 | array_map_str_int | array<map<string,int>> |
[[{'world': 2, 'hello': 1}], [{'a': 1}, {'b': 2}]] |
['list', 'list'] | ["[{'world': 2, 'hello': 1}]", "[{'a': 1}, {'b':
2}]"] |
+| 32 | array_struct_int_str | array<struct<a1:int,a2:string>> |
[[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]] |
['list', 'list'] | ["[Row(a1=1, a2='hello')]", "[Row(a1=1,
a2='hello'), Row(a1=2, a2='world')]"] |
+| 33 | map_int_array_int | map<int,array<int>> |
[{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}] |
['dict', 'dict'] | ['{1: [1, 2, 3]}', '{1: [1], 2: [2, 3]}']
|
+| 34 | map_int_map_str_int | map<int,map<string,int>> |
[{1: {'world': 2, 'hello': 1}}] |
['dict'] | ["{1: {'world': 2, 'hello': 1}}"]
|
+| 35 | map_int_struct_int_str | map<int,struct<a1:int,a2:string>> |
[{1: Row(a1=1, a2='hello')}] |
['dict'] | ["{1: Row(a1=1, a2='hello')}"]
|
+| 36 | struct_int_array_int | struct<a:int,b:array<int>> |
[Row(a=1, b=[1, 2, 3])] |
['Row'] | ['Row(a=1, b=[1, 2, 3])']
|
+| 37 | struct_int_map_str_int | struct<a:int,b:map<string,int>> |
[Row(a=1, b={'world': 2, 'hello': 1})] |
['Row'] | ["Row(a=1, b={'world': 2, 'hello': 1})"]
|
+| 38 | struct_int_struct_int_str | struct<a:int,b:struct<a1:int,a2:string>> |
[Row(a=1, b=Row(a1=1, a2='hello'))] |
['Row'] | ["Row(a=1, b=Row(a1=1, a2='hello'))"]
|
\ No newline at end of file
diff --git
a/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow.csv
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow.csv
new file mode 100644
index 000000000000..d29167ee2553
--- /dev/null
+++
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow.csv
@@ -0,0 +1,40 @@
+ Test Case Spark Type Spark Value Python Type Python
Value
+0 byte_values tinyint [-128, 127, 0] ['int', 'int', 'int']
['-128', '127', '0']
+1 byte_null tinyint [None, 42] ['NoneType', 'int']
['None', '42']
+2 short_values smallint [-32768, 32767, 0] ['int', 'int',
'int'] ['-32768', '32767', '0']
+3 short_null smallint [None, 123] ['NoneType', 'int']
['None', '123']
+4 int_values int [-2147483648, 2147483647, 0] ['int', 'int',
'int'] ['-2147483648', '2147483647', '0']
+5 int_null int [None, 456] ['NoneType', 'int']
['None', '456']
+6 long_values bigint [-9223372036854775808, 9223372036854775807, 0]
['int', 'int', 'int'] ['-9223372036854775808', '9223372036854775807', '0']
+7 long_null bigint [None, 789] ['NoneType', 'int']
['None', '789']
+8 float_values float [0.0, 1.0, 3.140000104904175] ['float',
'float', 'float'] ['0.0', '1.0', '3.140000104904175']
+9 float_null float [None, 3.140000104904175] ['NoneType',
'float'] ['None', '3.140000104904175']
+10 double_values double [0.0, 1.0, 0.3333333333333333] ['float',
'float', 'float'] ['0.0', '1.0', '0.3333333333333333']
+11 double_null double [None, 2.71] ['NoneType', 'float']
['None', '2.71']
+12 decimal_values decimal(3,2) [Decimal('5.35'), Decimal('1.23')]
['Decimal', 'Decimal'] ['5.35', '1.23']
+13 decimal_null decimal(3,2) [None, Decimal('9.99')] ['NoneType',
'Decimal'] ['None', '9.99']
+14 string_values string ['abc', '', 'hello'] ['str', 'str', 'str']
['abc', '', 'hello']
+15 string_null string [None, 'test'] ['NoneType', 'str']
['None', 'test']
+16 binary_values binary [b'abc', b'', b'ABC'] ['bytes', 'bytes',
'bytes'] "[""b'abc'"", ""b''"", ""b'ABC'""]"
+17 binary_null binary [None, b'test'] ['NoneType', 'bytes']
"['None', ""b'test'""]"
+18 boolean_values boolean [True, False] ['bool', 'bool']
['True', 'False']
+19 boolean_null boolean [None, True] ['NoneType', 'bool']
['None', 'True']
+20 date_values date [datetime.date(2020, 2, 2), datetime.date(1970,
1, 1)] ['date', 'date'] ['2020-02-02', '1970-01-01']
+21 date_null date [None, datetime.date(2023, 1, 1)]
['NoneType', 'date'] ['None', '2023-01-01']
+22 timestamp_values timestamp [datetime.datetime(2020, 2, 2,
12, 15, 16, 123000)] ['datetime'] ['2020-02-02 12:15:16.123000']
+23 timestamp_null timestamp [None, datetime.datetime(2023, 1, 1,
12, 0)] ['NoneType', 'datetime'] ['None', '2023-01-01 12:00:00']
+24 array_int_values array<int> [[1, 2, 3], [], [1, None, 3]]
['list', 'list', 'list'] ['[1, 2, 3]', '[]', '[1, None, 3]']
+25 array_int_null array<int> [None, [4, 5, 6]] ['NoneType',
'list'] ['None', '[4, 5, 6]']
+26 map_str_int_values map<string,int> [{'world': 2, 'hello': 1}, {}]
['dict', 'dict'] "[""{'world': 2, 'hello': 1}"", '{}']"
+27 map_str_int_null map<string,int> [None, {'test': 123}]
['NoneType', 'dict'] "['None', ""{'test': 123}""]"
+28 struct_int_str_values struct<a1:int,a2:string> [Row(a1=1,
a2='hello'), Row(a1=2, a2='world')] ['Row', 'Row'] "[""Row(a1=1,
a2='hello')"", ""Row(a1=2, a2='world')""]"
+29 struct_int_str_null struct<a1:int,a2:string> [None,
Row(a1=99, a2='test')] ['NoneType', 'Row'] "['None', ""Row(a1=99,
a2='test')""]"
+30 array_array_int array<array<int>> [[[1, 2, 3]], [[1], [2, 3]]]
['list', 'list'] ['[[1, 2, 3]]', '[[1], [2, 3]]']
+31 array_map_str_int array<map<string,int>> [[{'world': 2, 'hello':
1}], [{'a': 1}, {'b': 2}]] ['list', 'list'] "[""[{'world': 2,
'hello': 1}]"", ""[{'a': 1}, {'b': 2}]""]"
+32 array_struct_int_str array<struct<a1:int,a2:string>> [[Row(a1=1,
a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]] ['list',
'list'] "[""[Row(a1=1, a2='hello')]"", ""[Row(a1=1, a2='hello'),
Row(a1=2, a2='world')]""]"
+33 map_int_array_int map<int,array<int>> [{1: [1, 2, 3]}, {1:
[1], 2: [2, 3]}] ['dict', 'dict'] ['{1: [1, 2, 3]}', '{1: [1], 2: [2,
3]}']
+34 map_int_map_str_int map<int,map<string,int>> [{1: {'world':
2, 'hello': 1}}] ['dict'] "[""{1: {'world': 2, 'hello': 1}}""]"
+35 map_int_struct_int_str map<int,struct<a1:int,a2:string>> [{1:
Row(a1=1, a2='hello')}] ['dict'] "[""{1: Row(a1=1, a2='hello')}""]"
+36 struct_int_array_int struct<a:int,b:array<int>> [Row(a=1, b=[1,
2, 3])] ['Row'] ['Row(a=1, b=[1, 2, 3])']
+37 struct_int_map_str_int struct<a:int,b:map<string,int>> [Row(a=1,
b={'world': 2, 'hello': 1})] ['Row'] "[""Row(a=1, b={'world': 2, 'hello':
1})""]"
+38 struct_int_struct_int_str
struct<a:int,b:struct<a1:int,a2:string>> [Row(a=1, b=Row(a1=1,
a2='hello'))] ['Row'] "[""Row(a=1, b=Row(a1=1, a2='hello'))""]"
diff --git
a/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow.md
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow.md
new file mode 100644
index 000000000000..c622e115e00a
--- /dev/null
+++
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow.md
@@ -0,0 +1,41 @@
+| | Test Case | Spark Type |
Spark Value |
Python Type | Python Value
|
+|----|---------------------------|------------------------------------------|---------------------------------------------------------------------------|-----------------------------|-------------------------------------------------------------------------------|
+| 0 | byte_values | tinyint |
[-128, 127, 0] |
['int', 'int', 'int'] | ['-128', '127', '0']
|
+| 1 | byte_null | tinyint |
[None, 42] |
['NoneType', 'int'] | ['None', '42']
|
+| 2 | short_values | smallint |
[-32768, 32767, 0] |
['int', 'int', 'int'] | ['-32768', '32767', '0']
|
+| 3 | short_null | smallint |
[None, 123] |
['NoneType', 'int'] | ['None', '123']
|
+| 4 | int_values | int |
[-2147483648, 2147483647, 0] |
['int', 'int', 'int'] | ['-2147483648', '2147483647', '0']
|
+| 5 | int_null | int |
[None, 456] |
['NoneType', 'int'] | ['None', '456']
|
+| 6 | long_values | bigint |
[-9223372036854775808, 9223372036854775807, 0] |
['int', 'int', 'int'] | ['-9223372036854775808', '9223372036854775807',
'0'] |
+| 7 | long_null | bigint |
[None, 789] |
['NoneType', 'int'] | ['None', '789']
|
+| 8 | float_values | float |
[0.0, 1.0, 3.140000104904175] |
['float', 'float', 'float'] | ['0.0', '1.0', '3.140000104904175']
|
+| 9 | float_null | float |
[None, 3.140000104904175] |
['NoneType', 'float'] | ['None', '3.140000104904175']
|
+| 10 | double_values | double |
[0.0, 1.0, 0.3333333333333333] |
['float', 'float', 'float'] | ['0.0', '1.0', '0.3333333333333333']
|
+| 11 | double_null | double |
[None, 2.71] |
['NoneType', 'float'] | ['None', '2.71']
|
+| 12 | decimal_values | decimal(3,2) |
[Decimal('5.35'), Decimal('1.23')] |
['Decimal', 'Decimal'] | ['5.35', '1.23']
|
+| 13 | decimal_null | decimal(3,2) |
[None, Decimal('9.99')] |
['NoneType', 'Decimal'] | ['None', '9.99']
|
+| 14 | string_values | string |
['abc', '', 'hello'] |
['str', 'str', 'str'] | ['abc', '', 'hello']
|
+| 15 | string_null | string |
[None, 'test'] |
['NoneType', 'str'] | ['None', 'test']
|
+| 16 | binary_values | binary |
[b'abc', b'', b'ABC'] |
['bytes', 'bytes', 'bytes'] | ["b'abc'", "b''", "b'ABC'"]
|
+| 17 | binary_null | binary |
[None, b'test'] |
['NoneType', 'bytes'] | ['None', "b'test'"]
|
+| 18 | boolean_values | boolean |
[True, False] |
['bool', 'bool'] | ['True', 'False']
|
+| 19 | boolean_null | boolean |
[None, True] |
['NoneType', 'bool'] | ['None', 'True']
|
+| 20 | date_values | date |
[datetime.date(2020, 2, 2), datetime.date(1970, 1, 1)] |
['date', 'date'] | ['2020-02-02', '1970-01-01']
|
+| 21 | date_null | date |
[None, datetime.date(2023, 1, 1)] |
['NoneType', 'date'] | ['None', '2023-01-01']
|
+| 22 | timestamp_values | timestamp |
[datetime.datetime(2020, 2, 2, 12, 15, 16, 123000)] |
['datetime'] | ['2020-02-02 12:15:16.123000']
|
+| 23 | timestamp_null | timestamp |
[None, datetime.datetime(2023, 1, 1, 12, 0)] |
['NoneType', 'datetime'] | ['None', '2023-01-01 12:00:00']
|
+| 24 | array_int_values | array<int> |
[[1, 2, 3], [], [1, None, 3]] |
['list', 'list', 'list'] | ['[1, 2, 3]', '[]', '[1, None, 3]']
|
+| 25 | array_int_null | array<int> |
[None, [4, 5, 6]] |
['NoneType', 'list'] | ['None', '[4, 5, 6]']
|
+| 26 | map_str_int_values | map<string,int> |
[{'world': 2, 'hello': 1}, {}] |
['dict', 'dict'] | ["{'world': 2, 'hello': 1}", '{}']
|
+| 27 | map_str_int_null | map<string,int> |
[None, {'test': 123}] |
['NoneType', 'dict'] | ['None', "{'test': 123}"]
|
+| 28 | struct_int_str_values | struct<a1:int,a2:string> |
[Row(a1=1, a2='hello'), Row(a1=2, a2='world')] |
['Row', 'Row'] | ["Row(a1=1, a2='hello')", "Row(a1=2,
a2='world')"] |
+| 29 | struct_int_str_null | struct<a1:int,a2:string> |
[None, Row(a1=99, a2='test')] |
['NoneType', 'Row'] | ['None', "Row(a1=99, a2='test')"]
|
+| 30 | array_array_int | array<array<int>> |
[[[1, 2, 3]], [[1], [2, 3]]] |
['list', 'list'] | ['[[1, 2, 3]]', '[[1], [2, 3]]']
|
+| 31 | array_map_str_int | array<map<string,int>> |
[[{'world': 2, 'hello': 1}], [{'a': 1}, {'b': 2}]] |
['list', 'list'] | ["[{'world': 2, 'hello': 1}]", "[{'a': 1}, {'b':
2}]"] |
+| 32 | array_struct_int_str | array<struct<a1:int,a2:string>> |
[[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]] |
['list', 'list'] | ["[Row(a1=1, a2='hello')]", "[Row(a1=1,
a2='hello'), Row(a1=2, a2='world')]"] |
+| 33 | map_int_array_int | map<int,array<int>> |
[{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}] |
['dict', 'dict'] | ['{1: [1, 2, 3]}', '{1: [1], 2: [2, 3]}']
|
+| 34 | map_int_map_str_int | map<int,map<string,int>> |
[{1: {'world': 2, 'hello': 1}}] |
['dict'] | ["{1: {'world': 2, 'hello': 1}}"]
|
+| 35 | map_int_struct_int_str | map<int,struct<a1:int,a2:string>> |
[{1: Row(a1=1, a2='hello')}] |
['dict'] | ["{1: Row(a1=1, a2='hello')}"]
|
+| 36 | struct_int_array_int | struct<a:int,b:array<int>> |
[Row(a=1, b=[1, 2, 3])] |
['Row'] | ['Row(a=1, b=[1, 2, 3])']
|
+| 37 | struct_int_map_str_int | struct<a:int,b:map<string,int>> |
[Row(a=1, b={'world': 2, 'hello': 1})] |
['Row'] | ["Row(a=1, b={'world': 2, 'hello': 1})"]
|
+| 38 | struct_int_struct_int_str | struct<a:int,b:struct<a1:int,a2:string>> |
[Row(a=1, b=Row(a1=1, a2='hello'))] |
['Row'] | ["Row(a=1, b=Row(a1=1, a2='hello'))"]
|
\ No newline at end of file
diff --git
a/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow_and_pandas.csv
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow_and_pandas.csv
new file mode 100644
index 000000000000..9ed7dcba95c8
--- /dev/null
+++
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow_and_pandas.csv
@@ -0,0 +1,40 @@
+ Test Case Spark Type Spark Value Python Type Python
Value
+0 byte_values tinyint [-128, 127, 0] ['int', 'int', 'int']
['-128', '127', '0']
+1 byte_null tinyint [None, 42] ['NAType', 'int8']
['<NA>', '42']
+2 short_values smallint [-32768, 32767, 0] ['int', 'int',
'int'] ['-32768', '32767', '0']
+3 short_null smallint [None, 123] ['NAType', 'int16']
['<NA>', '123']
+4 int_values int [-2147483648, 2147483647, 0] ['int', 'int',
'int'] ['-2147483648', '2147483647', '0']
+5 int_null int [None, 456] ['NAType', 'int32']
['<NA>', '456']
+6 long_values bigint [-9223372036854775808, 9223372036854775807, 0]
['int', 'int', 'int'] ['-9223372036854775808', '9223372036854775807', '0']
+7 long_null bigint [None, 789] ['NAType', 'int64']
['<NA>', '789']
+8 float_values float [0.0, 1.0, 3.140000104904175] ['float',
'float', 'float'] ['0.0', '1.0', '3.140000104904175']
+9 float_null float [None, 3.140000104904175] ['float',
'float'] ['nan', '3.140000104904175']
+10 double_values double [0.0, 1.0, 0.3333333333333333] ['float',
'float', 'float'] ['0.0', '1.0', '0.3333333333333333']
+11 double_null double [None, 2.71] ['float', 'float'] ['nan',
'2.71']
+12 decimal_values decimal(3,2) [Decimal('5.35'), Decimal('1.23')]
['Decimal', 'Decimal'] ['5.35', '1.23']
+13 decimal_null decimal(3,2) [None, Decimal('9.99')] ['NoneType',
'Decimal'] ['None', '9.99']
+14 string_values string ['abc', '', 'hello'] ['str', 'str', 'str']
['abc', '', 'hello']
+15 string_null string [None, 'test'] ['NoneType', 'str']
['None', 'test']
+16 binary_values binary [b'abc', b'', b'ABC'] ['bytes', 'bytes',
'bytes'] "[""b'abc'"", ""b''"", ""b'ABC'""]"
+17 binary_null binary [None, b'test'] ['NoneType', 'bytes']
"['None', ""b'test'""]"
+18 boolean_values boolean [True, False] ['bool', 'bool']
['True', 'False']
+19 boolean_null boolean [None, True] ['NoneType', 'bool']
['None', 'True']
+20 date_values date [datetime.date(2020, 2, 2), datetime.date(1970,
1, 1)] ['date', 'date'] ['2020-02-02', '1970-01-01']
+21 date_null date [None, datetime.date(2023, 1, 1)]
['NoneType', 'date'] ['None', '2023-01-01']
+22 timestamp_values timestamp [datetime.datetime(2020, 2, 2,
12, 15, 16, 123000)] ['Timestamp'] ['2020-02-02 12:15:16.123000']
+23 timestamp_null timestamp [None, datetime.datetime(2023, 1, 1,
12, 0)] ['NaTType', 'Timestamp'] ['NaT', '2023-01-01 12:00:00']
+24 array_int_values array<int> [[1, 2, 3], [], [1, None, 3]]
['list', 'list', 'list'] ['[1, 2, 3]', '[]', '[1, None, 3]']
+25 array_int_null array<int> [None, [4, 5, 6]] ['NoneType',
'list'] ['None', '[np.int32(4), np.int32(5), np.int32(6)]']
+26 map_str_int_values map<string,int> [{'world': 2, 'hello': 1}, {}]
['dict', 'dict'] "[""{'world': 2, 'hello': 1}"", '{}']"
+27 map_str_int_null map<string,int> [None, {'test': 123}]
['NoneType', 'dict'] "['None', ""{'test': 123}""]"
+28 struct_int_str_values struct<a1:int,a2:string> [Row(a1=1,
a2='hello'), Row(a1=2, a2='world')] ['Row', 'Row'] "[""Row(a1=1,
a2='hello')"", ""Row(a1=2, a2='world')""]"
+29 struct_int_str_null struct<a1:int,a2:string> [None,
Row(a1=99, a2='test')] ['NoneType', 'Row'] "['None', ""Row(a1=99,
a2='test')""]"
+30 array_array_int array<array<int>> [[[1, 2, 3]], [[1], [2, 3]]]
['list', 'list'] ['[[np.int32(1), np.int32(2), np.int32(3)]]',
'[[np.int32(1)], [np.int32(2), np.int32(3)]]']
+31 array_map_str_int array<map<string,int>> [[{'world': 2, 'hello':
1}], [{'a': 1}, {'b': 2}]] ['list', 'list'] "[""[{'world': 2,
'hello': 1}]"", ""[{'a': 1}, {'b': 2}]""]"
+32 array_struct_int_str array<struct<a1:int,a2:string>> [[Row(a1=1,
a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]] ['list',
'list'] "[""[Row(a1=1, a2='hello')]"", ""[Row(a1=1, a2='hello'),
Row(a1=2, a2='world')]""]"
+33 map_int_array_int map<int,array<int>> [{1: [1, 2, 3]}, {1:
[1], 2: [2, 3]}] ['dict', 'dict'] ['{1: [np.int32(1), np.int32(2),
np.int32(3)]}', '{1: [np.int32(1)], 2: [np.int32(2), np.int32(3)]}']
+34 map_int_map_str_int map<int,map<string,int>> [{1: {'world':
2, 'hello': 1}}] ['dict'] "[""{1: {'world': 2, 'hello': 1}}""]"
+35 map_int_struct_int_str map<int,struct<a1:int,a2:string>> [{1:
Row(a1=1, a2='hello')}] ['dict'] "[""{1: Row(a1=1, a2='hello')}""]"
+36 struct_int_array_int struct<a:int,b:array<int>> [Row(a=1, b=[1,
2, 3])] ['Row'] ['Row(a=1, b=[np.int32(1), np.int32(2), np.int32(3)])']
+37 struct_int_map_str_int struct<a:int,b:map<string,int>> [Row(a=1,
b={'world': 2, 'hello': 1})] ['Row'] "[""Row(a=1, b={'world': 2, 'hello':
1})""]"
+38 struct_int_struct_int_str
struct<a:int,b:struct<a1:int,a2:string>> [Row(a=1, b=Row(a1=1,
a2='hello'))] ['Row'] "[""Row(a=1, b=Row(a1=1, a2='hello'))""]"
diff --git
a/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow_and_pandas.md
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow_and_pandas.md
new file mode 100644
index 000000000000..9b70503e679a
--- /dev/null
+++
b/python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_with_arrow_and_pandas.md
@@ -0,0 +1,41 @@
+| | Test Case | Spark Type |
Spark Value |
Python Type | Python Value
|
+|----|---------------------------|------------------------------------------|---------------------------------------------------------------------------|-----------------------------|-------------------------------------------------------------------------------------------------------|
+| 0 | byte_values | tinyint |
[-128, 127, 0] |
['int', 'int', 'int'] | ['-128', '127', '0']
|
+| 1 | byte_null | tinyint |
[None, 42] |
['NAType', 'int8'] | ['<NA>', '42']
|
+| 2 | short_values | smallint |
[-32768, 32767, 0] |
['int', 'int', 'int'] | ['-32768', '32767', '0']
|
+| 3 | short_null | smallint |
[None, 123] |
['NAType', 'int16'] | ['<NA>', '123']
|
+| 4 | int_values | int |
[-2147483648, 2147483647, 0] |
['int', 'int', 'int'] | ['-2147483648', '2147483647', '0']
|
+| 5 | int_null | int |
[None, 456] |
['NAType', 'int32'] | ['<NA>', '456']
|
+| 6 | long_values | bigint |
[-9223372036854775808, 9223372036854775807, 0] |
['int', 'int', 'int'] | ['-9223372036854775808', '9223372036854775807',
'0'] |
+| 7 | long_null | bigint |
[None, 789] |
['NAType', 'int64'] | ['<NA>', '789']
|
+| 8 | float_values | float |
[0.0, 1.0, 3.140000104904175] |
['float', 'float', 'float'] | ['0.0', '1.0', '3.140000104904175']
|
+| 9 | float_null | float |
[None, 3.140000104904175] |
['float', 'float'] | ['nan', '3.140000104904175']
|
+| 10 | double_values | double |
[0.0, 1.0, 0.3333333333333333] |
['float', 'float', 'float'] | ['0.0', '1.0', '0.3333333333333333']
|
+| 11 | double_null | double |
[None, 2.71] |
['float', 'float'] | ['nan', '2.71']
|
+| 12 | decimal_values | decimal(3,2) |
[Decimal('5.35'), Decimal('1.23')] |
['Decimal', 'Decimal'] | ['5.35', '1.23']
|
+| 13 | decimal_null | decimal(3,2) |
[None, Decimal('9.99')] |
['NoneType', 'Decimal'] | ['None', '9.99']
|
+| 14 | string_values | string |
['abc', '', 'hello'] |
['str', 'str', 'str'] | ['abc', '', 'hello']
|
+| 15 | string_null | string |
[None, 'test'] |
['NoneType', 'str'] | ['None', 'test']
|
+| 16 | binary_values | binary |
[b'abc', b'', b'ABC'] |
['bytes', 'bytes', 'bytes'] | ["b'abc'", "b''", "b'ABC'"]
|
+| 17 | binary_null | binary |
[None, b'test'] |
['NoneType', 'bytes'] | ['None', "b'test'"]
|
+| 18 | boolean_values | boolean |
[True, False] |
['bool', 'bool'] | ['True', 'False']
|
+| 19 | boolean_null | boolean |
[None, True] |
['NoneType', 'bool'] | ['None', 'True']
|
+| 20 | date_values | date |
[datetime.date(2020, 2, 2), datetime.date(1970, 1, 1)] |
['date', 'date'] | ['2020-02-02', '1970-01-01']
|
+| 21 | date_null | date |
[None, datetime.date(2023, 1, 1)] |
['NoneType', 'date'] | ['None', '2023-01-01']
|
+| 22 | timestamp_values | timestamp |
[datetime.datetime(2020, 2, 2, 12, 15, 16, 123000)] |
['Timestamp'] | ['2020-02-02 12:15:16.123000']
|
+| 23 | timestamp_null | timestamp |
[None, datetime.datetime(2023, 1, 1, 12, 0)] |
['NaTType', 'Timestamp'] | ['NaT', '2023-01-01 12:00:00']
|
+| 24 | array_int_values | array<int> |
[[1, 2, 3], [], [1, None, 3]] |
['list', 'list', 'list'] | ['[1, 2, 3]', '[]', '[1, None, 3]']
|
+| 25 | array_int_null | array<int> |
[None, [4, 5, 6]] |
['NoneType', 'list'] | ['None', '[np.int32(4), np.int32(5),
np.int32(6)]'] |
+| 26 | map_str_int_values | map<string,int> |
[{'world': 2, 'hello': 1}, {}] |
['dict', 'dict'] | ["{'world': 2, 'hello': 1}", '{}']
|
+| 27 | map_str_int_null | map<string,int> |
[None, {'test': 123}] |
['NoneType', 'dict'] | ['None', "{'test': 123}"]
|
+| 28 | struct_int_str_values | struct<a1:int,a2:string> |
[Row(a1=1, a2='hello'), Row(a1=2, a2='world')] |
['Row', 'Row'] | ["Row(a1=1, a2='hello')", "Row(a1=2,
a2='world')"] |
+| 29 | struct_int_str_null | struct<a1:int,a2:string> |
[None, Row(a1=99, a2='test')] |
['NoneType', 'Row'] | ['None', "Row(a1=99, a2='test')"]
|
+| 30 | array_array_int | array<array<int>> |
[[[1, 2, 3]], [[1], [2, 3]]] |
['list', 'list'] | ['[[np.int32(1), np.int32(2), np.int32(3)]]',
'[[np.int32(1)], [np.int32(2), np.int32(3)]]'] |
+| 31 | array_map_str_int | array<map<string,int>> |
[[{'world': 2, 'hello': 1}], [{'a': 1}, {'b': 2}]] |
['list', 'list'] | ["[{'world': 2, 'hello': 1}]", "[{'a': 1}, {'b':
2}]"] |
+| 32 | array_struct_int_str | array<struct<a1:int,a2:string>> |
[[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]] |
['list', 'list'] | ["[Row(a1=1, a2='hello')]", "[Row(a1=1,
a2='hello'), Row(a1=2, a2='world')]"] |
+| 33 | map_int_array_int | map<int,array<int>> |
[{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}] |
['dict', 'dict'] | ['{1: [np.int32(1), np.int32(2), np.int32(3)]}',
'{1: [np.int32(1)], 2: [np.int32(2), np.int32(3)]}'] |
+| 34 | map_int_map_str_int | map<int,map<string,int>> |
[{1: {'world': 2, 'hello': 1}}] |
['dict'] | ["{1: {'world': 2, 'hello': 1}}"]
|
+| 35 | map_int_struct_int_str | map<int,struct<a1:int,a2:string>> |
[{1: Row(a1=1, a2='hello')}] |
['dict'] | ["{1: Row(a1=1, a2='hello')}"]
|
+| 36 | struct_int_array_int | struct<a:int,b:array<int>> |
[Row(a=1, b=[1, 2, 3])] |
['Row'] | ['Row(a=1, b=[np.int32(1), np.int32(2),
np.int32(3)])'] |
+| 37 | struct_int_map_str_int | struct<a:int,b:map<string,int>> |
[Row(a=1, b={'world': 2, 'hello': 1})] |
['Row'] | ["Row(a=1, b={'world': 2, 'hello': 1})"]
|
+| 38 | struct_int_struct_int_str | struct<a:int,b:struct<a1:int,a2:string>> |
[Row(a=1, b=Row(a1=1, a2='hello'))] |
['Row'] | ["Row(a=1, b=Row(a1=1, a2='hello'))"]
|
\ No newline at end of file
diff --git a/python/pyspark/sql/tests/udf_type_tests/test_udf_input_types.py
b/python/pyspark/sql/tests/coercion/test_python_udf_input_type.py
similarity index 60%
copy from python/pyspark/sql/tests/udf_type_tests/test_udf_input_types.py
copy to python/pyspark/sql/tests/coercion/test_python_udf_input_type.py
index d8cf8466dcf3..5e88b546b0c6 100644
--- a/python/pyspark/sql/tests/udf_type_tests/test_udf_input_types.py
+++ b/python/pyspark/sql/tests/coercion/test_python_udf_input_type.py
@@ -15,14 +15,15 @@
# limitations under the License.
#
+from decimal import Decimal
+import datetime
import os
-import platform
+import time
import unittest
-import pandas as pd
-from pyspark.sql import Row
-from pyspark.sql.functions import udf, pandas_udf
+from pyspark.sql.functions import udf
from pyspark.sql.types import (
+ Row,
ArrayType,
BinaryType,
BooleanType,
@@ -50,216 +51,57 @@ from pyspark.testing.utils import (
numpy_requirement_message,
)
from pyspark.testing.sqlutils import ReusedSQLTestCase
-from .type_table_utils import generate_table_diff, format_type_table
if have_numpy:
import numpy as np
+if have_pandas:
+ import pandas as pd
+
+# If you need to re-generate the golden files, you need to set the
+# SPARK_GENERATE_GOLDEN_FILES=1 environment variable before running this test,
+# e.g.:
+# SPARK_GENERATE_GOLDEN_FILES=1 python/run-tests -k
+# --testnames 'pyspark.sql.tests.coercion.test_pandas_udf_input_type'
+# If package tabulate https://pypi.org/project/tabulate/ is installed,
+# it will also re-generate the Markdown files.
@unittest.skipIf(
not have_pandas
or not have_pyarrow
or not have_numpy
- or LooseVersion(np.__version__) < LooseVersion("2.0.0")
- or platform.system() == "Darwin",
- pandas_requirement_message
- or pyarrow_requirement_message
- or numpy_requirement_message
- or "float128 not supported on macos",
+ or LooseVersion(np.__version__) < LooseVersion("2.0.0"),
+ pandas_requirement_message or pyarrow_requirement_message or
numpy_requirement_message,
)
-class UDFInputTypeTests(ReusedSQLTestCase):
+class PandasUDFInputTypeTests(ReusedSQLTestCase):
@classmethod
def setUpClass(cls):
super().setUpClass()
- def setUp(self):
- super().setUp()
-
- def test_udf_input_types_arrow_disabled(self):
- golden_file = os.path.join(
- os.path.dirname(__file__),
"golden_udf_input_types_arrow_disabled.txt"
- )
- self._run_udf_input_type_coercion_test(
- config={},
- use_arrow=False,
- golden_file=golden_file,
- test_name="UDF input types - Arrow disabled",
- )
-
- def test_udf_input_types_arrow_legacy_pandas(self):
- golden_file = os.path.join(
- os.path.dirname(__file__),
"golden_udf_input_types_arrow_legacy_pandas.txt"
- )
- self._run_udf_input_type_coercion_test(
-
config={"spark.sql.legacy.execution.pythonUDF.pandas.conversion.enabled":
"true"},
- use_arrow=True,
- golden_file=golden_file,
- test_name="UDF input types - Arrow with legacy pandas",
- )
-
- def test_udf_input_types_arrow_enabled(self):
- golden_file = os.path.join(
- os.path.dirname(__file__),
"golden_udf_input_types_arrow_enabled.txt"
- )
- self._run_udf_input_type_coercion_test(
-
config={"spark.sql.legacy.execution.pythonUDF.pandas.conversion.enabled":
"false"},
- use_arrow=True,
- golden_file=golden_file,
- test_name="UDF input types - Arrow enabled",
- )
-
- def _run_udf_input_type_coercion_test(self, config, use_arrow,
golden_file, test_name):
- with self.sql_conf(config):
- results = self._generate_udf_input_type_coercion_results(use_arrow)
- actual_output = format_type_table(
- results,
- ["Test Case", "Spark Type", "Spark Value", "Python Type",
"Python Value"],
- column_width=85,
- )
- self._compare_or_create_golden_file(actual_output, golden_file,
test_name)
-
- def _generate_udf_input_type_coercion_results(self, use_arrow):
- results = []
- test_cases = self._get_input_type_test_cases()
-
- for test_name, spark_type, data_func in test_cases:
- input_df = data_func(spark_type).repartition(1)
- input_data = [row["value"] for row in input_df.collect()]
- result_row = [test_name, spark_type.simpleString(),
str(input_data)]
-
- try:
-
- def type_udf(x):
- if x is None:
- return "NoneType"
- else:
- return type(x).__name__
-
- def value_udf(x):
- return x
-
- def value_str(x):
- return str(x)
-
- type_test_udf = udf(type_udf, returnType=StringType(),
useArrow=use_arrow)
- value_test_udf = udf(value_udf, returnType=spark_type,
useArrow=use_arrow)
- value_str_udf = udf(value_str, returnType=StringType(),
useArrow=use_arrow)
-
- result_df = input_df.select(
- value_test_udf("value").alias("python_value"),
- type_test_udf("value").alias("python_type"),
- value_str_udf("value").alias("python_value_str"),
- )
- results_data = result_df.collect()
- values = [row["python_value"] for row in results_data]
- types = [row["python_type"] for row in results_data]
- values_str = [row["python_value_str"] for row in results_data]
-
- # Assert that the UDF output values match the input values
- assert values == input_data, f"Input {values} != output
{input_data}"
-
- result_row.append(str(types))
- result_row.append(str(values_str).replace("\n", " "))
-
- except Exception as e:
- print("error_msg", e)
- # Clean up exception message to remove newlines and extra
whitespace
- error_msg = str(e).replace("\n", " ").replace("\r", " ")
- result_row.append(f"✗ {error_msg}")
-
- results.append(result_row)
-
- return results
-
- def test_pandas_udf_input(self):
- golden_file = os.path.join(os.path.dirname(__file__),
"golden_pandas_udf_input_types.txt")
- results = self._generate_pandas_udf_input_type_coercion_results()
- actual_output = format_type_table(
- results,
- ["Test Case", "Spark Type", "Spark Value", "Python Type", "Python
Value"],
- column_width=85,
- )
- self._compare_or_create_golden_file(actual_output, golden_file,
"Pandas UDF input types")
-
- def _generate_pandas_udf_input_type_coercion_results(self):
- results = []
- test_cases = self._get_input_type_test_cases()
-
- for test_name, spark_type, data_func in test_cases:
- input_df = data_func(spark_type).repartition(1)
- input_data = [row["value"] for row in input_df.collect()]
- result_row = [test_name, spark_type.simpleString(),
str(input_data)]
-
- try:
-
- def type_pandas_udf(data):
- if hasattr(data, "dtype"):
- # Series case
- return pd.Series([str(data.dtype)] * len(data))
- else:
- # DataFrame case (for struct types)
- return pd.Series([str(type(data).__name__)] *
len(data))
-
- def value_pandas_udf(series):
- return series
-
- type_test_pandas_udf = pandas_udf(type_pandas_udf,
returnType=StringType())
- value_test_pandas_udf = pandas_udf(value_pandas_udf,
returnType=spark_type)
-
- result_df = input_df.select(
- value_test_pandas_udf("value").alias("python_value"),
- type_test_pandas_udf("value").alias("python_type"),
- )
- results_data = result_df.collect()
- values = [row["python_value"] for row in results_data]
- types = [row["python_type"] for row in results_data]
+ # Synchronize default timezone between Python and Java
+ cls.tz_prev = os.environ.get("TZ", None) # save current tz if set
+ tz = "America/Los_Angeles"
+ os.environ["TZ"] = tz
+ time.tzset()
- result_row.append(str(types))
- result_row.append(str(values).replace("\n", " "))
+ cls.sc.environment["TZ"] = tz
+ cls.spark.conf.set("spark.sql.session.timeZone", tz)
- except Exception as e:
- print("error_msg", e)
- error_msg = str(e).replace("\n", " ").replace("\r", " ")
- result_row.append(f"✗ {error_msg}")
-
- results.append(result_row)
-
- return results
-
- def _compare_or_create_golden_file(self, actual_output, golden_file,
test_name):
- """Compare actual output with golden file or create golden file if it
doesn't exist.
-
- Args:
- actual_output: The actual output to compare
- golden_file: Path to the golden file
- test_name: Name of the test for error messages
- """
- if os.path.exists(golden_file):
- with open(golden_file, "r") as f:
- expected_output = f.read()
-
- if actual_output != expected_output:
- diff_output = generate_table_diff(actual_output,
expected_output, cell_width=85)
- self.fail(
- f"""
- Results don't match golden file for :{test_name}.\n
- Diff:\n{diff_output}
- """
- )
- else:
- with open(golden_file, "w") as f:
- f.write(actual_output)
- self.fail(f"Golden file created for {test_name}. Please review and
re-run the test.")
+ @classmethod
+ def tearDownClass(cls):
+ del os.environ["TZ"]
+ if cls.tz_prev is not None:
+ os.environ["TZ"] = cls.tz_prev
+ time.tzset()
- def _create_value_schema(self, data_type):
- """Helper to create a StructType schema with a single 'value' column
of the given type."""
- return StructType([StructField("value", data_type, True)])
+ super().tearDownClass()
- def _get_input_type_test_cases(self):
- from pyspark.sql.types import StructType, StructField
- import datetime
- from decimal import Decimal
+ @property
+ def prefix(self):
+ return "golden_pandas_udf_input_type_coercion"
+ @property
+ def test_cases(self):
def df(args):
def create_df(data_type):
# For StructType where the data contains Row objects (not
wrapped in tuples)
@@ -413,6 +255,134 @@ class UDFInputTypeTests(ReusedSQLTestCase):
),
]
+ def test_python_input_type_coercion_vanilla(self):
+ self._run_udf_input_type_coercion(
+ use_arrow=False,
+ legacy_pandas=False,
+ golden_file=f"{self.prefix}_vanilla",
+ test_name="Vanilla Python UDF",
+ )
+
+ def test_python_input_type_coercion_with_arrow(self):
+ self._run_udf_input_type_coercion(
+ use_arrow=True,
+ legacy_pandas=False,
+ golden_file=f"{self.prefix}_with_arrow",
+ test_name="Arrow Optimized Python UDF",
+ )
+
+ def test_python_input_type_coercion_with_arrow_and_pandas(self):
+ self._run_udf_input_type_coercion(
+ use_arrow=True,
+ legacy_pandas=True,
+ golden_file=f"{self.prefix}_with_arrow_and_pandas",
+ test_name="Arrow Optimized Python UDF with Legacy Pandas
Conversion",
+ )
+
+ def _run_udf_input_type_coercion(self, use_arrow, legacy_pandas,
golden_file, test_name):
+ with self.sql_conf(
+ {
+ "spark.sql.execution.pythonUDF.arrow.enabled": use_arrow,
+
"spark.sql.legacy.execution.pythonUDF.pandas.conversion.enabled": legacy_pandas,
+ }
+ ):
+ self._compare_or_generate_golden(golden_file, test_name)
+
+ def _compare_or_generate_golden(self, golden_file, test_name):
+ testing = os.environ.get("SPARK_GENERATE_GOLDEN_FILES", "?") != "1"
+
+ golden_csv = os.path.join(os.path.dirname(__file__),
f"{golden_file}.csv")
+ golden_md = os.path.join(os.path.dirname(__file__),
f"{golden_file}.md")
+
+ golden = None
+ if testing:
+ golden = pd.read_csv(
+ golden_csv,
+ sep="\t",
+ index_col=0,
+ dtype="str",
+ na_filter=False,
+ engine="python",
+ )
+
+ results = []
+ for idx, (test_name, spark_type, data_func) in
enumerate(self.test_cases):
+ input_df = data_func(spark_type).repartition(1)
+ input_data = [row["value"] for row in input_df.collect()]
+ result = [test_name, spark_type.simpleString(), str(input_data)]
+
+ try:
+
+ def type_udf(x):
+ if x is None:
+ return "NoneType"
+ else:
+ return type(x).__name__
+
+ def value_udf(x):
+ return x
+
+ def value_str(x):
+ return str(x)
+
+ type_test_udf = udf(type_udf, returnType=StringType())
+ value_test_udf = udf(value_udf, returnType=spark_type)
+ value_str_udf = udf(value_str, returnType=StringType())
+
+ result_df = input_df.select(
+ value_test_udf("value").alias("python_value"),
+ type_test_udf("value").alias("python_type"),
+ value_str_udf("value").alias("python_value_str"),
+ )
+ results_data = result_df.collect()
+ values = [row["python_value"] for row in results_data]
+ types = [row["python_type"] for row in results_data]
+ values_str = [row["python_value_str"] for row in results_data]
+
+ # Assert that the UDF output values match the input values
+ assert values == input_data, f"Input {values} != output
{input_data}"
+
+ result.append(str(types))
+ result.append(str(values_str).replace("\n", " "))
+
+ except Exception as e:
+ print("error_msg", e)
+ # Clean up exception message to remove newlines and extra
whitespace
+ e = str(e).replace("\n", " ").replace("\r", " ").replace("\t",
" ")
+ result.append(f"✗ {e}")
+
+ error_msg = None
+ if testing and result != list(golden.iloc[idx]):
+ error_msg = f"line mismatch: expects {list(golden.iloc[idx])}
but got {result}"
+
+ results.append((result, error_msg))
+
+ if testing:
+ errs = []
+ for _, err in results:
+ if err is not None:
+ errs.append(err)
+ self.assertTrue(len(errs) == 0, "\n" + "\n".join(errs) + "\n")
+
+ else:
+ new_golden = pd.DataFrame(
+ [res for res, _ in results],
+ columns=["Test Case", "Spark Type", "Spark Value", "Python
Type", "Python Value"],
+ )
+
+ # generating the CSV file as the golden file
+ new_golden.to_csv(golden_csv, sep="\t", header=True, index=True)
+
+ try:
+ # generating the GitHub flavored Markdown file
+ # package tabulate is required
+ new_golden.to_markdown(golden_md, index=True,
tablefmt="github")
+ except Exception as e:
+ print(
+ f"{test_name} return type coercion: "
+ f"fail to write the markdown file due to {e}!"
+ )
+
if __name__ == "__main__":
from pyspark.testing import main
diff --git
a/python/pyspark/sql/tests/udf_type_tests/golden_udf_input_types_arrow_disabled.txt
b/python/pyspark/sql/tests/udf_type_tests/golden_udf_input_types_arrow_disabled.txt
deleted file mode 100644
index a3727dfd5d6b..000000000000
---
a/python/pyspark/sql/tests/udf_type_tests/golden_udf_input_types_arrow_disabled.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
-|Test Case
|Spark Type
|Spark Value
|Python Type
|Python Value
|
-+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
-|byte_values
|tinyint
|[-128, 127, 0]
|['int', 'int', 'int']
|['-128', '127', '0']
|
-|byte_null
|tinyint
|[None, 42]
|['NoneType', 'int']
|['None', '42']
|
-|short_values
|smallint
|[-32768, 32767, 0]
|['int', 'int', 'int']
|['-32768', '32767', '0']
|
-|short_null
|smallint
|[None, 123]
|['NoneType', 'int']
|['None', '123']
|
-|int_values
|int
|[-2147483648, 2147483647, 0]
|['int', 'int', 'int']
|['-2147483648', '2147483647', '0']
|
-|int_null
|int
|[None, 456]
|['NoneType', 'int']
|['None', '456']
|
-|long_values
|bigint
|[-9223372036854775808, 9223372036854775807, 0]
|['int', 'int', 'int']
|['-9223372036854775808', '9223372036854775807',
'0'] |
-|long_null
|bigint
|[None, 789]
|['NoneType', 'int']
|['None', '789']
|
-|float_values
|float
|[0.0, 1.0, 3.140000104904175]
|['float', 'float', 'float']
|['0.0', '1.0', '3.140000104904175']
|
-|float_null
|float
|[None, 3.140000104904175]
|['NoneType', 'float']
|['None', '3.140000104904175']
|
-|double_values
|double
|[0.0, 1.0, 0.3333333333333333]
|['float', 'float', 'float']
|['0.0', '1.0', '0.3333333333333333']
|
-|double_null
|double
|[None, 2.71]
|['NoneType', 'float']
|['None', '2.71']
|
-|decimal_values
|decimal(3,2)
|[Decimal('5.35'), Decimal('1.23')]
|['Decimal', 'Decimal']
|['5.35', '1.23']
|
-|decimal_null
|decimal(3,2)
|[None, Decimal('9.99')]
|['NoneType', 'Decimal']
|['None', '9.99']
|
-|string_values
|string
|['abc', '', 'hello']
|['str', 'str', 'str']
|['abc', '', 'hello']
|
-|string_null
|string
|[None, 'test']
|['NoneType', 'str']
|['None', 'test']
|
-|binary_values
|binary
|[b'abc', b'', b'ABC']
|['bytes', 'bytes', 'bytes']
|["b'abc'", "b''", "b'ABC'"]
|
-|binary_null
|binary
|[None, b'test']
|['NoneType', 'bytes']
|['None', "b'test'"]
|
-|boolean_values
|boolean
|[True, False]
|['bool', 'bool']
|['True', 'False']
|
-|boolean_null
|boolean
|[None, True]
|['NoneType', 'bool']
|['None', 'True']
|
-|date_values
|date
|[datetime.date(2020, 2, 2), datetime.date(1970, 1, 1)]
|['date', 'date']
|['2020-02-02', '1970-01-01']
|
-|date_null
|date
|[None, datetime.date(2023, 1, 1)]
|['NoneType', 'date']
|['None', '2023-01-01']
|
-|timestamp_values
|timestamp
|[datetime.datetime(2020, 2, 2, 12, 15, 16, 123000)]
|['datetime']
|['2020-02-02 12:15:16.123000']
|
-|timestamp_null
|timestamp
|[None, datetime.datetime(2023, 1, 1, 12, 0)]
|['NoneType', 'datetime']
|['None', '2023-01-01 12:00:00']
|
-|array_int_values
|array<int>
|[[1, 2, 3], [], [1, None, 3]]
|['list', 'list', 'list']
|['[1, 2, 3]', '[]', '[1, None, 3]']
|
-|array_int_null
|array<int>
|[None, [4, 5, 6]]
|['NoneType', 'list']
|['None', '[4, 5, 6]']
|
-|map_str_int_values
|map<string,int>
|[{'world': 2, 'hello': 1}, {}]
|['dict', 'dict']
|["{'world': 2, 'hello': 1}", '{}']
|
-|map_str_int_null
|map<string,int>
|[None, {'test': 123}]
|['NoneType', 'dict']
|['None', "{'test': 123}"]
|
-|struct_int_str_values
|struct<a1:int,a2:string>
|[Row(a1=1, a2='hello'), Row(a1=2, a2='world')]
|['Row', 'Row']
|["Row(a1=1, a2='hello')", "Row(a1=2,
a2='world')"] |
-|struct_int_str_null
|struct<a1:int,a2:string>
|[None, Row(a1=99, a2='test')]
|['NoneType', 'Row']
|['None', "Row(a1=99, a2='test')"]
|
-|array_array_int
|array<array<int>>
|[[[1, 2, 3]], [[1], [2, 3]]]
|['list', 'list']
|['[[1, 2, 3]]', '[[1], [2, 3]]']
|
-|array_map_str_int
|array<map<string,int>>
|[[{'world': 2, 'hello': 1}], [{'a': 1}, {'b': 2}]]
|['list', 'list']
|["[{'world': 2, 'hello': 1}]", "[{'a': 1}, {'b':
2}]"] |
-|array_struct_int_str
|array<struct<a1:int,a2:string>>
|[[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2,
a2='world')]] |['list', 'list']
|["[Row(a1=1, a2='hello')]", "[Row(a1=1,
a2='hello'), Row(a1=2, a2='world')]"] |
-|map_int_array_int
|map<int,array<int>>
|[{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}]
|['dict', 'dict']
|['{1: [1, 2, 3]}', '{1: [1], 2: [2, 3]}']
|
-|map_int_map_str_int
|map<int,map<string,int>>
|[{1: {'world': 2, 'hello': 1}}]
|['dict']
|["{1: {'world': 2, 'hello': 1}}"]
|
-|map_int_struct_int_str
|map<int,struct<a1:int,a2:string>>
|[{1: Row(a1=1, a2='hello')}]
|['dict']
|["{1: Row(a1=1, a2='hello')}"]
|
-|struct_int_array_int
|struct<a:int,b:array<int>>
|[Row(a=1, b=[1, 2, 3])]
|['Row']
|['Row(a=1, b=[1, 2, 3])']
|
-|struct_int_map_str_int
|struct<a:int,b:map<string,int>>
|[Row(a=1, b={'world': 2, 'hello': 1})]
|['Row']
|["Row(a=1, b={'world': 2, 'hello': 1})"]
|
-|struct_int_struct_int_str
|struct<a:int,b:struct<a1:int,a2:string>>
|[Row(a=1, b=Row(a1=1, a2='hello'))]
|['Row']
|["Row(a=1, b=Row(a1=1, a2='hello'))"]
|
-+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
\ No newline at end of file
diff --git
a/python/pyspark/sql/tests/udf_type_tests/golden_udf_input_types_arrow_enabled.txt
b/python/pyspark/sql/tests/udf_type_tests/golden_udf_input_types_arrow_enabled.txt
deleted file mode 100644
index a3727dfd5d6b..000000000000
---
a/python/pyspark/sql/tests/udf_type_tests/golden_udf_input_types_arrow_enabled.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
-|Test Case
|Spark Type
|Spark Value
|Python Type
|Python Value
|
-+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
-|byte_values
|tinyint
|[-128, 127, 0]
|['int', 'int', 'int']
|['-128', '127', '0']
|
-|byte_null
|tinyint
|[None, 42]
|['NoneType', 'int']
|['None', '42']
|
-|short_values
|smallint
|[-32768, 32767, 0]
|['int', 'int', 'int']
|['-32768', '32767', '0']
|
-|short_null
|smallint
|[None, 123]
|['NoneType', 'int']
|['None', '123']
|
-|int_values
|int
|[-2147483648, 2147483647, 0]
|['int', 'int', 'int']
|['-2147483648', '2147483647', '0']
|
-|int_null
|int
|[None, 456]
|['NoneType', 'int']
|['None', '456']
|
-|long_values
|bigint
|[-9223372036854775808, 9223372036854775807, 0]
|['int', 'int', 'int']
|['-9223372036854775808', '9223372036854775807',
'0'] |
-|long_null
|bigint
|[None, 789]
|['NoneType', 'int']
|['None', '789']
|
-|float_values
|float
|[0.0, 1.0, 3.140000104904175]
|['float', 'float', 'float']
|['0.0', '1.0', '3.140000104904175']
|
-|float_null
|float
|[None, 3.140000104904175]
|['NoneType', 'float']
|['None', '3.140000104904175']
|
-|double_values
|double
|[0.0, 1.0, 0.3333333333333333]
|['float', 'float', 'float']
|['0.0', '1.0', '0.3333333333333333']
|
-|double_null
|double
|[None, 2.71]
|['NoneType', 'float']
|['None', '2.71']
|
-|decimal_values
|decimal(3,2)
|[Decimal('5.35'), Decimal('1.23')]
|['Decimal', 'Decimal']
|['5.35', '1.23']
|
-|decimal_null
|decimal(3,2)
|[None, Decimal('9.99')]
|['NoneType', 'Decimal']
|['None', '9.99']
|
-|string_values
|string
|['abc', '', 'hello']
|['str', 'str', 'str']
|['abc', '', 'hello']
|
-|string_null
|string
|[None, 'test']
|['NoneType', 'str']
|['None', 'test']
|
-|binary_values
|binary
|[b'abc', b'', b'ABC']
|['bytes', 'bytes', 'bytes']
|["b'abc'", "b''", "b'ABC'"]
|
-|binary_null
|binary
|[None, b'test']
|['NoneType', 'bytes']
|['None', "b'test'"]
|
-|boolean_values
|boolean
|[True, False]
|['bool', 'bool']
|['True', 'False']
|
-|boolean_null
|boolean
|[None, True]
|['NoneType', 'bool']
|['None', 'True']
|
-|date_values
|date
|[datetime.date(2020, 2, 2), datetime.date(1970, 1, 1)]
|['date', 'date']
|['2020-02-02', '1970-01-01']
|
-|date_null
|date
|[None, datetime.date(2023, 1, 1)]
|['NoneType', 'date']
|['None', '2023-01-01']
|
-|timestamp_values
|timestamp
|[datetime.datetime(2020, 2, 2, 12, 15, 16, 123000)]
|['datetime']
|['2020-02-02 12:15:16.123000']
|
-|timestamp_null
|timestamp
|[None, datetime.datetime(2023, 1, 1, 12, 0)]
|['NoneType', 'datetime']
|['None', '2023-01-01 12:00:00']
|
-|array_int_values
|array<int>
|[[1, 2, 3], [], [1, None, 3]]
|['list', 'list', 'list']
|['[1, 2, 3]', '[]', '[1, None, 3]']
|
-|array_int_null
|array<int>
|[None, [4, 5, 6]]
|['NoneType', 'list']
|['None', '[4, 5, 6]']
|
-|map_str_int_values
|map<string,int>
|[{'world': 2, 'hello': 1}, {}]
|['dict', 'dict']
|["{'world': 2, 'hello': 1}", '{}']
|
-|map_str_int_null
|map<string,int>
|[None, {'test': 123}]
|['NoneType', 'dict']
|['None', "{'test': 123}"]
|
-|struct_int_str_values
|struct<a1:int,a2:string>
|[Row(a1=1, a2='hello'), Row(a1=2, a2='world')]
|['Row', 'Row']
|["Row(a1=1, a2='hello')", "Row(a1=2,
a2='world')"] |
-|struct_int_str_null
|struct<a1:int,a2:string>
|[None, Row(a1=99, a2='test')]
|['NoneType', 'Row']
|['None', "Row(a1=99, a2='test')"]
|
-|array_array_int
|array<array<int>>
|[[[1, 2, 3]], [[1], [2, 3]]]
|['list', 'list']
|['[[1, 2, 3]]', '[[1], [2, 3]]']
|
-|array_map_str_int
|array<map<string,int>>
|[[{'world': 2, 'hello': 1}], [{'a': 1}, {'b': 2}]]
|['list', 'list']
|["[{'world': 2, 'hello': 1}]", "[{'a': 1}, {'b':
2}]"] |
-|array_struct_int_str
|array<struct<a1:int,a2:string>>
|[[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2,
a2='world')]] |['list', 'list']
|["[Row(a1=1, a2='hello')]", "[Row(a1=1,
a2='hello'), Row(a1=2, a2='world')]"] |
-|map_int_array_int
|map<int,array<int>>
|[{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}]
|['dict', 'dict']
|['{1: [1, 2, 3]}', '{1: [1], 2: [2, 3]}']
|
-|map_int_map_str_int
|map<int,map<string,int>>
|[{1: {'world': 2, 'hello': 1}}]
|['dict']
|["{1: {'world': 2, 'hello': 1}}"]
|
-|map_int_struct_int_str
|map<int,struct<a1:int,a2:string>>
|[{1: Row(a1=1, a2='hello')}]
|['dict']
|["{1: Row(a1=1, a2='hello')}"]
|
-|struct_int_array_int
|struct<a:int,b:array<int>>
|[Row(a=1, b=[1, 2, 3])]
|['Row']
|['Row(a=1, b=[1, 2, 3])']
|
-|struct_int_map_str_int
|struct<a:int,b:map<string,int>>
|[Row(a=1, b={'world': 2, 'hello': 1})]
|['Row']
|["Row(a=1, b={'world': 2, 'hello': 1})"]
|
-|struct_int_struct_int_str
|struct<a:int,b:struct<a1:int,a2:string>>
|[Row(a=1, b=Row(a1=1, a2='hello'))]
|['Row']
|["Row(a=1, b=Row(a1=1, a2='hello'))"]
|
-+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
\ No newline at end of file
diff --git
a/python/pyspark/sql/tests/udf_type_tests/golden_udf_input_types_arrow_legacy_pandas.txt
b/python/pyspark/sql/tests/udf_type_tests/golden_udf_input_types_arrow_legacy_pandas.txt
deleted file mode 100644
index 5a896aa40916..000000000000
---
a/python/pyspark/sql/tests/udf_type_tests/golden_udf_input_types_arrow_legacy_pandas.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
-|Test Case
|Spark Type
|Spark Value
|Python Type
|Python Value
|
-+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
-|byte_values
|tinyint
|[-128, 127, 0]
|['int', 'int', 'int']
|['-128', '127', '0']
|
-|byte_null
|tinyint
|[None, 42]
|['NAType', 'int8']
|['<NA>', '42']
|
-|short_values
|smallint
|[-32768, 32767, 0]
|['int', 'int', 'int']
|['-32768', '32767', '0']
|
-|short_null
|smallint
|[None, 123]
|['NAType', 'int16']
|['<NA>', '123']
|
-|int_values
|int
|[-2147483648, 2147483647, 0]
|['int', 'int', 'int']
|['-2147483648', '2147483647', '0']
|
-|int_null
|int
|[None, 456]
|['NAType', 'int32']
|['<NA>', '456']
|
-|long_values
|bigint
|[-9223372036854775808, 9223372036854775807, 0]
|['int', 'int', 'int']
|['-9223372036854775808', '9223372036854775807',
'0'] |
-|long_null
|bigint
|[None, 789]
|['NAType', 'int64']
|['<NA>', '789']
|
-|float_values
|float
|[0.0, 1.0, 3.140000104904175]
|['float', 'float', 'float']
|['0.0', '1.0', '3.140000104904175']
|
-|float_null
|float
|[None, 3.140000104904175]
|['float', 'float']
|['nan', '3.140000104904175']
|
-|double_values
|double
|[0.0, 1.0, 0.3333333333333333]
|['float', 'float', 'float']
|['0.0', '1.0', '0.3333333333333333']
|
-|double_null
|double
|[None, 2.71]
|['float', 'float']
|['nan', '2.71']
|
-|decimal_values
|decimal(3,2)
|[Decimal('5.35'), Decimal('1.23')]
|['Decimal', 'Decimal']
|['5.35', '1.23']
|
-|decimal_null
|decimal(3,2)
|[None, Decimal('9.99')]
|['NoneType', 'Decimal']
|['None', '9.99']
|
-|string_values
|string
|['abc', '', 'hello']
|['str', 'str', 'str']
|['abc', '', 'hello']
|
-|string_null
|string
|[None, 'test']
|['NoneType', 'str']
|['None', 'test']
|
-|binary_values
|binary
|[b'abc', b'', b'ABC']
|['bytes', 'bytes', 'bytes']
|["b'abc'", "b''", "b'ABC'"]
|
-|binary_null
|binary
|[None, b'test']
|['NoneType', 'bytes']
|['None', "b'test'"]
|
-|boolean_values
|boolean
|[True, False]
|['bool', 'bool']
|['True', 'False']
|
-|boolean_null
|boolean
|[None, True]
|['NoneType', 'bool']
|['None', 'True']
|
-|date_values
|date
|[datetime.date(2020, 2, 2), datetime.date(1970, 1, 1)]
|['date', 'date']
|['2020-02-02', '1970-01-01']
|
-|date_null
|date
|[None, datetime.date(2023, 1, 1)]
|['NoneType', 'date']
|['None', '2023-01-01']
|
-|timestamp_values
|timestamp
|[datetime.datetime(2020, 2, 2, 12, 15, 16, 123000)]
|['Timestamp']
|['2020-02-02 12:15:16.123000']
|
-|timestamp_null
|timestamp
|[None, datetime.datetime(2023, 1, 1, 12, 0)]
|['NaTType', 'Timestamp']
|['NaT', '2023-01-01 12:00:00']
|
-|array_int_values
|array<int>
|[[1, 2, 3], [], [1, None, 3]]
|['list', 'list', 'list']
|['[1, 2, 3]', '[]', '[1, None, 3]']
|
-|array_int_null
|array<int>
|[None, [4, 5, 6]]
|['NoneType', 'list']
|['None', '[np.int32(4), np.int32(5),
np.int32(6)]'] |
-|map_str_int_values
|map<string,int>
|[{'world': 2, 'hello': 1}, {}]
|['dict', 'dict']
|["{'world': 2, 'hello': 1}", '{}']
|
-|map_str_int_null
|map<string,int>
|[None, {'test': 123}]
|['NoneType', 'dict']
|['None', "{'test': 123}"]
|
-|struct_int_str_values
|struct<a1:int,a2:string>
|[Row(a1=1, a2='hello'), Row(a1=2, a2='world')]
|['Row', 'Row']
|["Row(a1=1, a2='hello')", "Row(a1=2,
a2='world')"] |
-|struct_int_str_null
|struct<a1:int,a2:string>
|[None, Row(a1=99, a2='test')]
|['NoneType', 'Row']
|['None', "Row(a1=99, a2='test')"]
|
-|array_array_int
|array<array<int>>
|[[[1, 2, 3]], [[1], [2, 3]]]
|['list', 'list']
|['[[np.int32(1), np.int32(2), np.int32(3)]]',
'[[np.int32(1)], [np.int32(2), np.int32 |
-|array_map_str_int
|array<map<string,int>>
|[[{'world': 2, 'hello': 1}], [{'a': 1}, {'b': 2}]]
|['list', 'list']
|["[{'world': 2, 'hello': 1}]", "[{'a': 1}, {'b':
2}]"] |
-|array_struct_int_str
|array<struct<a1:int,a2:string>>
|[[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2,
a2='world')]] |['list', 'list']
|["[Row(a1=1, a2='hello')]", "[Row(a1=1,
a2='hello'), Row(a1=2, a2='world')]"] |
-|map_int_array_int
|map<int,array<int>>
|[{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}]
|['dict', 'dict']
|['{1: [np.int32(1), np.int32(2), np.int32(3)]}',
'{1: [np.int32(1)], 2: [np.int32(2), |
-|map_int_map_str_int
|map<int,map<string,int>>
|[{1: {'world': 2, 'hello': 1}}]
|['dict']
|["{1: {'world': 2, 'hello': 1}}"]
|
-|map_int_struct_int_str
|map<int,struct<a1:int,a2:string>>
|[{1: Row(a1=1, a2='hello')}]
|['dict']
|["{1: Row(a1=1, a2='hello')}"]
|
-|struct_int_array_int
|struct<a:int,b:array<int>>
|[Row(a=1, b=[1, 2, 3])]
|['Row']
|['Row(a=1, b=[np.int32(1), np.int32(2),
np.int32(3)])'] |
-|struct_int_map_str_int
|struct<a:int,b:map<string,int>>
|[Row(a=1, b={'world': 2, 'hello': 1})]
|['Row']
|["Row(a=1, b={'world': 2, 'hello': 1})"]
|
-|struct_int_struct_int_str
|struct<a:int,b:struct<a1:int,a2:string>>
|[Row(a=1, b=Row(a1=1, a2='hello'))]
|['Row']
|["Row(a=1, b=Row(a1=1, a2='hello'))"]
|
-+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
\ No newline at end of file
diff --git a/python/pyspark/sql/tests/udf_type_tests/test_udf_input_types.py
b/python/pyspark/sql/tests/udf_type_tests/test_udf_input_types.py
index d8cf8466dcf3..e0d144892128 100644
--- a/python/pyspark/sql/tests/udf_type_tests/test_udf_input_types.py
+++ b/python/pyspark/sql/tests/udf_type_tests/test_udf_input_types.py
@@ -21,7 +21,7 @@ import unittest
import pandas as pd
from pyspark.sql import Row
-from pyspark.sql.functions import udf, pandas_udf
+from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import (
ArrayType,
BinaryType,
@@ -75,102 +75,6 @@ class UDFInputTypeTests(ReusedSQLTestCase):
def setUp(self):
super().setUp()
- def test_udf_input_types_arrow_disabled(self):
- golden_file = os.path.join(
- os.path.dirname(__file__),
"golden_udf_input_types_arrow_disabled.txt"
- )
- self._run_udf_input_type_coercion_test(
- config={},
- use_arrow=False,
- golden_file=golden_file,
- test_name="UDF input types - Arrow disabled",
- )
-
- def test_udf_input_types_arrow_legacy_pandas(self):
- golden_file = os.path.join(
- os.path.dirname(__file__),
"golden_udf_input_types_arrow_legacy_pandas.txt"
- )
- self._run_udf_input_type_coercion_test(
-
config={"spark.sql.legacy.execution.pythonUDF.pandas.conversion.enabled":
"true"},
- use_arrow=True,
- golden_file=golden_file,
- test_name="UDF input types - Arrow with legacy pandas",
- )
-
- def test_udf_input_types_arrow_enabled(self):
- golden_file = os.path.join(
- os.path.dirname(__file__),
"golden_udf_input_types_arrow_enabled.txt"
- )
- self._run_udf_input_type_coercion_test(
-
config={"spark.sql.legacy.execution.pythonUDF.pandas.conversion.enabled":
"false"},
- use_arrow=True,
- golden_file=golden_file,
- test_name="UDF input types - Arrow enabled",
- )
-
- def _run_udf_input_type_coercion_test(self, config, use_arrow,
golden_file, test_name):
- with self.sql_conf(config):
- results = self._generate_udf_input_type_coercion_results(use_arrow)
- actual_output = format_type_table(
- results,
- ["Test Case", "Spark Type", "Spark Value", "Python Type",
"Python Value"],
- column_width=85,
- )
- self._compare_or_create_golden_file(actual_output, golden_file,
test_name)
-
- def _generate_udf_input_type_coercion_results(self, use_arrow):
- results = []
- test_cases = self._get_input_type_test_cases()
-
- for test_name, spark_type, data_func in test_cases:
- input_df = data_func(spark_type).repartition(1)
- input_data = [row["value"] for row in input_df.collect()]
- result_row = [test_name, spark_type.simpleString(),
str(input_data)]
-
- try:
-
- def type_udf(x):
- if x is None:
- return "NoneType"
- else:
- return type(x).__name__
-
- def value_udf(x):
- return x
-
- def value_str(x):
- return str(x)
-
- type_test_udf = udf(type_udf, returnType=StringType(),
useArrow=use_arrow)
- value_test_udf = udf(value_udf, returnType=spark_type,
useArrow=use_arrow)
- value_str_udf = udf(value_str, returnType=StringType(),
useArrow=use_arrow)
-
- result_df = input_df.select(
- value_test_udf("value").alias("python_value"),
- type_test_udf("value").alias("python_type"),
- value_str_udf("value").alias("python_value_str"),
- )
- results_data = result_df.collect()
- values = [row["python_value"] for row in results_data]
- types = [row["python_type"] for row in results_data]
- values_str = [row["python_value_str"] for row in results_data]
-
- # Assert that the UDF output values match the input values
- assert values == input_data, f"Input {values} != output
{input_data}"
-
- result_row.append(str(types))
- result_row.append(str(values_str).replace("\n", " "))
-
- except Exception as e:
- print("error_msg", e)
- # Clean up exception message to remove newlines and extra
whitespace
- error_msg = str(e).replace("\n", " ").replace("\r", " ")
- result_row.append(f"✗ {error_msg}")
-
- results.append(result_row)
-
- return results
-
def test_pandas_udf_input(self):
golden_file = os.path.join(os.path.dirname(__file__),
"golden_pandas_udf_input_types.txt")
results = self._generate_pandas_udf_input_type_coercion_results()
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]