zhengruifeng commented on code in PR #54084:
URL: https://github.com/apache/spark/pull/54084#discussion_r2762006571
##########
python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_base.csv:
##########
@@ -1,40 +1,40 @@
- Test Case Spark Type Spark Value Python Type Python
Value
-0 byte_values tinyint [-128, 127, 0] ['int8', 'int8', 'int8']
[-128, 127, 0]
-1 byte_null tinyint [None, 42] ['Int8', 'Int8'] [None,
42]
-2 short_values smallint [-32768, 32767, 0] ['int16',
'int16', 'int16'] [-32768, 32767, 0]
-3 short_null smallint [None, 123] ['Int16', 'Int16']
[None, 123]
-4 int_values int [-2147483648, 2147483647, 0] ['int32',
'int32', 'int32'] [-2147483648, 2147483647, 0]
-5 int_null int [None, 456] ['Int32', 'Int32'] [None,
456]
-6 long_values bigint [-9223372036854775808, 9223372036854775807, 0]
['int64', 'int64', 'int64'] [-9223372036854775808, 9223372036854775807, 0]
-7 long_null bigint [None, 789] ['Int64', 'Int64'] [None,
789]
-8 float_values float [0.0, 1.0, 3.140000104904175] ['float32',
'float32', 'float32'] [0.0, 1.0, 3.140000104904175]
-9 float_null float [None, 3.140000104904175] ['float32',
'float32'] [None, 3.140000104904175]
-10 double_values double [0.0, 1.0, 0.3333333333333333] ['float64',
'float64', 'float64'] [0.0, 1.0, 0.3333333333333333]
-11 double_null double [None, 2.71] ['float64', 'float64'] [None,
2.71]
-12 decimal_values decimal(3,2) [Decimal('5.35'), Decimal('1.23')]
['object', 'object'] [Decimal('5.35'), Decimal('1.23')]
-13 decimal_null decimal(3,2) [None, Decimal('9.99')] ['object',
'object'] [None, Decimal('9.99')]
-14 string_values string ['abc', '', 'hello'] ['object', 'object',
'object'] ['abc', '', 'hello']
-15 string_null string [None, 'test'] ['object', 'object'] [None,
'test']
-16 binary_values binary [b'abc', b'', b'ABC'] ['object', 'object',
'object'] [b'abc', b'', b'ABC']
-17 binary_null binary [None, b'test'] ['object', 'object'] [None,
b'test']
-18 boolean_values boolean [True, False] ['bool', 'bool'] [True,
False]
-19 boolean_null boolean [None, True] ['object', 'object'] [None,
True]
-20 date_values date [datetime.date(2020, 2, 2), datetime.date(1970,
1, 1)] ['object', 'object'] [datetime.date(2020, 2, 2), datetime.date(1970,
1, 1)]
-21 date_null date [None, datetime.date(2023, 1, 1)]
['object', 'object'] [None, datetime.date(2023, 1, 1)]
-22 timestamp_values timestamp [datetime.datetime(2020, 2, 2,
12, 15, 16, 123000)] ['datetime64[ns]'] [datetime.datetime(2020, 2, 2,
12, 15, 16, 123000)]
-23 timestamp_null timestamp [None, datetime.datetime(2023, 1, 1,
12, 0)] ['datetime64[ns]', 'datetime64[ns]'] [None,
datetime.datetime(2023, 1, 1, 12, 0)]
-24 array_int_values array<int> [[1, 2, 3], [], [1, None, 3]]
['object', 'object', 'object'] [[1, 2, 3], [], [1, None, 3]]
-25 array_int_null array<int> [None, [4, 5, 6]] ['object',
'object'] [None, [4, 5, 6]]
-26 map_str_int_values map<string,int> [{'world': 2, 'hello': 1}, {}]
['object', 'object'] [{'world': 2, 'hello': 1}, {}]
-27 map_str_int_null map<string,int> [None, {'test': 123}]
['object', 'object'] [None, {'test': 123}]
-28 struct_int_str_values struct<a1:int,a2:string> [Row(a1=1,
a2='hello'), Row(a1=2, a2='world')] ['DataFrame', 'DataFrame'] [Row(a1=1,
a2='hello'), Row(a1=2, a2='world')]
-29 struct_int_str_null struct<a1:int,a2:string> [None,
Row(a1=99, a2='test')] ['DataFrame', 'DataFrame'] [Row(a1=None,
a2=None), Row(a1=99, a2='test')]
-30 array_array_int array<array<int>> [[[1, 2, 3]], [[1], [2, 3]]]
['object', 'object'] [[[1, 2, 3]], [[1], [2, 3]]]
-31 array_map_str_int array<map<string,int>> [[{'world': 2, 'hello':
1}], [{'a': 1}, {'b': 2}]] ['object', 'object'] [[{'world': 2, 'hello':
1}], [{'a': 1}, {'b': 2}]]
-32 array_struct_int_str array<struct<a1:int,a2:string>> [[Row(a1=1,
a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]] ['object',
'object'] [[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2,
a2='world')]]
-33 map_int_array_int map<int,array<int>> [{1: [1, 2, 3]}, {1:
[1], 2: [2, 3]}] ['object', 'object'] [{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}]
-34 map_int_map_str_int map<int,map<string,int>> [{1: {'world':
2, 'hello': 1}}] ['object'] [{1: {'world': 2, 'hello': 1}}]
-35 map_int_struct_int_str map<int,struct<a1:int,a2:string>> [{1:
Row(a1=1, a2='hello')}] ['object'] [{1: Row(a1=1, a2='hello')}]
-36 struct_int_array_int struct<a:int,b:array<int>> [Row(a=1, b=[1,
2, 3])] ['DataFrame'] [Row(a=1, b=[1, 2, 3])]
-37 struct_int_map_str_int struct<a:int,b:map<string,int>> [Row(a=1,
b={'world': 2, 'hello': 1})] ['DataFrame'] [Row(a=1, b={'world': 2, 'hello':
1})]
-38 struct_int_struct_int_str
struct<a:int,b:struct<a1:int,a2:string>> [Row(a=1, b=Row(a1=1,
a2='hello'))] ['DataFrame'] [Row(a=1, b=Row(a1=1, a2='hello'))]
+Source Value \ Target Type Spark Type Spark Value Python Type
Python Value
+byte_values tinyint [-128, 127, 0] int8 [-128, 127, 0]
+byte_null tinyint [None, 42] Int8 [None, 42]
+short_values smallint [-32768, 32767, 0] int16 [-32768, 32767,
0]
+short_null smallint [None, 123] Int16 [None, 123]
+int_values int [-2147483648, 2147483647, 0] int32 [-2147483648,
2147483647, 0]
+int_null int [None, 456] Int32 [None, 456]
+long_values bigint [-9223372036854775808, 9223372036854775807, 0] int64
[-9223372036854775808, 9223372036854775807, 0]
+long_null bigint [None, 789] Int64 [None, 789]
+float_values float [0.0, 1.0, 3.140000104904175] float32 [0.0, 1.0,
3.140000104904175]
+float_null float [None, 3.140000104904175] float32 [None,
3.140000104904175]
+double_values double [0.0, 1.0, 0.3333333333333333] float64 [0.0, 1.0,
0.3333333333333333]
+double_null double [None, 2.71] float64 [None, 2.71]
+decimal_values decimal(3,2) [Decimal('5.35'), Decimal('1.23')] Decimal
[Decimal('5.35'), Decimal('1.23')]
Review Comment:
I think it is a topic other than `Extract`, if we want to do this we should
do it in separate PRs.
The test class should be able to override the default string expr.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]