Yicong-Huang commented on code in PR #54084:
URL: https://github.com/apache/spark/pull/54084#discussion_r2760787071


##########
python/pyspark/sql/tests/coercion/golden_pandas_udf_input_type_coercion_base.csv:
##########
@@ -1,40 +1,40 @@
-       Test Case       Spark Type      Spark Value     Python Type     Python 
Value
-0      byte_values     tinyint [-128, 127, 0]  ['int8', 'int8', 'int8']        
[-128, 127, 0]
-1      byte_null       tinyint [None, 42]      ['Int8', 'Int8']        [None, 
42]
-2      short_values    smallint        [-32768, 32767, 0]      ['int16', 
'int16', 'int16']     [-32768, 32767, 0]
-3      short_null      smallint        [None, 123]     ['Int16', 'Int16']      
[None, 123]
-4      int_values      int     [-2147483648, 2147483647, 0]    ['int32', 
'int32', 'int32']     [-2147483648, 2147483647, 0]
-5      int_null        int     [None, 456]     ['Int32', 'Int32']      [None, 
456]
-6      long_values     bigint  [-9223372036854775808, 9223372036854775807, 0]  
['int64', 'int64', 'int64']     [-9223372036854775808, 9223372036854775807, 0]
-7      long_null       bigint  [None, 789]     ['Int64', 'Int64']      [None, 
789]
-8      float_values    float   [0.0, 1.0, 3.140000104904175]   ['float32', 
'float32', 'float32']       [0.0, 1.0, 3.140000104904175]
-9      float_null      float   [None, 3.140000104904175]       ['float32', 
'float32']  [None, 3.140000104904175]
-10     double_values   double  [0.0, 1.0, 0.3333333333333333]  ['float64', 
'float64', 'float64']       [0.0, 1.0, 0.3333333333333333]
-11     double_null     double  [None, 2.71]    ['float64', 'float64']  [None, 
2.71]
-12     decimal_values  decimal(3,2)    [Decimal('5.35'), Decimal('1.23')]      
['object', 'object']    [Decimal('5.35'), Decimal('1.23')]
-13     decimal_null    decimal(3,2)    [None, Decimal('9.99')] ['object', 
'object']    [None, Decimal('9.99')]
-14     string_values   string  ['abc', '', 'hello']    ['object', 'object', 
'object']  ['abc', '', 'hello']
-15     string_null     string  [None, 'test']  ['object', 'object']    [None, 
'test']
-16     binary_values   binary  [b'abc', b'', b'ABC']   ['object', 'object', 
'object']  [b'abc', b'', b'ABC']
-17     binary_null     binary  [None, b'test'] ['object', 'object']    [None, 
b'test']
-18     boolean_values  boolean [True, False]   ['bool', 'bool']        [True, 
False]
-19     boolean_null    boolean [None, True]    ['object', 'object']    [None, 
True]
-20     date_values     date    [datetime.date(2020, 2, 2), datetime.date(1970, 
1, 1)]  ['object', 'object']    [datetime.date(2020, 2, 2), datetime.date(1970, 
1, 1)]
-21     date_null       date    [None, datetime.date(2023, 1, 1)]       
['object', 'object']    [None, datetime.date(2023, 1, 1)]
-22     timestamp_values        timestamp       [datetime.datetime(2020, 2, 2, 
12, 15, 16, 123000)]     ['datetime64[ns]']      [datetime.datetime(2020, 2, 2, 
12, 15, 16, 123000)]
-23     timestamp_null  timestamp       [None, datetime.datetime(2023, 1, 1, 
12, 0)]    ['datetime64[ns]', 'datetime64[ns]']    [None, 
datetime.datetime(2023, 1, 1, 12, 0)]
-24     array_int_values        array<int>      [[1, 2, 3], [], [1, None, 3]]   
['object', 'object', 'object']  [[1, 2, 3], [], [1, None, 3]]
-25     array_int_null  array<int>      [None, [4, 5, 6]]       ['object', 
'object']    [None, [4, 5, 6]]
-26     map_str_int_values      map<string,int> [{'world': 2, 'hello': 1}, {}]  
['object', 'object']    [{'world': 2, 'hello': 1}, {}]
-27     map_str_int_null        map<string,int> [None, {'test': 123}]   
['object', 'object']    [None, {'test': 123}]
-28     struct_int_str_values   struct<a1:int,a2:string>        [Row(a1=1, 
a2='hello'), Row(a1=2, a2='world')]  ['DataFrame', 'DataFrame']      [Row(a1=1, 
a2='hello'), Row(a1=2, a2='world')]
-29     struct_int_str_null     struct<a1:int,a2:string>        [None, 
Row(a1=99, a2='test')]   ['DataFrame', 'DataFrame']      [Row(a1=None, 
a2=None), Row(a1=99, a2='test')]
-30     array_array_int array<array<int>>       [[[1, 2, 3]], [[1], [2, 3]]]    
['object', 'object']    [[[1, 2, 3]], [[1], [2, 3]]]
-31     array_map_str_int       array<map<string,int>>  [[{'world': 2, 'hello': 
1}], [{'a': 1}, {'b': 2}]]      ['object', 'object']    [[{'world': 2, 'hello': 
1}], [{'a': 1}, {'b': 2}]]
-32     array_struct_int_str    array<struct<a1:int,a2:string>> [[Row(a1=1, 
a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]]       ['object', 
'object']    [[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, 
a2='world')]]
-33     map_int_array_int       map<int,array<int>>     [{1: [1, 2, 3]}, {1: 
[1], 2: [2, 3]}]   ['object', 'object']    [{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}]
-34     map_int_map_str_int     map<int,map<string,int>>        [{1: {'world': 
2, 'hello': 1}}] ['object']      [{1: {'world': 2, 'hello': 1}}]
-35     map_int_struct_int_str  map<int,struct<a1:int,a2:string>>       [{1: 
Row(a1=1, a2='hello')}]    ['object']      [{1: Row(a1=1, a2='hello')}]
-36     struct_int_array_int    struct<a:int,b:array<int>>      [Row(a=1, b=[1, 
2, 3])] ['DataFrame']   [Row(a=1, b=[1, 2, 3])]
-37     struct_int_map_str_int  struct<a:int,b:map<string,int>> [Row(a=1, 
b={'world': 2, 'hello': 1})]  ['DataFrame']   [Row(a=1, b={'world': 2, 'hello': 
1})]
-38     struct_int_struct_int_str       
struct<a:int,b:struct<a1:int,a2:string>>        [Row(a=1, b=Row(a1=1, 
a2='hello'))]     ['DataFrame']   [Row(a=1, b=Row(a1=1, a2='hello'))]
+Source Value \ Target Type     Spark Type      Spark Value     Python Type     
Python Value
+byte_values    tinyint [-128, 127, 0]  int8    [-128, 127, 0]
+byte_null      tinyint [None, 42]      Int8    [None, 42]
+short_values   smallint        [-32768, 32767, 0]      int16   [-32768, 32767, 
0]
+short_null     smallint        [None, 123]     Int16   [None, 123]
+int_values     int     [-2147483648, 2147483647, 0]    int32   [-2147483648, 
2147483647, 0]
+int_null       int     [None, 456]     Int32   [None, 456]
+long_values    bigint  [-9223372036854775808, 9223372036854775807, 0]  int64   
[-9223372036854775808, 9223372036854775807, 0]
+long_null      bigint  [None, 789]     Int64   [None, 789]
+float_values   float   [0.0, 1.0, 3.140000104904175]   float32 [0.0, 1.0, 
3.140000104904175]
+float_null     float   [None, 3.140000104904175]       float32 [None, 
3.140000104904175]
+double_values  double  [0.0, 1.0, 0.3333333333333333]  float64 [0.0, 1.0, 
0.3333333333333333]
+double_null    double  [None, 2.71]    float64 [None, 2.71]
+decimal_values decimal(3,2)    [Decimal('5.35'), Decimal('1.23')]      Decimal 
[Decimal('5.35'), Decimal('1.23')]

Review Comment:
   It is expected! I find `object` being too less of information.`object` dtype 
is pandas' fallback for anything that doesn't have native array support — it 
stores an array of Python object pointers. And if we use this in golden file 
test, we won't be able to notice the actual type has changed. For example,
   
   ```
   # Correct UDF result: Decimal values
   result1 = DataFrame({'value': [Decimal('1.23')], 'name': ['a']})
   
   # Buggy UDF result: str instead of Decimal  
   result2 = DataFrame({'value': ['1.23'], 'name': ['a']})
   
   # pandas dtype for both: [object, object] — identical!
   # Python element types: [Decimal, str] vs [str, str] — different!
   ```
   
   So in this PR I went ahead updated the `repr_type` to print out the actual 
python object type when detected a general `object` pandas dtype. And you can 
see it prints out Decimal for this case:
   
   <img width="769" height="193" alt="Image" 
src="https://github.com/user-attachments/assets/ef0e554e-3563-46ea-8082-67681b098190";
 />
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to