Re: [PR] [SPARK-53029][PYTHON] Support return type coercion for Arrow Python UDTFs [spark]

via GitHub Thu, 28 Aug 2025 13:36:02 -0700


shujingyang-db commented on code in PR #52140:
URL: https://github.com/apache/spark/pull/52140#discussion_r2308487335



##########
python/pyspark/sql/tests/arrow/test_arrow_udtf.py:
##########
@@ -341,22 +345,74 @@ def eval(self) -> Iterator["pa.Table"]:
                 # Return string values that cannot be coerced to int
                 result_table = pa.table(
                     {
-                        "id": pa.array(["abc", "def", "xyz"], 
type=pa.string()),
+                        "id": pa.array(["1", "2", "xyz"], type=pa.string()),
                     }
                 )
                 yield result_table
 
-        with self.assertRaisesRegex(PythonException, "Schema at index 0 was 
different"):
+        # Should fail with Arrow cast exception since string cannot be cast to 
int
+        with self.assertRaisesRegex(
+            PythonException,
+            r"pyarrow\.lib\.ArrowInvalid: Failed to parse string: 'xyz' as a 
scalar of type int32"
+        ):
             result_df = StringToIntUDTF()
             result_df.collect()
 
+    def test_return_type_coercion_success(self):
+        @arrow_udtf(returnType="value int")
+        class CoercionSuccessUDTF:
+            def eval(self) -> Iterator["pa.Table"]:
+                result_table = pa.table(
+                    {
+                        "value": pa.array([10, 20, 30], type=pa.int64()),  # 
long -> int coercion
+                    }
+                )
+                yield result_table
+
+        result_df = CoercionSuccessUDTF()
+        expected_df = self.spark.createDataFrame([(10,), (20,), (30,)], "value 
int")
+        assertDataFrameEqual(result_df, expected_df)
+
+    def test_return_type_coercion_overflow(self):
+        @arrow_udtf(returnType="value int")
+        class CoercionOverflowUDTF:
+            def eval(self) -> Iterator["pa.Table"]:
+                # Return values that will cause overflow when casting long to 
int
+                result_table = pa.table(
+                    {
+                        "value": pa.array([2147483647 + 1], type=pa.int64()),  
# int32 max + 1
+                    }
+                )
+                yield result_table
+
+        # Should fail with PyArrow overflow exception
+        with self.assertRaises(Exception):
+            result_df = CoercionOverflowUDTF()
+            result_df.collect()
+
+    def test_return_type_coercion_multiple_columns(self):
+        @arrow_udtf(returnType="id int, price float")
+        class MultipleColumnCoercionUDTF:
+            def eval(self) -> Iterator["pa.Table"]:
+                result_table = pa.table(
+                    {
+                        "id": pa.array([1, 2, 3], type=pa.int64()),         # 
long -> int coercion
+                        "price": pa.array([10.5, 20.7, 30.9], 
type=pa.float64()),  # double -> float coercion
+                    }
+                )
+                yield result_table
+
+        result_df = MultipleColumnCoercionUDTF()
+        expected_df = self.spark.createDataFrame([(1, 10.5), (2, 20.7), (3, 
30.9)], "id int, price float")
+        assertDataFrameEqual(result_df, expected_df)
+
     def test_arrow_udtf_with_empty_column_result(self):
         @arrow_udtf(returnType=StructType())
         class EmptyResultUDTF:
             def eval(self) -> Iterator["pa.Table"]:
                 yield pa.Table.from_struct_array(pa.array([{}] * 3))
 
-        assertDataFrameEqual(EmptyResultUDTF(), [Row(), Row(), Row()])
+        assertDataFrameEqual(EmptyResultUDTF(), [None, None, None])

Review Comment:
   Good catch! I have reverted it and create an empty batch with the number of 
rows set



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-53029][PYTHON] Support return type coercion for Arrow Python UDTFs [spark]

Reply via email to