[GitHub] [spark] itholic commented on a diff in pull request #39952: [SPARK-40770][PYTHON][FOLLOW-UP] Improved error messages for mapInPandas for schema mismatch

via GitHub Thu, 29 Jun 2023 10:33:48 -0700


itholic commented on code in PR #39952:
URL: https://github.com/apache/spark/pull/39952#discussion_r1246937082



##########
python/pyspark/worker.py:
##########
@@ -133,65 +134,103 @@ def verify_result_length(result, length):
     )
 
 
-def wrap_batch_iter_udf(f, return_type):
+def wrap_batch_iter_udf(f, return_type, is_arrow_iter=False):
     arrow_return_type = to_arrow_type(return_type)
+    iter_type_label = (
+        "pyarrow.RecordBatch"
+        if is_arrow_iter
+        else ("pandas.DataFrame" if type(return_type) == StructType else 
"pandas.Series")
+    )
 
-    def verify_result_type(result):
-        if not hasattr(result, "__len__"):
-            pd_type = "Pandas.DataFrame" if type(return_type) == StructType 
else "Pandas.Series"
+    def verify_result(result):
+        if not isinstance(result, Iterator) and not hasattr(result, 
"__iter__"):
             raise TypeError(
                 "Return type of the user-defined function should be "
-                "{}, but is {}".format(pd_type, type(result))
+                "iterator of {}, but is {}".format(iter_type_label, 
type(result))
             )
         return result
 
+    def verify_element(elem):
+        if is_arrow_iter:
+            import pyarrow as pa
+
+            if not isinstance(elem, pa.RecordBatch):
+                raise TypeError(
+                    "Return type of the user-defined function should be "
+                    "iterator of {}, but is iterator of 
{}".format(iter_type_label, type(elem))
+                )
+        else:
+            import pandas as pd
+
+            if not isinstance(elem, pd.DataFrame if type(return_type) == 
StructType else pd.Series):
+                raise TypeError(

Review Comment:
   ditto, and other places as well ?
   
   Basically we should use the PySpark specific errors instead of Python 
build-in exceptions.
   
   See 
https://github.com/apache/spark/blob/master/python/pyspark/errors/__init__.py 
for more details about PySpark specific errors.



##########
python/pyspark/worker.py:
##########
@@ -133,65 +134,103 @@ def verify_result_length(result, length):
     )
 
 
-def wrap_batch_iter_udf(f, return_type):
+def wrap_batch_iter_udf(f, return_type, is_arrow_iter=False):
     arrow_return_type = to_arrow_type(return_type)
+    iter_type_label = (
+        "pyarrow.RecordBatch"
+        if is_arrow_iter
+        else ("pandas.DataFrame" if type(return_type) == StructType else 
"pandas.Series")
+    )
 
-    def verify_result_type(result):
-        if not hasattr(result, "__len__"):
-            pd_type = "Pandas.DataFrame" if type(return_type) == StructType 
else "Pandas.Series"
+    def verify_result(result):
+        if not isinstance(result, Iterator) and not hasattr(result, 
"__iter__"):
             raise TypeError(
                 "Return type of the user-defined function should be "
-                "{}, but is {}".format(pd_type, type(result))
+                "iterator of {}, but is {}".format(iter_type_label, 
type(result))
             )
         return result
 
+    def verify_element(elem):
+        if is_arrow_iter:
+            import pyarrow as pa
+
+            if not isinstance(elem, pa.RecordBatch):
+                raise TypeError(
+                    "Return type of the user-defined function should be "
+                    "iterator of {}, but is iterator of 
{}".format(iter_type_label, type(elem))
+                )
+        else:
+            import pandas as pd
+
+            if not isinstance(elem, pd.DataFrame if type(return_type) == 
StructType else pd.Series):
+                raise TypeError(
+                    "Return type of the user-defined function should be "
+                    "iterator of {}, but is iterator of 
{}".format(iter_type_label, type(elem))
+                )
+
+            verify_pandas_result(elem, return_type, True, True)
+
+        return elem
+
     return lambda *iterator: map(
-        lambda res: (res, arrow_return_type), map(verify_result_type, 
f(*iterator))
+        lambda res: (res, arrow_return_type), map(verify_element, 
verify_result(f(*iterator)))
     )
 
 
-def verify_pandas_result(result, return_type, assign_cols_by_name):
+def verify_pandas_result(result, return_type, assign_cols_by_name, 
truncate_return_schema):
     import pandas as pd
 
-    if not isinstance(result, pd.DataFrame):
-        raise TypeError(
-            "Return type of the user-defined function should be "
-            "pandas.DataFrame, but is {}".format(type(result))
-        )
-
-    # check the schema of the result only if it is not empty or has columns
-    if not result.empty or len(result.columns) != 0:
-        # if any column name of the result is a string
-        # the column names of the result have to match the return type
-        #   see create_array in 
pyspark.sql.pandas.serializers.ArrowStreamPandasSerializer
-        field_names = set([field.name for field in return_type.fields])
-        column_names = set(result.columns)
-        if (
-            assign_cols_by_name
-            and any(isinstance(name, str) for name in result.columns)
-            and column_names != field_names
-        ):
-            missing = sorted(list(field_names.difference(column_names)))
-            missing = f" Missing: {', '.join(missing)}." if missing else ""
-
-            extra = sorted(list(column_names.difference(field_names)))
-            extra = f" Unexpected: {', '.join(extra)}." if extra else ""
+    if type(return_type) == StructType:
+        if not isinstance(result, pd.DataFrame):
+            raise TypeError(

Review Comment:
   Can we raise `PySparkTypeError` instead of `TypeError`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] itholic commented on a diff in pull request #39952: [SPARK-40770][PYTHON][FOLLOW-UP] Improved error messages for mapInPandas for schema mismatch

Reply via email to