viirya commented on a change in pull request #27165: [SPARK-28264][PYTHON][SQL] 
Support type hints in pandas UDF and rename/move inconsistent pandas UDF types
URL: https://github.com/apache/spark/pull/27165#discussion_r368839839
 
 

 ##########
 File path: python/pyspark/sql/pandas/functions.py
 ##########
 @@ -490,31 +405,80 @@ def pandas_udf(f=None, returnType=None, 
functionType=None):
             eval_type = returnType
         else:
             # @pandas_udf(dataType) or @pandas_udf(returnType=dataType)
-            eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF
+            eval_type = None
     else:
         return_type = returnType
 
         if functionType is not None:
             eval_type = functionType
         else:
-            eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF
+            eval_type = None
 
     if return_type is None:
-        raise ValueError("Invalid returnType: returnType can not be None")
+        raise ValueError("Invalid return type: returnType can not be None")
 
     if eval_type not in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                          PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                          PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                          PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                          PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
-                         PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF]:
-        raise ValueError("Invalid functionType: "
+                         PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
+                         None]:  # None means it should infer the type from 
type hints.
+
+        raise ValueError("Invalid function type: "
                          "functionType must be one the values from 
PandasUDFType")
 
     if is_decorator:
-        return functools.partial(_create_udf, returnType=return_type, 
evalType=eval_type)
+        return functools.partial(_create_pandas_udf, returnType=return_type, 
evalType=eval_type)
     else:
-        return _create_udf(f=f, returnType=return_type, evalType=eval_type)
+        return _create_pandas_udf(f=f, returnType=return_type, 
evalType=eval_type)
+
+
+def _create_pandas_udf(f, returnType, evalType):
+    argspec = _get_argspec(f)
+
+    # pandas UDF by type hints.
+    if sys.version_info >= (3, 6):
+        from inspect import signature
+
+        if evalType in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,
+                        PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
+                        PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]:
+            warnings.warn(
+                "In Python 3.6+ and Spark 3.0+, it is preferred to specify 
type hints for "
+                "pandas UDF instead of specifying pandas UDF type which will 
be deprecated "
+                "in the future releases. See SPARK-28264 for more details.", 
UserWarning)
+        elif len(argspec.annotations) > 0:
+            evalType = infer_eval_type(signature(f))
 
 Review comment:
   Shall we check `evalType` is `None` here?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to