viirya commented on a change in pull request #27165: [SPARK-28264][PYTHON][SQL]
Support type hints in pandas UDF and rename/move inconsistent pandas UDF types
URL: https://github.com/apache/spark/pull/27165#discussion_r368839839
##########
File path: python/pyspark/sql/pandas/functions.py
##########
@@ -490,31 +405,80 @@ def pandas_udf(f=None, returnType=None,
functionType=None):
eval_type = returnType
else:
# @pandas_udf(dataType) or @pandas_udf(returnType=dataType)
- eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF
+ eval_type = None
else:
return_type = returnType
if functionType is not None:
eval_type = functionType
else:
- eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF
+ eval_type = None
if return_type is None:
- raise ValueError("Invalid returnType: returnType can not be None")
+ raise ValueError("Invalid return type: returnType can not be None")
if eval_type not in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,
PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
- PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF]:
- raise ValueError("Invalid functionType: "
+ PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
+ None]: # None means it should infer the type from
type hints.
+
+ raise ValueError("Invalid function type: "
"functionType must be one the values from
PandasUDFType")
if is_decorator:
- return functools.partial(_create_udf, returnType=return_type,
evalType=eval_type)
+ return functools.partial(_create_pandas_udf, returnType=return_type,
evalType=eval_type)
else:
- return _create_udf(f=f, returnType=return_type, evalType=eval_type)
+ return _create_pandas_udf(f=f, returnType=return_type,
evalType=eval_type)
+
+
+def _create_pandas_udf(f, returnType, evalType):
+ argspec = _get_argspec(f)
+
+ # pandas UDF by type hints.
+ if sys.version_info >= (3, 6):
+ from inspect import signature
+
+ if evalType in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,
+ PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
+ PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]:
+ warnings.warn(
+ "In Python 3.6+ and Spark 3.0+, it is preferred to specify
type hints for "
+ "pandas UDF instead of specifying pandas UDF type which will
be deprecated "
+ "in the future releases. See SPARK-28264 for more details.",
UserWarning)
+ elif len(argspec.annotations) > 0:
+ evalType = infer_eval_type(signature(f))
Review comment:
Shall we check `evalType` is `None` here?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]