allisonwang-db commented on code in PR #41867:
URL: https://github.com/apache/spark/pull/41867#discussion_r1258988779
##########
python/pyspark/sql/udtf.py:
##########
@@ -39,15 +42,98 @@ def _create_udtf(
cls: Type,
returnType: Union[StructType, str],
name: Optional[str] = None,
+ evalType: int = PythonEvalType.SQL_TABLE_UDF,
deterministic: bool = True,
) -> "UserDefinedTableFunction":
- """Create a Python UDTF."""
+ """Create a Python UDTF with the given eval type."""
udtf_obj = UserDefinedTableFunction(
- cls, returnType=returnType, name=name, deterministic=deterministic
+ cls, returnType=returnType, name=name, evalType=evalType,
deterministic=deterministic
)
+
return udtf_obj
+def _create_py_udtf(
+ cls: Type,
+ returnType: Union[StructType, str],
+ name: Optional[str] = None,
+ deterministic: bool = True,
+ useArrow: Optional[bool] = None,
+) -> "UserDefinedTableFunction":
+ """Create a regular or an Arrow-optimized Python UDTF."""
+ # Determine whether to create Arrow-optimized UDTFs.
+ if useArrow is not None:
+ arrow_enabled = useArrow
+ else:
+ from pyspark.sql import SparkSession
+
+ session = SparkSession._instantiatedSession
+ arrow_enabled = (
+ session.conf.get("spark.sql.execution.pythonUDTF.arrow.enabled")
== "true"
+ if session is not None
+ else True
+ )
+
+ # Create a regular Python UDTF and check for invalid handler class.
+ regular_udtf = _create_udtf(cls, returnType, name,
PythonEvalType.SQL_TABLE_UDF, deterministic)
+
+ if arrow_enabled:
+ try:
+ require_minimum_pandas_version()
+ require_minimum_pyarrow_version()
+ except ImportError as e:
+ warnings.warn(
+ f"Arrow optimization for Python UDTFs cannot be enabled:
{str(e)}. "
+ f"Falling back to using regular Python UDTFs.",
+ UserWarning,
+ )
+ return regular_udtf
+ return _create_arrow_udtf(regular_udtf)
+ else:
+ return regular_udtf
+
+
+def _create_arrow_udtf(regular_udtf: "UserDefinedTableFunction") ->
"UserDefinedTableFunction":
+ """Create an Arrow-optimized Python UDTF."""
+ import pandas as pd
+
+ cls = regular_udtf.func
+
+ class VectorizedUDTF:
+ def __init__(self) -> None:
+ self.func = cls()
+
+ def eval(self, *args: pd.Series) -> Iterator[list[pd.Series]]:
+ if len(args) == 0:
+ yield pd.DataFrame(self.func.eval())
+ else:
+ # Create tuples from the input pandas Series, each tuple
+ # represents a row across all Series.
+ row_tuples = zip(*args)
+ for row in row_tuples:
+ yield pd.DataFrame(self.func.eval(*row))
+
+ def terminate(self) -> Iterator[pd.DataFrame]:
+ if hasattr(cls, "terminate"):
+ yield pd.DataFrame(self.func.terminate())
+
+ vectorized_udtf = VectorizedUDTF
+ vectorized_udtf.__name__ = cls.__name__
+ vectorized_udtf.__module__ = cls.__module__
+ vectorized_udtf.__doc__ = cls.__doc__
+ vectorized_udtf.eval.__doc__ = getattr(cls, "eval").__doc__
+ if hasattr(cls, "terminate"):
Review Comment:
`terminate` is optional for UDTFs but `VectorizedUDTF.terminate` is always
defined. We only want to add the docstring to `VectorizedUDTF.terminate` if the
original UDTF class has the `terminate` method.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]