HyukjinKwon commented on code in PR #41867:
URL: https://github.com/apache/spark/pull/41867#discussion_r1257917416
##########
python/pyspark/sql/udtf.py:
##########
@@ -39,15 +42,98 @@ def _create_udtf(
cls: Type,
returnType: Union[StructType, str],
name: Optional[str] = None,
+ evalType: int = PythonEvalType.SQL_TABLE_UDF,
deterministic: bool = True,
) -> "UserDefinedTableFunction":
- """Create a Python UDTF."""
+ """Create a Python UDTF with the given eval type."""
udtf_obj = UserDefinedTableFunction(
- cls, returnType=returnType, name=name, deterministic=deterministic
+ cls, returnType=returnType, name=name, evalType=evalType,
deterministic=deterministic
)
+
return udtf_obj
+def _create_py_udtf(
+ cls: Type,
+ returnType: Union[StructType, str],
+ name: Optional[str] = None,
+ deterministic: bool = True,
+ useArrow: Optional[bool] = None,
+) -> "UserDefinedTableFunction":
+ """Create a regular or an Arrow-optimized Python UDTF."""
+ # Determine whether to create Arrow-optimized UDTFs.
+ if useArrow is not None:
+ arrow_enabled = useArrow
+ else:
+ from pyspark.sql import SparkSession
+
+ session = SparkSession._instantiatedSession
Review Comment:
@xinrong-meng we should move the import to `else` branch in `_create_py_udt`
too in order to reduce the breaking change as far as possible. For example,
once you have this, you can't define pandas UDF without Spark session (e.g., 1.
providing the pandas UDF as a library or 2. defining a pandas udf in plain
Python interpreter before starting a Spark session).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]