Github user icexelloss commented on a diff in the pull request:
https://github.com/apache/spark/pull/21650#discussion_r198664314
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
---
@@ -94,36 +95,59 @@ object ExtractPythonUDFFromAggregate extends
Rule[LogicalPlan] {
*/
object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
- private def hasPythonUDF(e: Expression): Boolean = {
+ private def hasScalarPythonUDF(e: Expression): Boolean = {
e.find(PythonUDF.isScalarPythonUDF).isDefined
}
- private def canEvaluateInPython(e: PythonUDF): Boolean = {
- e.children match {
- // single PythonUDF child could be chained and evaluated in Python
- case Seq(u: PythonUDF) => canEvaluateInPython(u)
- // Python UDF can't be evaluated directly in JVM
- case children => !children.exists(hasPythonUDF)
+ private def canEvaluateInPython(e: PythonUDF, evalType: Int): Boolean = {
+ if (e.evalType != evalType) {
+ false
+ } else {
+ e.children match {
+ // single PythonUDF child could be chained and evaluated in Python
+ case Seq(u: PythonUDF) => canEvaluateInPython(u, evalType)
+ // Python UDF can't be evaluated directly in JVM
+ case children => !children.exists(hasScalarPythonUDF)
+ }
}
}
- private def collectEvaluatableUDF(expr: Expression): Seq[PythonUDF] =
expr match {
- case udf: PythonUDF if PythonUDF.isScalarPythonUDF(udf) &&
canEvaluateInPython(udf) => Seq(udf)
- case e => e.children.flatMap(collectEvaluatableUDF)
+ private def collectEvaluableUDF(expr: Expression, evalType: Int):
Seq[PythonUDF] = expr match {
+ case udf: PythonUDF if PythonUDF.isScalarPythonUDF(udf) &&
canEvaluateInPython(udf, evalType) =>
+ Seq(udf)
+ case e => e.children.flatMap(collectEvaluableUDF(_, evalType))
+ }
+
+ /**
+ * Collect evaluable UDFs from the current node.
+ *
+ * This function collects Python UDFs or Scalar Python UDFs from
expressions of the input node,
+ * and returns a list of UDFs of the same eval type.
+ *
+ * If expressions contain both UDFs eval types, this function will only
return Python UDFs.
+ *
+ * The caller should call this function multiple times until all
evaluable UDFs are collected.
+ */
+ private def collectEvaluableUDFs(plan: SparkPlan): Seq[PythonUDF] = {
+ val pythonUDFs =
+ plan.expressions.flatMap(collectEvaluableUDF(_,
PythonEvalType.SQL_BATCHED_UDF))
+
+ if (pythonUDFs.isEmpty) {
+ plan.expressions.flatMap(collectEvaluableUDF(_,
PythonEvalType.SQL_SCALAR_PANDAS_UDF))
+ } else {
+ pythonUDFs
+ }
}
def apply(plan: SparkPlan): SparkPlan = plan transformUp {
- // AggregateInPandasExec and FlatMapGroupsInPandas can be evaluated
directly in python worker
- // Therefore we don't need to extract the UDFs
- case plan: FlatMapGroupsInPandasExec => plan
--- End diff --
This is no longer needed because this rule will only extract Python UDF and
Scalar Pandas UDF and ignore other types of UDFs
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]