HyukjinKwon commented on a change in pull request #25130:
[SPARK-28359][test-maven][SQL][PYTHON][TESTS] Make integrated UDF tests robust
by making UDFs (virtually) no-op
URL: https://github.com/apache/spark/pull/25130#discussion_r303227812
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala
##########
@@ -238,23 +274,56 @@ object IntegratedUDFTestUtils extends SQLHelper {
accumulator = null),
dataType = StringType,
pythonEvalType = PythonEvalType.SQL_SCALAR_PANDAS_UDF,
- udfDeterministic = true)
+ udfDeterministic = true) {
+
+ override def builder(e: Seq[Expression]): Expression = {
+ assert(e.length == 1, "Defined UDF only has one column")
+ val expr = e.head
+ assert(expr.resolved, "column should be resolved to use the same type
" +
+ "as input. Try df(name) or df.col(name)")
+ Cast(super.builder(Cast(expr, StringType) :: Nil), expr.dataType)
+ }
+ }
def apply(exprs: Column*): Column = udf(exprs: _*)
val prettyName: String = "Scalar Pandas UDF"
}
/**
- * A Scala UDF that takes one column and returns a string column.
- * Equivalent to `udf((input: Any) => String.valueOf(input)`.
+ * A Scala UDF that takes one column, casts into string, executes the
+ * Scala native function, and casts back to the type of input column.
+ *
+ * Virtually equivalent to:
+ *
+ * {{{
+ * import org.apache.spark.sql.functions.udf
+ *
+ * val df = spark.range(3).toDF("col")
+ * val scala_udf = udf((input: Any) => input.toString)
+ * val casted_col = scala_udf(df.col("col").cast("string"))
+ * casted_col.cast(df.schema("col").dataType)
+ * }}}
*/
case class TestScalaUDF(name: String) extends TestUDF {
- private[IntegratedUDFTestUtils] lazy val udf = SparkUserDefinedFunction(
- (input: Any) => String.valueOf(input),
+ private[IntegratedUDFTestUtils] lazy val udf = new
SparkUserDefinedFunction(
+ (input: Any) => if (input == null) {
+ null
+ } else {
+ input.toString
+ },
StringType,
inputSchemas = Seq.fill(1)(None),
- name = Some(name))
+ name = Some(name)) {
+
+ override def apply(exprs: Column*): Column = {
+ assert(exprs.length == 1, "Defined UDF only has one column")
+ val expr = exprs.head.expr
+ assert(expr.resolved, "column should be resolved to use the same type
" +
+ "as input. Try df(name) or df.col(name)")
+ Column(Cast(createScalaUDF(Cast(expr, StringType) :: Nil),
expr.dataType))
Review comment:
Looks like I can even try to (de)serialize to/from JSON string for complex
types to work around.
But I think it's an overkill for now. So far, there look not many tests that
use complex types for UDF test cases. I will add it later if we see some values
on that.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]