Re: [PR] [SPARK-56883][SQL] DESCRIBE FUNCTION for SQL UDFs [spark]

via GitHub Mon, 18 May 2026 03:26:19 -0700


cloud-fan commented on code in PR #55915:
URL: https://github.com/apache/spark/pull/55915#discussion_r3258117743



##########
sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala:
##########
@@ -101,38 +105,115 @@ case class DescribeFunctionCommand(
     toAttributes(schema)
   }
 
-  override def run(sparkSession: SparkSession): Seq[Row] = {
-    val identifier = if (info.getDb != null) {
-      sparkSession.sessionState.catalog.qualifyIdentifier(
-        FunctionIdentifier(info.getName, Some(info.getDb)))
+  private def append(buffer: ArrayBuffer[(String, String)], key: String, 
value: String): Unit = {
+    buffer += (key -> value)
+  }
+
+  /**
+   * Pad all input strings into the same length using the max string length 
among all inputs.
+   */
+  private def tabulate(inputs: Seq[String]): Seq[String] = {
+    val maxLen = inputs.map(_.length).max
+    inputs.map { input => input.padTo(maxLen, " ").mkString }
+  }
+
+  private def formatParameters(params: StructType): Seq[String] = {
+    val names = tabulate(params.map(_.name))
+    val dataTypes = tabulate(params.map(_.dataType.sql))
+    // Only show parameter comments in extended mode.
+    val comments = params.map { p =>
+      if (isExtended) p.getComment().map(c => s" '$c'").getOrElse("") else ""
+    }
+    val defaults = params.map { p =>
+      if (isExtended) p.getDefault().map(d => s" DEFAULT $d").getOrElse("") 
else ""
+    }
+    names zip dataTypes zip defaults zip comments map {
+      case (((name, dataType), default), comment) => s"$name 
$dataType$default$comment"
+    }
+  }
+
+  private def describeSQLFunction(info: ExpressionInfo, parser: 
ParserInterface): Seq[Row] = {
+    val buffer = new ArrayBuffer[(String, String)]
+    val f = SQLFunction.fromExpressionInfo(info, parser)
+    append(buffer, "Function:", f.name.toString)

Review Comment:
   Behavior change worth confirming: the legacy DESCRIBE path (still used for 
non-SQL-UDF functions at L197-203) calls `qualifyIdentifier` and renders 
`spark_catalog.default.area`. This SQL-UDF path constructs a 
`FunctionIdentifier` from `info.getName` + `info.getDb` only (no catalog), so 
the output becomes `default.area`. Two paths in the same command now render the 
same field with different qualification depth. If the new 2-part form is the 
intended target, consider aligning the Hive path too; otherwise, qualifying 
here would keep DESCRIBE output consistent.



##########
sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala:
##########
@@ -101,38 +105,115 @@ case class DescribeFunctionCommand(
     toAttributes(schema)
   }
 
-  override def run(sparkSession: SparkSession): Seq[Row] = {
-    val identifier = if (info.getDb != null) {
-      sparkSession.sessionState.catalog.qualifyIdentifier(
-        FunctionIdentifier(info.getName, Some(info.getDb)))
+  private def append(buffer: ArrayBuffer[(String, String)], key: String, 
value: String): Unit = {
+    buffer += (key -> value)
+  }
+
+  /**
+   * Pad all input strings into the same length using the max string length 
among all inputs.
+   */
+  private def tabulate(inputs: Seq[String]): Seq[String] = {
+    val maxLen = inputs.map(_.length).max
+    inputs.map { input => input.padTo(maxLen, " ").mkString }
+  }
+
+  private def formatParameters(params: StructType): Seq[String] = {
+    val names = tabulate(params.map(_.name))
+    val dataTypes = tabulate(params.map(_.dataType.sql))
+    // Only show parameter comments in extended mode.
+    val comments = params.map { p =>
+      if (isExtended) p.getComment().map(c => s" '$c'").getOrElse("") else ""
+    }
+    val defaults = params.map { p =>
+      if (isExtended) p.getDefault().map(d => s" DEFAULT $d").getOrElse("") 
else ""
+    }
+    names zip dataTypes zip defaults zip comments map {
+      case (((name, dataType), default), comment) => s"$name 
$dataType$default$comment"
+    }
+  }
+
+  private def describeSQLFunction(info: ExpressionInfo, parser: 
ParserInterface): Seq[Row] = {
+    val buffer = new ArrayBuffer[(String, String)]
+    val f = SQLFunction.fromExpressionInfo(info, parser)
+    append(buffer, "Function:", f.name.toString)
+    append(buffer, "Type:", if (f.isTableFunc) SQLFunction.TABLE else 
SQLFunction.SCALAR)
+    // Function input
+    val input = f.inputParam
+    if (input.nonEmpty) {
+      val params = formatParameters(input.get)
+      assert(params.nonEmpty)
+      append(buffer, "Input:", params.head)
+      params.tail.foreach(s => append(buffer, "", s))
     } else {
-      FunctionIdentifier(info.getName)
+      append(buffer, "Input:", "()")
     }
-    val name = identifier.unquotedString
-    val result = if (info.getClassName != null) {
-      Row(s"Function: $name") ::
-        Row(s"Class: ${info.getClassName}") ::
-        Row(s"Usage: ${info.getUsage}") :: Nil
+    // Function returns
+    if (f.isTableFunc) {
+      val returnParams = formatParameters(f.getTableFuncReturnCols)
+      assert(returnParams.nonEmpty)
+      append(buffer, "Returns:", returnParams.head)
+      returnParams.tail.foreach(s => append(buffer, "", s))
     } else {
-      Row(s"Function: $name") :: Row(s"Usage: ${info.getUsage}") :: Nil
+      f.getScalarFuncReturnType match {
+        case _: NullType =>
+        case other => append(buffer, "Returns:", other.sql)
+      }
+    }
+    if (isExtended) {
+      f.comment.foreach(c => append(buffer, "Comment:", c))
+      f.collation.foreach(c => append(buffer, "Collation:", c))
+      f.deterministic.foreach(d => append(buffer, "Deterministic:", 
d.toString))
+      f.containsSQL.foreach { c =>
+        val dataAccess = if (c) "CONTAINS SQL" else "READS SQL DATA"
+        append(buffer, "Data Access:", dataAccess)
+      }
+      val configs = f.getSQLConfigs
+      if (configs.nonEmpty) {
+        val sorted = configs.toSeq.sortBy(_._1).map { case (key, value) => 
s"$key=$value" }
+        append(buffer, "Configs:", sorted.head)
+        sorted.tail.foreach(s => append(buffer, "", s))
+      }
+      f.owner.foreach(o => append(buffer, "Owner:", o))
+      append(buffer, "Create Time:", new 
java.util.Date(f.createTimeMs).toString)

Review Comment:
   `Create Time:` (and the just-above `Owner:`) will be misleading for 
persistent SQL UDFs. `SQLFunction.toCatalogFunction` (SQLFunction.scala 
L124-130) builds the catalog blob from `sqlFunctionToProps ++ properties` and 
omits `functionMetadataToProps` — so `OWNER` and `CREATE_TIME` are never 
persisted to the metastore. When the function is later loaded via 
`fromCatalogFunction`, `owner` defaults to `None` and `createTimeMs` defaults 
to `System.currentTimeMillis()`. After the SessionCatalog change, that 
fresh-now timestamp gets written into the cached `ExpressionInfo` via 
`toExpressionInfo`, so `Create Time:` ends up showing *when the registry cached 
the function*, and `Owner:` is never rendered.
   
   Two options: include `functionMetadataToProps` in `toCatalogFunction` so the 
values actually round-trip, or suppress these rows when the function was loaded 
from catalog. Either way the current behavior is user-visible and misleading — 
and the test only asserts the keyword `"Create Time:"` exists, so it won't 
catch this.



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/SQLFunctionSuite.scala:
##########
@@ -113,6 +113,107 @@ class SQLFunctionSuite extends SharedSparkSession {
     }
   }
 
+  test("describe SQL scalar functions") {
+    withUserDefinedFunction("foo" -> true, "bar" -> true, "area" -> false) {
+      // Temporary function
+      sql(
+        """
+          |CREATE TEMPORARY FUNCTION foo() RETURNS int
+          |COMMENT 'function foo' RETURN 1
+          |""".stripMargin)
+      checkKeywordsExist(sql("describe function foo"),
+        "Function:", "foo",
+        "Type:", "SCALAR",
+        "Input:", "()",
+        "Returns:", "INT")
+      checkKeywordsExist(sql("describe function extended foo"),
+        "Deterministic: true",
+        "Data Access:", "CONTAINS SQL",
+        "Comment:", "function foo",
+        "Create Time:",
+        "Body:", "1")
+      sql(
+        """
+          |CREATE TEMPORARY FUNCTION bar(x int default 8,
+          |y int default substr('8hello', 1, 1) comment 'var_y')
+          |RETURNS int COMMENT 'function bar' RETURN x + y
+          |""".stripMargin)
+      checkKeywordsExist(sql("describe function bar"),
+        "Function:", "bar",
+        "Input:", "x INT", "y INT",
+        "Returns:", "INT")
+      checkKeywordsExist(sql("describe function extended bar"),
+        "Input:", "x INT DEFAULT 8", "y INT DEFAULT substr('8hello', 1, 1) 
'var_y'",
+        "Comment:", "function bar",
+        "Deterministic: true",
+        "Data Access:", "CONTAINS SQL",
+        "Body:", "x + y")
+      // Permanent function
+      sql(
+        """
+          |CREATE FUNCTION area(width double comment 'width', height double 
comment 'height')
+          |RETURNS double
+          |COMMENT 'compute area'
+          |DETERMINISTIC
+          |RETURN width * height
+          |""".stripMargin)
+      checkKeywordsExist(sql("describe function area"),
+        "Function:", "default.area",
+        "Type:", "SCALAR",
+        "Input:", "width  DOUBLE", "height DOUBLE",
+        "Returns:", "DOUBLE")
+      checkKeywordsExist(sql("describe function extended area"),
+        "Input:", "width  DOUBLE 'width'", "height DOUBLE 'height'",
+        "Comment:", "compute area",
+        "Deterministic: true",
+        "Data Access:", "CONTAINS SQL",
+        "Create Time:",
+        "Body:", "width * height")
+    }
+  }
+
+  test("describe SQL table functions") {
+    withUserDefinedFunction("foo" -> false) {
+      sql(
+        """
+          |CREATE FUNCTION foo(x INT) RETURNS TABLE (a INT, b STRING)
+          |COMMENT 'table function foo' RETURN SELECT x, x
+          |""".stripMargin)
+      checkKeywordsExist(sql("describe function foo"),
+        "Function:", "foo",
+        "Type:", "TABLE",
+        "Input:", "x INT",
+        "Returns:", "a INT", "b STRING")
+      checkKeywordsExist(sql("describe function extended foo"),
+        "Comment:", "table function foo",
+        "Deterministic: true",
+        "Data Access:", "CONTAINS SQL",
+        "Create Time:",
+        "Body:", "SELECT x, x")
+    }
+  }
+
+  test("describe SQL functions with derived routine characteristics") {
+    withUserDefinedFunction("foo" -> false, "bar" -> false, "baz" -> false) {
+      withTable("tbl_for_describe") {
+        sql("CREATE TABLE tbl_for_describe AS SELECT 1 AS x")
+        sql("CREATE FUNCTION foo() RETURNS TABLE(x INT) RETURN SELECT * FROM 
tbl_for_describe")
+        sql("CREATE FUNCTION bar() RETURNS DOUBLE RETURN SELECT SUM(x) + 
rand() FROM foo()")
+        sql("CREATE FUNCTION baz() RETURNS INT NOT DETERMINISTIC READS SQL 
DATA RETURN 1")
+        checkKeywordsExist(sql("DESCRIBE FUNCTION EXTENDED foo"),
+          "Deterministic: true",
+          "Data Access:", "READS SQL DATA")
+        checkKeywordsExist(sql("DESCRIBE FUNCTION EXTENDED bar"),
+          "Deterministic: false",
+          "Data Access:", "READS SQL DATA")
+        // Do not overwrite user specified routine characteristics.

Review Comment:
   ```suggestion
           // Do not overwrite user-specified routine characteristics.
   ```



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/SQLFunctionSuite.scala:
##########
@@ -113,6 +113,107 @@ class SQLFunctionSuite extends SharedSparkSession {
     }
   }
 
+  test("describe SQL scalar functions") {
+    withUserDefinedFunction("foo" -> true, "bar" -> true, "area" -> false) {
+      // Temporary function
+      sql(
+        """
+          |CREATE TEMPORARY FUNCTION foo() RETURNS int
+          |COMMENT 'function foo' RETURN 1
+          |""".stripMargin)
+      checkKeywordsExist(sql("describe function foo"),
+        "Function:", "foo",
+        "Type:", "SCALAR",
+        "Input:", "()",
+        "Returns:", "INT")
+      checkKeywordsExist(sql("describe function extended foo"),
+        "Deterministic: true",
+        "Data Access:", "CONTAINS SQL",
+        "Comment:", "function foo",
+        "Create Time:",
+        "Body:", "1")
+      sql(
+        """
+          |CREATE TEMPORARY FUNCTION bar(x int default 8,
+          |y int default substr('8hello', 1, 1) comment 'var_y')
+          |RETURNS int COMMENT 'function bar' RETURN x + y
+          |""".stripMargin)
+      checkKeywordsExist(sql("describe function bar"),
+        "Function:", "bar",
+        "Input:", "x INT", "y INT",
+        "Returns:", "INT")
+      checkKeywordsExist(sql("describe function extended bar"),
+        "Input:", "x INT DEFAULT 8", "y INT DEFAULT substr('8hello', 1, 1) 
'var_y'",
+        "Comment:", "function bar",
+        "Deterministic: true",
+        "Data Access:", "CONTAINS SQL",
+        "Body:", "x + y")
+      // Permanent function
+      sql(
+        """
+          |CREATE FUNCTION area(width double comment 'width', height double 
comment 'height')
+          |RETURNS double
+          |COMMENT 'compute area'
+          |DETERMINISTIC
+          |RETURN width * height
+          |""".stripMargin)
+      checkKeywordsExist(sql("describe function area"),
+        "Function:", "default.area",
+        "Type:", "SCALAR",
+        "Input:", "width  DOUBLE", "height DOUBLE",
+        "Returns:", "DOUBLE")
+      checkKeywordsExist(sql("describe function extended area"),
+        "Input:", "width  DOUBLE 'width'", "height DOUBLE 'height'",
+        "Comment:", "compute area",
+        "Deterministic: true",
+        "Data Access:", "CONTAINS SQL",
+        "Create Time:",

Review Comment:
   `checkKeywordsExist` only checks that the literal `"Create Time:"` substring 
appears anywhere in the output — it doesn't verify the rendered timestamp is 
meaningful. For `area`, the displayed time is currently 
`System.currentTimeMillis()` at the cache-build moment (see the comment on 
functions.scala L177), but the test passes either way. If the persistence issue 
is addressed, consider asserting the row's value falls within a small window of 
the test's start time.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala:
##########
@@ -253,6 +256,40 @@ object SQLFunction {
     }
   }
 
+  /**
+   * Convert an [[ExpressionInfo]] into a SQL function.
+   */
+  def fromExpressionInfo(info: ExpressionInfo, parser: ParserInterface): 
SQLFunction = {

Review Comment:
   Optional refactor: this method is largely a copy of `fromCatalogFunction` 
(L224-257) — the differences are the source of the JSON blob, how 
`FunctionIdentifier` is built, and the extra `owner`/`createTimeMs` reads. A 
small `fromProps(props: Map[String, String], identifier: FunctionIdentifier): 
SQLFunction` helper would let both call into one place and stay in sync when 
new properties are added.



##########
sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala:
##########
@@ -101,38 +105,115 @@ case class DescribeFunctionCommand(
     toAttributes(schema)
   }
 
-  override def run(sparkSession: SparkSession): Seq[Row] = {
-    val identifier = if (info.getDb != null) {
-      sparkSession.sessionState.catalog.qualifyIdentifier(
-        FunctionIdentifier(info.getName, Some(info.getDb)))
+  private def append(buffer: ArrayBuffer[(String, String)], key: String, 
value: String): Unit = {
+    buffer += (key -> value)
+  }
+
+  /**
+   * Pad all input strings into the same length using the max string length 
among all inputs.

Review Comment:
   ```suggestion
      * Pad all input strings to the same length using the max string length 
among all inputs.
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala:
##########
@@ -2146,7 +2146,15 @@ class SessionCatalog(
     if (registry.functionExists(identToRegister) && !overrideIfExists) {
       throw QueryCompilationErrors.functionAlreadyExistsError(func)
     }
-    val info = makeExprInfoForHiveFunction(funcDefinition)
+    val info = if (funcDefinition.isUserDefinedFunction) {

Review Comment:
   The persistent branch of `registerUserDefinedFunction` (L2289-2294) already 
has the `UserDefinedFunction` in hand; it calls 
`registerFunction(function.toCatalogFunction, ...)`, which this new branch then 
reverses via `UserDefinedFunction.fromCatalogFunction(funcDefinition, 
parser).toExpressionInfo`. The deserialization step does no useful work — and, 
because `toCatalogFunction` doesn't carry `OWNER`/`CREATE_TIME`, it actively 
loses fields that `function.toExpressionInfo` would have preserved (see the 
`Create Time:` / `Owner:` comment on functions.scala).
   
   Consider routing the persistent path to pass `function.toExpressionInfo` 
directly to the registry — that would let this branch go away and would also 
fix the create-time / owner regression in one shot.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-56883][SQL] DESCRIBE FUNCTION for SQL UDFs [spark]

Reply via email to