cloud-fan commented on code in PR #55915:
URL: https://github.com/apache/spark/pull/55915#discussion_r3258117743
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala:
##########
@@ -101,38 +105,115 @@ case class DescribeFunctionCommand(
toAttributes(schema)
}
- override def run(sparkSession: SparkSession): Seq[Row] = {
- val identifier = if (info.getDb != null) {
- sparkSession.sessionState.catalog.qualifyIdentifier(
- FunctionIdentifier(info.getName, Some(info.getDb)))
+ private def append(buffer: ArrayBuffer[(String, String)], key: String,
value: String): Unit = {
+ buffer += (key -> value)
+ }
+
+ /**
+ * Pad all input strings into the same length using the max string length
among all inputs.
+ */
+ private def tabulate(inputs: Seq[String]): Seq[String] = {
+ val maxLen = inputs.map(_.length).max
+ inputs.map { input => input.padTo(maxLen, " ").mkString }
+ }
+
+ private def formatParameters(params: StructType): Seq[String] = {
+ val names = tabulate(params.map(_.name))
+ val dataTypes = tabulate(params.map(_.dataType.sql))
+ // Only show parameter comments in extended mode.
+ val comments = params.map { p =>
+ if (isExtended) p.getComment().map(c => s" '$c'").getOrElse("") else ""
+ }
+ val defaults = params.map { p =>
+ if (isExtended) p.getDefault().map(d => s" DEFAULT $d").getOrElse("")
else ""
+ }
+ names zip dataTypes zip defaults zip comments map {
+ case (((name, dataType), default), comment) => s"$name
$dataType$default$comment"
+ }
+ }
+
+ private def describeSQLFunction(info: ExpressionInfo, parser:
ParserInterface): Seq[Row] = {
+ val buffer = new ArrayBuffer[(String, String)]
+ val f = SQLFunction.fromExpressionInfo(info, parser)
+ append(buffer, "Function:", f.name.toString)
Review Comment:
Behavior change worth confirming: the legacy DESCRIBE path (still used for
non-SQL-UDF functions at L197-203) calls `qualifyIdentifier` and renders
`spark_catalog.default.area`. This SQL-UDF path constructs a
`FunctionIdentifier` from `info.getName` + `info.getDb` only (no catalog), so
the output becomes `default.area`. Two paths in the same command now render the
same field with different qualification depth. If the new 2-part form is the
intended target, consider aligning the Hive path too; otherwise, qualifying
here would keep DESCRIBE output consistent.
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala:
##########
@@ -101,38 +105,115 @@ case class DescribeFunctionCommand(
toAttributes(schema)
}
- override def run(sparkSession: SparkSession): Seq[Row] = {
- val identifier = if (info.getDb != null) {
- sparkSession.sessionState.catalog.qualifyIdentifier(
- FunctionIdentifier(info.getName, Some(info.getDb)))
+ private def append(buffer: ArrayBuffer[(String, String)], key: String,
value: String): Unit = {
+ buffer += (key -> value)
+ }
+
+ /**
+ * Pad all input strings into the same length using the max string length
among all inputs.
+ */
+ private def tabulate(inputs: Seq[String]): Seq[String] = {
+ val maxLen = inputs.map(_.length).max
+ inputs.map { input => input.padTo(maxLen, " ").mkString }
+ }
+
+ private def formatParameters(params: StructType): Seq[String] = {
+ val names = tabulate(params.map(_.name))
+ val dataTypes = tabulate(params.map(_.dataType.sql))
+ // Only show parameter comments in extended mode.
+ val comments = params.map { p =>
+ if (isExtended) p.getComment().map(c => s" '$c'").getOrElse("") else ""
+ }
+ val defaults = params.map { p =>
+ if (isExtended) p.getDefault().map(d => s" DEFAULT $d").getOrElse("")
else ""
+ }
+ names zip dataTypes zip defaults zip comments map {
+ case (((name, dataType), default), comment) => s"$name
$dataType$default$comment"
+ }
+ }
+
+ private def describeSQLFunction(info: ExpressionInfo, parser:
ParserInterface): Seq[Row] = {
+ val buffer = new ArrayBuffer[(String, String)]
+ val f = SQLFunction.fromExpressionInfo(info, parser)
+ append(buffer, "Function:", f.name.toString)
+ append(buffer, "Type:", if (f.isTableFunc) SQLFunction.TABLE else
SQLFunction.SCALAR)
+ // Function input
+ val input = f.inputParam
+ if (input.nonEmpty) {
+ val params = formatParameters(input.get)
+ assert(params.nonEmpty)
+ append(buffer, "Input:", params.head)
+ params.tail.foreach(s => append(buffer, "", s))
} else {
- FunctionIdentifier(info.getName)
+ append(buffer, "Input:", "()")
}
- val name = identifier.unquotedString
- val result = if (info.getClassName != null) {
- Row(s"Function: $name") ::
- Row(s"Class: ${info.getClassName}") ::
- Row(s"Usage: ${info.getUsage}") :: Nil
+ // Function returns
+ if (f.isTableFunc) {
+ val returnParams = formatParameters(f.getTableFuncReturnCols)
+ assert(returnParams.nonEmpty)
+ append(buffer, "Returns:", returnParams.head)
+ returnParams.tail.foreach(s => append(buffer, "", s))
} else {
- Row(s"Function: $name") :: Row(s"Usage: ${info.getUsage}") :: Nil
+ f.getScalarFuncReturnType match {
+ case _: NullType =>
+ case other => append(buffer, "Returns:", other.sql)
+ }
+ }
+ if (isExtended) {
+ f.comment.foreach(c => append(buffer, "Comment:", c))
+ f.collation.foreach(c => append(buffer, "Collation:", c))
+ f.deterministic.foreach(d => append(buffer, "Deterministic:",
d.toString))
+ f.containsSQL.foreach { c =>
+ val dataAccess = if (c) "CONTAINS SQL" else "READS SQL DATA"
+ append(buffer, "Data Access:", dataAccess)
+ }
+ val configs = f.getSQLConfigs
+ if (configs.nonEmpty) {
+ val sorted = configs.toSeq.sortBy(_._1).map { case (key, value) =>
s"$key=$value" }
+ append(buffer, "Configs:", sorted.head)
+ sorted.tail.foreach(s => append(buffer, "", s))
+ }
+ f.owner.foreach(o => append(buffer, "Owner:", o))
+ append(buffer, "Create Time:", new
java.util.Date(f.createTimeMs).toString)
Review Comment:
`Create Time:` (and the just-above `Owner:`) will be misleading for
persistent SQL UDFs. `SQLFunction.toCatalogFunction` (SQLFunction.scala
L124-130) builds the catalog blob from `sqlFunctionToProps ++ properties` and
omits `functionMetadataToProps` — so `OWNER` and `CREATE_TIME` are never
persisted to the metastore. When the function is later loaded via
`fromCatalogFunction`, `owner` defaults to `None` and `createTimeMs` defaults
to `System.currentTimeMillis()`. After the SessionCatalog change, that
fresh-now timestamp gets written into the cached `ExpressionInfo` via
`toExpressionInfo`, so `Create Time:` ends up showing *when the registry cached
the function*, and `Owner:` is never rendered.
Two options: include `functionMetadataToProps` in `toCatalogFunction` so the
values actually round-trip, or suppress these rows when the function was loaded
from catalog. Either way the current behavior is user-visible and misleading —
and the test only asserts the keyword `"Create Time:"` exists, so it won't
catch this.
##########
sql/core/src/test/scala/org/apache/spark/sql/execution/SQLFunctionSuite.scala:
##########
@@ -113,6 +113,107 @@ class SQLFunctionSuite extends SharedSparkSession {
}
}
+ test("describe SQL scalar functions") {
+ withUserDefinedFunction("foo" -> true, "bar" -> true, "area" -> false) {
+ // Temporary function
+ sql(
+ """
+ |CREATE TEMPORARY FUNCTION foo() RETURNS int
+ |COMMENT 'function foo' RETURN 1
+ |""".stripMargin)
+ checkKeywordsExist(sql("describe function foo"),
+ "Function:", "foo",
+ "Type:", "SCALAR",
+ "Input:", "()",
+ "Returns:", "INT")
+ checkKeywordsExist(sql("describe function extended foo"),
+ "Deterministic: true",
+ "Data Access:", "CONTAINS SQL",
+ "Comment:", "function foo",
+ "Create Time:",
+ "Body:", "1")
+ sql(
+ """
+ |CREATE TEMPORARY FUNCTION bar(x int default 8,
+ |y int default substr('8hello', 1, 1) comment 'var_y')
+ |RETURNS int COMMENT 'function bar' RETURN x + y
+ |""".stripMargin)
+ checkKeywordsExist(sql("describe function bar"),
+ "Function:", "bar",
+ "Input:", "x INT", "y INT",
+ "Returns:", "INT")
+ checkKeywordsExist(sql("describe function extended bar"),
+ "Input:", "x INT DEFAULT 8", "y INT DEFAULT substr('8hello', 1, 1)
'var_y'",
+ "Comment:", "function bar",
+ "Deterministic: true",
+ "Data Access:", "CONTAINS SQL",
+ "Body:", "x + y")
+ // Permanent function
+ sql(
+ """
+ |CREATE FUNCTION area(width double comment 'width', height double
comment 'height')
+ |RETURNS double
+ |COMMENT 'compute area'
+ |DETERMINISTIC
+ |RETURN width * height
+ |""".stripMargin)
+ checkKeywordsExist(sql("describe function area"),
+ "Function:", "default.area",
+ "Type:", "SCALAR",
+ "Input:", "width DOUBLE", "height DOUBLE",
+ "Returns:", "DOUBLE")
+ checkKeywordsExist(sql("describe function extended area"),
+ "Input:", "width DOUBLE 'width'", "height DOUBLE 'height'",
+ "Comment:", "compute area",
+ "Deterministic: true",
+ "Data Access:", "CONTAINS SQL",
+ "Create Time:",
+ "Body:", "width * height")
+ }
+ }
+
+ test("describe SQL table functions") {
+ withUserDefinedFunction("foo" -> false) {
+ sql(
+ """
+ |CREATE FUNCTION foo(x INT) RETURNS TABLE (a INT, b STRING)
+ |COMMENT 'table function foo' RETURN SELECT x, x
+ |""".stripMargin)
+ checkKeywordsExist(sql("describe function foo"),
+ "Function:", "foo",
+ "Type:", "TABLE",
+ "Input:", "x INT",
+ "Returns:", "a INT", "b STRING")
+ checkKeywordsExist(sql("describe function extended foo"),
+ "Comment:", "table function foo",
+ "Deterministic: true",
+ "Data Access:", "CONTAINS SQL",
+ "Create Time:",
+ "Body:", "SELECT x, x")
+ }
+ }
+
+ test("describe SQL functions with derived routine characteristics") {
+ withUserDefinedFunction("foo" -> false, "bar" -> false, "baz" -> false) {
+ withTable("tbl_for_describe") {
+ sql("CREATE TABLE tbl_for_describe AS SELECT 1 AS x")
+ sql("CREATE FUNCTION foo() RETURNS TABLE(x INT) RETURN SELECT * FROM
tbl_for_describe")
+ sql("CREATE FUNCTION bar() RETURNS DOUBLE RETURN SELECT SUM(x) +
rand() FROM foo()")
+ sql("CREATE FUNCTION baz() RETURNS INT NOT DETERMINISTIC READS SQL
DATA RETURN 1")
+ checkKeywordsExist(sql("DESCRIBE FUNCTION EXTENDED foo"),
+ "Deterministic: true",
+ "Data Access:", "READS SQL DATA")
+ checkKeywordsExist(sql("DESCRIBE FUNCTION EXTENDED bar"),
+ "Deterministic: false",
+ "Data Access:", "READS SQL DATA")
+ // Do not overwrite user specified routine characteristics.
Review Comment:
```suggestion
// Do not overwrite user-specified routine characteristics.
```
##########
sql/core/src/test/scala/org/apache/spark/sql/execution/SQLFunctionSuite.scala:
##########
@@ -113,6 +113,107 @@ class SQLFunctionSuite extends SharedSparkSession {
}
}
+ test("describe SQL scalar functions") {
+ withUserDefinedFunction("foo" -> true, "bar" -> true, "area" -> false) {
+ // Temporary function
+ sql(
+ """
+ |CREATE TEMPORARY FUNCTION foo() RETURNS int
+ |COMMENT 'function foo' RETURN 1
+ |""".stripMargin)
+ checkKeywordsExist(sql("describe function foo"),
+ "Function:", "foo",
+ "Type:", "SCALAR",
+ "Input:", "()",
+ "Returns:", "INT")
+ checkKeywordsExist(sql("describe function extended foo"),
+ "Deterministic: true",
+ "Data Access:", "CONTAINS SQL",
+ "Comment:", "function foo",
+ "Create Time:",
+ "Body:", "1")
+ sql(
+ """
+ |CREATE TEMPORARY FUNCTION bar(x int default 8,
+ |y int default substr('8hello', 1, 1) comment 'var_y')
+ |RETURNS int COMMENT 'function bar' RETURN x + y
+ |""".stripMargin)
+ checkKeywordsExist(sql("describe function bar"),
+ "Function:", "bar",
+ "Input:", "x INT", "y INT",
+ "Returns:", "INT")
+ checkKeywordsExist(sql("describe function extended bar"),
+ "Input:", "x INT DEFAULT 8", "y INT DEFAULT substr('8hello', 1, 1)
'var_y'",
+ "Comment:", "function bar",
+ "Deterministic: true",
+ "Data Access:", "CONTAINS SQL",
+ "Body:", "x + y")
+ // Permanent function
+ sql(
+ """
+ |CREATE FUNCTION area(width double comment 'width', height double
comment 'height')
+ |RETURNS double
+ |COMMENT 'compute area'
+ |DETERMINISTIC
+ |RETURN width * height
+ |""".stripMargin)
+ checkKeywordsExist(sql("describe function area"),
+ "Function:", "default.area",
+ "Type:", "SCALAR",
+ "Input:", "width DOUBLE", "height DOUBLE",
+ "Returns:", "DOUBLE")
+ checkKeywordsExist(sql("describe function extended area"),
+ "Input:", "width DOUBLE 'width'", "height DOUBLE 'height'",
+ "Comment:", "compute area",
+ "Deterministic: true",
+ "Data Access:", "CONTAINS SQL",
+ "Create Time:",
Review Comment:
`checkKeywordsExist` only checks that the literal `"Create Time:"` substring
appears anywhere in the output — it doesn't verify the rendered timestamp is
meaningful. For `area`, the displayed time is currently
`System.currentTimeMillis()` at the cache-build moment (see the comment on
functions.scala L177), but the test passes either way. If the persistence issue
is addressed, consider asserting the row's value falls within a small window of
the test's start time.
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala:
##########
@@ -253,6 +256,40 @@ object SQLFunction {
}
}
+ /**
+ * Convert an [[ExpressionInfo]] into a SQL function.
+ */
+ def fromExpressionInfo(info: ExpressionInfo, parser: ParserInterface):
SQLFunction = {
Review Comment:
Optional refactor: this method is largely a copy of `fromCatalogFunction`
(L224-257) — the differences are the source of the JSON blob, how
`FunctionIdentifier` is built, and the extra `owner`/`createTimeMs` reads. A
small `fromProps(props: Map[String, String], identifier: FunctionIdentifier):
SQLFunction` helper would let both call into one place and stay in sync when
new properties are added.
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala:
##########
@@ -101,38 +105,115 @@ case class DescribeFunctionCommand(
toAttributes(schema)
}
- override def run(sparkSession: SparkSession): Seq[Row] = {
- val identifier = if (info.getDb != null) {
- sparkSession.sessionState.catalog.qualifyIdentifier(
- FunctionIdentifier(info.getName, Some(info.getDb)))
+ private def append(buffer: ArrayBuffer[(String, String)], key: String,
value: String): Unit = {
+ buffer += (key -> value)
+ }
+
+ /**
+ * Pad all input strings into the same length using the max string length
among all inputs.
Review Comment:
```suggestion
* Pad all input strings to the same length using the max string length
among all inputs.
```
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala:
##########
@@ -2146,7 +2146,15 @@ class SessionCatalog(
if (registry.functionExists(identToRegister) && !overrideIfExists) {
throw QueryCompilationErrors.functionAlreadyExistsError(func)
}
- val info = makeExprInfoForHiveFunction(funcDefinition)
+ val info = if (funcDefinition.isUserDefinedFunction) {
Review Comment:
The persistent branch of `registerUserDefinedFunction` (L2289-2294) already
has the `UserDefinedFunction` in hand; it calls
`registerFunction(function.toCatalogFunction, ...)`, which this new branch then
reverses via `UserDefinedFunction.fromCatalogFunction(funcDefinition,
parser).toExpressionInfo`. The deserialization step does no useful work — and,
because `toCatalogFunction` doesn't carry `OWNER`/`CREATE_TIME`, it actively
loses fields that `function.toExpressionInfo` would have preserved (see the
`Create Time:` / `Owner:` comment on functions.scala).
Consider routing the persistent path to pass `function.toExpressionInfo`
directly to the registry — that would let this branch go away and would also
fix the create-time / owner regression in one shot.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]