This is an automated email from the ASF dual-hosted git repository.
ggal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-livy.git
The following commit(s) were added to refs/heads/master by this push:
new 370a7156 [LIVY-1018] Extract field comments from Spark schema metadata
and set in Datatypeutils
370a7156 is described below
commit 370a71561dcf60a860d3b97ef20810b95bf90980
Author: Arnav Balyan <[email protected]>
AuthorDate: Tue Oct 7 20:00:17 2025 +0530
[LIVY-1018] Extract field comments from Spark schema metadata and set in
Datatypeutils
## What changes were proposed in this pull request?
* During schema fetch, ensure we can extract and persist the comments.
* Column comments extracted from Spark metadata for complexToDataType and
schemaFromSparkJson
* Closes Extract comment metadata for complexToDataType and
schemaFromSparkJson #483
## How was this patch tested?
CI
---
.../livy/thriftserver/types/DataTypeUtils.scala | 23 ++++++--
.../livy/thriftserver/ThriftServerSuites.scala | 65 ++++++++++++++++++++++
2 files changed, 84 insertions(+), 4 deletions(-)
diff --git
a/thriftserver/server/src/main/scala/org/apache/livy/thriftserver/types/DataTypeUtils.scala
b/thriftserver/server/src/main/scala/org/apache/livy/thriftserver/types/DataTypeUtils.scala
index 90799f7a..f8f0f190 100644
---
a/thriftserver/server/src/main/scala/org/apache/livy/thriftserver/types/DataTypeUtils.scala
+++
b/thriftserver/server/src/main/scala/org/apache/livy/thriftserver/types/DataTypeUtils.scala
@@ -42,8 +42,8 @@ object DataTypeUtils {
case "array" => ArrayType(toFieldType(sparkType \ "elementType"))
case "struct" =>
val fields = (sparkType \ "fields").children.map { f =>
- // TODO: get comment from metadata
- Field((f \ "name").extract[String], toFieldType(f \ "type"), "")
+ val comment = extractComment(f \ "metadata")
+ Field((f \ "name").extract[String], toFieldType(f \ "type"), comment)
}
StructType(fields.toArray)
case "map" =>
@@ -52,6 +52,21 @@ object DataTypeUtils {
}
}
+ /**
+ * Extracts comment from Spark field metadata JSON.
+ * Spark stores comments in metadata under the "comment" key.
+ */
+ private def extractComment(metadata: JValue): String = {
+ metadata match {
+ case JObject(fields) =>
+ fields.find(_._1 == "comment") match {
+ case Some((_, JString(comment))) => comment
+ case _ => ""
+ }
+ case _ => ""
+ }
+ }
+
/**
* Converts a JSON representing the Spark schema (the one returned by
`df.schema.json`) into
@@ -65,8 +80,8 @@ object DataTypeUtils {
val fields = schema.children.map { field =>
val name = (field \ "name").extract[String]
val hiveType = toFieldType(field \ "type")
- // TODO: retrieve comment from metadata
- Field(name, hiveType, "")
+ val comment = extractComment(field \ "metadata")
+ Field(name, hiveType, comment)
}
Schema(fields.toArray)
}
diff --git
a/thriftserver/server/src/test/scala/org/apache/livy/thriftserver/ThriftServerSuites.scala
b/thriftserver/server/src/test/scala/org/apache/livy/thriftserver/ThriftServerSuites.scala
index c0f4a36d..cc28b02e 100644
---
a/thriftserver/server/src/test/scala/org/apache/livy/thriftserver/ThriftServerSuites.scala
+++
b/thriftserver/server/src/test/scala/org/apache/livy/thriftserver/ThriftServerSuites.scala
@@ -310,6 +310,59 @@ trait CommonThriftTests {
}
+ def getColumnsWithCommentsTest(connection: Connection): Unit = {
+ val metadata = connection.getMetaData
+ val statement = connection.createStatement()
+ try {
+ statement.execute(
+ "CREATE TABLE test_column_comments (" +
+ "id integer COMMENT 'User identifier', " +
+ "name string COMMENT 'User full name', " +
+ "age integer COMMENT 'User age in years') USING json")
+
+ val columnsResultSet = metadata.getColumns("", "default",
"test_column_comments", "%")
+
+ columnsResultSet.next()
+ assert(columnsResultSet.getString("COLUMN_NAME") == "id")
+ assert(columnsResultSet.getString("REMARKS") == "User identifier",
+ "Column 'id' should have comment 'User identifier'")
+
+ columnsResultSet.next()
+ assert(columnsResultSet.getString("COLUMN_NAME") == "name")
+ assert(columnsResultSet.getString("REMARKS") == "User full name",
+ "Column 'name' should have comment 'User full name'")
+
+ columnsResultSet.next()
+ assert(columnsResultSet.getString("COLUMN_NAME") == "age")
+ assert(columnsResultSet.getString("REMARKS") == "User age in years",
+ "Column 'age' should have comment 'User age in years'")
+
+ assert(!columnsResultSet.next())
+ columnsResultSet.close()
+
+ statement.execute(
+ "CREATE TABLE test_struct_comments (" +
+ "person struct<id:int COMMENT 'Person ID', " +
+ "name:string COMMENT 'Person name'>, " +
+ "address struct<street:string COMMENT 'Street address', " +
+ "city:string COMMENT 'City name'>) " +
+ "USING json")
+
+ val rs = statement.executeQuery("SELECT person, address FROM
test_struct_comments")
+ val rsMetaData = rs.getMetaData()
+
+ assert(rsMetaData.getColumnCount == 2)
+ assert(rsMetaData.getColumnName(1) == "person")
+ assert(rsMetaData.getColumnName(2) == "address")
+
+ rs.close()
+ } finally {
+ statement.execute("DROP TABLE IF EXISTS test_column_comments")
+ statement.execute("DROP TABLE IF EXISTS test_struct_comments")
+ statement.close()
+ }
+ }
+
def operationLogRetrievalTest(statement: Statement): Unit = {
statement.execute("select 1")
val logIterator =
statement.asInstanceOf[HiveStatement].getQueryLog().iterator()
@@ -562,6 +615,12 @@ class BinaryThriftServerSuite extends ThriftServerBaseTest
with CommonThriftTest
}
}
+ test("fetch columns with comments") {
+ withJdbcConnection { connection =>
+ getColumnsWithCommentsTest(connection)
+ }
+ }
+
test("operation log retrieval test") {
withJdbcStatement { statement =>
operationLogRetrievalTest(statement)
@@ -639,6 +698,12 @@ class HttpThriftServerSuite extends ThriftServerBaseTest
with CommonThriftTests
}
}
+ test("fetch columns with comments") {
+ withJdbcConnection { connection =>
+ getColumnsWithCommentsTest(connection)
+ }
+ }
+
test("operation log retrieval test") {
withJdbcStatement { statement =>
operationLogRetrievalTest(statement)