This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new be154a371df0 [SPARK-48622][SQL] get SQLConf once when resolving column
names
be154a371df0 is described below
commit be154a371df0401163deb221efc3b54fa089f49c
Author: Andrew Xue <[email protected]>
AuthorDate: Fri Jun 14 10:04:24 2024 +0800
[SPARK-48622][SQL] get SQLConf once when resolving column names
### What changes were proposed in this pull request?
`SQLConf.caseSensitiveAnalysis` is currently being retrieved for every
column when resolving column names. This is expensive if there are many
columns. We can instead retrieve it once before the loop, and reuse the result.
### Why are the changes needed?
Performance improvement.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Profiles of adding 1 column on an empty 10k column table (hms-parquet):
Before (55s):
<img width="1728" alt="Screenshot 2024-06-10 at 12 17 49 PM"
src="https://github.com/databricks/runtime/assets/169104436/58de6a56-943e-465a-9005-ae98f960779e">
After (13s):
<img width="1726" alt="Screenshot 2024-06-10 at 12 17 04 PM"
src="https://github.com/databricks/runtime/assets/169104436/e9bdabc4-6e29-4012-bb01-103fa0b640fc">
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #46979 from andrewxue-db/andrewxue-db/spark-48622.
Authored-by: Andrew Xue <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
---
.../org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 7a19f276b513..0e0852d0a550 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -480,8 +480,9 @@ class SessionCatalog(
val catalogTable = externalCatalog.getTable(db, table)
val oldDataSchema = catalogTable.dataSchema
// not supporting dropping columns yet
+ val resolver = conf.resolver
val nonExistentColumnNames =
- oldDataSchema.map(_.name).filterNot(columnNameResolved(newDataSchema, _))
+ oldDataSchema.map(_.name).filterNot(columnNameResolved(resolver,
newDataSchema, _))
if (nonExistentColumnNames.nonEmpty) {
throw
QueryCompilationErrors.dropNonExistentColumnsNotSupportedError(nonExistentColumnNames)
}
@@ -489,8 +490,11 @@ class SessionCatalog(
externalCatalog.alterTableDataSchema(db, table, newDataSchema)
}
- private def columnNameResolved(schema: StructType, colName: String): Boolean
= {
- schema.fields.map(_.name).exists(conf.resolver(_, colName))
+ private def columnNameResolved(
+ resolver: Resolver,
+ schema: StructType,
+ colName: String): Boolean = {
+ schema.fields.exists(f => resolver(f.name, colName))
}
/**
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]