sunchao commented on a change in pull request #30825:
URL: https://github.com/apache/spark/pull/30825#discussion_r545623161
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
##########
@@ -863,6 +864,23 @@ class DataSourceV2SQLSuite
}
}
+ test("SPARK-33829: Renaming a table should recreate a cache while retaining
the old cache info") {
+ withTable("testcat.ns.old", "testcat.ns.new") {
+ def getStorageLevel(tableName: String): StorageLevel = {
+ val table = spark.table(tableName)
+ val cachedData =
spark.sharedState.cacheManager.lookupCachedData(table).get
Review comment:
nit: might add an assertion here to check the cached data is non-empty.
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
##########
@@ -52,21 +52,36 @@ class DataSourceV2Strategy(session: SparkSession) extends
Strategy with Predicat
}
}
+ private def cache(relation: DataSourceV2Relation, cacheInfo: CacheInfo):
Unit = {
+ session.sharedState.cacheManager.cacheQuery(
+ session,
+ relation,
+ cacheInfo.tableName,
+ cacheInfo.storageLevel)
+ }
+
private def refreshCache(r: DataSourceV2Relation)(): Unit = {
session.sharedState.cacheManager.recacheByPlan(session, r)
}
- private def invalidateCache(r: ResolvedTable, recacheTable: Boolean =
false)(): Unit = {
+ // Invalidates the cache associated with the given table. If there exists a
cache with the given
+ // table, the cache's info (table name and storage level) is returned.
+ private def invalidateCache(
+ r: ResolvedTable,
+ recacheTable: Boolean = false)(): Option[CacheInfo] = {
val v2Relation = DataSourceV2Relation.create(r.table, Some(r.catalog),
Some(r.identifier))
val cache = session.sharedState.cacheManager.lookupCachedData(v2Relation)
session.sharedState.cacheManager.uncacheQuery(session, v2Relation, cascade
= true)
- if (recacheTable && cache.isDefined) {
- // save the cache name and cache level for recreation
+ if (cache.isDefined) {
val cacheName = cache.get.cachedRepresentation.cacheBuilder.tableName
val cacheLevel = cache.get.cachedRepresentation.cacheBuilder.storageLevel
Review comment:
I think they are also needed for the returning `CacheInfo`.
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
##########
@@ -52,21 +52,36 @@ class DataSourceV2Strategy(session: SparkSession) extends
Strategy with Predicat
}
}
+ private def cache(relation: DataSourceV2Relation, cacheInfo: CacheInfo):
Unit = {
+ session.sharedState.cacheManager.cacheQuery(
+ session,
+ relation,
+ cacheInfo.tableName,
+ cacheInfo.storageLevel)
+ }
+
private def refreshCache(r: DataSourceV2Relation)(): Unit = {
session.sharedState.cacheManager.recacheByPlan(session, r)
}
- private def invalidateCache(r: ResolvedTable, recacheTable: Boolean =
false)(): Unit = {
+ // Invalidates the cache associated with the given table. If there exists a
cache with the given
+ // table, the cache's info (table name and storage level) is returned.
+ private def invalidateCache(
+ r: ResolvedTable,
+ recacheTable: Boolean = false)(): Option[CacheInfo] = {
Review comment:
Not sure if the `CacheInfo` class is really needed since here only the
storage level returned is used. `Option[StorageLevel]` should work too.
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RenameTableExec.scala
##########
@@ -27,14 +27,26 @@ import org.apache.spark.sql.connector.catalog.{Identifier,
TableCatalog}
case class RenameTableExec(
catalog: TableCatalog,
oldIdent: Identifier,
- newIdent: Identifier) extends V2CommandExec {
+ newIdent: Identifier,
+ invalidateCache: () => Option[CacheInfo],
+ cacheTable: (DataSourceV2Relation, CacheInfo) => Unit)
+ extends V2CommandExec {
override def output: Seq[Attribute] = Seq.empty
override protected def run(): Seq[InternalRow] = {
+ import
org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper
+
+ val optOldCacheInfo = invalidateCache()
catalog.invalidateTable(oldIdent)
+
catalog.renameTable(oldIdent, newIdent)
+ optOldCacheInfo.foreach { cacheInfo =>
+ val tbl = catalog.loadTable(newIdent)
+ val newRelation = DataSourceV2Relation.create(tbl, Some(catalog),
Some(newIdent))
+ cacheTable(newRelation, CacheInfo(Some(newIdent.quoted),
cacheInfo.storageLevel))
Review comment:
it's unfortunate that we have to recache the table just for renaming -
but yeah I don't know if there is a way to reuse the old in memory relation.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]