[GitHub] [spark] sunchao commented on a change in pull request #30825: [SPARK-33829][SQL] Renaming v2 tables should recreate the cache

GitBox Thu, 17 Dec 2020 23:42:07 -0800


sunchao commented on a change in pull request #30825:
URL: https://github.com/apache/spark/pull/30825#discussion_r545623161




##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
##########
@@ -863,6 +864,23 @@ class DataSourceV2SQLSuite
     }
   }
 
+  test("SPARK-33829: Renaming a table should recreate a cache while retaining 
the old cache info") {
+    withTable("testcat.ns.old", "testcat.ns.new") {
+      def getStorageLevel(tableName: String): StorageLevel = {
+        val table = spark.table(tableName)
+        val cachedData = 
spark.sharedState.cacheManager.lookupCachedData(table).get

Review comment:
       nit: might add an assertion here to check the cached data is non-empty. 

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
##########
@@ -52,21 +52,36 @@ class DataSourceV2Strategy(session: SparkSession) extends 
Strategy with Predicat
     }
   }
 
+  private def cache(relation: DataSourceV2Relation, cacheInfo: CacheInfo): 
Unit = {
+    session.sharedState.cacheManager.cacheQuery(
+      session,
+      relation,
+      cacheInfo.tableName,
+      cacheInfo.storageLevel)
+  }
+
   private def refreshCache(r: DataSourceV2Relation)(): Unit = {
     session.sharedState.cacheManager.recacheByPlan(session, r)
   }
 
-  private def invalidateCache(r: ResolvedTable, recacheTable: Boolean = 
false)(): Unit = {
+  // Invalidates the cache associated with the given table. If there exists a 
cache with the given
+  // table, the cache's info (table name and storage level) is returned.
+  private def invalidateCache(
+      r: ResolvedTable,
+      recacheTable: Boolean = false)(): Option[CacheInfo] = {
     val v2Relation = DataSourceV2Relation.create(r.table, Some(r.catalog), 
Some(r.identifier))
     val cache = session.sharedState.cacheManager.lookupCachedData(v2Relation)
     session.sharedState.cacheManager.uncacheQuery(session, v2Relation, cascade 
= true)
-    if (recacheTable && cache.isDefined) {
-      // save the cache name and cache level for recreation
+    if (cache.isDefined) {
       val cacheName = cache.get.cachedRepresentation.cacheBuilder.tableName
       val cacheLevel = cache.get.cachedRepresentation.cacheBuilder.storageLevel

Review comment:
       I think they are also needed for the returning `CacheInfo`.

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
##########
@@ -52,21 +52,36 @@ class DataSourceV2Strategy(session: SparkSession) extends 
Strategy with Predicat
     }
   }
 
+  private def cache(relation: DataSourceV2Relation, cacheInfo: CacheInfo): 
Unit = {
+    session.sharedState.cacheManager.cacheQuery(
+      session,
+      relation,
+      cacheInfo.tableName,
+      cacheInfo.storageLevel)
+  }
+
   private def refreshCache(r: DataSourceV2Relation)(): Unit = {
     session.sharedState.cacheManager.recacheByPlan(session, r)
   }
 
-  private def invalidateCache(r: ResolvedTable, recacheTable: Boolean = 
false)(): Unit = {
+  // Invalidates the cache associated with the given table. If there exists a 
cache with the given
+  // table, the cache's info (table name and storage level) is returned.
+  private def invalidateCache(
+      r: ResolvedTable,
+      recacheTable: Boolean = false)(): Option[CacheInfo] = {

Review comment:
       Not sure if the `CacheInfo` class is really needed since here only the 
storage level returned is used. `Option[StorageLevel]` should work too.

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RenameTableExec.scala
##########
@@ -27,14 +27,26 @@ import org.apache.spark.sql.connector.catalog.{Identifier, 
TableCatalog}
 case class RenameTableExec(
     catalog: TableCatalog,
     oldIdent: Identifier,
-    newIdent: Identifier) extends V2CommandExec {
+    newIdent: Identifier,
+    invalidateCache: () => Option[CacheInfo],
+    cacheTable: (DataSourceV2Relation, CacheInfo) => Unit)
+  extends V2CommandExec {
 
   override def output: Seq[Attribute] = Seq.empty
 
   override protected def run(): Seq[InternalRow] = {
+    import 
org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper
+
+    val optOldCacheInfo = invalidateCache()
     catalog.invalidateTable(oldIdent)
+
     catalog.renameTable(oldIdent, newIdent)
 
+    optOldCacheInfo.foreach { cacheInfo =>
+      val tbl = catalog.loadTable(newIdent)
+      val newRelation = DataSourceV2Relation.create(tbl, Some(catalog), 
Some(newIdent))
+      cacheTable(newRelation, CacheInfo(Some(newIdent.quoted), 
cacheInfo.storageLevel))

Review comment:
       it's unfortunate that we have to recache the table just for renaming - 
but yeah I don't know if there is a way to reuse the old in memory relation.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] sunchao commented on a change in pull request #30825: [SPARK-33829][SQL] Renaming v2 tables should recreate the cache

Reply via email to