Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/19864#discussion_r157141961 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala --- @@ -479,4 +485,35 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { } } } + + test("SPARK-22673: InMemoryRelation should utilize existing stats of the plan to be cached") { + withSQLConf("spark.sql.cbo.enabled" -> "true") { + val workDir = s"${Utils.createTempDir()}/table1" + val data = Seq(100, 200, 300, 400).toDF("count") + data.write.parquet(workDir) + val dfFromFile = spark.read.parquet(workDir).cache() + val inMemoryRelation = dfFromFile.queryExecution.optimizedPlan.collect { + case plan: InMemoryRelation => plan + }.head + // InMemoryRelation's stats is file size before the underlying RDD is materialized + assert(inMemoryRelation.computeStats().sizeInBytes === 740) + + // InMemoryRelation's stats is updated after materializing RDD + dfFromFile.collect() + assert(inMemoryRelation.computeStats().sizeInBytes === 16) + + // test of catalog table + val dfFromTable = spark.catalog.createTable("table1", workDir).cache() + val inMemoryRelation2 = dfFromTable.queryExecution.optimizedPlan. + collect { case plan: InMemoryRelation => plan }.head + + // Even CBO enabled, InMemoryRelation's stats keeps as the file size before table's stats + // is calculated + assert(inMemoryRelation2.computeStats().sizeInBytes === 740) + + // InMemoryRelation's stats should be updated after calculating stats of the table + spark.sql("ANALYZE TABLE table1 COMPUTE STATISTICS") + assert(inMemoryRelation2.computeStats().sizeInBytes === 16) --- End diff -- what happened here? `InMemoryRelation.statsOfPlanToCache` gets updated aotumatically?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org