Github user CodingCat commented on a diff in the pull request:
https://github.com/apache/spark/pull/19864#discussion_r157653784
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
---
@@ -479,4 +485,43 @@ class InMemoryColumnarQuerySuite extends QueryTest
with SharedSQLContext {
}
}
}
+
+ test("SPARK-22673: InMemoryRelation should utilize existing stats of the
plan to be cached") {
+ withSQLConf("spark.sql.cbo.enabled" -> "true") {
+ withTempPath { workDir =>
+ withTable("table1") {
+ val workDirPath = workDir.getAbsolutePath
+ val data = Seq(100, 200, 300, 400).toDF("count")
+ data.write.parquet(workDirPath)
+ val dfFromFile = spark.read.parquet(workDirPath).cache()
+ val inMemoryRelation =
dfFromFile.queryExecution.optimizedPlan.collect {
+ case plan: InMemoryRelation => plan
+ }.head
+ // InMemoryRelation's stats is file size before the underlying
RDD is materialized
+ assert(inMemoryRelation.computeStats().sizeInBytes === 740)
+
+ // InMemoryRelation's stats is updated after materializing RDD
+ dfFromFile.collect()
+ assert(inMemoryRelation.computeStats().sizeInBytes === 16)
+
+ // test of catalog table
+ val dfFromTable = spark.catalog.createTable("table1",
workDirPath).cache()
+ val inMemoryRelation2 = dfFromTable.queryExecution.optimizedPlan.
+ collect { case plan: InMemoryRelation => plan }.head
+
+ // Even CBO enabled, InMemoryRelation's stats keeps as the file
size before table's stats
+ // is calculated
+ assert(inMemoryRelation2.computeStats().sizeInBytes === 740)
+
+ // InMemoryRelation's stats should be updated after calculating
stats of the table
+ // clear cache to simulate a fresh environment
+ dfFromTable.unpersist(blocking = true)
+ spark.sql("ANALYZE TABLE table1 COMPUTE STATISTICS")
+ val inMemoryRelation3 =
spark.read.table("table1").cache().queryExecution.optimizedPlan.
+ collect { case plan: InMemoryRelation => plan }.head
+ assert(inMemoryRelation3.computeStats().sizeInBytes === 48)
--- End diff --
because 16 is the `exact` in-memory size which is got by reading the
accumulator's value after evaluating the RDD
48 is calculated by EstimationUtils:
https://github.com/apache/spark/blob/bdb5e55c2a67d16a36ad6baa22296d714d3525af/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala#L78
`(8 + 4 (average attribute length)) * 4`
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]