wangyum commented on a change in pull request #24715: [SPARK-25474][SQL] Data source tables support fallback to HDFS for size estimation URL: https://github.com/apache/spark/pull/24715#discussion_r298788939
########## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala ########## @@ -512,37 +512,46 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "orc", SQLConf.USE_V1_SOURCE_READER_LIST.key -> useV1SourceReaderList) { withTempPath { workDir => - withTable("table1") { - val workDirPath = workDir.getAbsolutePath - val data = Seq(100, 200, 300, 400).toDF("count") - data.write.orc(workDirPath) - val dfFromFile = spark.read.orc(workDirPath).cache() - val inMemoryRelation = dfFromFile.queryExecution.optimizedPlan.collect { - case plan: InMemoryRelation => plan - }.head - // InMemoryRelation's stats is file size before the underlying RDD is materialized - assert(inMemoryRelation.computeStats().sizeInBytes === 486) - - // InMemoryRelation's stats is updated after materializing RDD - dfFromFile.collect() - assert(inMemoryRelation.computeStats().sizeInBytes === 16) - - // test of catalog table - val dfFromTable = spark.catalog.createTable("table1", workDirPath).cache() - val inMemoryRelation2 = dfFromTable.queryExecution.optimizedPlan. - collect { case plan: InMemoryRelation => plan }.head - - // Even CBO enabled, InMemoryRelation's stats keeps as the file size before table's - // stats is calculated - assert(inMemoryRelation2.computeStats().sizeInBytes === 486) Review comment: Hive table also has this issue: ```scala import org.apache.spark.sql.execution.columnar.InMemoryRelation val tempDir = "/tmp/spark/spark_25474" spark.range(10).write.mode("overwrite").parquet(tempDir) spark.sql(s"CREATE TABLE spark_25474 (id BIGINT) STORED AS parquet LOCATION '$tempDir'") spark.sql("DESC FORMATTED spark_25474").show(false) val inMemoryRelation = spark.table("spark_25474").cache() val optimizedPlan = inMemoryRelation.queryExecution.optimizedPlan val inMemoryRelation = optimizedPlan.collect { case plan: InMemoryRelation => plan }.head println(inMemoryRelation.computeStats().sizeInBytes) ``` ``` scala> println(inMemoryRelation.computeStats().sizeInBytes) 9223372036854775807 ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org