Github user viirya commented on a diff in the pull request:
https://github.com/apache/spark/pull/22357#discussion_r216255549
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
---
@@ -155,6 +161,47 @@ class ParquetSchemaPruningSuite
Row(null) :: Row(null) :: Nil)
}
+ testSchemaPruning("select a single complex field and in where clause") {
+ val query1 = sql("select name.first from contacts where name.first =
'Jane'")
+ checkScan(query1, "struct<name:struct<first:string>>")
+ checkAnswer(query1, Row("Jane") :: Nil)
+
+ val query2 = sql("select name.first, name.last from contacts where
name.first = 'Jane'")
+ checkScan(query2, "struct<name:struct<first:string,last:string>>")
+ checkAnswer(query2, Row("Jane", "Doe") :: Nil)
+
+ val query3 = sql("select name.first from contacts " +
+ "where employer.company.name = 'abc' and p = 1")
--- End diff --
When there is a nested field access in the query like
`employer.company.name`, then we don't need other fields inside
`employ.company` other than `name`.
But if there is no such access but just `employer.company is not null` in
where clause, it will read full schema of `employ.company`.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]