viirya commented on a change in pull request #23943: [SPARK-27034][SQL] Nested 
schema pruning for ORC
URL: https://github.com/apache/spark/pull/23943#discussion_r263250824
 
 

 ##########
 File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
 ##########
 @@ -103,42 +43,13 @@ class ParquetSchemaPruningSuite
       Nil)
   }
 
-  testSchemaPruning("select a single complex field array and its parent struct 
array") {
-    val query = sql("select friends.middle, friends from contacts where p=1")
-    checkScan(query,
-      "struct<friends:array<struct<first:string,middle:string,last:string>>>")
-    checkAnswer(query.orderBy("id"),
-      Row(Array("Z."), Array(Row("Susan", "Z.", "Smith"))) ::
-      Row(Array.empty[String], Array.empty[Row]) ::
-      Nil)
-  }
-
-  testSchemaPruning("select a single complex field from a map entry and its 
parent map entry") {
-    val query =
-      sql("select relatives[\"brother\"].middle, relatives[\"brother\"] from 
contacts where p=1")
-    checkScan(query,
-      
"struct<relatives:map<string,struct<first:string,middle:string,last:string>>>")
-    checkAnswer(query.orderBy("id"),
-      Row("Y.", Row("John", "Y.", "Doe")) ::
-      Row(null, null) ::
-      Nil)
-  }
-
   testSchemaPruning("select a single complex field and the partition column") {
     val query = sql("select name.middle, p from contacts")
     checkScan(query, "struct<name:struct<middle:string>>")
     checkAnswer(query.orderBy("id"),
       Row("X.", 1) :: Row("Y.", 1) :: Row(null, 2) :: Row(null, 2) :: Nil)
 
 Review comment:
   Ah, sorry, it is not due to schema merging.
   
   But the inferred schema between ORC and Parquet is different. We can test it 
on current master branch like:
   
   ```scala
   withTempPath { dir =>
     val path = dir.getCanonicalPath
   
     makeDataSourceFile(contacts, new File(path + "/contacts/p=1"))
     makeDataSourceFile(briefContacts, new File(path + "/contacts/p=2"))
   
     spark.read.format(dataSourceName).load(path + 
"/contacts").createOrReplaceTempView("contacts")
     spark.sql("select * from contacts").printSchema()
   }
   ```
   
   When `dataSourceName` is parquet, the schema is:
   ```
   root                                                                         
                                                                          
    |-- id: integer (nullable = true)                                           
                                                                          
    |-- name: struct (nullable = true)                                          
                                                                          
    |    |-- first: string (nullable = true)                                    
                                                                          
    |    |-- middle: string (nullable = true)                                   
                                                                          
    |    |-- last: string (nullable = true)                                     
                                                                          
    |-- address: string (nullable = true)                                       
                                                                          
    |-- pets: integer (nullable = true)                                         
                                                                          
    |-- friends: array (nullable = true)                                        
                                                                          
    |    |-- element: struct (containsNull = true)                              
                                                                          
    |    |    |-- first: string (nullable = true)                               
                                                                          
    |    |    |-- middle: string (nullable = true)                              
                                                                          
    |    |    |-- last: string (nullable = true)                                
                                                                          
    |-- relatives: map (nullable = true)                                        
                                                                          
    |    |-- key: string                                                        
                                                                          
    |    |-- value: struct (valueContainsNull = true)                           
                                                                          
    |    |    |-- first: string (nullable = true)                               
                                                                          
    |    |    |-- middle: string (nullable = true)                              
                                                                          
    |    |    |-- last: string (nullable = true)                                
                                                                          
    |-- employer: struct (nullable = true)                                      
                                                                          
    |    |-- id: integer (nullable = true)                                      
                                                                          
    |    |-- company: struct (nullable = true)                                  
                                                                          
    |    |    |-- name: string (nullable = true)                                
                                                                          
    |    |    |-- address: string (nullable = true)                             
                                                                          
    |-- p: integer (nullable = true)      
   ```
   
   For orc, it is:
   ```
   root
    |-- id: integer (nullable = true)
    |-- name: struct (nullable = true)
    |    |-- first: string (nullable = true)
    |    |-- last: string (nullable = true)
    |-- address: string (nullable = true)
    |-- p: integer (nullable = true)
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to