[GitHub] [spark] maropu commented on a change in pull request #28556: [SPARK-31736][SQL] Nested column aliasing for RepartitionByExpression/Join

GitBox Thu, 21 May 2020 06:08:10 -0700


maropu commented on a change in pull request #28556:
URL: https://github.com/apache/spark/pull/28556#discussion_r428635965




##########
File path: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
##########
@@ -144,7 +144,6 @@ class NestedColumnAliasingSuite extends SchemaPruningTest {
   test("Pushing a single nested field projection - negative") {
     val ops = Seq(
       (input: LogicalPlan) => input.distribute('name)(1),
-      (input: LogicalPlan) => input.distribute($"name.middle")(1),

Review comment:
       Ah, looks nice. This PR could support this case.

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
##########
@@ -338,6 +349,93 @@ abstract class SchemaPruningSuite
     }
   }
 
+  testSchemaPruning("select one deep nested complex field after repartition") {
+    val query = sql("select * from contacts")
+      .repartition(100)
+      .where("employer.company.address is not null")
+      .selectExpr("employer.id as employer_id")
+    checkScan(query,
+      "struct<employer:struct<id:int,company:struct<address:string>>>")
+    checkAnswer(query, Row(0) :: Nil)
+  }
+
+  testSchemaPruning("select one deep nested complex field after repartition by 
expression") {
+    val query1 = sql("select * from contacts")
+      .repartition(100, col("id"))
+      .where("employer.company.address is not null")
+      .selectExpr("employer.id as employer_id")
+    checkScan(query1,
+      "struct<id:int,employer:struct<id:int,company:struct<address:string>>>")
+    checkAnswer(query1, Row(0) :: Nil)
+
+    val query2 = sql("select * from contacts")
+      .repartition(100, col("employer"))
+      .where("employer.company.address is not null")
+      .selectExpr("employer.id as employer_id")
+    checkScan(query2,
+      
"struct<employer:struct<id:int,company:struct<name:string,address:string>>>")
+    checkAnswer(query2, Row(0) :: Nil)
+
+    val query3 = sql("select * from contacts")
+      .repartition(100, col("employer.company"))
+      .where("employer.company.address is not null")
+      .selectExpr("employer.company as employer_company")
+    checkScan(query3,
+      "struct<employer:struct<company:struct<name:string,address:string>>>")
+    checkAnswer(query3, Row(Row("abc", "123 Business Street")) :: Nil)
+
+    val query4 = sql("select * from contacts")
+      .repartition(100, col("employer.company.address"))
+      .where("employer.company.address is not null")
+      .selectExpr("employer.company.address as employer_company_addr")
+    checkScan(query4,
+      "struct<employer:struct<company:struct<address:string>>>")
+    checkAnswer(query4, Row("123 Business Street") :: Nil)
+  }
+
+  testSchemaPruning("select one deep nested complex field after join") {
+    val query1 = sql("select contacts.name.middle from contacts, departments 
where " +
+        "contacts.id = departments.contactId")
+    checkScan(query1,
+      "struct<id:int,name:struct<middle:string>>",
+    "struct<contactId:int>")
+    checkAnswer(query1, Row("X.") :: Row("Y.") :: Nil)
+
+    val query2 = sql("select contacts.name.middle from contacts, departments 
where " +
+      "contacts.employer = departments.employer")
+    checkScan(query2,
+      "struct<name:struct<middle:string>," +
+        "employer:struct<id:int,company:struct<name:string,address:string>>>",
+      
"struct<employer:struct<id:int,company:struct<name:string,address:string>>>")
+    checkAnswer(query2, Row("X.") :: Row("Y.") :: Nil)
+
+    val query3 = sql("select contacts.employer.company.name from contacts, 
departments where " +
+      "contacts.employer = departments.employer")
+    checkScan(query3,
+      
"struct<employer:struct<id:int,company:struct<name:string,address:string>>>",
+      
"struct<employer:struct<id:int,company:struct<name:string,address:string>>>")
+    checkAnswer(query3, Row("abc") :: Row(null) :: Nil)
+  }
+
+  testSchemaPruning("select one deep nested complex field after outer join") {

Review comment:
       Thanks for adding the tests.

##########
File path: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
##########
@@ -341,6 +340,90 @@ class NestedColumnAliasingSuite extends SchemaPruningTest {
       .analyze
     comparePlans(optimized, expected)
   }
+
+  test("Nested field pruning through RepartitionByExpression") {
+    val query1 = contact
+      .distribute($"id")(1)
+      .select($"name.middle")
+      .analyze
+    val optimized1 = Optimize.execute(query1)
+
+    val aliases1 = collectGeneratedAliases(optimized1)
+
+    val expected1 = contact
+      .select('id, 'name.getField("middle").as(aliases1(0)))
+      .distribute($"id")(1)
+      .select($"${aliases1(0)}".as("middle"))
+      .analyze
+    comparePlans(optimized1, expected1)
+
+

Review comment:
       nit: unnecessary line break.

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
##########
@@ -338,6 +349,93 @@ abstract class SchemaPruningSuite
     }
   }
 
+  testSchemaPruning("select one deep nested complex field after repartition") {
+    val query = sql("select * from contacts")
+      .repartition(100)
+      .where("employer.company.address is not null")
+      .selectExpr("employer.id as employer_id")
+    checkScan(query,
+      "struct<employer:struct<id:int,company:struct<address:string>>>")
+    checkAnswer(query, Row(0) :: Nil)
+  }
+
+  testSchemaPruning("select one deep nested complex field after repartition by 
expression") {
+    val query1 = sql("select * from contacts")
+      .repartition(100, col("id"))
+      .where("employer.company.address is not null")
+      .selectExpr("employer.id as employer_id")
+    checkScan(query1,
+      "struct<id:int,employer:struct<id:int,company:struct<address:string>>>")
+    checkAnswer(query1, Row(0) :: Nil)
+
+    val query2 = sql("select * from contacts")
+      .repartition(100, col("employer"))
+      .where("employer.company.address is not null")
+      .selectExpr("employer.id as employer_id")
+    checkScan(query2,
+      
"struct<employer:struct<id:int,company:struct<name:string,address:string>>>")
+    checkAnswer(query2, Row(0) :: Nil)
+
+    val query3 = sql("select * from contacts")
+      .repartition(100, col("employer.company"))
+      .where("employer.company.address is not null")
+      .selectExpr("employer.company as employer_company")
+    checkScan(query3,
+      "struct<employer:struct<company:struct<name:string,address:string>>>")
+    checkAnswer(query3, Row(Row("abc", "123 Business Street")) :: Nil)
+
+    val query4 = sql("select * from contacts")
+      .repartition(100, col("employer.company.address"))
+      .where("employer.company.address is not null")
+      .selectExpr("employer.company.address as employer_company_addr")
+    checkScan(query4,
+      "struct<employer:struct<company:struct<address:string>>>")
+    checkAnswer(query4, Row("123 Business Street") :: Nil)
+  }
+
+  testSchemaPruning("select one deep nested complex field after join") {
+    val query1 = sql("select contacts.name.middle from contacts, departments 
where " +
+        "contacts.id = departments.contactId")
+    checkScan(query1,
+      "struct<id:int,name:struct<middle:string>>",
+    "struct<contactId:int>")
+    checkAnswer(query1, Row("X.") :: Row("Y.") :: Nil)
+
+    val query2 = sql("select contacts.name.middle from contacts, departments 
where " +

Review comment:
       nit: I think its better to use uppercases for SQL keywords where 
possible.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] maropu commented on a change in pull request #28556: [SPARK-31736][SQL] Nested column aliasing for RepartitionByExpression/Join

Reply via email to