Author: hashutosh
Date: Fri Dec 19 00:30:41 2014
New Revision: 1646596

URL: http://svn.apache.org/r1646596
Log:
HIVE-9106 : improve the performance of null scan optimizer when several table 
scans share a physical path (Pengcheng Xiong via Ashutosh Chauhan)

Modified:
    
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java
    hive/trunk/ql/src/test/results/clientpositive/metadataonly1.q.out
    hive/trunk/ql/src/test/results/clientpositive/optimize_nullscan.q.out
    hive/trunk/ql/src/test/results/clientpositive/tez/metadataonly1.q.out
    hive/trunk/ql/src/test/results/clientpositive/tez/optimize_nullscan.q.out

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java?rev=1646596&r1=1646595&r2=1646596&view=diff
==============================================================================
--- 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java
 (original)
+++ 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java
 Fri Dec 19 00:30:41 2014
@@ -102,47 +102,51 @@ public class NullScanTaskDispatcher impl
 
     return paths;
   }
-
-  private void processAlias(MapWork work, ArrayList<String> aliases, String 
path) {
- 
-    work.setUseOneNullRowInputFormat(true);
-    for (String alias : aliases) {
-      // Change the conf for tableScanOp
-      TableScanOperator tso = (TableScanOperator) 
work.getAliasToWork().get(alias);
-      tso.getConf().setIsMetadataOnly(true);
-      // Change the alias partition desc
-      PartitionDesc aliasPartn = work.getAliasToPartnInfo().get(alias);
-      changePartitionToMetadataOnly(aliasPartn);
+  
+  private void processAlias(MapWork work, String path, ArrayList<String> 
aliasesAffected,
+      ArrayList<String> aliases) {
+    // the aliases that are allowed to map to a null scan.
+    ArrayList<String> allowed = new ArrayList<String>();
+    for (String alias : aliasesAffected) {
+      if (aliases.contains(alias)) {
+        allowed.add(alias);
+      }
+    }
+    if (allowed.size() > 0) {
+      work.setUseOneNullRowInputFormat(true);
+      PartitionDesc partDesc = work.getPathToPartitionInfo().get(path).clone();
+      PartitionDesc newPartition = changePartitionToMetadataOnly(partDesc);
+      Path fakePath = new Path(physicalContext.getContext().getMRTmpPath()
+          + newPartition.getTableName() + encode(newPartition.getPartSpec()));
+      work.getPathToPartitionInfo().put(fakePath.getName(), newPartition);
+      work.getPathToAliases().put(fakePath.getName(), new 
ArrayList<String>(allowed));
+      aliasesAffected.removeAll(allowed);
+      if (aliasesAffected.isEmpty()) {
+        work.getPathToAliases().remove(path);
+        work.getPathToPartitionInfo().remove(path);
+      }
     }
-
-    PartitionDesc partDesc = work.getPathToPartitionInfo().get(path);
-    PartitionDesc newPartition = changePartitionToMetadataOnly(partDesc);
-    Path fakePath = new Path(physicalContext.getContext().getMRTmpPath()
-        + newPartition.getTableName() + encode(newPartition.getPartSpec()));
-    work.getPathToPartitionInfo().remove(path);
-    work.getPathToPartitionInfo().put(fakePath.getName(), newPartition);
-    assert(work.getPathToAliases().remove(path).equals(aliases));
-    work.getPathToAliases().put(fakePath.getName(), aliases);
   }
 
   private void processAlias(MapWork work, HashSet<TableScanOperator> 
tableScans) {
-    ArrayList<String> aliasList = new ArrayList<String>();
+    ArrayList<String> aliases = new ArrayList<String>();
     for (TableScanOperator tso : tableScans) {
       // use LinkedHashMap<String, Operator<? extends OperatorDesc>>
       // getAliasToWork()
       String alias = getAliasForTableScanOperator(work, tso);
-      aliasList.add(alias);
+      aliases.add(alias);
+      tso.getConf().setIsMetadataOnly(true);
     }
     // group path alias according to work
     LinkedHashMap<String, ArrayList<String>> candidates = new 
LinkedHashMap<String, ArrayList<String>>();
     for (String path : work.getPaths()) {
-      ArrayList<String> aliases = work.getPathToAliases().get(path);
-      if (aliases != null && aliasList.containsAll(aliases)) {
-        candidates.put(path, aliases);
+      ArrayList<String> aliasesAffected = work.getPathToAliases().get(path);
+      if (aliasesAffected != null && aliasesAffected.size() > 0) {
+        candidates.put(path, aliasesAffected);
       }
     }
     for (Entry<String, ArrayList<String>> entry : candidates.entrySet()) {
-      processAlias(work, entry.getValue(), entry.getKey());
+      processAlias(work, entry.getKey(), entry.getValue(), aliases);
     }
   }
 

Modified: hive/trunk/ql/src/test/results/clientpositive/metadataonly1.q.out
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/metadataonly1.q.out?rev=1646596&r1=1646595&r2=1646596&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/metadataonly1.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/metadataonly1.q.out Fri Dec 
19 00:30:41 2014
@@ -146,7 +146,6 @@ STAGE PLANS:
       Path -> Partition:
         -mr-10003default.test1{ds=1} 
           Partition
-            base file name: ds=1
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -292,7 +291,6 @@ STAGE PLANS:
       Path -> Partition:
         -mr-10003default.test1{ds=1} 
           Partition
-            base file name: ds=1
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -621,7 +619,6 @@ STAGE PLANS:
       Path -> Partition:
         -mr-10005default.test1{ds=1} 
           Partition
-            base file name: ds=1
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -661,7 +658,6 @@ STAGE PLANS:
             name: default.test1
         -mr-10006default.test1{ds=2} 
           Partition
-            base file name: ds=2
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -1079,7 +1075,6 @@ STAGE PLANS:
       Path -> Partition:
         -mr-10003default.test2{ds=1, hr=1} 
           Partition
-            base file name: hr=1
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -1120,7 +1115,6 @@ STAGE PLANS:
             name: default.test2
         -mr-10004default.test2{ds=1, hr=2} 
           Partition
-            base file name: hr=2
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -1161,7 +1155,6 @@ STAGE PLANS:
             name: default.test2
         -mr-10005default.test2{ds=1, hr=3} 
           Partition
-            base file name: hr=3
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -1558,7 +1551,6 @@ STAGE PLANS:
       Path -> Partition:
         -mr-10003default.test1{ds=1} 
           Partition
-            base file name: ds=1
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -1598,7 +1590,6 @@ STAGE PLANS:
             name: default.test1
         -mr-10004default.test1{ds=2} 
           Partition
-            base file name: ds=2
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -1809,7 +1800,6 @@ STAGE PLANS:
       Path -> Partition:
         -mr-10003default.test2{ds=01_10_10, hr=01} 
           Partition
-            base file name: hr=01
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -1850,7 +1840,6 @@ STAGE PLANS:
             name: default.test2
         -mr-10004default.test2{ds=01_10_20, hr=02} 
           Partition
-            base file name: hr=02
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -1891,7 +1880,6 @@ STAGE PLANS:
             name: default.test2
         -mr-10005default.test2{ds=1, hr=1} 
           Partition
-            base file name: hr=1
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -1932,7 +1920,6 @@ STAGE PLANS:
             name: default.test2
         -mr-10006default.test2{ds=1, hr=2} 
           Partition
-            base file name: hr=2
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:
@@ -1973,7 +1960,6 @@ STAGE PLANS:
             name: default.test2
         -mr-10007default.test2{ds=1, hr=3} 
           Partition
-            base file name: hr=3
             input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
             output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
             partition values:

Modified: hive/trunk/ql/src/test/results/clientpositive/optimize_nullscan.q.out
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/optimize_nullscan.q.out?rev=1646596&r1=1646595&r2=1646596&view=diff
==============================================================================
Binary files 
hive/trunk/ql/src/test/results/clientpositive/optimize_nullscan.q.out 
(original) and 
hive/trunk/ql/src/test/results/clientpositive/optimize_nullscan.q.out Fri Dec 
19 00:30:41 2014 differ

Modified: hive/trunk/ql/src/test/results/clientpositive/tez/metadataonly1.q.out
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/tez/metadataonly1.q.out?rev=1646596&r1=1646595&r2=1646596&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/tez/metadataonly1.q.out 
(original)
+++ hive/trunk/ql/src/test/results/clientpositive/tez/metadataonly1.q.out Fri 
Dec 19 00:30:41 2014
@@ -157,7 +157,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10003default.test1{ds=1} 
                 Partition
-                  base file name: ds=1
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -309,7 +308,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10003default.test1{ds=1} 
                 Partition
-                  base file name: ds=1
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1041,7 +1039,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10003default.test2{ds=1, hr=1} 
                 Partition
-                  base file name: hr=1
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1082,7 +1079,6 @@ STAGE PLANS:
                   name: default.test2
               -mr-10004default.test2{ds=1, hr=2} 
                 Partition
-                  base file name: hr=2
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1123,7 +1119,6 @@ STAGE PLANS:
                   name: default.test2
               -mr-10005default.test2{ds=1, hr=3} 
                 Partition
-                  base file name: hr=3
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1532,7 +1527,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10003default.test1{ds=1} 
                 Partition
-                  base file name: ds=1
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1572,7 +1566,6 @@ STAGE PLANS:
                   name: default.test1
               -mr-10004default.test1{ds=2} 
                 Partition
-                  base file name: ds=2
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1789,7 +1782,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10003default.test2{ds=01_10_10, hr=01} 
                 Partition
-                  base file name: hr=01
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1830,7 +1822,6 @@ STAGE PLANS:
                   name: default.test2
               -mr-10004default.test2{ds=01_10_20, hr=02} 
                 Partition
-                  base file name: hr=02
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1871,7 +1862,6 @@ STAGE PLANS:
                   name: default.test2
               -mr-10005default.test2{ds=1, hr=1} 
                 Partition
-                  base file name: hr=1
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1912,7 +1902,6 @@ STAGE PLANS:
                   name: default.test2
               -mr-10006default.test2{ds=1, hr=2} 
                 Partition
-                  base file name: hr=2
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1953,7 +1942,6 @@ STAGE PLANS:
                   name: default.test2
               -mr-10007default.test2{ds=1, hr=3} 
                 Partition
-                  base file name: hr=3
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:

Modified: 
hive/trunk/ql/src/test/results/clientpositive/tez/optimize_nullscan.q.out
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/tez/optimize_nullscan.q.out?rev=1646596&r1=1646595&r2=1646596&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/tez/optimize_nullscan.q.out 
(original)
+++ hive/trunk/ql/src/test/results/clientpositive/tez/optimize_nullscan.q.out 
Fri Dec 19 00:30:41 2014
@@ -254,7 +254,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10002default.src{} 
                 Partition
-                  base file name: src
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   properties:
@@ -325,7 +324,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10003default.srcpart{ds=2008-04-08, hr=11} 
                 Partition
-                  base file name: hr=11
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -371,7 +369,6 @@ STAGE PLANS:
                   name: default.srcpart
               -mr-10004default.srcpart{ds=2008-04-08, hr=12} 
                 Partition
-                  base file name: hr=12
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -417,7 +414,6 @@ STAGE PLANS:
                   name: default.srcpart
               -mr-10005default.srcpart{ds=2008-04-09, hr=11} 
                 Partition
-                  base file name: hr=11
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -463,7 +459,6 @@ STAGE PLANS:
                   name: default.srcpart
               -mr-10006default.srcpart{ds=2008-04-09, hr=12} 
                 Partition
-                  base file name: hr=12
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -683,7 +678,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10002default.src{} 
                 Partition
-                  base file name: src
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   properties:
@@ -1121,7 +1115,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10002default.src{} 
                 Partition
-                  base file name: src
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   properties:
@@ -1192,7 +1185,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10003default.srcpart{ds=2008-04-08, hr=11} 
                 Partition
-                  base file name: hr=11
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1238,7 +1230,6 @@ STAGE PLANS:
                   name: default.srcpart
               -mr-10004default.srcpart{ds=2008-04-08, hr=12} 
                 Partition
-                  base file name: hr=12
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1284,7 +1275,6 @@ STAGE PLANS:
                   name: default.srcpart
               -mr-10005default.srcpart{ds=2008-04-09, hr=11} 
                 Partition
-                  base file name: hr=11
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1330,7 +1320,6 @@ STAGE PLANS:
                   name: default.srcpart
               -mr-10006default.srcpart{ds=2008-04-09, hr=12} 
                 Partition
-                  base file name: hr=12
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   partition values:
@@ -1640,7 +1629,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10002default.src{} 
                 Partition
-                  base file name: src
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   properties:
@@ -1812,7 +1800,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10002default.src{} 
                 Partition
-                  base file name: src
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   properties:
@@ -1879,7 +1866,6 @@ STAGE PLANS:
             Path -> Partition:
               -mr-10003default.src{} 
                 Partition
-                  base file name: src
                   input format: 
org.apache.hadoop.hive.ql.io.OneNullRowInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   properties:


Reply via email to