hive git commit: HIVE-12712: HiveInputFormat may fail to column names to read in some cases (Prasanth Jayachandran reviewed by Sergey Shelukhin)

prasanthj Mon, 21 Dec 2015 16:02:59 -0800

Repository: hive
Updated Branches:
  refs/heads/master c65651930 -> a60fec2eb



HIVE-12712: HiveInputFormat may fail to column names to read in some cases 
(Prasanth Jayachandran reviewed by Sergey Shelukhin)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/a60fec2e
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/a60fec2e
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/a60fec2e

Branch: refs/heads/master
Commit: a60fec2ebfad276b21f801f4c0b025cfc069f5dd
Parents: c656519
Author: Prasanth Jayachandran <[email protected]>
Authored: Mon Dec 21 18:02:30 2015 -0600
Committer: Prasanth Jayachandran <[email protected]>
Committed: Mon Dec 21 18:02:30 2015 -0600

----------------------------------------------------------------------
 .../hadoop/hive/ql/io/HiveInputFormat.java      | 49 ++++++++++++++++----
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java   | 15 ++++--
 2 files changed, 50 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/a60fec2e/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
index 2607d9c..1f262d0 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
@@ -424,8 +424,7 @@ public class HiveInputFormat<K extends WritableComparable, 
V extends Writable>
       TableDesc table = part.getTableDesc();
       TableScanOperator tableScan = null;
 
-      List<String> aliases =
-          mrwork.getPathToAliases().get(dir.toUri().toString());
+      List<String> aliases = mrwork.getPathToAliases().get(dir.toString());
 
       // Make filter pushdown information available to getSplits.
       if ((aliases != null) && (aliases.size() == 1)) {
@@ -442,6 +441,10 @@ public class HiveInputFormat<K extends WritableComparable, 
V extends Writable>
           // push down filters
           pushFilters(newjob, tableScan);
         }
+      } else {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("aliases: {} pathToAliases: {} dir: {}", aliases, 
mrwork.getPathToAliases(), dir);
+        }
       }
 
       if (!currentDirs.isEmpty() &&
@@ -453,7 +456,15 @@ public class HiveInputFormat<K extends WritableComparable, 
V extends Writable>
       }
 
       if (!currentDirs.isEmpty()) {
-        LOG.info("Generating splits");
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Generating splits as currentDirs is not empty. 
currentDirs: {}", currentDirs);
+        }
+
+        // set columns to read in conf
+        if (pushDownProjection) {
+          pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
+        }
+
         addSplitsForGroup(currentDirs, currentTableScan, newjob,
             getInputFormatFromCache(currentInputFormatClass, job),
             currentInputFormatClass, currentDirs.size()*(numSplits / 
dirs.length),
@@ -466,16 +477,16 @@ public class HiveInputFormat<K extends 
WritableComparable, V extends Writable>
       currentTable = table;
       currentInputFormatClass = inputFormatClass;
     }
+
+    // set columns to read in conf
     if (pushDownProjection) {
-      newjob.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
-      newjob.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, 
readColumnsBuffer.toString());
-      newjob.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, 
readColumnNamesBuffer.toString());
-      LOG.info(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR + "=" + 
readColumnsBuffer.toString());
-      LOG.info(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR + "=" + 
readColumnNamesBuffer.toString());
+      pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
     }
 
     if (dirs.length != 0) {
-      LOG.info("Generating splits");
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Generating splits for dirs: {}", dirs);
+      }
       addSplitsForGroup(currentDirs, currentTableScan, newjob,
           getInputFormatFromCache(currentInputFormatClass, job),
           currentInputFormatClass, currentDirs.size()*(numSplits / 
dirs.length),
@@ -483,11 +494,29 @@ public class HiveInputFormat<K extends 
WritableComparable, V extends Writable>
     }
 
     Utilities.clearWorkMapForConf(job);
-    LOG.info("number of splits " + result.size());
+    if (LOG.isInfoEnabled()) {
+      LOG.info("number of splits " + result.size());
+    }
     perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
     return result.toArray(new HiveInputSplit[result.size()]);
   }
 
+  private void pushProjection(final JobConf newjob, final StringBuilder 
readColumnsBuffer,
+      final StringBuilder readColumnNamesBuffer) {
+    String readColIds = readColumnsBuffer.toString();
+    String readColNames = readColumnNamesBuffer.toString();
+    boolean readAllColumns = readColIds.isEmpty() ? true : false;
+    newjob.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, readAllColumns);
+    newjob.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIds);
+    newjob.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNames);
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("{} = {}", ColumnProjectionUtils.READ_ALL_COLUMNS, 
readAllColumns);
+      LOG.info("{} = {}", ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, 
readColIds);
+      LOG.info("{} = {}", ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, 
readColNames);
+    }
+  }
+
   protected static PartitionDesc getPartitionDescFromPath(
       Map<String, PartitionDesc> pathToPartitionInfo, Path dir)
       throws IOException {

http://git-wip-us.apache.org/repos/asf/hive/blob/a60fec2e/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index b3b48d5..359cbf7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -414,7 +414,11 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
 
   private static String[] extractNeededColNames(
       List<OrcProto.Type> types, Configuration conf, boolean[] include, 
boolean isOriginal) {
-    return extractNeededColNames(types, getNeededColumnNamesString(conf), 
include, isOriginal);
+    String colNames = getNeededColumnNamesString(conf);
+    if (colNames == null) {
+      return null;
+    }
+    return extractNeededColNames(types, colNames, include, isOriginal);
   }
 
   private static String[] extractNeededColNames(
@@ -1068,10 +1072,13 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
       // we can't eliminate stripes if there are deltas because the
       // deltas may change the rows making them match the predicate.
       if ((deltas == null || deltas.isEmpty()) && context.sarg != null) {
-        SearchArgument sarg = 
ConvertAstToSearchArg.createFromConf(context.conf);
         String[] colNames = extractNeededColNames(types, context.conf, 
includedCols, isOriginal);
-        includeStripe = pickStripes(context.sarg, colNames, writerVersion, 
isOriginal,
-            stripeStats, stripes.size(), file.getPath());
+        if (colNames == null) {
+          LOG.warn("Skipping split elimination for {} as column names is 
null", file.getPath());
+        } else {
+          includeStripe = pickStripes(context.sarg, colNames, writerVersion, 
isOriginal,
+              stripeStats, stripes.size(), file.getPath());
+        }
       }
 
       // if we didn't have predicate pushdown, read everything

hive git commit: HIVE-12712: HiveInputFormat may fail to column names to read in some cases (Prasanth Jayachandran reviewed by Sergey Shelukhin)

Reply via email to