hive git commit: HIVE-17917: VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket optimization (Saurabh Seth via Eugene Koifman)

ekoifman Thu, 27 Sep 2018 19:15:02 -0700

Repository: hive
Updated Branches:
  refs/heads/master 37fd22e6a -> 34331f3c7



HIVE-17917: VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket optimization 
(Saurabh Seth via Eugene Koifman)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/34331f3c
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/34331f3c
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/34331f3c

Branch: refs/heads/master
Commit: 34331f3c7b69200a0177f5446f1f15c8ed69ee86
Parents: 37fd22e
Author: Saurabh Seth <saurabh.s...@gmail.com>
Authored: Thu Sep 27 19:14:21 2018 -0700
Committer: Eugene Koifman <ekoif...@apache.org>
Committed: Thu Sep 27 19:14:21 2018 -0700

----------------------------------------------------------------------
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java   |  23 ++-
 .../apache/hadoop/hive/ql/io/orc/OrcSplit.java  |  59 +++++++-
 .../io/orc/VectorizedOrcAcidRowBatchReader.java |  69 ++++-----
 .../hive/ql/io/orc/TestInputOutputFormat.java   |   6 +-
 .../acid_vectorization_original.q               |  29 +++-
 .../llap/acid_vectorization_original.q.out      | 146 +++++++++++++++++++
 6 files changed, 285 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index f34f393..728bf50 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -1036,6 +1036,7 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
     private final Path dir;
     private final boolean allowSyntheticFileIds;
     private final boolean isDefaultFs;
+    private final Configuration conf;
 
     /**
      * @param dir - root of partition dir
@@ -1051,12 +1052,21 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
       this.dir = dir;
       this.allowSyntheticFileIds = allowSyntheticFileIds;
       this.isDefaultFs = isDefaultFs;
+      this.conf = context.conf;
     }
 
     @Override
     public List<OrcSplit> getSplits() throws IOException {
       List<OrcSplit> splits = Lists.newArrayList();
+      boolean isAcid = AcidUtils.isFullAcidScan(conf);
+      boolean vectorMode = Utilities.getIsVectorized(conf);
+      OrcSplit.OffsetAndBucketProperty offsetAndBucket = null;
       for (HdfsFileStatusWithId file : fileStatuses) {
+        if (isOriginal && isAcid && vectorMode) {
+          offsetAndBucket = 
VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket(file.getFileStatus(), 
dir,
+              isOriginal, !deltas.isEmpty(), conf);
+        }
+
         FileStatus fileStatus = file.getFileStatus();
         long logicalLen = AcidUtils.getLogicalLength(fs, fileStatus);
         if (logicalLen != 0) {
@@ -1072,7 +1082,7 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
             }
             OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, 
entry.getKey(),
                 entry.getValue().getLength(), entry.getValue().getHosts(), 
null, isOriginal, true,
-                deltas, -1, logicalLen, dir);
+                deltas, -1, logicalLen, dir, offsetAndBucket);
             splits.add(orcSplit);
           }
         }
@@ -1352,6 +1362,7 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
     private SchemaEvolution evolution;
     //this is the root of the partition in which the 'file' is located
     private final Path rootDir;
+    OrcSplit.OffsetAndBucketProperty offsetAndBucket = null;
 
     public SplitGenerator(SplitInfo splitInfo, UserGroupInformation ugi,
         boolean allowSyntheticFileIds, boolean isDefaultFs) throws IOException 
{
@@ -1480,7 +1491,7 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
         fileKey = new SyntheticFileId(file);
       }
       return new OrcSplit(file.getPath(), fileKey, offset, length, hosts,
-          orcTail, isOriginal, hasBase, deltas, scaledProjSize, fileLen, 
rootDir);
+          orcTail, isOriginal, hasBase, deltas, scaledProjSize, fileLen, 
rootDir, offsetAndBucket);
     }
 
     private static final class OffsetAndLength { // Java cruft; pair of long.
@@ -1519,6 +1530,14 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
     }
 
     private List<OrcSplit> callInternal() throws IOException {
+      boolean isAcid = AcidUtils.isFullAcidScan(context.conf);
+      boolean vectorMode = Utilities.getIsVectorized(context.conf);
+
+      if (isOriginal && isAcid && vectorMode) {
+        offsetAndBucket = 
VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket(file, rootDir, 
isOriginal,
+            !deltas.isEmpty(), context.conf);
+      }
+
       // Figure out which stripes we need to read.
       if (ppdResult != null) {
         assert deltaSplits.isEmpty();

http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
index bce7977..4d55592 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
@@ -65,6 +65,12 @@ public class OrcSplit extends FileSplit implements 
ColumnarSplit, LlapAwareSplit
   private transient Object fileKey;
   private long fileLen;
 
+  /**
+   * This contains the synthetic ROW__ID offset and bucket properties for 
original file splits in an ACID table.
+   */
+  private OffsetAndBucketProperty syntheticAcidProps;
+
+  static final int HAS_SYNTHETIC_ACID_PROPS_FLAG = 32;
   static final int HAS_SYNTHETIC_FILEID_FLAG = 16;
   static final int HAS_LONG_FILEID_FLAG = 8;
   static final int BASE_FLAG = 4;
@@ -80,7 +86,8 @@ public class OrcSplit extends FileSplit implements 
ColumnarSplit, LlapAwareSplit
 
   public OrcSplit(Path path, Object fileId, long offset, long length, String[] 
hosts,
       OrcTail orcTail, boolean isOriginal, boolean hasBase,
-      List<AcidInputFormat.DeltaMetaData> deltas, long projectedDataSize, long 
fileLen, Path rootDir) {
+      List<AcidInputFormat.DeltaMetaData> deltas, long projectedDataSize, long 
fileLen, Path rootDir,
+      OffsetAndBucketProperty syntheticAcidProps) {
     super(path, offset, length, hosts);
     // For HDFS, we could avoid serializing file ID and just replace the path 
with inode-based
     // path. However, that breaks bunch of stuff because Hive later looks up 
things by split path.
@@ -94,6 +101,7 @@ public class OrcSplit extends FileSplit implements 
ColumnarSplit, LlapAwareSplit
     this.projColsUncompressedSize = projectedDataSize <= 0 ? length : 
projectedDataSize;
     // setting file length to Long.MAX_VALUE will let orc reader read file 
length from file system
     this.fileLen = fileLen <= 0 ? Long.MAX_VALUE : fileLen;
+    this.syntheticAcidProps = syntheticAcidProps;
   }
 
   @Override
@@ -121,7 +129,8 @@ public class OrcSplit extends FileSplit implements 
ColumnarSplit, LlapAwareSplit
         (isOriginal ? ORIGINAL_FLAG : 0) |
         (hasFooter ? FOOTER_FLAG : 0) |
         (isFileIdLong ? HAS_LONG_FILEID_FLAG : 0) |
-        (isFileIdWritable ? HAS_SYNTHETIC_FILEID_FLAG : 0);
+        (isFileIdWritable ? HAS_SYNTHETIC_FILEID_FLAG : 0) |
+        (syntheticAcidProps != null? HAS_SYNTHETIC_ACID_PROPS_FLAG : 0);
     out.writeByte(flags);
     out.writeInt(deltas.size());
     for(AcidInputFormat.DeltaMetaData delta: deltas) {
@@ -141,6 +150,11 @@ public class OrcSplit extends FileSplit implements 
ColumnarSplit, LlapAwareSplit
     }
     out.writeLong(fileLen);
     out.writeUTF(rootDir.toString());
+    if (syntheticAcidProps != null) {
+      out.writeLong(syntheticAcidProps.rowIdOffset);
+      out.writeInt(syntheticAcidProps.bucketProperty);
+      out.writeLong(syntheticAcidProps.syntheticWriteId);
+    }
   }
 
   @Override
@@ -153,7 +167,8 @@ public class OrcSplit extends FileSplit implements 
ColumnarSplit, LlapAwareSplit
     isOriginal = (ORIGINAL_FLAG & flags) != 0;
     hasBase = (BASE_FLAG & flags) != 0;
     boolean hasLongFileId = (HAS_LONG_FILEID_FLAG & flags) != 0,
-        hasWritableFileId = (HAS_SYNTHETIC_FILEID_FLAG & flags) != 0;
+        hasWritableFileId = (HAS_SYNTHETIC_FILEID_FLAG & flags) != 0,
+        hasSyntheticProps = (HAS_SYNTHETIC_ACID_PROPS_FLAG & flags) != 0;
     if (hasLongFileId && hasWritableFileId) {
       throw new IOException("Invalid split - both file ID types present");
     }
@@ -181,6 +196,14 @@ public class OrcSplit extends FileSplit implements 
ColumnarSplit, LlapAwareSplit
     }
     fileLen = in.readLong();
     rootDir = new Path(in.readUTF());
+
+    if (hasSyntheticProps) {
+      long rowId = in.readLong();
+      int bucket = in.readInt();
+      long writeId = in.readLong();
+
+      syntheticAcidProps = new OffsetAndBucketProperty(rowId, bucket, writeId);
+    }
   }
 
   public OrcTail getOrcTail() {
@@ -235,6 +258,10 @@ public class OrcSplit extends FileSplit implements 
ColumnarSplit, LlapAwareSplit
     return fileKey;
   }
 
+  public OffsetAndBucketProperty getSyntheticAcidProps() {
+    return syntheticAcidProps;
+  }
+
   @Override
   public long getColumnarProjectionSize() {
     return projColsUncompressedSize;
@@ -276,6 +303,32 @@ public class OrcSplit extends FileSplit implements 
ColumnarSplit, LlapAwareSplit
     return false;
   }
 
+  /**
+   * Used for generating synthetic ROW__IDs for reading "original" files.
+   */
+  static final class OffsetAndBucketProperty {
+    private final long rowIdOffset;
+    private final int bucketProperty;
+    private final long syntheticWriteId;
+    OffsetAndBucketProperty(long rowIdOffset, int bucketProperty, long 
syntheticWriteId) {
+      this.rowIdOffset = rowIdOffset;
+      this.bucketProperty = bucketProperty;
+      this.syntheticWriteId = syntheticWriteId;
+    }
+
+    public long getRowIdOffset() {
+      return rowIdOffset;
+    }
+
+    public int getBucketProperty() {
+      return bucketProperty;
+    }
+
+    public long getSyntheticWriteId() {
+      return syntheticWriteId;
+    }
+  }
+
   @Override
   public String toString() {
     return "OrcSplit [" + getPath() + ", start=" + getStart() + ", length=" + 
getLength()

http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java
index a6cf263..f16f9b4 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java
@@ -27,6 +27,7 @@ import java.util.Map.Entry;
 import java.util.TreeMap;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.common.ValidReaderWriteIdList;
@@ -102,7 +103,7 @@ public class VectorizedOrcAcidRowBatchReader
   /**
    * for reading "original" files
    */
-  private final OffsetAndBucketProperty syntheticProps;
+  private final OrcSplit.OffsetAndBucketProperty syntheticProps;
   /**
    * To have access to {@link RecordReader#getRowNumber()} in the underlying 
file
    */
@@ -247,8 +248,7 @@ public class VectorizedOrcAcidRowBatchReader
     }
     rowIdProjected = areRowIdsProjected(rbCtx);
     rootPath = orcSplit.getRootDir();
-    //why even compute syntheticProps if !isOriginal???
-    syntheticProps = computeOffsetAndBucket(orcSplit, conf, validWriteIdList);
+    syntheticProps = orcSplit.getSyntheticAcidProps();
   }
 
   /**
@@ -500,33 +500,20 @@ public class VectorizedOrcAcidRowBatchReader
   }
 
   /**
-   * Used for generating synthetic ROW__IDs for reading "original" files
-   */
-  private static final class OffsetAndBucketProperty {
-    private final long rowIdOffset;
-    private final int bucketProperty;
-    private final long syntheticWriteId;
-    private OffsetAndBucketProperty(long rowIdOffset, int bucketProperty, long 
syntheticWriteId) {
-      this.rowIdOffset = rowIdOffset;
-      this.bucketProperty = bucketProperty;
-      this.syntheticWriteId = syntheticWriteId;
-    }
-  }
-  /**
    * See {@link #next(NullWritable, VectorizedRowBatch)} first and
    * {@link OrcRawRecordMerger.OriginalReaderPair}.
    * When reading a split of an "original" file and we need to decorate data 
with ROW__ID.
    * This requires treating multiple files that are part of the same bucket 
(tranche for unbucketed
    * tables) as a single logical file to number rowids consistently.
-   *
-   * todo: This logic is executed per split of every "original" file.  The 
computed result is the
-   * same for every split form the same file so this could be optimized by 
moving it to
-   * before/during split computation and passing the info in the split.  
(HIVE-17917)
    */
-  private OffsetAndBucketProperty computeOffsetAndBucket(
-      OrcSplit split, JobConf conf, ValidWriteIdList validWriteIdList) throws 
IOException {
-    if (!needSyntheticRowIds(split.isOriginal(), 
!deleteEventRegistry.isEmpty(), rowIdProjected)) {
-      if(split.isOriginal()) {
+  static OrcSplit.OffsetAndBucketProperty computeOffsetAndBucket(
+          FileStatus file, Path rootDir, boolean isOriginal, boolean 
hasDeletes,
+          Configuration conf) throws IOException {
+
+    VectorizedRowBatchCtx vrbCtx = Utilities.getVectorizedRowBatchCtx(conf);
+
+    if (!needSyntheticRowIds(isOriginal, hasDeletes, 
areRowIdsProjected(vrbCtx))) {
+      if(isOriginal) {
         /**
          * Even if we don't need to project ROW_IDs, we still need to check 
the write ID that
          * created the file to see if it's committed.  See more in
@@ -534,16 +521,21 @@ public class VectorizedOrcAcidRowBatchReader
          * filter out base/delta files but this makes fewer dependencies)
          */
         OrcRawRecordMerger.TransactionMetaData syntheticTxnInfo =
-            
OrcRawRecordMerger.TransactionMetaData.findWriteIDForSynthetcRowIDs(split.getPath(),
-                    split.getRootDir(), conf);
-        return new OffsetAndBucketProperty(-1,-1, 
syntheticTxnInfo.syntheticWriteId);
+            
OrcRawRecordMerger.TransactionMetaData.findWriteIDForSynthetcRowIDs(file.getPath(),
+                    rootDir, conf);
+        return new OrcSplit.OffsetAndBucketProperty(-1, -1, 
syntheticTxnInfo.syntheticWriteId);
       }
       return null;
     }
+
+    String txnString = conf.get(ValidWriteIdList.VALID_WRITEIDS_KEY);
+    ValidWriteIdList validWriteIdList = (txnString == null) ? new 
ValidReaderWriteIdList() :
+        new ValidReaderWriteIdList(txnString);
+
     long rowIdOffset = 0;
     OrcRawRecordMerger.TransactionMetaData syntheticTxnInfo =
-        
OrcRawRecordMerger.TransactionMetaData.findWriteIDForSynthetcRowIDs(split.getPath(),
 split.getRootDir(), conf);
-    int bucketId = AcidUtils.parseBucketId(split.getPath());
+        
OrcRawRecordMerger.TransactionMetaData.findWriteIDForSynthetcRowIDs(file.getPath(),
 rootDir, conf);
+    int bucketId = AcidUtils.parseBucketId(file.getPath());
     int bucketProperty = BucketCodec.V1.encode(new 
AcidOutputFormat.Options(conf)
         //statementId is from directory name (or 0 if there is none)
       .statementId(syntheticTxnInfo.statementId).bucket(bucketId));
@@ -554,7 +546,7 @@ public class VectorizedOrcAcidRowBatchReader
       if (bucketIdFromPath != bucketId) {
         continue;//HIVE-16952
       }
-      if (f.getFileStatus().getPath().equals(split.getPath())) {
+      if (f.getFileStatus().getPath().equals(file.getPath())) {
         //'f' is the file whence this split is
         break;
       }
@@ -562,7 +554,7 @@ public class VectorizedOrcAcidRowBatchReader
         OrcFile.readerOptions(conf));
       rowIdOffset += reader.getNumberOfRows();
     }
-    return new OffsetAndBucketProperty(rowIdOffset, bucketProperty,
+    return new OrcSplit.OffsetAndBucketProperty(rowIdOffset, bucketProperty,
       syntheticTxnInfo.syntheticWriteId);
   }
   /**
@@ -759,8 +751,9 @@ public class VectorizedOrcAcidRowBatchReader
     boolean needSyntheticRowId =
         needSyntheticRowIds(true, !deleteEventRegistry.isEmpty(), 
rowIdProjected);
     if(needSyntheticRowId) {
-      assert syntheticProps != null && syntheticProps.rowIdOffset >= 0 : "" + 
syntheticProps;
-      assert syntheticProps != null && syntheticProps.bucketProperty >= 0 : "" 
+ syntheticProps;
+      assert syntheticProps != null : "" + syntheticProps;
+      assert syntheticProps.getRowIdOffset() >= 0 : "" + syntheticProps;
+      assert syntheticProps.getBucketProperty() >= 0 : "" + syntheticProps;
       if(innerReader == null) {
         throw new IllegalStateException(getClass().getName() + " requires " +
           org.apache.orc.RecordReader.class +
@@ -771,14 +764,14 @@ public class VectorizedOrcAcidRowBatchReader
        */
       recordIdColumnVector.fields[0].noNulls = true;
       recordIdColumnVector.fields[0].isRepeating = true;
-      ((LongColumnVector)recordIdColumnVector.fields[0]).vector[0] = 
syntheticProps.syntheticWriteId;
+      ((LongColumnVector)recordIdColumnVector.fields[0]).vector[0] = 
syntheticProps.getSyntheticWriteId();
       /**
        * This is {@link RecordIdentifier#getBucketProperty()}
        * Also see {@link BucketCodec}
        */
       recordIdColumnVector.fields[1].noNulls = true;
       recordIdColumnVector.fields[1].isRepeating = true;
-      ((LongColumnVector)recordIdColumnVector.fields[1]).vector[0] = 
syntheticProps.bucketProperty;
+      ((LongColumnVector)recordIdColumnVector.fields[1]).vector[0] = 
syntheticProps.getBucketProperty();
       /**
        * {@link RecordIdentifier#getRowId()}
        */
@@ -787,7 +780,7 @@ public class VectorizedOrcAcidRowBatchReader
       long[] rowIdVector = 
((LongColumnVector)recordIdColumnVector.fields[2]).vector;
       for(int i = 0; i < vectorizedRowBatchBase.size; i++) {
         //baseReader.getRowNumber() seems to point at the start of the batch 
todo: validate
-        rowIdVector[i] = syntheticProps.rowIdOffset + 
innerReader.getRowNumber() + i;
+        rowIdVector[i] = syntheticProps.getRowIdOffset() + 
innerReader.getRowNumber() + i;
       }
       //Now populate a structure to use to apply delete events
       innerRecordIdColumnVector = new ColumnVector[OrcRecordUpdater.FIELDS];
@@ -797,7 +790,7 @@ public class VectorizedOrcAcidRowBatchReader
       //these are insert events so (original txn == current) txn for all rows
       innerRecordIdColumnVector[OrcRecordUpdater.CURRENT_WRITEID] = 
recordIdColumnVector.fields[0];
     }
-    if(syntheticProps.syntheticWriteId > 0) {
+    if(syntheticProps.getSyntheticWriteId() > 0) {
       //"originals" (written before table was converted to acid) is considered 
written by
       // writeid:0 which is always committed so there is no need to check wrt 
invalid write Ids
       //But originals written by Load Data for example can be in base_x or 
delta_x_x so we must
@@ -811,7 +804,7 @@ public class VectorizedOrcAcidRowBatchReader
         * reader (transactions) is concerned.  Since here we are reading 
'original' schema file,
         * all rows in it have been created by the same txn, namely 
'syntheticProps.syntheticWriteId'
         */
-        if (!validWriteIdList.isWriteIdValid(syntheticProps.syntheticWriteId)) 
{
+        if 
(!validWriteIdList.isWriteIdValid(syntheticProps.getSyntheticWriteId())) {
           selectedBitSet.clear(0, vectorizedRowBatchBase.size);
         }
       }

http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java 
b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index a8ee744..9123d72 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -4049,7 +4049,7 @@ public class TestInputOutputFormat {
     conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2");
     OrcSplit split = new OrcSplit(testFilePath, null, 0, fileLength,
         new String[0], null, false, true,
-        new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, 
fileLength, workDir);
+        new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, 
fileLength, workDir, null);
     OrcInputFormat inputFormat = new OrcInputFormat();
     AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split,
         new AcidInputFormat.Options(conf));
@@ -4077,7 +4077,7 @@ public class TestInputOutputFormat {
     conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2,3");
     split = new OrcSplit(testFilePath, null, 0, fileLength,
         new String[0], null, false, true,
-        new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, 
fileLength, workDir);
+        new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, 
fileLength, workDir, null);
     inputFormat = new OrcInputFormat();
     reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
     record = 0;
@@ -4195,7 +4195,7 @@ public class TestInputOutputFormat {
     // Specify an OrcSplit that starts beyond the offset of the last stripe.
     OrcSplit split = new OrcSplit(testFilePath, null, lastStripeOffset + 1, 
lastStripeLength,
         new String[0], null, false, true,
-        new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, 
fileLength, workDir);
+        new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, 
fileLength, workDir, null);
     OrcInputFormat inputFormat = new OrcInputFormat();
     AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split,
         new AcidInputFormat.Options(conf));

http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/test/queries/clientpositive/acid_vectorization_original.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/acid_vectorization_original.q 
b/ql/src/test/queries/clientpositive/acid_vectorization_original.q
index 5082aed..9decbec 100644
--- a/ql/src/test/queries/clientpositive/acid_vectorization_original.q
+++ b/ql/src/test/queries/clientpositive/acid_vectorization_original.q
@@ -132,4 +132,31 @@ select ROW__ID, * from over10k_orc_bucketed where ROW__ID 
is null;
 -- this test that there are no duplicate ROW__IDs so should produce no output
 -- select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having 
count(*) > 1;
 
--- select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null;
\ No newline at end of file
+-- select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null;
+
+CREATE TABLE over10k_orc STORED AS ORC as select * from over10k_n2 where t 
between 3 and 4;
+-- Make sure there are multiple original files
+INSERT INTO over10k_orc select * from over10k_n2 where t between 3 and 4;
+alter table over10k_orc set TBLPROPERTIES ('transactional'='true');
+
+-- row id is projected but there are no delete deltas
+set hive.exec.orc.split.strategy=ETL;
+select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2
+on o1.ROW__ID.rowid == o2.ROW__ID.rowid and o1.ROW__ID.writeid == 
o2.ROW__ID.writeid and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid;
+
+set hive.exec.orc.split.strategy=BI;
+select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2
+on o1.ROW__ID.rowid == o2.ROW__ID.rowid
+and o1.ROW__ID.writeid == o2.ROW__ID.writeid
+and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid;
+
+delete from over10k_orc where t = 3;
+
+-- row id not projected but has delete deltas
+set hive.exec.orc.split.strategy=ETL;
+select t, count(*) from over10k_orc
+group by t;
+
+set hive.exec.orc.split.strategy=BI;
+select t, count(*) from over10k_orc
+group by t;

http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out
----------------------------------------------------------------------
diff --git 
a/ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out 
b/ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out
index 99c741c..1f060ca 100644
--- a/ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out
+++ b/ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out
@@ -742,3 +742,149 @@ POSTHOOK: query: select ROW__ID, * from 
over10k_orc_bucketed where ROW__ID is nu
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@over10k_orc_bucketed
 #### A masked pattern was here ####
+PREHOOK: query: CREATE TABLE over10k_orc STORED AS ORC as select * from 
over10k_n2 where t between 3 and 4
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@over10k_n2
+PREHOOK: Output: database:default
+PREHOOK: Output: default@over10k_orc
+POSTHOOK: query: CREATE TABLE over10k_orc STORED AS ORC as select * from 
over10k_n2 where t between 3 and 4
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@over10k_n2
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@over10k_orc
+POSTHOOK: Lineage: over10k_orc.b SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:b, type:bigint, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.bin SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:bin, type:binary, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.bo SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:bo, type:boolean, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.d SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:d, type:double, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.dec SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:dec, type:decimal(4,2), comment:null), 
]
+POSTHOOK: Lineage: over10k_orc.f SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:f, type:float, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.i SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:i, type:int, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.s SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:s, type:string, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.si SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:si, type:smallint, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.t SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:t, type:tinyint, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.ts SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:ts, type:timestamp, comment:null), ]
+PREHOOK: query: INSERT INTO over10k_orc select * from over10k_n2 where t 
between 3 and 4
+PREHOOK: type: QUERY
+PREHOOK: Input: default@over10k_n2
+PREHOOK: Output: default@over10k_orc
+POSTHOOK: query: INSERT INTO over10k_orc select * from over10k_n2 where t 
between 3 and 4
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@over10k_n2
+POSTHOOK: Output: default@over10k_orc
+POSTHOOK: Lineage: over10k_orc.b SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:b, type:bigint, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.bin SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:bin, type:binary, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.bo SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:bo, type:boolean, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.d SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:d, type:double, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.dec SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:dec, type:decimal(4,2), comment:null), 
]
+POSTHOOK: Lineage: over10k_orc.f SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:f, type:float, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.i SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:i, type:int, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.s SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:s, type:string, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.si SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:si, type:smallint, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.t SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:t, type:tinyint, comment:null), ]
+POSTHOOK: Lineage: over10k_orc.ts SIMPLE 
[(over10k_n2)over10k_n2.FieldSchema(name:ts, type:timestamp, comment:null), ]
+PREHOOK: query: alter table over10k_orc set TBLPROPERTIES 
('transactional'='true')
+PREHOOK: type: ALTERTABLE_PROPERTIES
+PREHOOK: Input: default@over10k_orc
+PREHOOK: Output: default@over10k_orc
+POSTHOOK: query: alter table over10k_orc set TBLPROPERTIES 
('transactional'='true')
+POSTHOOK: type: ALTERTABLE_PROPERTIES
+POSTHOOK: Input: default@over10k_orc
+POSTHOOK: Output: default@over10k_orc
+PREHOOK: query: select o1.ROW__ID r1, o1.* from over10k_orc o1 join 
over10k_orc o2
+on o1.ROW__ID.rowid == o2.ROW__ID.rowid and o1.ROW__ID.writeid == 
o2.ROW__ID.writeid and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@over10k_orc
+#### A masked pattern was here ####
+POSTHOOK: query: select o1.ROW__ID r1, o1.* from over10k_orc o1 join 
over10k_orc o2
+on o1.ROW__ID.rowid == o2.ROW__ID.rowid and o1.ROW__ID.writeid == 
o2.ROW__ID.writeid and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@over10k_orc
+#### A masked pattern was here ####
+{"writeid":0,"bucketid":536870912,"rowid":0}   3       260     65659   
4294967508      91.53   43.18   false   oscar ovid      2013-03-01 
09:11:58.703281      99.87   chemistry
+{"writeid":0,"bucketid":536870912,"rowid":1}   4       279     65745   
4294967431      83.58   31.66   true    fred van buren  2013-03-01 
09:11:58.703087      25.19   study skills
+{"writeid":0,"bucketid":536870912,"rowid":2}   3       469     65743   
4294967428      10.66   39.84   false   victor zipper   2013-03-01 
09:11:58.703181      26.60   mathematics
+{"writeid":0,"bucketid":536870912,"rowid":3}   4       392     65665   
4294967391      53.27   3.86    true    zach miller     2013-03-01 
09:11:58.703296      43.66   undecided
+{"writeid":0,"bucketid":536870912,"rowid":4}   3       395     65747   
4294967313      57.25   3.17    true    wendy garcia    2013-03-01 
09:11:58.703074      58.47   xylophone band
+{"writeid":0,"bucketid":536870912,"rowid":5}   3       277     65788   
4294967403      58.08   20.55   false   xavier ovid     2013-03-01 
09:11:58.703281      62.11   zync studies
+{"writeid":0,"bucketid":536870912,"rowid":6}   4       509     65776   
4294967432      78.26   35.02   false   mike king       2013-03-01 
09:11:58.703231      18.70   undecided
+{"writeid":0,"bucketid":536870912,"rowid":7}   3       308     65757   
4294967430      49.28   38.04   false   nick zipper     2013-03-01 
09:11:58.703132      1.86    kindergarten
+{"writeid":0,"bucketid":536870912,"rowid":8}   4       460     65625   
4294967360      5.51    22.6    true    oscar laertes   2013-03-01 
09:11:58.703293      42.86   nap time
+{"writeid":0,"bucketid":536870912,"rowid":9}   3       322     65672   
4294967508      25.55   26.28   true    jessica carson  2013-03-01 
09:11:58.70312       52.60   education
+{"writeid":0,"bucketid":536870912,"rowid":10}  3       464     65617   
4294967424      82.3    2.92    false   ethan brown     2013-03-01 
09:11:58.703076      18.51   wind surfing
+{"writeid":0,"bucketid":536870912,"rowid":11}  3       260     65659   
4294967508      91.53   43.18   false   oscar ovid      2013-03-01 
09:11:58.703281      99.87   chemistry
+{"writeid":0,"bucketid":536870912,"rowid":12}  4       279     65745   
4294967431      83.58   31.66   true    fred van buren  2013-03-01 
09:11:58.703087      25.19   study skills
+{"writeid":0,"bucketid":536870912,"rowid":13}  3       469     65743   
4294967428      10.66   39.84   false   victor zipper   2013-03-01 
09:11:58.703181      26.60   mathematics
+{"writeid":0,"bucketid":536870912,"rowid":14}  4       392     65665   
4294967391      53.27   3.86    true    zach miller     2013-03-01 
09:11:58.703296      43.66   undecided
+{"writeid":0,"bucketid":536870912,"rowid":15}  3       395     65747   
4294967313      57.25   3.17    true    wendy garcia    2013-03-01 
09:11:58.703074      58.47   xylophone band
+{"writeid":0,"bucketid":536870912,"rowid":16}  3       277     65788   
4294967403      58.08   20.55   false   xavier ovid     2013-03-01 
09:11:58.703281      62.11   zync studies
+{"writeid":0,"bucketid":536870912,"rowid":17}  4       509     65776   
4294967432      78.26   35.02   false   mike king       2013-03-01 
09:11:58.703231      18.70   undecided
+{"writeid":0,"bucketid":536870912,"rowid":18}  3       308     65757   
4294967430      49.28   38.04   false   nick zipper     2013-03-01 
09:11:58.703132      1.86    kindergarten
+{"writeid":0,"bucketid":536870912,"rowid":19}  4       460     65625   
4294967360      5.51    22.6    true    oscar laertes   2013-03-01 
09:11:58.703293      42.86   nap time
+{"writeid":0,"bucketid":536870912,"rowid":20}  3       322     65672   
4294967508      25.55   26.28   true    jessica carson  2013-03-01 
09:11:58.70312       52.60   education
+{"writeid":0,"bucketid":536870912,"rowid":21}  3       464     65617   
4294967424      82.3    2.92    false   ethan brown     2013-03-01 
09:11:58.703076      18.51   wind surfing
+PREHOOK: query: select o1.ROW__ID r1, o1.* from over10k_orc o1 join 
over10k_orc o2
+on o1.ROW__ID.rowid == o2.ROW__ID.rowid
+and o1.ROW__ID.writeid == o2.ROW__ID.writeid
+and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@over10k_orc
+#### A masked pattern was here ####
+POSTHOOK: query: select o1.ROW__ID r1, o1.* from over10k_orc o1 join 
over10k_orc o2
+on o1.ROW__ID.rowid == o2.ROW__ID.rowid
+and o1.ROW__ID.writeid == o2.ROW__ID.writeid
+and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@over10k_orc
+#### A masked pattern was here ####
+{"writeid":0,"bucketid":536870912,"rowid":0}   3       260     65659   
4294967508      91.53   43.18   false   oscar ovid      2013-03-01 
09:11:58.703281      99.87   chemistry
+{"writeid":0,"bucketid":536870912,"rowid":1}   4       279     65745   
4294967431      83.58   31.66   true    fred van buren  2013-03-01 
09:11:58.703087      25.19   study skills
+{"writeid":0,"bucketid":536870912,"rowid":2}   3       469     65743   
4294967428      10.66   39.84   false   victor zipper   2013-03-01 
09:11:58.703181      26.60   mathematics
+{"writeid":0,"bucketid":536870912,"rowid":3}   4       392     65665   
4294967391      53.27   3.86    true    zach miller     2013-03-01 
09:11:58.703296      43.66   undecided
+{"writeid":0,"bucketid":536870912,"rowid":4}   3       395     65747   
4294967313      57.25   3.17    true    wendy garcia    2013-03-01 
09:11:58.703074      58.47   xylophone band
+{"writeid":0,"bucketid":536870912,"rowid":5}   3       277     65788   
4294967403      58.08   20.55   false   xavier ovid     2013-03-01 
09:11:58.703281      62.11   zync studies
+{"writeid":0,"bucketid":536870912,"rowid":6}   4       509     65776   
4294967432      78.26   35.02   false   mike king       2013-03-01 
09:11:58.703231      18.70   undecided
+{"writeid":0,"bucketid":536870912,"rowid":7}   3       308     65757   
4294967430      49.28   38.04   false   nick zipper     2013-03-01 
09:11:58.703132      1.86    kindergarten
+{"writeid":0,"bucketid":536870912,"rowid":8}   4       460     65625   
4294967360      5.51    22.6    true    oscar laertes   2013-03-01 
09:11:58.703293      42.86   nap time
+{"writeid":0,"bucketid":536870912,"rowid":9}   3       322     65672   
4294967508      25.55   26.28   true    jessica carson  2013-03-01 
09:11:58.70312       52.60   education
+{"writeid":0,"bucketid":536870912,"rowid":10}  3       464     65617   
4294967424      82.3    2.92    false   ethan brown     2013-03-01 
09:11:58.703076      18.51   wind surfing
+{"writeid":0,"bucketid":536870912,"rowid":11}  3       260     65659   
4294967508      91.53   43.18   false   oscar ovid      2013-03-01 
09:11:58.703281      99.87   chemistry
+{"writeid":0,"bucketid":536870912,"rowid":12}  4       279     65745   
4294967431      83.58   31.66   true    fred van buren  2013-03-01 
09:11:58.703087      25.19   study skills
+{"writeid":0,"bucketid":536870912,"rowid":13}  3       469     65743   
4294967428      10.66   39.84   false   victor zipper   2013-03-01 
09:11:58.703181      26.60   mathematics
+{"writeid":0,"bucketid":536870912,"rowid":14}  4       392     65665   
4294967391      53.27   3.86    true    zach miller     2013-03-01 
09:11:58.703296      43.66   undecided
+{"writeid":0,"bucketid":536870912,"rowid":15}  3       395     65747   
4294967313      57.25   3.17    true    wendy garcia    2013-03-01 
09:11:58.703074      58.47   xylophone band
+{"writeid":0,"bucketid":536870912,"rowid":16}  3       277     65788   
4294967403      58.08   20.55   false   xavier ovid     2013-03-01 
09:11:58.703281      62.11   zync studies
+{"writeid":0,"bucketid":536870912,"rowid":17}  4       509     65776   
4294967432      78.26   35.02   false   mike king       2013-03-01 
09:11:58.703231      18.70   undecided
+{"writeid":0,"bucketid":536870912,"rowid":18}  3       308     65757   
4294967430      49.28   38.04   false   nick zipper     2013-03-01 
09:11:58.703132      1.86    kindergarten
+{"writeid":0,"bucketid":536870912,"rowid":19}  4       460     65625   
4294967360      5.51    22.6    true    oscar laertes   2013-03-01 
09:11:58.703293      42.86   nap time
+{"writeid":0,"bucketid":536870912,"rowid":20}  3       322     65672   
4294967508      25.55   26.28   true    jessica carson  2013-03-01 
09:11:58.70312       52.60   education
+{"writeid":0,"bucketid":536870912,"rowid":21}  3       464     65617   
4294967424      82.3    2.92    false   ethan brown     2013-03-01 
09:11:58.703076      18.51   wind surfing
+PREHOOK: query: delete from over10k_orc where t = 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@over10k_orc
+PREHOOK: Output: default@over10k_orc
+POSTHOOK: query: delete from over10k_orc where t = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@over10k_orc
+POSTHOOK: Output: default@over10k_orc
+PREHOOK: query: select t, count(*) from over10k_orc
+group by t
+PREHOOK: type: QUERY
+PREHOOK: Input: default@over10k_orc
+#### A masked pattern was here ####
+POSTHOOK: query: select t, count(*) from over10k_orc
+group by t
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@over10k_orc
+#### A masked pattern was here ####
+4      8
+PREHOOK: query: select t, count(*) from over10k_orc
+group by t
+PREHOOK: type: QUERY
+PREHOOK: Input: default@over10k_orc
+#### A masked pattern was here ####
+POSTHOOK: query: select t, count(*) from over10k_orc
+group by t
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@over10k_orc
+#### A masked pattern was here ####
+4      8

hive git commit: HIVE-17917: VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket optimization (Saurabh Seth via Eugene Koifman)

Reply via email to