Repository: hive Updated Branches: refs/heads/master 37fd22e6a -> 34331f3c7
HIVE-17917: VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket optimization (Saurabh Seth via Eugene Koifman) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/34331f3c Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/34331f3c Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/34331f3c Branch: refs/heads/master Commit: 34331f3c7b69200a0177f5446f1f15c8ed69ee86 Parents: 37fd22e Author: Saurabh Seth <saurabh.s...@gmail.com> Authored: Thu Sep 27 19:14:21 2018 -0700 Committer: Eugene Koifman <ekoif...@apache.org> Committed: Thu Sep 27 19:14:21 2018 -0700 ---------------------------------------------------------------------- .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 23 ++- .../apache/hadoop/hive/ql/io/orc/OrcSplit.java | 59 +++++++- .../io/orc/VectorizedOrcAcidRowBatchReader.java | 69 ++++----- .../hive/ql/io/orc/TestInputOutputFormat.java | 6 +- .../acid_vectorization_original.q | 29 +++- .../llap/acid_vectorization_original.q.out | 146 +++++++++++++++++++ 6 files changed, 285 insertions(+), 47 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index f34f393..728bf50 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -1036,6 +1036,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, private final Path dir; private final boolean allowSyntheticFileIds; private final boolean isDefaultFs; + private final Configuration conf; /** * @param dir - root of partition dir @@ -1051,12 +1052,21 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, this.dir = dir; this.allowSyntheticFileIds = allowSyntheticFileIds; this.isDefaultFs = isDefaultFs; + this.conf = context.conf; } @Override public List<OrcSplit> getSplits() throws IOException { List<OrcSplit> splits = Lists.newArrayList(); + boolean isAcid = AcidUtils.isFullAcidScan(conf); + boolean vectorMode = Utilities.getIsVectorized(conf); + OrcSplit.OffsetAndBucketProperty offsetAndBucket = null; for (HdfsFileStatusWithId file : fileStatuses) { + if (isOriginal && isAcid && vectorMode) { + offsetAndBucket = VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket(file.getFileStatus(), dir, + isOriginal, !deltas.isEmpty(), conf); + } + FileStatus fileStatus = file.getFileStatus(); long logicalLen = AcidUtils.getLogicalLength(fs, fileStatus); if (logicalLen != 0) { @@ -1072,7 +1082,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, } OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(), entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true, - deltas, -1, logicalLen, dir); + deltas, -1, logicalLen, dir, offsetAndBucket); splits.add(orcSplit); } } @@ -1352,6 +1362,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, private SchemaEvolution evolution; //this is the root of the partition in which the 'file' is located private final Path rootDir; + OrcSplit.OffsetAndBucketProperty offsetAndBucket = null; public SplitGenerator(SplitInfo splitInfo, UserGroupInformation ugi, boolean allowSyntheticFileIds, boolean isDefaultFs) throws IOException { @@ -1480,7 +1491,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, fileKey = new SyntheticFileId(file); } return new OrcSplit(file.getPath(), fileKey, offset, length, hosts, - orcTail, isOriginal, hasBase, deltas, scaledProjSize, fileLen, rootDir); + orcTail, isOriginal, hasBase, deltas, scaledProjSize, fileLen, rootDir, offsetAndBucket); } private static final class OffsetAndLength { // Java cruft; pair of long. @@ -1519,6 +1530,14 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, } private List<OrcSplit> callInternal() throws IOException { + boolean isAcid = AcidUtils.isFullAcidScan(context.conf); + boolean vectorMode = Utilities.getIsVectorized(context.conf); + + if (isOriginal && isAcid && vectorMode) { + offsetAndBucket = VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket(file, rootDir, isOriginal, + !deltas.isEmpty(), context.conf); + } + // Figure out which stripes we need to read. if (ppdResult != null) { assert deltaSplits.isEmpty(); http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java index bce7977..4d55592 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java @@ -65,6 +65,12 @@ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit private transient Object fileKey; private long fileLen; + /** + * This contains the synthetic ROW__ID offset and bucket properties for original file splits in an ACID table. + */ + private OffsetAndBucketProperty syntheticAcidProps; + + static final int HAS_SYNTHETIC_ACID_PROPS_FLAG = 32; static final int HAS_SYNTHETIC_FILEID_FLAG = 16; static final int HAS_LONG_FILEID_FLAG = 8; static final int BASE_FLAG = 4; @@ -80,7 +86,8 @@ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit public OrcSplit(Path path, Object fileId, long offset, long length, String[] hosts, OrcTail orcTail, boolean isOriginal, boolean hasBase, - List<AcidInputFormat.DeltaMetaData> deltas, long projectedDataSize, long fileLen, Path rootDir) { + List<AcidInputFormat.DeltaMetaData> deltas, long projectedDataSize, long fileLen, Path rootDir, + OffsetAndBucketProperty syntheticAcidProps) { super(path, offset, length, hosts); // For HDFS, we could avoid serializing file ID and just replace the path with inode-based // path. However, that breaks bunch of stuff because Hive later looks up things by split path. @@ -94,6 +101,7 @@ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit this.projColsUncompressedSize = projectedDataSize <= 0 ? length : projectedDataSize; // setting file length to Long.MAX_VALUE will let orc reader read file length from file system this.fileLen = fileLen <= 0 ? Long.MAX_VALUE : fileLen; + this.syntheticAcidProps = syntheticAcidProps; } @Override @@ -121,7 +129,8 @@ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit (isOriginal ? ORIGINAL_FLAG : 0) | (hasFooter ? FOOTER_FLAG : 0) | (isFileIdLong ? HAS_LONG_FILEID_FLAG : 0) | - (isFileIdWritable ? HAS_SYNTHETIC_FILEID_FLAG : 0); + (isFileIdWritable ? HAS_SYNTHETIC_FILEID_FLAG : 0) | + (syntheticAcidProps != null? HAS_SYNTHETIC_ACID_PROPS_FLAG : 0); out.writeByte(flags); out.writeInt(deltas.size()); for(AcidInputFormat.DeltaMetaData delta: deltas) { @@ -141,6 +150,11 @@ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit } out.writeLong(fileLen); out.writeUTF(rootDir.toString()); + if (syntheticAcidProps != null) { + out.writeLong(syntheticAcidProps.rowIdOffset); + out.writeInt(syntheticAcidProps.bucketProperty); + out.writeLong(syntheticAcidProps.syntheticWriteId); + } } @Override @@ -153,7 +167,8 @@ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit isOriginal = (ORIGINAL_FLAG & flags) != 0; hasBase = (BASE_FLAG & flags) != 0; boolean hasLongFileId = (HAS_LONG_FILEID_FLAG & flags) != 0, - hasWritableFileId = (HAS_SYNTHETIC_FILEID_FLAG & flags) != 0; + hasWritableFileId = (HAS_SYNTHETIC_FILEID_FLAG & flags) != 0, + hasSyntheticProps = (HAS_SYNTHETIC_ACID_PROPS_FLAG & flags) != 0; if (hasLongFileId && hasWritableFileId) { throw new IOException("Invalid split - both file ID types present"); } @@ -181,6 +196,14 @@ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit } fileLen = in.readLong(); rootDir = new Path(in.readUTF()); + + if (hasSyntheticProps) { + long rowId = in.readLong(); + int bucket = in.readInt(); + long writeId = in.readLong(); + + syntheticAcidProps = new OffsetAndBucketProperty(rowId, bucket, writeId); + } } public OrcTail getOrcTail() { @@ -235,6 +258,10 @@ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit return fileKey; } + public OffsetAndBucketProperty getSyntheticAcidProps() { + return syntheticAcidProps; + } + @Override public long getColumnarProjectionSize() { return projColsUncompressedSize; @@ -276,6 +303,32 @@ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit return false; } + /** + * Used for generating synthetic ROW__IDs for reading "original" files. + */ + static final class OffsetAndBucketProperty { + private final long rowIdOffset; + private final int bucketProperty; + private final long syntheticWriteId; + OffsetAndBucketProperty(long rowIdOffset, int bucketProperty, long syntheticWriteId) { + this.rowIdOffset = rowIdOffset; + this.bucketProperty = bucketProperty; + this.syntheticWriteId = syntheticWriteId; + } + + public long getRowIdOffset() { + return rowIdOffset; + } + + public int getBucketProperty() { + return bucketProperty; + } + + public long getSyntheticWriteId() { + return syntheticWriteId; + } + } + @Override public String toString() { return "OrcSplit [" + getPath() + ", start=" + getStart() + ", length=" + getLength() http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java index a6cf263..f16f9b4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java @@ -27,6 +27,7 @@ import java.util.Map.Entry; import java.util.TreeMap; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.ValidReaderWriteIdList; @@ -102,7 +103,7 @@ public class VectorizedOrcAcidRowBatchReader /** * for reading "original" files */ - private final OffsetAndBucketProperty syntheticProps; + private final OrcSplit.OffsetAndBucketProperty syntheticProps; /** * To have access to {@link RecordReader#getRowNumber()} in the underlying file */ @@ -247,8 +248,7 @@ public class VectorizedOrcAcidRowBatchReader } rowIdProjected = areRowIdsProjected(rbCtx); rootPath = orcSplit.getRootDir(); - //why even compute syntheticProps if !isOriginal??? - syntheticProps = computeOffsetAndBucket(orcSplit, conf, validWriteIdList); + syntheticProps = orcSplit.getSyntheticAcidProps(); } /** @@ -500,33 +500,20 @@ public class VectorizedOrcAcidRowBatchReader } /** - * Used for generating synthetic ROW__IDs for reading "original" files - */ - private static final class OffsetAndBucketProperty { - private final long rowIdOffset; - private final int bucketProperty; - private final long syntheticWriteId; - private OffsetAndBucketProperty(long rowIdOffset, int bucketProperty, long syntheticWriteId) { - this.rowIdOffset = rowIdOffset; - this.bucketProperty = bucketProperty; - this.syntheticWriteId = syntheticWriteId; - } - } - /** * See {@link #next(NullWritable, VectorizedRowBatch)} first and * {@link OrcRawRecordMerger.OriginalReaderPair}. * When reading a split of an "original" file and we need to decorate data with ROW__ID. * This requires treating multiple files that are part of the same bucket (tranche for unbucketed * tables) as a single logical file to number rowids consistently. - * - * todo: This logic is executed per split of every "original" file. The computed result is the - * same for every split form the same file so this could be optimized by moving it to - * before/during split computation and passing the info in the split. (HIVE-17917) */ - private OffsetAndBucketProperty computeOffsetAndBucket( - OrcSplit split, JobConf conf, ValidWriteIdList validWriteIdList) throws IOException { - if (!needSyntheticRowIds(split.isOriginal(), !deleteEventRegistry.isEmpty(), rowIdProjected)) { - if(split.isOriginal()) { + static OrcSplit.OffsetAndBucketProperty computeOffsetAndBucket( + FileStatus file, Path rootDir, boolean isOriginal, boolean hasDeletes, + Configuration conf) throws IOException { + + VectorizedRowBatchCtx vrbCtx = Utilities.getVectorizedRowBatchCtx(conf); + + if (!needSyntheticRowIds(isOriginal, hasDeletes, areRowIdsProjected(vrbCtx))) { + if(isOriginal) { /** * Even if we don't need to project ROW_IDs, we still need to check the write ID that * created the file to see if it's committed. See more in @@ -534,16 +521,21 @@ public class VectorizedOrcAcidRowBatchReader * filter out base/delta files but this makes fewer dependencies) */ OrcRawRecordMerger.TransactionMetaData syntheticTxnInfo = - OrcRawRecordMerger.TransactionMetaData.findWriteIDForSynthetcRowIDs(split.getPath(), - split.getRootDir(), conf); - return new OffsetAndBucketProperty(-1,-1, syntheticTxnInfo.syntheticWriteId); + OrcRawRecordMerger.TransactionMetaData.findWriteIDForSynthetcRowIDs(file.getPath(), + rootDir, conf); + return new OrcSplit.OffsetAndBucketProperty(-1, -1, syntheticTxnInfo.syntheticWriteId); } return null; } + + String txnString = conf.get(ValidWriteIdList.VALID_WRITEIDS_KEY); + ValidWriteIdList validWriteIdList = (txnString == null) ? new ValidReaderWriteIdList() : + new ValidReaderWriteIdList(txnString); + long rowIdOffset = 0; OrcRawRecordMerger.TransactionMetaData syntheticTxnInfo = - OrcRawRecordMerger.TransactionMetaData.findWriteIDForSynthetcRowIDs(split.getPath(), split.getRootDir(), conf); - int bucketId = AcidUtils.parseBucketId(split.getPath()); + OrcRawRecordMerger.TransactionMetaData.findWriteIDForSynthetcRowIDs(file.getPath(), rootDir, conf); + int bucketId = AcidUtils.parseBucketId(file.getPath()); int bucketProperty = BucketCodec.V1.encode(new AcidOutputFormat.Options(conf) //statementId is from directory name (or 0 if there is none) .statementId(syntheticTxnInfo.statementId).bucket(bucketId)); @@ -554,7 +546,7 @@ public class VectorizedOrcAcidRowBatchReader if (bucketIdFromPath != bucketId) { continue;//HIVE-16952 } - if (f.getFileStatus().getPath().equals(split.getPath())) { + if (f.getFileStatus().getPath().equals(file.getPath())) { //'f' is the file whence this split is break; } @@ -562,7 +554,7 @@ public class VectorizedOrcAcidRowBatchReader OrcFile.readerOptions(conf)); rowIdOffset += reader.getNumberOfRows(); } - return new OffsetAndBucketProperty(rowIdOffset, bucketProperty, + return new OrcSplit.OffsetAndBucketProperty(rowIdOffset, bucketProperty, syntheticTxnInfo.syntheticWriteId); } /** @@ -759,8 +751,9 @@ public class VectorizedOrcAcidRowBatchReader boolean needSyntheticRowId = needSyntheticRowIds(true, !deleteEventRegistry.isEmpty(), rowIdProjected); if(needSyntheticRowId) { - assert syntheticProps != null && syntheticProps.rowIdOffset >= 0 : "" + syntheticProps; - assert syntheticProps != null && syntheticProps.bucketProperty >= 0 : "" + syntheticProps; + assert syntheticProps != null : "" + syntheticProps; + assert syntheticProps.getRowIdOffset() >= 0 : "" + syntheticProps; + assert syntheticProps.getBucketProperty() >= 0 : "" + syntheticProps; if(innerReader == null) { throw new IllegalStateException(getClass().getName() + " requires " + org.apache.orc.RecordReader.class + @@ -771,14 +764,14 @@ public class VectorizedOrcAcidRowBatchReader */ recordIdColumnVector.fields[0].noNulls = true; recordIdColumnVector.fields[0].isRepeating = true; - ((LongColumnVector)recordIdColumnVector.fields[0]).vector[0] = syntheticProps.syntheticWriteId; + ((LongColumnVector)recordIdColumnVector.fields[0]).vector[0] = syntheticProps.getSyntheticWriteId(); /** * This is {@link RecordIdentifier#getBucketProperty()} * Also see {@link BucketCodec} */ recordIdColumnVector.fields[1].noNulls = true; recordIdColumnVector.fields[1].isRepeating = true; - ((LongColumnVector)recordIdColumnVector.fields[1]).vector[0] = syntheticProps.bucketProperty; + ((LongColumnVector)recordIdColumnVector.fields[1]).vector[0] = syntheticProps.getBucketProperty(); /** * {@link RecordIdentifier#getRowId()} */ @@ -787,7 +780,7 @@ public class VectorizedOrcAcidRowBatchReader long[] rowIdVector = ((LongColumnVector)recordIdColumnVector.fields[2]).vector; for(int i = 0; i < vectorizedRowBatchBase.size; i++) { //baseReader.getRowNumber() seems to point at the start of the batch todo: validate - rowIdVector[i] = syntheticProps.rowIdOffset + innerReader.getRowNumber() + i; + rowIdVector[i] = syntheticProps.getRowIdOffset() + innerReader.getRowNumber() + i; } //Now populate a structure to use to apply delete events innerRecordIdColumnVector = new ColumnVector[OrcRecordUpdater.FIELDS]; @@ -797,7 +790,7 @@ public class VectorizedOrcAcidRowBatchReader //these are insert events so (original txn == current) txn for all rows innerRecordIdColumnVector[OrcRecordUpdater.CURRENT_WRITEID] = recordIdColumnVector.fields[0]; } - if(syntheticProps.syntheticWriteId > 0) { + if(syntheticProps.getSyntheticWriteId() > 0) { //"originals" (written before table was converted to acid) is considered written by // writeid:0 which is always committed so there is no need to check wrt invalid write Ids //But originals written by Load Data for example can be in base_x or delta_x_x so we must @@ -811,7 +804,7 @@ public class VectorizedOrcAcidRowBatchReader * reader (transactions) is concerned. Since here we are reading 'original' schema file, * all rows in it have been created by the same txn, namely 'syntheticProps.syntheticWriteId' */ - if (!validWriteIdList.isWriteIdValid(syntheticProps.syntheticWriteId)) { + if (!validWriteIdList.isWriteIdValid(syntheticProps.getSyntheticWriteId())) { selectedBitSet.clear(0, vectorizedRowBatchBase.size); } } http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index a8ee744..9123d72 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -4049,7 +4049,7 @@ public class TestInputOutputFormat { conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2"); OrcSplit split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, - new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir); + new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir, null); OrcInputFormat inputFormat = new OrcInputFormat(); AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf)); @@ -4077,7 +4077,7 @@ public class TestInputOutputFormat { conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2,3"); split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, - new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir); + new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir, null); inputFormat = new OrcInputFormat(); reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf)); record = 0; @@ -4195,7 +4195,7 @@ public class TestInputOutputFormat { // Specify an OrcSplit that starts beyond the offset of the last stripe. OrcSplit split = new OrcSplit(testFilePath, null, lastStripeOffset + 1, lastStripeLength, new String[0], null, false, true, - new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir); + new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir, null); OrcInputFormat inputFormat = new OrcInputFormat(); AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf)); http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/test/queries/clientpositive/acid_vectorization_original.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/acid_vectorization_original.q b/ql/src/test/queries/clientpositive/acid_vectorization_original.q index 5082aed..9decbec 100644 --- a/ql/src/test/queries/clientpositive/acid_vectorization_original.q +++ b/ql/src/test/queries/clientpositive/acid_vectorization_original.q @@ -132,4 +132,31 @@ select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null; -- this test that there are no duplicate ROW__IDs so should produce no output -- select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1; --- select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null; \ No newline at end of file +-- select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null; + +CREATE TABLE over10k_orc STORED AS ORC as select * from over10k_n2 where t between 3 and 4; +-- Make sure there are multiple original files +INSERT INTO over10k_orc select * from over10k_n2 where t between 3 and 4; +alter table over10k_orc set TBLPROPERTIES ('transactional'='true'); + +-- row id is projected but there are no delete deltas +set hive.exec.orc.split.strategy=ETL; +select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2 +on o1.ROW__ID.rowid == o2.ROW__ID.rowid and o1.ROW__ID.writeid == o2.ROW__ID.writeid and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid; + +set hive.exec.orc.split.strategy=BI; +select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2 +on o1.ROW__ID.rowid == o2.ROW__ID.rowid +and o1.ROW__ID.writeid == o2.ROW__ID.writeid +and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid; + +delete from over10k_orc where t = 3; + +-- row id not projected but has delete deltas +set hive.exec.orc.split.strategy=ETL; +select t, count(*) from over10k_orc +group by t; + +set hive.exec.orc.split.strategy=BI; +select t, count(*) from over10k_orc +group by t; http://git-wip-us.apache.org/repos/asf/hive/blob/34331f3c/ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out b/ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out index 99c741c..1f060ca 100644 --- a/ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out +++ b/ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out @@ -742,3 +742,149 @@ POSTHOOK: query: select ROW__ID, * from over10k_orc_bucketed where ROW__ID is nu POSTHOOK: type: QUERY POSTHOOK: Input: default@over10k_orc_bucketed #### A masked pattern was here #### +PREHOOK: query: CREATE TABLE over10k_orc STORED AS ORC as select * from over10k_n2 where t between 3 and 4 +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@over10k_n2 +PREHOOK: Output: database:default +PREHOOK: Output: default@over10k_orc +POSTHOOK: query: CREATE TABLE over10k_orc STORED AS ORC as select * from over10k_n2 where t between 3 and 4 +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@over10k_n2 +POSTHOOK: Output: database:default +POSTHOOK: Output: default@over10k_orc +POSTHOOK: Lineage: over10k_orc.b SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:b, type:bigint, comment:null), ] +POSTHOOK: Lineage: over10k_orc.bin SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:bin, type:binary, comment:null), ] +POSTHOOK: Lineage: over10k_orc.bo SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:bo, type:boolean, comment:null), ] +POSTHOOK: Lineage: over10k_orc.d SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:d, type:double, comment:null), ] +POSTHOOK: Lineage: over10k_orc.dec SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:dec, type:decimal(4,2), comment:null), ] +POSTHOOK: Lineage: over10k_orc.f SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:f, type:float, comment:null), ] +POSTHOOK: Lineage: over10k_orc.i SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: over10k_orc.s SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:s, type:string, comment:null), ] +POSTHOOK: Lineage: over10k_orc.si SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:si, type:smallint, comment:null), ] +POSTHOOK: Lineage: over10k_orc.t SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:t, type:tinyint, comment:null), ] +POSTHOOK: Lineage: over10k_orc.ts SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:ts, type:timestamp, comment:null), ] +PREHOOK: query: INSERT INTO over10k_orc select * from over10k_n2 where t between 3 and 4 +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_n2 +PREHOOK: Output: default@over10k_orc +POSTHOOK: query: INSERT INTO over10k_orc select * from over10k_n2 where t between 3 and 4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_n2 +POSTHOOK: Output: default@over10k_orc +POSTHOOK: Lineage: over10k_orc.b SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:b, type:bigint, comment:null), ] +POSTHOOK: Lineage: over10k_orc.bin SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:bin, type:binary, comment:null), ] +POSTHOOK: Lineage: over10k_orc.bo SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:bo, type:boolean, comment:null), ] +POSTHOOK: Lineage: over10k_orc.d SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:d, type:double, comment:null), ] +POSTHOOK: Lineage: over10k_orc.dec SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:dec, type:decimal(4,2), comment:null), ] +POSTHOOK: Lineage: over10k_orc.f SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:f, type:float, comment:null), ] +POSTHOOK: Lineage: over10k_orc.i SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: over10k_orc.s SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:s, type:string, comment:null), ] +POSTHOOK: Lineage: over10k_orc.si SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:si, type:smallint, comment:null), ] +POSTHOOK: Lineage: over10k_orc.t SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:t, type:tinyint, comment:null), ] +POSTHOOK: Lineage: over10k_orc.ts SIMPLE [(over10k_n2)over10k_n2.FieldSchema(name:ts, type:timestamp, comment:null), ] +PREHOOK: query: alter table over10k_orc set TBLPROPERTIES ('transactional'='true') +PREHOOK: type: ALTERTABLE_PROPERTIES +PREHOOK: Input: default@over10k_orc +PREHOOK: Output: default@over10k_orc +POSTHOOK: query: alter table over10k_orc set TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: ALTERTABLE_PROPERTIES +POSTHOOK: Input: default@over10k_orc +POSTHOOK: Output: default@over10k_orc +PREHOOK: query: select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2 +on o1.ROW__ID.rowid == o2.ROW__ID.rowid and o1.ROW__ID.writeid == o2.ROW__ID.writeid and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc +#### A masked pattern was here #### +POSTHOOK: query: select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2 +on o1.ROW__ID.rowid == o2.ROW__ID.rowid and o1.ROW__ID.writeid == o2.ROW__ID.writeid and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc +#### A masked pattern was here #### +{"writeid":0,"bucketid":536870912,"rowid":0} 3 260 65659 4294967508 91.53 43.18 false oscar ovid 2013-03-01 09:11:58.703281 99.87 chemistry +{"writeid":0,"bucketid":536870912,"rowid":1} 4 279 65745 4294967431 83.58 31.66 true fred van buren 2013-03-01 09:11:58.703087 25.19 study skills +{"writeid":0,"bucketid":536870912,"rowid":2} 3 469 65743 4294967428 10.66 39.84 false victor zipper 2013-03-01 09:11:58.703181 26.60 mathematics +{"writeid":0,"bucketid":536870912,"rowid":3} 4 392 65665 4294967391 53.27 3.86 true zach miller 2013-03-01 09:11:58.703296 43.66 undecided +{"writeid":0,"bucketid":536870912,"rowid":4} 3 395 65747 4294967313 57.25 3.17 true wendy garcia 2013-03-01 09:11:58.703074 58.47 xylophone band +{"writeid":0,"bucketid":536870912,"rowid":5} 3 277 65788 4294967403 58.08 20.55 false xavier ovid 2013-03-01 09:11:58.703281 62.11 zync studies +{"writeid":0,"bucketid":536870912,"rowid":6} 4 509 65776 4294967432 78.26 35.02 false mike king 2013-03-01 09:11:58.703231 18.70 undecided +{"writeid":0,"bucketid":536870912,"rowid":7} 3 308 65757 4294967430 49.28 38.04 false nick zipper 2013-03-01 09:11:58.703132 1.86 kindergarten +{"writeid":0,"bucketid":536870912,"rowid":8} 4 460 65625 4294967360 5.51 22.6 true oscar laertes 2013-03-01 09:11:58.703293 42.86 nap time +{"writeid":0,"bucketid":536870912,"rowid":9} 3 322 65672 4294967508 25.55 26.28 true jessica carson 2013-03-01 09:11:58.70312 52.60 education +{"writeid":0,"bucketid":536870912,"rowid":10} 3 464 65617 4294967424 82.3 2.92 false ethan brown 2013-03-01 09:11:58.703076 18.51 wind surfing +{"writeid":0,"bucketid":536870912,"rowid":11} 3 260 65659 4294967508 91.53 43.18 false oscar ovid 2013-03-01 09:11:58.703281 99.87 chemistry +{"writeid":0,"bucketid":536870912,"rowid":12} 4 279 65745 4294967431 83.58 31.66 true fred van buren 2013-03-01 09:11:58.703087 25.19 study skills +{"writeid":0,"bucketid":536870912,"rowid":13} 3 469 65743 4294967428 10.66 39.84 false victor zipper 2013-03-01 09:11:58.703181 26.60 mathematics +{"writeid":0,"bucketid":536870912,"rowid":14} 4 392 65665 4294967391 53.27 3.86 true zach miller 2013-03-01 09:11:58.703296 43.66 undecided +{"writeid":0,"bucketid":536870912,"rowid":15} 3 395 65747 4294967313 57.25 3.17 true wendy garcia 2013-03-01 09:11:58.703074 58.47 xylophone band +{"writeid":0,"bucketid":536870912,"rowid":16} 3 277 65788 4294967403 58.08 20.55 false xavier ovid 2013-03-01 09:11:58.703281 62.11 zync studies +{"writeid":0,"bucketid":536870912,"rowid":17} 4 509 65776 4294967432 78.26 35.02 false mike king 2013-03-01 09:11:58.703231 18.70 undecided +{"writeid":0,"bucketid":536870912,"rowid":18} 3 308 65757 4294967430 49.28 38.04 false nick zipper 2013-03-01 09:11:58.703132 1.86 kindergarten +{"writeid":0,"bucketid":536870912,"rowid":19} 4 460 65625 4294967360 5.51 22.6 true oscar laertes 2013-03-01 09:11:58.703293 42.86 nap time +{"writeid":0,"bucketid":536870912,"rowid":20} 3 322 65672 4294967508 25.55 26.28 true jessica carson 2013-03-01 09:11:58.70312 52.60 education +{"writeid":0,"bucketid":536870912,"rowid":21} 3 464 65617 4294967424 82.3 2.92 false ethan brown 2013-03-01 09:11:58.703076 18.51 wind surfing +PREHOOK: query: select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2 +on o1.ROW__ID.rowid == o2.ROW__ID.rowid +and o1.ROW__ID.writeid == o2.ROW__ID.writeid +and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc +#### A masked pattern was here #### +POSTHOOK: query: select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2 +on o1.ROW__ID.rowid == o2.ROW__ID.rowid +and o1.ROW__ID.writeid == o2.ROW__ID.writeid +and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc +#### A masked pattern was here #### +{"writeid":0,"bucketid":536870912,"rowid":0} 3 260 65659 4294967508 91.53 43.18 false oscar ovid 2013-03-01 09:11:58.703281 99.87 chemistry +{"writeid":0,"bucketid":536870912,"rowid":1} 4 279 65745 4294967431 83.58 31.66 true fred van buren 2013-03-01 09:11:58.703087 25.19 study skills +{"writeid":0,"bucketid":536870912,"rowid":2} 3 469 65743 4294967428 10.66 39.84 false victor zipper 2013-03-01 09:11:58.703181 26.60 mathematics +{"writeid":0,"bucketid":536870912,"rowid":3} 4 392 65665 4294967391 53.27 3.86 true zach miller 2013-03-01 09:11:58.703296 43.66 undecided +{"writeid":0,"bucketid":536870912,"rowid":4} 3 395 65747 4294967313 57.25 3.17 true wendy garcia 2013-03-01 09:11:58.703074 58.47 xylophone band +{"writeid":0,"bucketid":536870912,"rowid":5} 3 277 65788 4294967403 58.08 20.55 false xavier ovid 2013-03-01 09:11:58.703281 62.11 zync studies +{"writeid":0,"bucketid":536870912,"rowid":6} 4 509 65776 4294967432 78.26 35.02 false mike king 2013-03-01 09:11:58.703231 18.70 undecided +{"writeid":0,"bucketid":536870912,"rowid":7} 3 308 65757 4294967430 49.28 38.04 false nick zipper 2013-03-01 09:11:58.703132 1.86 kindergarten +{"writeid":0,"bucketid":536870912,"rowid":8} 4 460 65625 4294967360 5.51 22.6 true oscar laertes 2013-03-01 09:11:58.703293 42.86 nap time +{"writeid":0,"bucketid":536870912,"rowid":9} 3 322 65672 4294967508 25.55 26.28 true jessica carson 2013-03-01 09:11:58.70312 52.60 education +{"writeid":0,"bucketid":536870912,"rowid":10} 3 464 65617 4294967424 82.3 2.92 false ethan brown 2013-03-01 09:11:58.703076 18.51 wind surfing +{"writeid":0,"bucketid":536870912,"rowid":11} 3 260 65659 4294967508 91.53 43.18 false oscar ovid 2013-03-01 09:11:58.703281 99.87 chemistry +{"writeid":0,"bucketid":536870912,"rowid":12} 4 279 65745 4294967431 83.58 31.66 true fred van buren 2013-03-01 09:11:58.703087 25.19 study skills +{"writeid":0,"bucketid":536870912,"rowid":13} 3 469 65743 4294967428 10.66 39.84 false victor zipper 2013-03-01 09:11:58.703181 26.60 mathematics +{"writeid":0,"bucketid":536870912,"rowid":14} 4 392 65665 4294967391 53.27 3.86 true zach miller 2013-03-01 09:11:58.703296 43.66 undecided +{"writeid":0,"bucketid":536870912,"rowid":15} 3 395 65747 4294967313 57.25 3.17 true wendy garcia 2013-03-01 09:11:58.703074 58.47 xylophone band +{"writeid":0,"bucketid":536870912,"rowid":16} 3 277 65788 4294967403 58.08 20.55 false xavier ovid 2013-03-01 09:11:58.703281 62.11 zync studies +{"writeid":0,"bucketid":536870912,"rowid":17} 4 509 65776 4294967432 78.26 35.02 false mike king 2013-03-01 09:11:58.703231 18.70 undecided +{"writeid":0,"bucketid":536870912,"rowid":18} 3 308 65757 4294967430 49.28 38.04 false nick zipper 2013-03-01 09:11:58.703132 1.86 kindergarten +{"writeid":0,"bucketid":536870912,"rowid":19} 4 460 65625 4294967360 5.51 22.6 true oscar laertes 2013-03-01 09:11:58.703293 42.86 nap time +{"writeid":0,"bucketid":536870912,"rowid":20} 3 322 65672 4294967508 25.55 26.28 true jessica carson 2013-03-01 09:11:58.70312 52.60 education +{"writeid":0,"bucketid":536870912,"rowid":21} 3 464 65617 4294967424 82.3 2.92 false ethan brown 2013-03-01 09:11:58.703076 18.51 wind surfing +PREHOOK: query: delete from over10k_orc where t = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc +PREHOOK: Output: default@over10k_orc +POSTHOOK: query: delete from over10k_orc where t = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc +POSTHOOK: Output: default@over10k_orc +PREHOOK: query: select t, count(*) from over10k_orc +group by t +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc +#### A masked pattern was here #### +POSTHOOK: query: select t, count(*) from over10k_orc +group by t +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc +#### A masked pattern was here #### +4 8 +PREHOOK: query: select t, count(*) from over10k_orc +group by t +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc +#### A masked pattern was here #### +POSTHOOK: query: select t, count(*) from over10k_orc +group by t +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc +#### A masked pattern was here #### +4 8