Repository: hive Updated Branches: refs/heads/master 6b8ddbfcc -> 236a32c64
HIVE-20418 : LLAP IO may not handle ORC files that have row index disabled correctly for queries with no columns selected (Sergey Shelukhin, reviewed by Gopal Vijayaraghavan) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/236a32c6 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/236a32c6 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/236a32c6 Branch: refs/heads/master Commit: 236a32c645a21b04ccaf7f18db5c6a5aa53586e8 Parents: 6b8ddbf Author: sergey <ser...@apache.org> Authored: Wed Aug 22 11:41:50 2018 -0700 Committer: sergey <ser...@apache.org> Committed: Wed Aug 22 11:41:50 2018 -0700 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 1 + .../ql/io/orc/encoded/EncodedReaderImpl.java | 33 ++- .../test/queries/clientpositive/vector_acid4.q | 44 +++ .../clientpositive/llap/vector_acid4.q.out | 265 +++++++++++++++++++ 4 files changed, 335 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index d2fe788..983d5a7 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -747,6 +747,7 @@ minillaplocal.query.files=\ union_remove_26.q,\ union_top_level.q,\ update_access_time_non_current_db.q, \ + vector_acid4.q,\ vector_annotate_stats_select.q,\ vector_auto_smb_mapjoin_14.q,\ vector_char_varchar_1.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java index 1b11e0e..346ab5c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java @@ -57,6 +57,7 @@ import org.apache.orc.impl.OrcIndex; import org.apache.orc.impl.OutStream; import org.apache.orc.impl.RecordReaderUtils; import org.apache.orc.impl.StreamName; +import org.apache.orc.impl.RecordReaderImpl.SargApplier; import org.apache.orc.impl.StreamName.Area; import org.apache.orc.impl.WriterImpl; import org.apache.orc.StripeInformation; @@ -323,15 +324,17 @@ class EncodedReaderImpl implements EncodedReader { trace.logColumnRead(i, colRgIx, enc.getKind()); } CreateHelper listToRead = new CreateHelper(); - boolean hasIndexOnlyCols = false; + boolean hasIndexOnlyCols = false, hasAnyNonData = false; for (OrcProto.Stream stream : streamList) { long length = stream.getLength(); int colIx = stream.getColumn(); OrcProto.Stream.Kind streamKind = stream.getKind(); - if (!physicalFileIncludes[colIx] || StreamName.getArea(streamKind) != StreamName.Area.DATA) { - // We have a stream for included column, but in future it might have no data streams. - // It's more like "has at least one column included that has an index stream". - hasIndexOnlyCols = hasIndexOnlyCols || physicalFileIncludes[colIx]; + boolean isIndexCol = StreamName.getArea(streamKind) != StreamName.Area.DATA; + hasAnyNonData = hasAnyNonData || isIndexCol; + // We have a stream for included column, but in future it might have no data streams. + // It's more like "has at least one column included that has an index stream". + hasIndexOnlyCols = hasIndexOnlyCols || (isIndexCol && physicalFileIncludes[colIx]); + if (!physicalFileIncludes[colIx] || isIndexCol) { if (isTracingEnabled) { LOG.trace("Skipping stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length); @@ -367,8 +370,22 @@ class EncodedReaderImpl implements EncodedReader { boolean hasFileId = this.fileKey != null; if (listToRead.get() == null) { // No data to read for this stripe. Check if we have some included index-only columns. - // TODO: there may be a bug here. Could there be partial RG filtering on index-only column? - if (hasIndexOnlyCols && (rgs == null)) { + // For example, count(1) would have the root column, that has no data stream, included. + // It may also happen that we have a column included with no streams whatsoever. That + // should only be possible if the file has no index streams. + boolean hasAnyIncludes = false; + if (!hasIndexOnlyCols) { + for (int i = 0; i < physicalFileIncludes.length; ++i) { + if (!physicalFileIncludes[i]) continue; + hasAnyIncludes = true; + break; + } + } + boolean nonProjectionRead = hasIndexOnlyCols || (!hasAnyNonData && hasAnyIncludes); + + // TODO: Could there be partial RG filtering w/no projection? + // We should probably just disable filtering for such cases if they exist. + if (nonProjectionRead && (rgs == SargApplier.READ_ALL_RGS)) { OrcEncodedColumnBatch ecb = POOLS.ecbPool.take(); ecb.init(fileKey, stripeIx, OrcEncodedColumnBatch.ALL_RGS, physicalFileIncludes.length); try { @@ -1004,7 +1021,7 @@ class EncodedReaderImpl implements EncodedReader { if (current instanceof CacheChunk) { // 2a. This is a decoded compression buffer, add as is. CacheChunk cc = (CacheChunk)current; - if (isTracingEnabled) { // TODO# HERE unaccompanied lock + if (isTracingEnabled) { LOG.trace("Locking " + cc.getBuffer() + " due to reuse"); } cacheWrapper.reuseBuffer(cc.getBuffer()); http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/ql/src/test/queries/clientpositive/vector_acid4.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/vector_acid4.q b/ql/src/test/queries/clientpositive/vector_acid4.q new file mode 100644 index 0000000..628ecb5 --- /dev/null +++ b/ql/src/test/queries/clientpositive/vector_acid4.q @@ -0,0 +1,44 @@ +--! qt:dataset:src + +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.exec.dynamic.partition=true; +set hive.vectorized.execution.enabled=true; +set hive.compute.query.using.stats=false; +set hive.fetch.task.conversion=none; +set hive.llap.io.enabled=true; +set hive.compute.query.using.stats=false; +SET hive.exec.orc.default.row.index.stride=1000; +set hive.mapred.mode=nonstrict; + +set hive.exec.orc.delta.streaming.optimizations.enabled=true; + + +drop table cross_numbers; +create table cross_numbers(i string); +insert into table cross_numbers select key from src limit 20; + +drop table lots_of_rows; +create table lots_of_rows(key string) stored as orc tblproperties("transactional"="false"); +insert into table lots_of_rows select concat(key, '', i) from src cross join cross_numbers; + +drop table testacid1; +create table testacid1(id string, id2 string) clustered by (id2) into 2 buckets stored as orc tblproperties("transactional"="true"); +insert into table testacid1 select key, key from lots_of_rows; + +drop table lots_of_row; + +select * from testacid1 order by id limit 30; +select sum(hash(*)) from testacid1 limit 10; + +select count(id) from testacid1; + +select count(1) from testacid1; + +select count(1) from testacid1 where id = '0128'; + +explain update testacid1 set id = '206' where id = '0128'; +update testacid1 set id = '206' where id = '0128'; + +select * from testacid1 order by id limit 30; http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/ql/src/test/results/clientpositive/llap/vector_acid4.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/vector_acid4.q.out b/ql/src/test/results/clientpositive/llap/vector_acid4.q.out new file mode 100644 index 0000000..b1f246d --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/vector_acid4.q.out @@ -0,0 +1,265 @@ +PREHOOK: query: drop table cross_numbers +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table cross_numbers +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table cross_numbers(i string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@cross_numbers +POSTHOOK: query: create table cross_numbers(i string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@cross_numbers +PREHOOK: query: insert into table cross_numbers select key from src limit 20 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@cross_numbers +POSTHOOK: query: insert into table cross_numbers select key from src limit 20 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@cross_numbers +POSTHOOK: Lineage: cross_numbers.i SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: drop table lots_of_rows +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table lots_of_rows +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table lots_of_rows(key string) stored as orc tblproperties("transactional"="false") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@lots_of_rows +POSTHOOK: query: create table lots_of_rows(key string) stored as orc tblproperties("transactional"="false") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@lots_of_rows +Warning: Shuffle Join MERGEJOIN[16][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: insert into table lots_of_rows select concat(key, '', i) from src cross join cross_numbers +PREHOOK: type: QUERY +PREHOOK: Input: default@cross_numbers +PREHOOK: Input: default@src +PREHOOK: Output: default@lots_of_rows +POSTHOOK: query: insert into table lots_of_rows select concat(key, '', i) from src cross join cross_numbers +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cross_numbers +POSTHOOK: Input: default@src +POSTHOOK: Output: default@lots_of_rows +POSTHOOK: Lineage: lots_of_rows.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (cross_numbers)cross_numbers.FieldSchema(name:i, type:string, comment:null), ] +PREHOOK: query: drop table testacid1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table testacid1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table testacid1(id string, id2 string) clustered by (id2) into 2 buckets stored as orc tblproperties("transactional"="true") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@testacid1 +POSTHOOK: query: create table testacid1(id string, id2 string) clustered by (id2) into 2 buckets stored as orc tblproperties("transactional"="true") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@testacid1 +PREHOOK: query: insert into table testacid1 select key, key from lots_of_rows +PREHOOK: type: QUERY +PREHOOK: Input: default@lots_of_rows +PREHOOK: Output: default@testacid1 +POSTHOOK: query: insert into table testacid1 select key, key from lots_of_rows +POSTHOOK: type: QUERY +POSTHOOK: Input: default@lots_of_rows +POSTHOOK: Output: default@testacid1 +POSTHOOK: Lineage: testacid1.id SIMPLE [(lots_of_rows)lots_of_rows.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: testacid1.id2 SIMPLE [(lots_of_rows)lots_of_rows.FieldSchema(name:key, type:string, comment:null), ] +PREHOOK: query: drop table lots_of_row +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table lots_of_row +POSTHOOK: type: DROPTABLE +PREHOOK: query: select * from testacid1 order by id limit 30 +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select * from testacid1 order by id limit 30 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +0128 0128 +0128 0128 +0128 0128 +0150 0150 +0150 0150 +0150 0150 +0165 0165 +0165 0165 +0165 0165 +0193 0193 +0193 0193 +0193 0193 +0213 0213 +0213 0213 +0213 0213 +0224 0224 +0224 0224 +0224 0224 +0238 0238 +0238 0238 +0238 0238 +0255 0255 +0255 0255 +0255 0255 +0265 0265 +0265 0265 +0265 0265 +027 027 +027 027 +027 027 +PREHOOK: query: select sum(hash(*)) from testacid1 limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select sum(hash(*)) from testacid1 limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +8838111640064 +PREHOOK: query: select count(id) from testacid1 +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select count(id) from testacid1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +10000 +PREHOOK: query: select count(1) from testacid1 +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from testacid1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +10000 +PREHOOK: query: select count(1) from testacid1 where id = '0128' +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from testacid1 where id = '0128' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +3 +PREHOOK: query: explain update testacid1 set id = '206' where id = '0128' +PREHOOK: type: QUERY +POSTHOOK: query: explain update testacid1 set id = '206' where id = '0128' +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: testacid1 + filterExpr: (id = '0128') (type: boolean) + Statistics: Num rows: 10000 Data size: 1800000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (id = '0128') (type: boolean) + Statistics: Num rows: 2 Data size: 360 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ROW__ID (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), id2 (type: string) + outputColumnNames: _col0, _col2 + Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>) + sort order: + + Map-reduce partition columns: UDFToInteger(_col0) (type: int) + Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: string) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), '206' (type: string), VALUE._col0 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.testacid1 + Write Type: UPDATE + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: false + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.testacid1 + Write Type: UPDATE + + Stage: Stage-3 + Stats Work + Basic Stats Work: + +PREHOOK: query: update testacid1 set id = '206' where id = '0128' +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +PREHOOK: Output: default@testacid1 +POSTHOOK: query: update testacid1 set id = '206' where id = '0128' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +POSTHOOK: Output: default@testacid1 +PREHOOK: query: select * from testacid1 order by id limit 30 +PREHOOK: type: QUERY +PREHOOK: Input: default@testacid1 +#### A masked pattern was here #### +POSTHOOK: query: select * from testacid1 order by id limit 30 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testacid1 +#### A masked pattern was here #### +0150 0150 +0150 0150 +0150 0150 +0165 0165 +0165 0165 +0165 0165 +0193 0193 +0193 0193 +0193 0193 +0213 0213 +0213 0213 +0213 0213 +0224 0224 +0224 0224 +0224 0224 +0238 0238 +0238 0238 +0238 0238 +0255 0255 +0255 0255 +0255 0255 +0265 0265 +0265 0265 +0265 0265 +027 027 +027 027 +027 027 +0273 0273 +0273 0273 +0273 0273