hive git commit: HIVE-20418 : LLAP IO may not handle ORC files that have row index disabled correctly for queries with no columns selected (Sergey Shelukhin, reviewed by Gopal Vijayaraghavan)

sershe Wed, 22 Aug 2018 11:42:56 -0700

Repository: hive
Updated Branches:
  refs/heads/master 6b8ddbfcc -> 236a32c64



HIVE-20418 : LLAP IO may not handle ORC files that have row index disabled 
correctly for queries with no columns selected (Sergey Shelukhin, reviewed by 
Gopal Vijayaraghavan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/236a32c6
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/236a32c6
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/236a32c6

Branch: refs/heads/master
Commit: 236a32c645a21b04ccaf7f18db5c6a5aa53586e8
Parents: 6b8ddbf
Author: sergey <ser...@apache.org>
Authored: Wed Aug 22 11:41:50 2018 -0700
Committer: sergey <ser...@apache.org>
Committed: Wed Aug 22 11:41:50 2018 -0700

----------------------------------------------------------------------
 .../test/resources/testconfiguration.properties |   1 +
 .../ql/io/orc/encoded/EncodedReaderImpl.java    |  33 ++-
 .../test/queries/clientpositive/vector_acid4.q  |  44 +++
 .../clientpositive/llap/vector_acid4.q.out      | 265 +++++++++++++++++++
 4 files changed, 335 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index d2fe788..983d5a7 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -747,6 +747,7 @@ minillaplocal.query.files=\
   union_remove_26.q,\
   union_top_level.q,\
   update_access_time_non_current_db.q, \
+  vector_acid4.q,\
   vector_annotate_stats_select.q,\
   vector_auto_smb_mapjoin_14.q,\
   vector_char_varchar_1.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
index 1b11e0e..346ab5c 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
@@ -57,6 +57,7 @@ import org.apache.orc.impl.OrcIndex;
 import org.apache.orc.impl.OutStream;
 import org.apache.orc.impl.RecordReaderUtils;
 import org.apache.orc.impl.StreamName;
+import org.apache.orc.impl.RecordReaderImpl.SargApplier;
 import org.apache.orc.impl.StreamName.Area;
 import org.apache.orc.impl.WriterImpl;
 import org.apache.orc.StripeInformation;
@@ -323,15 +324,17 @@ class EncodedReaderImpl implements EncodedReader {
       trace.logColumnRead(i, colRgIx, enc.getKind());
     }
     CreateHelper listToRead = new CreateHelper();
-    boolean hasIndexOnlyCols = false;
+    boolean hasIndexOnlyCols = false, hasAnyNonData = false;
     for (OrcProto.Stream stream : streamList) {
       long length = stream.getLength();
       int colIx = stream.getColumn();
       OrcProto.Stream.Kind streamKind = stream.getKind();
-      if (!physicalFileIncludes[colIx] || StreamName.getArea(streamKind) != 
StreamName.Area.DATA) {
-        // We have a stream for included column, but in future it might have 
no data streams.
-        // It's more like "has at least one column included that has an index 
stream".
-        hasIndexOnlyCols = hasIndexOnlyCols || physicalFileIncludes[colIx];
+      boolean isIndexCol = StreamName.getArea(streamKind) != 
StreamName.Area.DATA;
+      hasAnyNonData = hasAnyNonData || isIndexCol;
+      // We have a stream for included column, but in future it might have no 
data streams.
+      // It's more like "has at least one column included that has an index 
stream".
+      hasIndexOnlyCols = hasIndexOnlyCols || (isIndexCol && 
physicalFileIncludes[colIx]);
+      if (!physicalFileIncludes[colIx] || isIndexCol) {
         if (isTracingEnabled) {
           LOG.trace("Skipping stream for column " + colIx + ": "
               + streamKind + " at " + offset + ", " + length);
@@ -367,8 +370,22 @@ class EncodedReaderImpl implements EncodedReader {
     boolean hasFileId = this.fileKey != null;
     if (listToRead.get() == null) {
       // No data to read for this stripe. Check if we have some included 
index-only columns.
-      // TODO: there may be a bug here. Could there be partial RG filtering on 
index-only column?
-      if (hasIndexOnlyCols && (rgs == null)) {
+      // For example, count(1) would have the root column, that has no data 
stream, included.
+      // It may also happen that we have a column included with no streams 
whatsoever. That
+      // should only be possible if the file has no index streams.
+      boolean hasAnyIncludes = false;
+      if (!hasIndexOnlyCols) {
+        for (int i = 0; i < physicalFileIncludes.length; ++i) {
+          if (!physicalFileIncludes[i]) continue;
+          hasAnyIncludes = true;
+          break;
+        }
+      }
+      boolean nonProjectionRead = hasIndexOnlyCols || (!hasAnyNonData && 
hasAnyIncludes);
+
+      // TODO: Could there be partial RG filtering w/no projection?
+      //       We should probably just disable filtering for such cases if 
they exist.
+      if (nonProjectionRead && (rgs == SargApplier.READ_ALL_RGS)) {
         OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
         ecb.init(fileKey, stripeIx, OrcEncodedColumnBatch.ALL_RGS, 
physicalFileIncludes.length);
         try {
@@ -1004,7 +1021,7 @@ class EncodedReaderImpl implements EncodedReader {
       if (current instanceof CacheChunk) {
         // 2a. This is a decoded compression buffer, add as is.
         CacheChunk cc = (CacheChunk)current;
-        if (isTracingEnabled) { // TODO# HERE unaccompanied lock
+        if (isTracingEnabled) {
           LOG.trace("Locking " + cc.getBuffer() + " due to reuse");
         }
         cacheWrapper.reuseBuffer(cc.getBuffer());

http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/ql/src/test/queries/clientpositive/vector_acid4.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vector_acid4.q 
b/ql/src/test/queries/clientpositive/vector_acid4.q
new file mode 100644
index 0000000..628ecb5
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vector_acid4.q
@@ -0,0 +1,44 @@
+--! qt:dataset:src
+
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.exec.dynamic.partition=true;
+set hive.vectorized.execution.enabled=true;
+set hive.compute.query.using.stats=false;
+set hive.fetch.task.conversion=none;
+set hive.llap.io.enabled=true;
+set hive.compute.query.using.stats=false;
+SET hive.exec.orc.default.row.index.stride=1000;
+set hive.mapred.mode=nonstrict;
+
+set hive.exec.orc.delta.streaming.optimizations.enabled=true;
+
+
+drop table cross_numbers;
+create table cross_numbers(i string);
+insert into table cross_numbers select key from src limit 20;
+
+drop table lots_of_rows;
+create table lots_of_rows(key string) stored as orc 
tblproperties("transactional"="false");
+insert into table lots_of_rows select concat(key, '', i) from src cross join 
cross_numbers;
+
+drop table testacid1;
+create table testacid1(id string, id2 string) clustered by (id2) into 2 
buckets stored as orc tblproperties("transactional"="true");
+insert into table testacid1 select key, key from lots_of_rows;
+
+drop table lots_of_row;
+
+select * from testacid1 order by id limit 30;
+select sum(hash(*)) from testacid1 limit 10;
+
+select count(id) from testacid1;
+
+select count(1) from testacid1;
+
+select count(1) from testacid1 where id = '0128';
+
+explain update testacid1 set id = '206' where id = '0128';
+update testacid1 set id = '206' where id = '0128';
+
+select * from testacid1 order by id limit 30;

http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/ql/src/test/results/clientpositive/llap/vector_acid4.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_acid4.q.out 
b/ql/src/test/results/clientpositive/llap/vector_acid4.q.out
new file mode 100644
index 0000000..b1f246d
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/vector_acid4.q.out
@@ -0,0 +1,265 @@
+PREHOOK: query: drop table cross_numbers
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table cross_numbers
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table cross_numbers(i string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@cross_numbers
+POSTHOOK: query: create table cross_numbers(i string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@cross_numbers
+PREHOOK: query: insert into table cross_numbers select key from src limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@cross_numbers
+POSTHOOK: query: insert into table cross_numbers select key from src limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@cross_numbers
+POSTHOOK: Lineage: cross_numbers.i SIMPLE [(src)src.FieldSchema(name:key, 
type:string, comment:default), ]
+PREHOOK: query: drop table lots_of_rows
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table lots_of_rows
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table lots_of_rows(key string) stored as orc 
tblproperties("transactional"="false")
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@lots_of_rows
+POSTHOOK: query: create table lots_of_rows(key string) stored as orc 
tblproperties("transactional"="false")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@lots_of_rows
+Warning: Shuffle Join MERGEJOIN[16][tables = [$hdt$_0, $hdt$_1]] in Stage 
'Reducer 2' is a cross product
+PREHOOK: query: insert into table lots_of_rows select concat(key, '', i) from 
src cross join cross_numbers
+PREHOOK: type: QUERY
+PREHOOK: Input: default@cross_numbers
+PREHOOK: Input: default@src
+PREHOOK: Output: default@lots_of_rows
+POSTHOOK: query: insert into table lots_of_rows select concat(key, '', i) from 
src cross join cross_numbers
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@cross_numbers
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@lots_of_rows
+POSTHOOK: Lineage: lots_of_rows.key EXPRESSION [(src)src.FieldSchema(name:key, 
type:string, comment:default), (cross_numbers)cross_numbers.FieldSchema(name:i, 
type:string, comment:null), ]
+PREHOOK: query: drop table testacid1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table testacid1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table testacid1(id string, id2 string) clustered by 
(id2) into 2 buckets stored as orc tblproperties("transactional"="true")
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@testacid1
+POSTHOOK: query: create table testacid1(id string, id2 string) clustered by 
(id2) into 2 buckets stored as orc tblproperties("transactional"="true")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@testacid1
+PREHOOK: query: insert into table testacid1 select key, key from lots_of_rows
+PREHOOK: type: QUERY
+PREHOOK: Input: default@lots_of_rows
+PREHOOK: Output: default@testacid1
+POSTHOOK: query: insert into table testacid1 select key, key from lots_of_rows
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@lots_of_rows
+POSTHOOK: Output: default@testacid1
+POSTHOOK: Lineage: testacid1.id SIMPLE 
[(lots_of_rows)lots_of_rows.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: testacid1.id2 SIMPLE 
[(lots_of_rows)lots_of_rows.FieldSchema(name:key, type:string, comment:null), ]
+PREHOOK: query: drop table lots_of_row
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table lots_of_row
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: select * from testacid1 order by id limit 30
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testacid1 order by id limit 30
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+0128   0128
+0128   0128
+0128   0128
+0150   0150
+0150   0150
+0150   0150
+0165   0165
+0165   0165
+0165   0165
+0193   0193
+0193   0193
+0193   0193
+0213   0213
+0213   0213
+0213   0213
+0224   0224
+0224   0224
+0224   0224
+0238   0238
+0238   0238
+0238   0238
+0255   0255
+0255   0255
+0255   0255
+0265   0265
+0265   0265
+0265   0265
+027    027
+027    027
+027    027
+PREHOOK: query: select sum(hash(*)) from testacid1 limit 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select sum(hash(*)) from testacid1 limit 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+8838111640064
+PREHOOK: query: select count(id) from testacid1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(id) from testacid1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+10000
+PREHOOK: query: select count(1) from testacid1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(1) from testacid1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+10000
+PREHOOK: query: select count(1) from testacid1 where id = '0128'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(1) from testacid1 where id = '0128'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+3
+PREHOOK: query: explain update testacid1 set id = '206' where id = '0128'
+PREHOOK: type: QUERY
+POSTHOOK: query: explain update testacid1 set id = '206' where id = '0128'
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+  Stage-3 depends on stages: Stage-0
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: testacid1
+                  filterExpr: (id = '0128') (type: boolean)
+                  Statistics: Num rows: 10000 Data size: 1800000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (id = '0128') (type: boolean)
+                    Statistics: Num rows: 2 Data size: 360 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: ROW__ID (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>), id2 (type: string)
+                      outputColumnNames: _col0, _col2
+                      Statistics: Num rows: 2 Data size: 506 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>)
+                        sort order: +
+                        Map-reduce partition columns: UDFToInteger(_col0) 
(type: int)
+                        Statistics: Num rows: 2 Data size: 506 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col2 (type: string)
+            Execution mode: vectorized, llap
+            LLAP IO: may be used (ACID table)
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Select Operator
+                expressions: KEY.reducesinkkey0 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>), '206' (type: string), 
VALUE._col0 (type: string)
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE 
Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  table:
+                      input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+                      serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                      name: default.testacid1
+                  Write Type: UPDATE
+
+  Stage: Stage-2
+    Dependency Collection
+
+  Stage: Stage-0
+    Move Operator
+      tables:
+          replace: false
+          table:
+              input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+              output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+              serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+              name: default.testacid1
+          Write Type: UPDATE
+
+  Stage: Stage-3
+    Stats Work
+      Basic Stats Work:
+
+PREHOOK: query: update testacid1 set id = '206' where id = '0128'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+PREHOOK: Output: default@testacid1
+POSTHOOK: query: update testacid1 set id = '206' where id = '0128'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+POSTHOOK: Output: default@testacid1
+PREHOOK: query: select * from testacid1 order by id limit 30
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testacid1 order by id limit 30
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+0150   0150
+0150   0150
+0150   0150
+0165   0165
+0165   0165
+0165   0165
+0193   0193
+0193   0193
+0193   0193
+0213   0213
+0213   0213
+0213   0213
+0224   0224
+0224   0224
+0224   0224
+0238   0238
+0238   0238
+0238   0238
+0255   0255
+0255   0255
+0255   0255
+0265   0265
+0265   0265
+0265   0265
+027    027
+027    027
+027    027
+0273   0273
+0273   0273
+0273   0273

hive git commit: HIVE-20418 : LLAP IO may not handle ORC files that have row index disabled correctly for queries with no columns selected (Sergey Shelukhin, reviewed by Gopal Vijayaraghavan)

Reply via email to