Repository: hive
Updated Branches:
  refs/heads/master 6b8ddbfcc -> 236a32c64


HIVE-20418 : LLAP IO may not handle ORC files that have row index disabled 
correctly for queries with no columns selected (Sergey Shelukhin, reviewed by 
Gopal Vijayaraghavan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/236a32c6
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/236a32c6
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/236a32c6

Branch: refs/heads/master
Commit: 236a32c645a21b04ccaf7f18db5c6a5aa53586e8
Parents: 6b8ddbf
Author: sergey <ser...@apache.org>
Authored: Wed Aug 22 11:41:50 2018 -0700
Committer: sergey <ser...@apache.org>
Committed: Wed Aug 22 11:41:50 2018 -0700

----------------------------------------------------------------------
 .../test/resources/testconfiguration.properties |   1 +
 .../ql/io/orc/encoded/EncodedReaderImpl.java    |  33 ++-
 .../test/queries/clientpositive/vector_acid4.q  |  44 +++
 .../clientpositive/llap/vector_acid4.q.out      | 265 +++++++++++++++++++
 4 files changed, 335 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index d2fe788..983d5a7 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -747,6 +747,7 @@ minillaplocal.query.files=\
   union_remove_26.q,\
   union_top_level.q,\
   update_access_time_non_current_db.q, \
+  vector_acid4.q,\
   vector_annotate_stats_select.q,\
   vector_auto_smb_mapjoin_14.q,\
   vector_char_varchar_1.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
index 1b11e0e..346ab5c 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
@@ -57,6 +57,7 @@ import org.apache.orc.impl.OrcIndex;
 import org.apache.orc.impl.OutStream;
 import org.apache.orc.impl.RecordReaderUtils;
 import org.apache.orc.impl.StreamName;
+import org.apache.orc.impl.RecordReaderImpl.SargApplier;
 import org.apache.orc.impl.StreamName.Area;
 import org.apache.orc.impl.WriterImpl;
 import org.apache.orc.StripeInformation;
@@ -323,15 +324,17 @@ class EncodedReaderImpl implements EncodedReader {
       trace.logColumnRead(i, colRgIx, enc.getKind());
     }
     CreateHelper listToRead = new CreateHelper();
-    boolean hasIndexOnlyCols = false;
+    boolean hasIndexOnlyCols = false, hasAnyNonData = false;
     for (OrcProto.Stream stream : streamList) {
       long length = stream.getLength();
       int colIx = stream.getColumn();
       OrcProto.Stream.Kind streamKind = stream.getKind();
-      if (!physicalFileIncludes[colIx] || StreamName.getArea(streamKind) != 
StreamName.Area.DATA) {
-        // We have a stream for included column, but in future it might have 
no data streams.
-        // It's more like "has at least one column included that has an index 
stream".
-        hasIndexOnlyCols = hasIndexOnlyCols || physicalFileIncludes[colIx];
+      boolean isIndexCol = StreamName.getArea(streamKind) != 
StreamName.Area.DATA;
+      hasAnyNonData = hasAnyNonData || isIndexCol;
+      // We have a stream for included column, but in future it might have no 
data streams.
+      // It's more like "has at least one column included that has an index 
stream".
+      hasIndexOnlyCols = hasIndexOnlyCols || (isIndexCol && 
physicalFileIncludes[colIx]);
+      if (!physicalFileIncludes[colIx] || isIndexCol) {
         if (isTracingEnabled) {
           LOG.trace("Skipping stream for column " + colIx + ": "
               + streamKind + " at " + offset + ", " + length);
@@ -367,8 +370,22 @@ class EncodedReaderImpl implements EncodedReader {
     boolean hasFileId = this.fileKey != null;
     if (listToRead.get() == null) {
       // No data to read for this stripe. Check if we have some included 
index-only columns.
-      // TODO: there may be a bug here. Could there be partial RG filtering on 
index-only column?
-      if (hasIndexOnlyCols && (rgs == null)) {
+      // For example, count(1) would have the root column, that has no data 
stream, included.
+      // It may also happen that we have a column included with no streams 
whatsoever. That
+      // should only be possible if the file has no index streams.
+      boolean hasAnyIncludes = false;
+      if (!hasIndexOnlyCols) {
+        for (int i = 0; i < physicalFileIncludes.length; ++i) {
+          if (!physicalFileIncludes[i]) continue;
+          hasAnyIncludes = true;
+          break;
+        }
+      }
+      boolean nonProjectionRead = hasIndexOnlyCols || (!hasAnyNonData && 
hasAnyIncludes);
+
+      // TODO: Could there be partial RG filtering w/no projection?
+      //       We should probably just disable filtering for such cases if 
they exist.
+      if (nonProjectionRead && (rgs == SargApplier.READ_ALL_RGS)) {
         OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
         ecb.init(fileKey, stripeIx, OrcEncodedColumnBatch.ALL_RGS, 
physicalFileIncludes.length);
         try {
@@ -1004,7 +1021,7 @@ class EncodedReaderImpl implements EncodedReader {
       if (current instanceof CacheChunk) {
         // 2a. This is a decoded compression buffer, add as is.
         CacheChunk cc = (CacheChunk)current;
-        if (isTracingEnabled) { // TODO# HERE unaccompanied lock
+        if (isTracingEnabled) {
           LOG.trace("Locking " + cc.getBuffer() + " due to reuse");
         }
         cacheWrapper.reuseBuffer(cc.getBuffer());

http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/ql/src/test/queries/clientpositive/vector_acid4.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vector_acid4.q 
b/ql/src/test/queries/clientpositive/vector_acid4.q
new file mode 100644
index 0000000..628ecb5
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vector_acid4.q
@@ -0,0 +1,44 @@
+--! qt:dataset:src
+
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.exec.dynamic.partition=true;
+set hive.vectorized.execution.enabled=true;
+set hive.compute.query.using.stats=false;
+set hive.fetch.task.conversion=none;
+set hive.llap.io.enabled=true;
+set hive.compute.query.using.stats=false;
+SET hive.exec.orc.default.row.index.stride=1000;
+set hive.mapred.mode=nonstrict;
+
+set hive.exec.orc.delta.streaming.optimizations.enabled=true;
+
+
+drop table cross_numbers;
+create table cross_numbers(i string);
+insert into table cross_numbers select key from src limit 20;
+
+drop table lots_of_rows;
+create table lots_of_rows(key string) stored as orc 
tblproperties("transactional"="false");
+insert into table lots_of_rows select concat(key, '', i) from src cross join 
cross_numbers;
+
+drop table testacid1;
+create table testacid1(id string, id2 string) clustered by (id2) into 2 
buckets stored as orc tblproperties("transactional"="true");
+insert into table testacid1 select key, key from lots_of_rows;
+
+drop table lots_of_row;
+
+select * from testacid1 order by id limit 30;
+select sum(hash(*)) from testacid1 limit 10;
+
+select count(id) from testacid1;
+
+select count(1) from testacid1;
+
+select count(1) from testacid1 where id = '0128';
+
+explain update testacid1 set id = '206' where id = '0128';
+update testacid1 set id = '206' where id = '0128';
+
+select * from testacid1 order by id limit 30;

http://git-wip-us.apache.org/repos/asf/hive/blob/236a32c6/ql/src/test/results/clientpositive/llap/vector_acid4.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_acid4.q.out 
b/ql/src/test/results/clientpositive/llap/vector_acid4.q.out
new file mode 100644
index 0000000..b1f246d
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/vector_acid4.q.out
@@ -0,0 +1,265 @@
+PREHOOK: query: drop table cross_numbers
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table cross_numbers
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table cross_numbers(i string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@cross_numbers
+POSTHOOK: query: create table cross_numbers(i string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@cross_numbers
+PREHOOK: query: insert into table cross_numbers select key from src limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@cross_numbers
+POSTHOOK: query: insert into table cross_numbers select key from src limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@cross_numbers
+POSTHOOK: Lineage: cross_numbers.i SIMPLE [(src)src.FieldSchema(name:key, 
type:string, comment:default), ]
+PREHOOK: query: drop table lots_of_rows
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table lots_of_rows
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table lots_of_rows(key string) stored as orc 
tblproperties("transactional"="false")
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@lots_of_rows
+POSTHOOK: query: create table lots_of_rows(key string) stored as orc 
tblproperties("transactional"="false")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@lots_of_rows
+Warning: Shuffle Join MERGEJOIN[16][tables = [$hdt$_0, $hdt$_1]] in Stage 
'Reducer 2' is a cross product
+PREHOOK: query: insert into table lots_of_rows select concat(key, '', i) from 
src cross join cross_numbers
+PREHOOK: type: QUERY
+PREHOOK: Input: default@cross_numbers
+PREHOOK: Input: default@src
+PREHOOK: Output: default@lots_of_rows
+POSTHOOK: query: insert into table lots_of_rows select concat(key, '', i) from 
src cross join cross_numbers
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@cross_numbers
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@lots_of_rows
+POSTHOOK: Lineage: lots_of_rows.key EXPRESSION [(src)src.FieldSchema(name:key, 
type:string, comment:default), (cross_numbers)cross_numbers.FieldSchema(name:i, 
type:string, comment:null), ]
+PREHOOK: query: drop table testacid1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table testacid1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table testacid1(id string, id2 string) clustered by 
(id2) into 2 buckets stored as orc tblproperties("transactional"="true")
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@testacid1
+POSTHOOK: query: create table testacid1(id string, id2 string) clustered by 
(id2) into 2 buckets stored as orc tblproperties("transactional"="true")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@testacid1
+PREHOOK: query: insert into table testacid1 select key, key from lots_of_rows
+PREHOOK: type: QUERY
+PREHOOK: Input: default@lots_of_rows
+PREHOOK: Output: default@testacid1
+POSTHOOK: query: insert into table testacid1 select key, key from lots_of_rows
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@lots_of_rows
+POSTHOOK: Output: default@testacid1
+POSTHOOK: Lineage: testacid1.id SIMPLE 
[(lots_of_rows)lots_of_rows.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: testacid1.id2 SIMPLE 
[(lots_of_rows)lots_of_rows.FieldSchema(name:key, type:string, comment:null), ]
+PREHOOK: query: drop table lots_of_row
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table lots_of_row
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: select * from testacid1 order by id limit 30
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testacid1 order by id limit 30
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+0128   0128
+0128   0128
+0128   0128
+0150   0150
+0150   0150
+0150   0150
+0165   0165
+0165   0165
+0165   0165
+0193   0193
+0193   0193
+0193   0193
+0213   0213
+0213   0213
+0213   0213
+0224   0224
+0224   0224
+0224   0224
+0238   0238
+0238   0238
+0238   0238
+0255   0255
+0255   0255
+0255   0255
+0265   0265
+0265   0265
+0265   0265
+027    027
+027    027
+027    027
+PREHOOK: query: select sum(hash(*)) from testacid1 limit 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select sum(hash(*)) from testacid1 limit 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+8838111640064
+PREHOOK: query: select count(id) from testacid1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(id) from testacid1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+10000
+PREHOOK: query: select count(1) from testacid1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(1) from testacid1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+10000
+PREHOOK: query: select count(1) from testacid1 where id = '0128'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(1) from testacid1 where id = '0128'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+3
+PREHOOK: query: explain update testacid1 set id = '206' where id = '0128'
+PREHOOK: type: QUERY
+POSTHOOK: query: explain update testacid1 set id = '206' where id = '0128'
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+  Stage-3 depends on stages: Stage-0
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: testacid1
+                  filterExpr: (id = '0128') (type: boolean)
+                  Statistics: Num rows: 10000 Data size: 1800000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (id = '0128') (type: boolean)
+                    Statistics: Num rows: 2 Data size: 360 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: ROW__ID (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>), id2 (type: string)
+                      outputColumnNames: _col0, _col2
+                      Statistics: Num rows: 2 Data size: 506 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>)
+                        sort order: +
+                        Map-reduce partition columns: UDFToInteger(_col0) 
(type: int)
+                        Statistics: Num rows: 2 Data size: 506 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col2 (type: string)
+            Execution mode: vectorized, llap
+            LLAP IO: may be used (ACID table)
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Select Operator
+                expressions: KEY.reducesinkkey0 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>), '206' (type: string), 
VALUE._col0 (type: string)
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE 
Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 2 Data size: 506 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  table:
+                      input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+                      serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                      name: default.testacid1
+                  Write Type: UPDATE
+
+  Stage: Stage-2
+    Dependency Collection
+
+  Stage: Stage-0
+    Move Operator
+      tables:
+          replace: false
+          table:
+              input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+              output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+              serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+              name: default.testacid1
+          Write Type: UPDATE
+
+  Stage: Stage-3
+    Stats Work
+      Basic Stats Work:
+
+PREHOOK: query: update testacid1 set id = '206' where id = '0128'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+PREHOOK: Output: default@testacid1
+POSTHOOK: query: update testacid1 set id = '206' where id = '0128'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+POSTHOOK: Output: default@testacid1
+PREHOOK: query: select * from testacid1 order by id limit 30
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testacid1 order by id limit 30
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testacid1
+#### A masked pattern was here ####
+0150   0150
+0150   0150
+0150   0150
+0165   0165
+0165   0165
+0165   0165
+0193   0193
+0193   0193
+0193   0193
+0213   0213
+0213   0213
+0213   0213
+0224   0224
+0224   0224
+0224   0224
+0238   0238
+0238   0238
+0238   0238
+0255   0255
+0255   0255
+0255   0255
+0265   0265
+0265   0265
+0265   0265
+027    027
+027    027
+027    027
+0273   0273
+0273   0273
+0273   0273

Reply via email to