[hive] branch branch-3 updated: HIVE-27807: Backport of HIVE-20629, HIVE-20705, HIVE-20734 to branch-3 (#4809)

sankarh Mon, 23 Oct 2023 23:33:29 -0700

This is an automated email from the ASF dual-hosted git repository.

sankarh pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/branch-3 by this push:
     new 0e2d0757357 HIVE-27807: Backport of HIVE-20629, HIVE-20705, HIVE-20734 
to branch-3 (#4809)
0e2d0757357 is described below

commit 0e2d07573570cb66fa9bf8af05ca79ccee55e21f
Author: Aman Raj <[email protected]>
AuthorDate: Tue Oct 24 12:03:15 2023 +0530

    HIVE-27807: Backport of HIVE-20629, HIVE-20705, HIVE-20734 to branch-3 
(#4809)
    
    * HIVE-20629: Hive incremental replication fails with events missing error 
if database is kept idle for more than an hour (Mahesh Kumar Behera, reviewed 
by Sankar Hariappan)
    * HIVE-20705: Vectorization: Native Vector MapJoin doesn't support Complex 
Big Table values
    * HIVE-20734: Beeline: When beeline-site.xml is and hive CLI redirects to 
beeline, it should use the system username/dummy password instead of prompting 
for one
    
    ---------
    
    Co-authored-by: Sankar Hariappan <[email protected]>
    Co-authored-by: Matt McCline <[email protected]>
    Co-authored-by: Vaibhav Gumashta <[email protected]>
    
     Signed-off-by: Sankar Hariappan <[email protected]>
     Closes (#4809)
---
 bin/ext/beeline.sh                                 |   7 +-
 bin/hive                                           |   1 +
 .../TestReplicationScenariosAcrossInstances.java   |  40 +++
 .../test/resources/testconfiguration.properties    |   1 +
 .../hadoop/hive/ql/exec/repl/ReplLoadWork.java     |   9 +-
 .../incremental/IncrementalLoadEventsIterator.java |   4 +-
 .../incremental/IncrementalLoadTasksBuilder.java   |  20 +-
 .../hive/ql/optimizer/physical/Vectorizer.java     |  18 +-
 .../hive/ql/parse/ReplicationSemanticAnalyzer.java |  15 +-
 .../apache/hadoop/hive/ql/plan/MapJoinDesc.java    |  10 +
 .../hadoop/hive/ql/plan/VectorMapJoinDesc.java     |  14 +
 .../clientpositive/vector_mapjoin_complex_values.q |  34 ++
 .../llap/vector_mapjoin_complex_values.q.out       | 355 +++++++++++++++++++++
 13 files changed, 500 insertions(+), 28 deletions(-)

diff --git a/bin/ext/beeline.sh b/bin/ext/beeline.sh
index 8052c452bac..5bf7fe67503 100644
--- a/bin/ext/beeline.sh
+++ b/bin/ext/beeline.sh
@@ -32,7 +32,12 @@ beeline () {
   export 
HADOOP_CLASSPATH="${hadoopClasspath}${HIVE_CONF_DIR}:${beelineJarPath}:${superCsvJarPath}:${jlineJarPath}"
   export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS 
-Dlog4j.configurationFile=beeline-log4j2.properties "
 
-  exec $HADOOP jar ${beelineJarPath} $CLASS $HIVE_OPTS "$@"
+  # if CLIUSER is not empty, then pass it as user id / password during beeline 
redirect
+  if [ -z $CLIUSER ] ; then
+    exec $HADOOP jar ${beelineJarPath} $CLASS $HIVE_OPTS "$@"
+  else
+    exec $HADOOP jar ${beelineJarPath} $CLASS $HIVE_OPTS "$@" -n "${CLIUSER}" 
-p "${CLIUSER}"
+  fi
 }
 
 beeline_help () {
diff --git a/bin/hive b/bin/hive
index a7ae2f571e9..ef9ef955d23 100755
--- a/bin/hive
+++ b/bin/hive
@@ -86,6 +86,7 @@ if [ "$SERVICE" = "" ] ; then
 fi
 
 if [[ "$SERVICE" == "cli" && "$USE_BEELINE_FOR_HIVE_CLI" == "true" ]] ; then
+  CLIUSER=`whoami`
   SERVICE="beeline"
 fi
 
diff --git 
a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java
 
b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java
index 1d0a9c8b447..12ec8e66731 100644
--- 
a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java
+++ 
b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java
@@ -961,6 +961,46 @@ public class TestReplicationScenariosAcrossInstances {
     assertFalse(props.containsKey(SOURCE_OF_REPLICATION));
   }
 
+  @Test
+  public void testIncrementalDumpEmptyDumpDirectory() throws Throwable {
+    WarehouseInstance.Tuple tuple = primary.dump(primaryDbName, null);
+
+    replica.load(replicatedDbName, tuple.dumpLocation)
+            .status(replicatedDbName)
+            .verifyResult(tuple.lastReplicationId);
+
+    tuple = primary.dump(primaryDbName, tuple.lastReplicationId);
+
+    replica.load(replicatedDbName, tuple.dumpLocation)
+            .status(replicatedDbName)
+            .verifyResult(tuple.lastReplicationId);
+
+    // create events for some other database and then dump the primaryDbName 
to dump an empty directory.
+    String testDbName = primaryDbName + "_test";
+    tuple = primary.run(" create database " + testDbName)
+            .run("create table " + testDbName + ".tbl (fld int)")
+            .dump(primaryDbName, tuple.lastReplicationId);
+
+    // Incremental load to existing database with empty dump directory should 
set the repl id to the last event at src.
+    replica.load(replicatedDbName, tuple.dumpLocation)
+            .status(replicatedDbName)
+            .verifyResult(tuple.lastReplicationId);
+
+    // Incremental load to non existing db should return database not exist 
error.
+    tuple = primary.dump("someJunkDB", tuple.lastReplicationId);
+    CommandProcessorResponse response = replica.runCommand("REPL LOAD 
someJunkDB from " + tuple.dumpLocation);
+    
response.getErrorMessage().toLowerCase().contains("org.apache.hadoop.hive.ql.metadata.hiveException:
 " +
+            "database does not exist");
+
+    // Bootstrap load from an empty dump directory should return empty load 
directory error.
+    tuple = primary.dump("someJunkDB", null);
+    response = replica.runCommand("REPL LOAD someJunkDB from " + 
tuple.dumpLocation);
+    
response.getErrorMessage().toLowerCase().contains("org.apache.hadoop.hive.ql.parse.semanticException:"
 +
+            " no data to load in path");
+
+    primary.run(" drop database if exists " + testDbName + " cascade");
+  }
+
   @Test
   public void testIncrementalDumpMultiIteration() throws Throwable {
     WarehouseInstance.Tuple bootstrapTuple = primary.dump(primaryDbName, null);
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index 16a3e082d99..52cde10efdc 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -807,6 +807,7 @@ minillaplocal.query.files=\
   vector_like_2.q,\
   vector_llap_io_data_conversion.q,\
   vector_llap_text_1.q,\
+  vector_mapjoin_complex_values.q,\
   vector_mapjoin_reduce.q,\
   vector_null_map.q,\
   vector_number_compare_projection.q,\
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java
index fdbcb15c72d..ff21b6a601d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java
@@ -53,7 +53,7 @@ public class ReplLoadWork implements Serializable {
   final LineageState sessionStateLineageState;
 
   public ReplLoadWork(HiveConf hiveConf, String dumpDirectory, String 
dbNameToLoadIn,
-      String tableNameToLoadIn, LineageState lineageState, boolean 
isIncrementalDump) throws IOException {
+      String tableNameToLoadIn, LineageState lineageState, boolean 
isIncrementalDump, Long eventTo) throws IOException {
     this.tableNameToLoadIn = tableNameToLoadIn;
     sessionStateLineageState = lineageState;
     this.dumpDirectory = dumpDirectory;
@@ -64,7 +64,7 @@ public class ReplLoadWork implements Serializable {
       this.bootstrapIterator = null;
       this.constraintsIterator = null;
       incrementalLoad = new IncrementalLoadTasksBuilder(dbNameToLoadIn, 
tableNameToLoadIn, dumpDirectory,
-              incrementalIterator, hiveConf);
+              incrementalIterator, hiveConf, eventTo);
     } else {
       this.bootstrapIterator = new BootstrapEventsIterator(dumpDirectory, 
dbNameToLoadIn, hiveConf);
       this.constraintsIterator = new ConstraintEventsIterator(dumpDirectory, 
hiveConf);
@@ -73,11 +73,6 @@ public class ReplLoadWork implements Serializable {
     }
   }
 
-  public ReplLoadWork(HiveConf hiveConf, String dumpDirectory, String 
dbNameOrPattern,
-      LineageState lineageState) throws IOException {
-    this(hiveConf, dumpDirectory, dbNameOrPattern, null, lineageState, false);
-  }
-
   public BootstrapEventsIterator iterator() {
     return bootstrapIterator;
   }
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java
index 4b37c8dd989..5638ace714d 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java
@@ -44,7 +44,9 @@ public class IncrementalLoadEventsIterator implements 
Iterator<FileStatus> {
     FileSystem fs = eventPath.getFileSystem(conf);
     eventDirs = fs.listStatus(eventPath, EximUtil.getDirectoryFilter(fs));
     if ((eventDirs == null) || (eventDirs.length == 0)) {
-      throw new IllegalArgumentException("No data to load in path " + 
loadPath);
+      currentIndex = 0;
+      numEvents = 0;
+      return;
     }
     // For event dump, each sub-dir is an individual event dump.
     // We need to guarantee that the directory listing we got is in order of 
event id.
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java
index 2a9388772cf..60ab9b64a10 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java
@@ -64,15 +64,16 @@ import java.util.HashSet;
 public class IncrementalLoadTasksBuilder {
   private final String dbName, tableName;
   private final IncrementalLoadEventsIterator iterator;
-  private HashSet<ReadEntity> inputs;
-  private HashSet<WriteEntity> outputs;
+  private final HashSet<ReadEntity> inputs;
+  private final HashSet<WriteEntity> outputs;
   private Logger log;
   private final HiveConf conf;
   private final ReplLogger replLogger;
   private static long numIteration;
+  private final Long eventTo;
 
   public IncrementalLoadTasksBuilder(String dbName, String tableName, String 
loadPath,
-                                     IncrementalLoadEventsIterator iterator, 
HiveConf conf) {
+                                     IncrementalLoadEventsIterator iterator, 
HiveConf conf, Long eventTo) {
     this.dbName = dbName;
     this.tableName = tableName;
     this.iterator = iterator;
@@ -83,6 +84,7 @@ public class IncrementalLoadTasksBuilder {
     replLogger = new IncrementalLoadLogger(dbName, loadPath, 
iterator.getNumEvents());
     numIteration = 0;
     replLogger.startLog();
+    this.eventTo = eventTo;
   }
 
   public Task<? extends Serializable> build(DriverContext driverContext, Hive 
hive, Logger log,
@@ -151,6 +153,18 @@ public class IncrementalLoadTasksBuilder {
       // add load task to start the next iteration
       taskChainTail.addDependentTask(TaskFactory.get(loadWork, conf));
     } else {
+      // if no events were replayed, then add a task to update the last repl 
id of the database/table to last event id.
+      if (taskChainTail == evTaskRoot) {
+        String lastEventid = eventTo.toString();
+        if (StringUtils.isEmpty(tableName)) {
+          taskChainTail = dbUpdateReplStateTask(dbName, lastEventid, 
taskChainTail);
+          this.log.debug("no events to replay, set last repl id of db  " + 
dbName + " to " + lastEventid);
+        } else {
+          taskChainTail = tableUpdateReplStateTask(dbName, tableName, null, 
lastEventid, taskChainTail);
+          this.log.debug("no events to replay, set last repl id of table " + 
dbName + "." + tableName + " to " +
+                  lastEventid);
+        }
+      }
       Map<String, String> dbProps = new HashMap<>();
       dbProps.put(ReplicationSpec.KEY.CURR_STATE_ID.toString(), 
String.valueOf(lastReplayedEvent));
       ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, 
dbProps);
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
index 2dd12ef1918..22915b50f68 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
@@ -3500,6 +3500,9 @@ public class Vectorizer implements PhysicalPlanResolver {
      * Similarly, we need a mapping since a value expression can be a 
calculation and the value
      * will go into a scratch column.
      */
+    boolean supportsValueTypes = true;  // Assume.
+    HashSet<String> notSupportedValueTypes = new HashSet<String>();
+
     int[] bigTableValueColumnMap = new int[allBigTableValueExpressions.length];
     String[] bigTableValueColumnNames = new 
String[allBigTableValueExpressions.length];
     TypeInfo[] bigTableValueTypeInfos = new 
TypeInfo[allBigTableValueExpressions.length];
@@ -3514,7 +3517,13 @@ public class Vectorizer implements PhysicalPlanResolver {
 
       ExprNodeDesc exprNode = bigTableExprs.get(i);
       bigTableValueColumnNames[i] = exprNode.toString();
-      bigTableValueTypeInfos[i] = exprNode.getTypeInfo();
+      TypeInfo typeInfo = exprNode.getTypeInfo();
+      if (!(typeInfo instanceof PrimitiveTypeInfo)) {
+        supportsValueTypes = false;
+        Category category = typeInfo.getCategory();
+        notSupportedValueTypes.add(category.toString());
+      }
+      bigTableValueTypeInfos[i] = typeInfo;
     }
     if (bigTableValueExpressionsList.size() == 0) {
       slimmedBigTableValueExpressions = null;
@@ -3747,6 +3756,10 @@ public class Vectorizer implements PhysicalPlanResolver {
     if (!supportsKeyTypes) {
       vectorDesc.setNotSupportedKeyTypes(new ArrayList(notSupportedKeyTypes));
     }
+    vectorDesc.setSupportsValueTypes(supportsValueTypes);
+    if (!supportsValueTypes) {
+      vectorDesc.setNotSupportedValueTypes(new 
ArrayList(notSupportedValueTypes));
+    }
 
     // Check common conditions for both Optimized and Fast Hash Tables.
     boolean result = true;    // Assume.
@@ -3756,7 +3769,8 @@ public class Vectorizer implements PhysicalPlanResolver {
         !oneMapJoinCondition ||
         hasNullSafes ||
         !smallTableExprVectorizes ||
-        outerJoinHasNoKeys) {
+        outerJoinHasNoKeys ||
+        !supportsValueTypes) {
       result = false;
     }
 
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java
index f83146125f3..fe0cec010e0 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java
@@ -328,21 +328,8 @@ public class ReplicationSemanticAnalyzer extends 
BaseSemanticAnalyzer {
         LOG.debug("{} contains an bootstrap dump", loadPath);
       }
 
-      if ((!evDump) && (tblNameOrPattern != null) && 
!(tblNameOrPattern.isEmpty())) {
-        ReplLoadWork replLoadWork = new ReplLoadWork(conf, 
loadPath.toString(), dbNameOrPattern,
-                tblNameOrPattern, queryState.getLineageState(), false);
-        rootTasks.add(TaskFactory.get(replLoadWork, conf));
-        return;
-      }
-
-      FileStatus[] srcs = LoadSemanticAnalyzer.matchFilesOrDir(fs, loadPath);
-      if (srcs == null || (srcs.length == 0)) {
-        LOG.warn("Nothing to load at {}", loadPath.toUri().toString());
-        return;
-      }
-
       ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(), 
dbNameOrPattern,
-              tblNameOrPattern, queryState.getLineageState(), evDump);
+              tblNameOrPattern, queryState.getLineageState(), evDump, 
dmd.getEventTo());
       rootTasks.add(TaskFactory.get(replLoadWork, conf));
     } catch (Exception e) {
       // TODO : simple wrap & rethrow for now, clean up with error codes
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
index 83b34161a75..7834b182a78 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
@@ -478,6 +478,16 @@ public class MapJoinDesc extends JoinDesc implements 
Serializable {
                 vectorMapJoinDesc.getSupportsKeyTypes(),
                 "Optimized Table and Supports Key Types"));
       }
+      final boolean supportsValueTypes = 
vectorMapJoinDesc.getSupportsValueTypes();
+      if (!supportsValueTypes) {
+
+        // Only add this condition when false to avoid mega-Q file update.
+        conditionList.add(
+            new VectorizationCondition(
+                false,
+                "Supports Value Types " +
+                vectorMapJoinDesc.getNotSupportedValueTypes().toString()));
+      }
 
       VectorizationCondition[] conditions =
           conditionList.toArray(new VectorizationCondition[0]);
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java
index 58032ca0572..3c7c69d5822 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java
@@ -204,6 +204,8 @@ public class VectorMapJoinDesc extends AbstractVectorDesc  {
   private boolean isHybridHashJoin;
   private boolean supportsKeyTypes;
   private List<String> notSupportedKeyTypes;
+  private boolean supportsValueTypes;
+  private List<String> notSupportedValueTypes;
   private boolean smallTableExprVectorizes;
   private boolean outerJoinHasNoKeys;
 
@@ -249,6 +251,18 @@ public class VectorMapJoinDesc extends AbstractVectorDesc  
{
   public List<String> getNotSupportedKeyTypes() {
     return notSupportedKeyTypes;
   }
+  public void setSupportsValueTypes(boolean supportsValueTypes) {
+    this.supportsValueTypes = supportsValueTypes;
+  }
+  public boolean getSupportsValueTypes() {
+    return supportsValueTypes;
+  }
+  public void setNotSupportedValueTypes(List<String> notSupportedValueTypes) {
+    this.notSupportedValueTypes = notSupportedValueTypes;
+  }
+  public List<String> getNotSupportedValueTypes() {
+    return notSupportedValueTypes;
+  }
   public void setSmallTableExprVectorizes(boolean smallTableExprVectorizes) {
     this.smallTableExprVectorizes = smallTableExprVectorizes;
   }
diff --git a/ql/src/test/queries/clientpositive/vector_mapjoin_complex_values.q 
b/ql/src/test/queries/clientpositive/vector_mapjoin_complex_values.q
new file mode 100644
index 00000000000..1c88daaefd4
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vector_mapjoin_complex_values.q
@@ -0,0 +1,34 @@
+set hive.mapred.mode=nonstrict;
+set hive.explain.user=false;
+set hive.vectorized.execution.enabled=true;
+set hive.auto.convert.join=true;
+set hive.mapjoin.hybridgrace.hashtable=false;
+set hive.fetch.task.conversion=none;
+set hive.cli.print.header=true;
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+
+create table census(
+ssn int,
+name string,
+city string,
+email string) 
+row format delimited 
+fields terminated by ',';
+
+insert into census values(100,"raj","san jose","email");
+
+create table census_clus(
+ssn int,
+name string,
+city string,
+email string) 
+clustered by (ssn) into 4 buckets  stored as orc TBLPROPERTIES 
('transactional'='true');
+
+insert into  table census_clus select *  from census;
+
+EXPLAIN VECTORIZATION DETAIL
+UPDATE census_clus SET name = 'updated name' where ssn=100 and   EXISTS 
(select distinct ssn from census where ssn=census_clus.ssn);
+
+UPDATE census_clus SET name = 'updated name' where ssn=100 and   EXISTS 
(select distinct ssn from census where ssn=census_clus.ssn);
\ No newline at end of file
diff --git 
a/ql/src/test/results/clientpositive/llap/vector_mapjoin_complex_values.q.out 
b/ql/src/test/results/clientpositive/llap/vector_mapjoin_complex_values.q.out
new file mode 100644
index 00000000000..d7fe5f1d0dc
--- /dev/null
+++ 
b/ql/src/test/results/clientpositive/llap/vector_mapjoin_complex_values.q.out
@@ -0,0 +1,355 @@
+PREHOOK: query: create table census(
+ssn int,
+name string,
+city string,
+email string) 
+row format delimited 
+fields terminated by ','
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@census
+POSTHOOK: query: create table census(
+ssn int,
+name string,
+city string,
+email string) 
+row format delimited 
+fields terminated by ','
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@census
+PREHOOK: query: insert into census values(100,"raj","san jose","email")
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@census
+POSTHOOK: query: insert into census values(100,"raj","san jose","email")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@census
+POSTHOOK: Lineage: census.city SCRIPT []
+POSTHOOK: Lineage: census.email SCRIPT []
+POSTHOOK: Lineage: census.name SCRIPT []
+POSTHOOK: Lineage: census.ssn SCRIPT []
+col1   col2    col3    col4
+PREHOOK: query: create table census_clus(
+ssn int,
+name string,
+city string,
+email string) 
+clustered by (ssn) into 4 buckets  stored as orc TBLPROPERTIES 
('transactional'='true')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@census_clus
+POSTHOOK: query: create table census_clus(
+ssn int,
+name string,
+city string,
+email string) 
+clustered by (ssn) into 4 buckets  stored as orc TBLPROPERTIES 
('transactional'='true')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@census_clus
+PREHOOK: query: insert into  table census_clus select *  from census
+PREHOOK: type: QUERY
+PREHOOK: Input: default@census
+PREHOOK: Output: default@census_clus
+POSTHOOK: query: insert into  table census_clus select *  from census
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@census
+POSTHOOK: Output: default@census_clus
+POSTHOOK: Lineage: census_clus.city SIMPLE 
[(census)census.FieldSchema(name:city, type:string, comment:null), ]
+POSTHOOK: Lineage: census_clus.email SIMPLE 
[(census)census.FieldSchema(name:email, type:string, comment:null), ]
+POSTHOOK: Lineage: census_clus.name SIMPLE 
[(census)census.FieldSchema(name:name, type:string, comment:null), ]
+POSTHOOK: Lineage: census_clus.ssn SIMPLE 
[(census)census.FieldSchema(name:ssn, type:int, comment:null), ]
+census.ssn     census.name     census.city     census.email
+PREHOOK: query: EXPLAIN VECTORIZATION DETAIL
+UPDATE census_clus SET name = 'updated name' where ssn=100 and   EXISTS 
(select distinct ssn from census where ssn=census_clus.ssn)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@census
+PREHOOK: Input: default@census_clus
+PREHOOK: Output: default@census_clus
+POSTHOOK: query: EXPLAIN VECTORIZATION DETAIL
+UPDATE census_clus SET name = 'updated name' where ssn=100 and   EXISTS 
(select distinct ssn from census where ssn=census_clus.ssn)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@census
+POSTHOOK: Input: default@census_clus
+POSTHOOK: Output: default@census_clus
+Explain
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+  Stage-3 depends on stages: Stage-0
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Reducer 4 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 4 <- Map 3 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: census_clus
+                  Statistics: Num rows: 1 Data size: 185 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  TableScan Vectorization:
+                      native: true
+                      vectorizationSchemaColumns: [0:ssn:int, 1:name:string, 
2:city:string, 3:email:string, 
4:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>]
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: FilterLongColEqualLongScalar(col 
0:int, val 100)
+                    predicate: (ssn = 100) (type: boolean)
+                    Statistics: Num rows: 1 Data size: 185 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Map Join Operator
+                      condition map:
+                           Left Semi Join 0 to 1
+                      keys:
+                        0 100 (type: int)
+                        1 100 (type: int)
+                      Map Join Vectorization:
+                          bigTableKeyExpressions: ConstantVectorExpression(val 
100) -> 5:int
+                          bigTableValueExpressions: col 2:string, col 
3:string, col 4:struct<writeid:bigint,bucketid:int,rowid:bigint>
+                          className: VectorMapJoinOperator
+                          native: false
+                          nativeConditionsMet: 
hive.mapjoin.optimized.hashtable IS true, 
hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine 
tez IN [tez, spark] IS true, One MapJoin Condition IS true, No nullsafe IS 
true, Small table vectorizes IS true, Optimized Table and Supports Key Types IS 
true
+                          nativeConditionsNotMet: Supports Value Types 
[STRUCT] IS false
+                      outputColumnNames: _col2, _col3, _col6
+                      input vertices:
+                        1 Reducer 4
+                      Statistics: Num rows: 1 Data size: 257 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Select Operator
+                        expressions: _col6 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>), _col2 (type: string), _col3 
(type: string)
+                        outputColumnNames: _col0, _col3, _col4
+                        Select Vectorization:
+                            className: VectorSelectOperator
+                            native: true
+                            projectedOutputColumnNums: [2, 0, 1]
+                        Statistics: Num rows: 1 Data size: 357 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Reduce Output Operator
+                          key expressions: _col0 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>)
+                          sort order: +
+                          Map-reduce partition columns: UDFToInteger(_col0) 
(type: int)
+                          Reduce Sink Vectorization:
+                              className: VectorReduceSinkObjectHashOperator
+                              keyColumnNums: [2]
+                              native: true
+                              nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+                              partitionColumnNums: [3]
+                              valueColumnNums: [0, 1]
+                          Statistics: Num rows: 1 Data size: 357 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          value expressions: _col3 (type: string), _col4 
(type: string)
+            Execution mode: vectorized, llap
+            LLAP IO: may be used (ACID table)
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: 
hive.vectorized.use.vectorized.input.format IS true
+                inputFormatFeatureSupport: [DECIMAL_64]
+                featureSupportInUse: [DECIMAL_64]
+                inputFileFormats: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 4
+                    includeColumns: [0, 2, 3]
+                    dataColumns: ssn:int, name:string, city:string, 
email:string
+                    neededVirtualColumns: [ROWID]
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: [bigint]
+        Map 3 
+            Map Operator Tree:
+                TableScan
+                  alias: census
+                  Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  TableScan Vectorization:
+                      native: true
+                      vectorizationSchemaColumns: [0:ssn:int, 1:name:string, 
2:city:string, 3:email:string, 
4:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>]
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: FilterLongColEqualLongScalar(col 
0:int, val 100)
+                    predicate: (ssn = 100) (type: boolean)
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: COMPLETE
+                    Select Operator
+                      Select Vectorization:
+                          className: VectorSelectOperator
+                          native: true
+                          projectedOutputColumnNums: []
+                      Statistics: Num rows: 1 Data size: 4 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Group By Operator
+                        Group By Vectorization:
+                            className: VectorGroupByOperator
+                            groupByMode: HASH
+                            keyExpressions: ConstantVectorExpression(val 100) 
-> 5:int
+                            native: false
+                            vectorProcessingMode: HASH
+                            projectedOutputColumnNums: []
+                        keys: 100 (type: int)
+                        mode: hash
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 1 Data size: 4 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Reduce Output Operator
+                          key expressions: 100 (type: int)
+                          sort order: +
+                          Map-reduce partition columns: 100 (type: int)
+                          Reduce Sink Vectorization:
+                              className: VectorReduceSinkLongOperator
+                              keyColumnNums: [1]
+                              keyExpressions: ConstantVectorExpression(val 
100) -> 1:int
+                              native: true
+                              nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+                              valueColumnNums: []
+                          Statistics: Num rows: 1 Data size: 4 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: no inputs
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: 
hive.vectorized.use.vector.serde.deserialize IS true
+                inputFormatFeatureSupport: [DECIMAL_64]
+                featureSupportInUse: [DECIMAL_64]
+                inputFileFormats: org.apache.hadoop.mapred.TextInputFormat
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 4
+                    includeColumns: [0]
+                    dataColumns: ssn:int, name:string, city:string, 
email:string
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: [bigint]
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Vectorization:
+                enabled: true
+                enableConditionsMet: hive.vectorized.execution.reduce.enabled 
IS true, hive.execution.engine tez IN [tez, spark] IS true
+                reduceColumnNullOrder: z
+                reduceColumnSortOrder: +
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 3
+                    dataColumns: 
KEY.reducesinkkey0:struct<writeid:bigint,bucketid:int,rowid:bigint>, 
VALUE._col1:string, VALUE._col2:string
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: [bigint, string]
+            Reduce Operator Tree:
+              Select Operator
+                expressions: KEY.reducesinkkey0 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>), 100 (type: int), 'updated 
name' (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string)
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                Select Vectorization:
+                    className: VectorSelectOperator
+                    native: true
+                    projectedOutputColumnNums: [0, 3, 4, 1, 2]
+                    selectExpressions: ConstantVectorExpression(val 100) -> 
3:int, ConstantVectorExpression(val updated name) -> 4:string
+                Statistics: Num rows: 1 Data size: 357 Basic stats: COMPLETE 
Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  File Sink Vectorization:
+                      className: VectorFileSinkOperator
+                      native: false
+                  Statistics: Num rows: 1 Data size: 357 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  table:
+                      input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+                      serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                      name: default.census_clus
+                  Write Type: UPDATE
+        Reducer 4 
+            Execution mode: vectorized, llap
+            Reduce Vectorization:
+                enabled: true
+                enableConditionsMet: hive.vectorized.execution.reduce.enabled 
IS true, hive.execution.engine tez IN [tez, spark] IS true
+                reduceColumnNullOrder: a
+                reduceColumnSortOrder: +
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 1
+                    dataColumns: KEY._col0:int
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: [bigint, bigint]
+            Reduce Operator Tree:
+              Group By Operator
+                Group By Vectorization:
+                    className: VectorGroupByOperator
+                    groupByMode: MERGEPARTIAL
+                    keyExpressions: ConstantVectorExpression(val 100) -> 
1:int, ConstantVectorExpression(val 100) -> 2:int
+                    native: false
+                    vectorProcessingMode: MERGE_PARTIAL
+                    projectedOutputColumnNums: []
+                keys: 100 (type: int), 100 (type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Select Operator
+                  Select Vectorization:
+                      className: VectorSelectOperator
+                      native: true
+                      projectedOutputColumnNums: []
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Group By Operator
+                    Group By Vectorization:
+                        className: VectorGroupByOperator
+                        groupByMode: HASH
+                        keyExpressions: ConstantVectorExpression(val 100) -> 
2:int
+                        native: false
+                        vectorProcessingMode: HASH
+                        projectedOutputColumnNums: []
+                    keys: 100 (type: int)
+                    mode: hash
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: 100 (type: int)
+                      sort order: +
+                      Map-reduce partition columns: 100 (type: int)
+                      Reduce Sink Vectorization:
+                          className: VectorReduceSinkLongOperator
+                          keyColumnNums: [1]
+                          keyExpressions: ConstantVectorExpression(val 100) -> 
1:int
+                          native: true
+                          nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+                          valueColumnNums: []
+                      Statistics: Num rows: 1 Data size: 4 Basic stats: 
COMPLETE Column stats: COMPLETE
+
+  Stage: Stage-2
+    Dependency Collection
+
+  Stage: Stage-0
+    Move Operator
+      tables:
+          replace: false
+          table:
+              input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+              output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+              serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+              name: default.census_clus
+          Write Type: UPDATE
+
+  Stage: Stage-3
+    Stats Work
+      Basic Stats Work:
+
+PREHOOK: query: UPDATE census_clus SET name = 'updated name' where ssn=100 and 
  EXISTS (select distinct ssn from census where ssn=census_clus.ssn)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@census
+PREHOOK: Input: default@census_clus
+PREHOOK: Output: default@census_clus
+POSTHOOK: query: UPDATE census_clus SET name = 'updated name' where ssn=100 
and   EXISTS (select distinct ssn from census where ssn=census_clus.ssn)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@census
+POSTHOOK: Input: default@census_clus
+POSTHOOK: Output: default@census_clus
+row__id        ssn     _c2     city    email

[hive] branch branch-3 updated: HIVE-27807: Backport of HIVE-20629, HIVE-20705, HIVE-20734 to branch-3 (#4809)

Reply via email to