This is an automated email from the ASF dual-hosted git repository.
sankarh pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-3 by this push:
new 0e2d0757357 HIVE-27807: Backport of HIVE-20629, HIVE-20705, HIVE-20734
to branch-3 (#4809)
0e2d0757357 is described below
commit 0e2d07573570cb66fa9bf8af05ca79ccee55e21f
Author: Aman Raj <[email protected]>
AuthorDate: Tue Oct 24 12:03:15 2023 +0530
HIVE-27807: Backport of HIVE-20629, HIVE-20705, HIVE-20734 to branch-3
(#4809)
* HIVE-20629: Hive incremental replication fails with events missing error
if database is kept idle for more than an hour (Mahesh Kumar Behera, reviewed
by Sankar Hariappan)
* HIVE-20705: Vectorization: Native Vector MapJoin doesn't support Complex
Big Table values
* HIVE-20734: Beeline: When beeline-site.xml is and hive CLI redirects to
beeline, it should use the system username/dummy password instead of prompting
for one
---------
Co-authored-by: Sankar Hariappan <[email protected]>
Co-authored-by: Matt McCline <[email protected]>
Co-authored-by: Vaibhav Gumashta <[email protected]>
Signed-off-by: Sankar Hariappan <[email protected]>
Closes (#4809)
---
bin/ext/beeline.sh | 7 +-
bin/hive | 1 +
.../TestReplicationScenariosAcrossInstances.java | 40 +++
.../test/resources/testconfiguration.properties | 1 +
.../hadoop/hive/ql/exec/repl/ReplLoadWork.java | 9 +-
.../incremental/IncrementalLoadEventsIterator.java | 4 +-
.../incremental/IncrementalLoadTasksBuilder.java | 20 +-
.../hive/ql/optimizer/physical/Vectorizer.java | 18 +-
.../hive/ql/parse/ReplicationSemanticAnalyzer.java | 15 +-
.../apache/hadoop/hive/ql/plan/MapJoinDesc.java | 10 +
.../hadoop/hive/ql/plan/VectorMapJoinDesc.java | 14 +
.../clientpositive/vector_mapjoin_complex_values.q | 34 ++
.../llap/vector_mapjoin_complex_values.q.out | 355 +++++++++++++++++++++
13 files changed, 500 insertions(+), 28 deletions(-)
diff --git a/bin/ext/beeline.sh b/bin/ext/beeline.sh
index 8052c452bac..5bf7fe67503 100644
--- a/bin/ext/beeline.sh
+++ b/bin/ext/beeline.sh
@@ -32,7 +32,12 @@ beeline () {
export
HADOOP_CLASSPATH="${hadoopClasspath}${HIVE_CONF_DIR}:${beelineJarPath}:${superCsvJarPath}:${jlineJarPath}"
export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS
-Dlog4j.configurationFile=beeline-log4j2.properties "
- exec $HADOOP jar ${beelineJarPath} $CLASS $HIVE_OPTS "$@"
+ # if CLIUSER is not empty, then pass it as user id / password during beeline
redirect
+ if [ -z $CLIUSER ] ; then
+ exec $HADOOP jar ${beelineJarPath} $CLASS $HIVE_OPTS "$@"
+ else
+ exec $HADOOP jar ${beelineJarPath} $CLASS $HIVE_OPTS "$@" -n "${CLIUSER}"
-p "${CLIUSER}"
+ fi
}
beeline_help () {
diff --git a/bin/hive b/bin/hive
index a7ae2f571e9..ef9ef955d23 100755
--- a/bin/hive
+++ b/bin/hive
@@ -86,6 +86,7 @@ if [ "$SERVICE" = "" ] ; then
fi
if [[ "$SERVICE" == "cli" && "$USE_BEELINE_FOR_HIVE_CLI" == "true" ]] ; then
+ CLIUSER=`whoami`
SERVICE="beeline"
fi
diff --git
a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java
b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java
index 1d0a9c8b447..12ec8e66731 100644
---
a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java
+++
b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java
@@ -961,6 +961,46 @@ public class TestReplicationScenariosAcrossInstances {
assertFalse(props.containsKey(SOURCE_OF_REPLICATION));
}
+ @Test
+ public void testIncrementalDumpEmptyDumpDirectory() throws Throwable {
+ WarehouseInstance.Tuple tuple = primary.dump(primaryDbName, null);
+
+ replica.load(replicatedDbName, tuple.dumpLocation)
+ .status(replicatedDbName)
+ .verifyResult(tuple.lastReplicationId);
+
+ tuple = primary.dump(primaryDbName, tuple.lastReplicationId);
+
+ replica.load(replicatedDbName, tuple.dumpLocation)
+ .status(replicatedDbName)
+ .verifyResult(tuple.lastReplicationId);
+
+ // create events for some other database and then dump the primaryDbName
to dump an empty directory.
+ String testDbName = primaryDbName + "_test";
+ tuple = primary.run(" create database " + testDbName)
+ .run("create table " + testDbName + ".tbl (fld int)")
+ .dump(primaryDbName, tuple.lastReplicationId);
+
+ // Incremental load to existing database with empty dump directory should
set the repl id to the last event at src.
+ replica.load(replicatedDbName, tuple.dumpLocation)
+ .status(replicatedDbName)
+ .verifyResult(tuple.lastReplicationId);
+
+ // Incremental load to non existing db should return database not exist
error.
+ tuple = primary.dump("someJunkDB", tuple.lastReplicationId);
+ CommandProcessorResponse response = replica.runCommand("REPL LOAD
someJunkDB from " + tuple.dumpLocation);
+
response.getErrorMessage().toLowerCase().contains("org.apache.hadoop.hive.ql.metadata.hiveException:
" +
+ "database does not exist");
+
+ // Bootstrap load from an empty dump directory should return empty load
directory error.
+ tuple = primary.dump("someJunkDB", null);
+ response = replica.runCommand("REPL LOAD someJunkDB from " +
tuple.dumpLocation);
+
response.getErrorMessage().toLowerCase().contains("org.apache.hadoop.hive.ql.parse.semanticException:"
+
+ " no data to load in path");
+
+ primary.run(" drop database if exists " + testDbName + " cascade");
+ }
+
@Test
public void testIncrementalDumpMultiIteration() throws Throwable {
WarehouseInstance.Tuple bootstrapTuple = primary.dump(primaryDbName, null);
diff --git a/itests/src/test/resources/testconfiguration.properties
b/itests/src/test/resources/testconfiguration.properties
index 16a3e082d99..52cde10efdc 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -807,6 +807,7 @@ minillaplocal.query.files=\
vector_like_2.q,\
vector_llap_io_data_conversion.q,\
vector_llap_text_1.q,\
+ vector_mapjoin_complex_values.q,\
vector_mapjoin_reduce.q,\
vector_null_map.q,\
vector_number_compare_projection.q,\
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java
index fdbcb15c72d..ff21b6a601d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java
@@ -53,7 +53,7 @@ public class ReplLoadWork implements Serializable {
final LineageState sessionStateLineageState;
public ReplLoadWork(HiveConf hiveConf, String dumpDirectory, String
dbNameToLoadIn,
- String tableNameToLoadIn, LineageState lineageState, boolean
isIncrementalDump) throws IOException {
+ String tableNameToLoadIn, LineageState lineageState, boolean
isIncrementalDump, Long eventTo) throws IOException {
this.tableNameToLoadIn = tableNameToLoadIn;
sessionStateLineageState = lineageState;
this.dumpDirectory = dumpDirectory;
@@ -64,7 +64,7 @@ public class ReplLoadWork implements Serializable {
this.bootstrapIterator = null;
this.constraintsIterator = null;
incrementalLoad = new IncrementalLoadTasksBuilder(dbNameToLoadIn,
tableNameToLoadIn, dumpDirectory,
- incrementalIterator, hiveConf);
+ incrementalIterator, hiveConf, eventTo);
} else {
this.bootstrapIterator = new BootstrapEventsIterator(dumpDirectory,
dbNameToLoadIn, hiveConf);
this.constraintsIterator = new ConstraintEventsIterator(dumpDirectory,
hiveConf);
@@ -73,11 +73,6 @@ public class ReplLoadWork implements Serializable {
}
}
- public ReplLoadWork(HiveConf hiveConf, String dumpDirectory, String
dbNameOrPattern,
- LineageState lineageState) throws IOException {
- this(hiveConf, dumpDirectory, dbNameOrPattern, null, lineageState, false);
- }
-
public BootstrapEventsIterator iterator() {
return bootstrapIterator;
}
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java
index 4b37c8dd989..5638ace714d 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java
@@ -44,7 +44,9 @@ public class IncrementalLoadEventsIterator implements
Iterator<FileStatus> {
FileSystem fs = eventPath.getFileSystem(conf);
eventDirs = fs.listStatus(eventPath, EximUtil.getDirectoryFilter(fs));
if ((eventDirs == null) || (eventDirs.length == 0)) {
- throw new IllegalArgumentException("No data to load in path " +
loadPath);
+ currentIndex = 0;
+ numEvents = 0;
+ return;
}
// For event dump, each sub-dir is an individual event dump.
// We need to guarantee that the directory listing we got is in order of
event id.
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java
index 2a9388772cf..60ab9b64a10 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java
@@ -64,15 +64,16 @@ import java.util.HashSet;
public class IncrementalLoadTasksBuilder {
private final String dbName, tableName;
private final IncrementalLoadEventsIterator iterator;
- private HashSet<ReadEntity> inputs;
- private HashSet<WriteEntity> outputs;
+ private final HashSet<ReadEntity> inputs;
+ private final HashSet<WriteEntity> outputs;
private Logger log;
private final HiveConf conf;
private final ReplLogger replLogger;
private static long numIteration;
+ private final Long eventTo;
public IncrementalLoadTasksBuilder(String dbName, String tableName, String
loadPath,
- IncrementalLoadEventsIterator iterator,
HiveConf conf) {
+ IncrementalLoadEventsIterator iterator,
HiveConf conf, Long eventTo) {
this.dbName = dbName;
this.tableName = tableName;
this.iterator = iterator;
@@ -83,6 +84,7 @@ public class IncrementalLoadTasksBuilder {
replLogger = new IncrementalLoadLogger(dbName, loadPath,
iterator.getNumEvents());
numIteration = 0;
replLogger.startLog();
+ this.eventTo = eventTo;
}
public Task<? extends Serializable> build(DriverContext driverContext, Hive
hive, Logger log,
@@ -151,6 +153,18 @@ public class IncrementalLoadTasksBuilder {
// add load task to start the next iteration
taskChainTail.addDependentTask(TaskFactory.get(loadWork, conf));
} else {
+ // if no events were replayed, then add a task to update the last repl
id of the database/table to last event id.
+ if (taskChainTail == evTaskRoot) {
+ String lastEventid = eventTo.toString();
+ if (StringUtils.isEmpty(tableName)) {
+ taskChainTail = dbUpdateReplStateTask(dbName, lastEventid,
taskChainTail);
+ this.log.debug("no events to replay, set last repl id of db " +
dbName + " to " + lastEventid);
+ } else {
+ taskChainTail = tableUpdateReplStateTask(dbName, tableName, null,
lastEventid, taskChainTail);
+ this.log.debug("no events to replay, set last repl id of table " +
dbName + "." + tableName + " to " +
+ lastEventid);
+ }
+ }
Map<String, String> dbProps = new HashMap<>();
dbProps.put(ReplicationSpec.KEY.CURR_STATE_ID.toString(),
String.valueOf(lastReplayedEvent));
ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger,
dbProps);
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
index 2dd12ef1918..22915b50f68 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
@@ -3500,6 +3500,9 @@ public class Vectorizer implements PhysicalPlanResolver {
* Similarly, we need a mapping since a value expression can be a
calculation and the value
* will go into a scratch column.
*/
+ boolean supportsValueTypes = true; // Assume.
+ HashSet<String> notSupportedValueTypes = new HashSet<String>();
+
int[] bigTableValueColumnMap = new int[allBigTableValueExpressions.length];
String[] bigTableValueColumnNames = new
String[allBigTableValueExpressions.length];
TypeInfo[] bigTableValueTypeInfos = new
TypeInfo[allBigTableValueExpressions.length];
@@ -3514,7 +3517,13 @@ public class Vectorizer implements PhysicalPlanResolver {
ExprNodeDesc exprNode = bigTableExprs.get(i);
bigTableValueColumnNames[i] = exprNode.toString();
- bigTableValueTypeInfos[i] = exprNode.getTypeInfo();
+ TypeInfo typeInfo = exprNode.getTypeInfo();
+ if (!(typeInfo instanceof PrimitiveTypeInfo)) {
+ supportsValueTypes = false;
+ Category category = typeInfo.getCategory();
+ notSupportedValueTypes.add(category.toString());
+ }
+ bigTableValueTypeInfos[i] = typeInfo;
}
if (bigTableValueExpressionsList.size() == 0) {
slimmedBigTableValueExpressions = null;
@@ -3747,6 +3756,10 @@ public class Vectorizer implements PhysicalPlanResolver {
if (!supportsKeyTypes) {
vectorDesc.setNotSupportedKeyTypes(new ArrayList(notSupportedKeyTypes));
}
+ vectorDesc.setSupportsValueTypes(supportsValueTypes);
+ if (!supportsValueTypes) {
+ vectorDesc.setNotSupportedValueTypes(new
ArrayList(notSupportedValueTypes));
+ }
// Check common conditions for both Optimized and Fast Hash Tables.
boolean result = true; // Assume.
@@ -3756,7 +3769,8 @@ public class Vectorizer implements PhysicalPlanResolver {
!oneMapJoinCondition ||
hasNullSafes ||
!smallTableExprVectorizes ||
- outerJoinHasNoKeys) {
+ outerJoinHasNoKeys ||
+ !supportsValueTypes) {
result = false;
}
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java
b/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java
index f83146125f3..fe0cec010e0 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java
@@ -328,21 +328,8 @@ public class ReplicationSemanticAnalyzer extends
BaseSemanticAnalyzer {
LOG.debug("{} contains an bootstrap dump", loadPath);
}
- if ((!evDump) && (tblNameOrPattern != null) &&
!(tblNameOrPattern.isEmpty())) {
- ReplLoadWork replLoadWork = new ReplLoadWork(conf,
loadPath.toString(), dbNameOrPattern,
- tblNameOrPattern, queryState.getLineageState(), false);
- rootTasks.add(TaskFactory.get(replLoadWork, conf));
- return;
- }
-
- FileStatus[] srcs = LoadSemanticAnalyzer.matchFilesOrDir(fs, loadPath);
- if (srcs == null || (srcs.length == 0)) {
- LOG.warn("Nothing to load at {}", loadPath.toUri().toString());
- return;
- }
-
ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(),
dbNameOrPattern,
- tblNameOrPattern, queryState.getLineageState(), evDump);
+ tblNameOrPattern, queryState.getLineageState(), evDump,
dmd.getEventTo());
rootTasks.add(TaskFactory.get(replLoadWork, conf));
} catch (Exception e) {
// TODO : simple wrap & rethrow for now, clean up with error codes
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
index 83b34161a75..7834b182a78 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
@@ -478,6 +478,16 @@ public class MapJoinDesc extends JoinDesc implements
Serializable {
vectorMapJoinDesc.getSupportsKeyTypes(),
"Optimized Table and Supports Key Types"));
}
+ final boolean supportsValueTypes =
vectorMapJoinDesc.getSupportsValueTypes();
+ if (!supportsValueTypes) {
+
+ // Only add this condition when false to avoid mega-Q file update.
+ conditionList.add(
+ new VectorizationCondition(
+ false,
+ "Supports Value Types " +
+ vectorMapJoinDesc.getNotSupportedValueTypes().toString()));
+ }
VectorizationCondition[] conditions =
conditionList.toArray(new VectorizationCondition[0]);
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java
b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java
index 58032ca0572..3c7c69d5822 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java
@@ -204,6 +204,8 @@ public class VectorMapJoinDesc extends AbstractVectorDesc {
private boolean isHybridHashJoin;
private boolean supportsKeyTypes;
private List<String> notSupportedKeyTypes;
+ private boolean supportsValueTypes;
+ private List<String> notSupportedValueTypes;
private boolean smallTableExprVectorizes;
private boolean outerJoinHasNoKeys;
@@ -249,6 +251,18 @@ public class VectorMapJoinDesc extends AbstractVectorDesc
{
public List<String> getNotSupportedKeyTypes() {
return notSupportedKeyTypes;
}
+ public void setSupportsValueTypes(boolean supportsValueTypes) {
+ this.supportsValueTypes = supportsValueTypes;
+ }
+ public boolean getSupportsValueTypes() {
+ return supportsValueTypes;
+ }
+ public void setNotSupportedValueTypes(List<String> notSupportedValueTypes) {
+ this.notSupportedValueTypes = notSupportedValueTypes;
+ }
+ public List<String> getNotSupportedValueTypes() {
+ return notSupportedValueTypes;
+ }
public void setSmallTableExprVectorizes(boolean smallTableExprVectorizes) {
this.smallTableExprVectorizes = smallTableExprVectorizes;
}
diff --git a/ql/src/test/queries/clientpositive/vector_mapjoin_complex_values.q
b/ql/src/test/queries/clientpositive/vector_mapjoin_complex_values.q
new file mode 100644
index 00000000000..1c88daaefd4
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vector_mapjoin_complex_values.q
@@ -0,0 +1,34 @@
+set hive.mapred.mode=nonstrict;
+set hive.explain.user=false;
+set hive.vectorized.execution.enabled=true;
+set hive.auto.convert.join=true;
+set hive.mapjoin.hybridgrace.hashtable=false;
+set hive.fetch.task.conversion=none;
+set hive.cli.print.header=true;
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+
+create table census(
+ssn int,
+name string,
+city string,
+email string)
+row format delimited
+fields terminated by ',';
+
+insert into census values(100,"raj","san jose","email");
+
+create table census_clus(
+ssn int,
+name string,
+city string,
+email string)
+clustered by (ssn) into 4 buckets stored as orc TBLPROPERTIES
('transactional'='true');
+
+insert into table census_clus select * from census;
+
+EXPLAIN VECTORIZATION DETAIL
+UPDATE census_clus SET name = 'updated name' where ssn=100 and EXISTS
(select distinct ssn from census where ssn=census_clus.ssn);
+
+UPDATE census_clus SET name = 'updated name' where ssn=100 and EXISTS
(select distinct ssn from census where ssn=census_clus.ssn);
\ No newline at end of file
diff --git
a/ql/src/test/results/clientpositive/llap/vector_mapjoin_complex_values.q.out
b/ql/src/test/results/clientpositive/llap/vector_mapjoin_complex_values.q.out
new file mode 100644
index 00000000000..d7fe5f1d0dc
--- /dev/null
+++
b/ql/src/test/results/clientpositive/llap/vector_mapjoin_complex_values.q.out
@@ -0,0 +1,355 @@
+PREHOOK: query: create table census(
+ssn int,
+name string,
+city string,
+email string)
+row format delimited
+fields terminated by ','
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@census
+POSTHOOK: query: create table census(
+ssn int,
+name string,
+city string,
+email string)
+row format delimited
+fields terminated by ','
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@census
+PREHOOK: query: insert into census values(100,"raj","san jose","email")
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@census
+POSTHOOK: query: insert into census values(100,"raj","san jose","email")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@census
+POSTHOOK: Lineage: census.city SCRIPT []
+POSTHOOK: Lineage: census.email SCRIPT []
+POSTHOOK: Lineage: census.name SCRIPT []
+POSTHOOK: Lineage: census.ssn SCRIPT []
+col1 col2 col3 col4
+PREHOOK: query: create table census_clus(
+ssn int,
+name string,
+city string,
+email string)
+clustered by (ssn) into 4 buckets stored as orc TBLPROPERTIES
('transactional'='true')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@census_clus
+POSTHOOK: query: create table census_clus(
+ssn int,
+name string,
+city string,
+email string)
+clustered by (ssn) into 4 buckets stored as orc TBLPROPERTIES
('transactional'='true')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@census_clus
+PREHOOK: query: insert into table census_clus select * from census
+PREHOOK: type: QUERY
+PREHOOK: Input: default@census
+PREHOOK: Output: default@census_clus
+POSTHOOK: query: insert into table census_clus select * from census
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@census
+POSTHOOK: Output: default@census_clus
+POSTHOOK: Lineage: census_clus.city SIMPLE
[(census)census.FieldSchema(name:city, type:string, comment:null), ]
+POSTHOOK: Lineage: census_clus.email SIMPLE
[(census)census.FieldSchema(name:email, type:string, comment:null), ]
+POSTHOOK: Lineage: census_clus.name SIMPLE
[(census)census.FieldSchema(name:name, type:string, comment:null), ]
+POSTHOOK: Lineage: census_clus.ssn SIMPLE
[(census)census.FieldSchema(name:ssn, type:int, comment:null), ]
+census.ssn census.name census.city census.email
+PREHOOK: query: EXPLAIN VECTORIZATION DETAIL
+UPDATE census_clus SET name = 'updated name' where ssn=100 and EXISTS
(select distinct ssn from census where ssn=census_clus.ssn)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@census
+PREHOOK: Input: default@census_clus
+PREHOOK: Output: default@census_clus
+POSTHOOK: query: EXPLAIN VECTORIZATION DETAIL
+UPDATE census_clus SET name = 'updated name' where ssn=100 and EXISTS
(select distinct ssn from census where ssn=census_clus.ssn)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@census
+POSTHOOK: Input: default@census_clus
+POSTHOOK: Output: default@census_clus
+Explain
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 depends on stages: Stage-2
+ Stage-3 depends on stages: Stage-0
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 1 <- Reducer 4 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+ Reducer 4 <- Map 3 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: census_clus
+ Statistics: Num rows: 1 Data size: 185 Basic stats: COMPLETE
Column stats: COMPLETE
+ TableScan Vectorization:
+ native: true
+ vectorizationSchemaColumns: [0:ssn:int, 1:name:string,
2:city:string, 3:email:string,
4:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>]
+ Filter Operator
+ Filter Vectorization:
+ className: VectorFilterOperator
+ native: true
+ predicateExpression: FilterLongColEqualLongScalar(col
0:int, val 100)
+ predicate: (ssn = 100) (type: boolean)
+ Statistics: Num rows: 1 Data size: 185 Basic stats:
COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Left Semi Join 0 to 1
+ keys:
+ 0 100 (type: int)
+ 1 100 (type: int)
+ Map Join Vectorization:
+ bigTableKeyExpressions: ConstantVectorExpression(val
100) -> 5:int
+ bigTableValueExpressions: col 2:string, col
3:string, col 4:struct<writeid:bigint,bucketid:int,rowid:bigint>
+ className: VectorMapJoinOperator
+ native: false
+ nativeConditionsMet:
hive.mapjoin.optimized.hashtable IS true,
hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine
tez IN [tez, spark] IS true, One MapJoin Condition IS true, No nullsafe IS
true, Small table vectorizes IS true, Optimized Table and Supports Key Types IS
true
+ nativeConditionsNotMet: Supports Value Types
[STRUCT] IS false
+ outputColumnNames: _col2, _col3, _col6
+ input vertices:
+ 1 Reducer 4
+ Statistics: Num rows: 1 Data size: 257 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col6 (type:
struct<writeid:bigint,bucketid:int,rowid:bigint>), _col2 (type: string), _col3
(type: string)
+ outputColumnNames: _col0, _col3, _col4
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [2, 0, 1]
+ Statistics: Num rows: 1 Data size: 357 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type:
struct<writeid:bigint,bucketid:int,rowid:bigint>)
+ sort order: +
+ Map-reduce partition columns: UDFToInteger(_col0)
(type: int)
+ Reduce Sink Vectorization:
+ className: VectorReduceSinkObjectHashOperator
+ keyColumnNums: [2]
+ native: true
+ nativeConditionsMet:
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine
tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true,
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+ partitionColumnNums: [3]
+ valueColumnNums: [0, 1]
+ Statistics: Num rows: 1 Data size: 357 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col3 (type: string), _col4
(type: string)
+ Execution mode: vectorized, llap
+ LLAP IO: may be used (ACID table)
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet:
hive.vectorized.use.vectorized.input.format IS true
+ inputFormatFeatureSupport: [DECIMAL_64]
+ featureSupportInUse: [DECIMAL_64]
+ inputFileFormats:
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ rowBatchContext:
+ dataColumnCount: 4
+ includeColumns: [0, 2, 3]
+ dataColumns: ssn:int, name:string, city:string,
email:string
+ neededVirtualColumns: [ROWID]
+ partitionColumnCount: 0
+ scratchColumnTypeNames: [bigint]
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: census
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: COMPLETE
+ TableScan Vectorization:
+ native: true
+ vectorizationSchemaColumns: [0:ssn:int, 1:name:string,
2:city:string, 3:email:string,
4:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>]
+ Filter Operator
+ Filter Vectorization:
+ className: VectorFilterOperator
+ native: true
+ predicateExpression: FilterLongColEqualLongScalar(col
0:int, val 100)
+ predicate: (ssn = 100) (type: boolean)
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: []
+ Statistics: Num rows: 1 Data size: 4 Basic stats:
COMPLETE Column stats: COMPLETE
+ Group By Operator
+ Group By Vectorization:
+ className: VectorGroupByOperator
+ groupByMode: HASH
+ keyExpressions: ConstantVectorExpression(val 100)
-> 5:int
+ native: false
+ vectorProcessingMode: HASH
+ projectedOutputColumnNums: []
+ keys: 100 (type: int)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 4 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: 100 (type: int)
+ sort order: +
+ Map-reduce partition columns: 100 (type: int)
+ Reduce Sink Vectorization:
+ className: VectorReduceSinkLongOperator
+ keyColumnNums: [1]
+ keyExpressions: ConstantVectorExpression(val
100) -> 1:int
+ native: true
+ nativeConditionsMet:
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine
tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true,
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+ valueColumnNums: []
+ Statistics: Num rows: 1 Data size: 4 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: no inputs
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet:
hive.vectorized.use.vector.serde.deserialize IS true
+ inputFormatFeatureSupport: [DECIMAL_64]
+ featureSupportInUse: [DECIMAL_64]
+ inputFileFormats: org.apache.hadoop.mapred.TextInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ rowBatchContext:
+ dataColumnCount: 4
+ includeColumns: [0]
+ dataColumns: ssn:int, name:string, city:string,
email:string
+ partitionColumnCount: 0
+ scratchColumnTypeNames: [bigint]
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Vectorization:
+ enabled: true
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled
IS true, hive.execution.engine tez IN [tez, spark] IS true
+ reduceColumnNullOrder: z
+ reduceColumnSortOrder: +
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ rowBatchContext:
+ dataColumnCount: 3
+ dataColumns:
KEY.reducesinkkey0:struct<writeid:bigint,bucketid:int,rowid:bigint>,
VALUE._col1:string, VALUE._col2:string
+ partitionColumnCount: 0
+ scratchColumnTypeNames: [bigint, string]
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type:
struct<writeid:bigint,bucketid:int,rowid:bigint>), 100 (type: int), 'updated
name' (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [0, 3, 4, 1, 2]
+ selectExpressions: ConstantVectorExpression(val 100) ->
3:int, ConstantVectorExpression(val updated name) -> 4:string
+ Statistics: Num rows: 1 Data size: 357 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ File Sink Vectorization:
+ className: VectorFileSinkOperator
+ native: false
+ Statistics: Num rows: 1 Data size: 357 Basic stats: COMPLETE
Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+ serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+ name: default.census_clus
+ Write Type: UPDATE
+ Reducer 4
+ Execution mode: vectorized, llap
+ Reduce Vectorization:
+ enabled: true
+ enableConditionsMet: hive.vectorized.execution.reduce.enabled
IS true, hive.execution.engine tez IN [tez, spark] IS true
+ reduceColumnNullOrder: a
+ reduceColumnSortOrder: +
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ rowBatchContext:
+ dataColumnCount: 1
+ dataColumns: KEY._col0:int
+ partitionColumnCount: 0
+ scratchColumnTypeNames: [bigint, bigint]
+ Reduce Operator Tree:
+ Group By Operator
+ Group By Vectorization:
+ className: VectorGroupByOperator
+ groupByMode: MERGEPARTIAL
+ keyExpressions: ConstantVectorExpression(val 100) ->
1:int, ConstantVectorExpression(val 100) -> 2:int
+ native: false
+ vectorProcessingMode: MERGE_PARTIAL
+ projectedOutputColumnNums: []
+ keys: 100 (type: int), 100 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: []
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE
Column stats: COMPLETE
+ Group By Operator
+ Group By Vectorization:
+ className: VectorGroupByOperator
+ groupByMode: HASH
+ keyExpressions: ConstantVectorExpression(val 100) ->
2:int
+ native: false
+ vectorProcessingMode: HASH
+ projectedOutputColumnNums: []
+ keys: 100 (type: int)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: 100 (type: int)
+ sort order: +
+ Map-reduce partition columns: 100 (type: int)
+ Reduce Sink Vectorization:
+ className: VectorReduceSinkLongOperator
+ keyColumnNums: [1]
+ keyExpressions: ConstantVectorExpression(val 100) ->
1:int
+ native: true
+ nativeConditionsMet:
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine
tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true,
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+ valueColumnNums: []
+ Statistics: Num rows: 1 Data size: 4 Basic stats:
COMPLETE Column stats: COMPLETE
+
+ Stage: Stage-2
+ Dependency Collection
+
+ Stage: Stage-0
+ Move Operator
+ tables:
+ replace: false
+ table:
+ input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+ output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+ serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+ name: default.census_clus
+ Write Type: UPDATE
+
+ Stage: Stage-3
+ Stats Work
+ Basic Stats Work:
+
+PREHOOK: query: UPDATE census_clus SET name = 'updated name' where ssn=100 and
EXISTS (select distinct ssn from census where ssn=census_clus.ssn)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@census
+PREHOOK: Input: default@census_clus
+PREHOOK: Output: default@census_clus
+POSTHOOK: query: UPDATE census_clus SET name = 'updated name' where ssn=100
and EXISTS (select distinct ssn from census where ssn=census_clus.ssn)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@census
+POSTHOOK: Input: default@census_clus
+POSTHOOK: Output: default@census_clus
+row__id ssn _c2 city email