HIVE-11180: Enable native vectorized map join for spark [Spark Branch] (Rui reviewed by Xuefu)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/80f548af Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/80f548af Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/80f548af Branch: refs/heads/spark Commit: 80f548af3b762abc7775fdfeb21b0d2d9d417c09 Parents: 714b3db Author: Rui Li <[email protected]> Authored: Thu Aug 6 13:58:50 2015 +0800 Committer: Rui Li <[email protected]> Committed: Thu Aug 6 14:09:36 2015 +0800 ---------------------------------------------------------------------- .../org/apache/hadoop/hive/conf/HiveConf.java | 4 +- .../test/resources/testconfiguration.properties | 9 +- .../persistence/MapJoinTableContainerSerDe.java | 70 + .../hive/ql/exec/spark/HashTableLoader.java | 18 +- .../mapjoin/VectorMapJoinCommonOperator.java | 4 +- .../fast/VectorMapJoinFastTableContainer.java | 2 +- .../hive/ql/optimizer/physical/Vectorizer.java | 6 +- .../optimizer/spark/SparkMapJoinOptimizer.java | 10 + .../spark/vector_inner_join.q.out | 853 +++++++++++ .../spark/vector_outer_join0.q.out | 242 +++ .../spark/vector_outer_join1.q.out | 631 ++++++++ .../spark/vector_outer_join2.q.out | 327 ++++ .../spark/vector_outer_join3.q.out | 630 ++++++++ .../spark/vector_outer_join4.q.out | 1000 +++++++++++++ .../spark/vector_outer_join5.q.out | 1406 ++++++++++++++++++ 15 files changed, 5201 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index f593d7d..73610dc 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -765,8 +765,8 @@ public class HiveConf extends Configuration { HIVEMAPJOINBUCKETCACHESIZE("hive.mapjoin.bucket.cache.size", 100, ""), HIVEMAPJOINUSEOPTIMIZEDTABLE("hive.mapjoin.optimized.hashtable", true, - "Whether Hive should use memory-optimized hash table for MapJoin. Only works on Tez,\n" + - "because memory-optimized hashtable cannot be serialized."), + "Whether Hive should use memory-optimized hash table for MapJoin.\n" + + "Only works on Tez and Spark, because memory-optimized hashtable cannot be serialized."), HIVEUSEHYBRIDGRACEHASHJOIN("hive.mapjoin.hybridgrace.hashtable", true, "Whether to use hybrid" + "grace hash join as the join method for mapjoin. Tez only."), HIVEHYBRIDGRACEHASHJOINMEMCHECKFREQ("hive.mapjoin.hybridgrace.memcheckfrequency", 1024, "For " + http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index c710b0b..b04c5d5 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -1181,7 +1181,14 @@ miniSparkOnYarn.query.files=auto_sortmerge_join_16.q,\ stats_counter_partitioned.q,\ temp_table_external.q,\ truncate_column_buckets.q,\ - uber_reduce.q + uber_reduce.q,\ + vector_inner_join.q,\ + vector_outer_join0.q,\ + vector_outer_join1.q,\ + vector_outer_join2.q,\ + vector_outer_join3.q,\ + vector_outer_join4.q,\ + vector_outer_join5.q spark.query.negative.files=groupby2_map_skew_multi_distinct.q,\ groupby2_multi_distinct.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java index e97a9f0..d6deabe 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinTableContainerSerDe.java @@ -32,7 +32,9 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.JavaUtils; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastTableContainer; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.shims.ShimLoader; @@ -195,6 +197,74 @@ public class MapJoinTableContainerSerDe { } } + /** + * Loads the small table into a VectorMapJoinFastTableContainer. Only used on Spark path. + * @param mapJoinDesc The descriptor for the map join + * @param fs FileSystem of the folder. + * @param folder The folder to load table container. + * @param hconf The hive configuration + * @return Loaded table. + */ + @SuppressWarnings("unchecked") + public MapJoinTableContainer loadFastContainer(MapJoinDesc mapJoinDesc, + FileSystem fs, Path folder, Configuration hconf) throws HiveException { + try { + if (!fs.isDirectory(folder)) { + throw new HiveException("Error, not a directory: " + folder); + } + FileStatus[] fileStatuses = fs.listStatus(folder); + if (fileStatuses == null || fileStatuses.length == 0) { + return null; + } + + SerDe keySerDe = keyContext.getSerDe(); + SerDe valueSerDe = valueContext.getSerDe(); + Writable key = keySerDe.getSerializedClass().newInstance(); + Writable value = valueSerDe.getSerializedClass().newInstance(); + + VectorMapJoinFastTableContainer tableContainer = + new VectorMapJoinFastTableContainer(mapJoinDesc, hconf, -1); + + for (FileStatus fileStatus : fileStatuses) { + Path filePath = fileStatus.getPath(); + if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) { + throw new HiveException("Error, not a file: " + filePath); + } + InputStream is = null; + ObjectInputStream in = null; + try { + is = fs.open(filePath, 4096); + in = new ObjectInputStream(is); + // skip the name and metadata + in.readUTF(); + in.readObject(); + int numKeys = in.readInt(); + for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) { + key.readFields(in); + long numRows = in.readLong(); + for (long rowIndex = 0L; rowIndex < numRows; rowIndex++) { + value.readFields(in); + tableContainer.putRow(null, key, null, value); + } + } + } finally { + if (in != null) { + in.close(); + } else if (is != null) { + is.close(); + } + } + } + + tableContainer.seal(); + return tableContainer; + } catch (IOException e) { + throw new HiveException("IO error while trying to create table container", e); + } catch (Exception e) { + throw new HiveException("Error while trying to create table container", e); + } + } + public void persist(ObjectOutputStream out, MapJoinPersistableTableContainer tableContainer) throws HiveException { int numKeys = tableContainer.size(); http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java index 10e3497..c2462a0 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java @@ -46,6 +46,7 @@ import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapredLocalWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext; +import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.mapred.JobConf; @@ -62,6 +63,8 @@ public class HashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTable private MapJoinOperator joinOp; private MapJoinDesc desc; + private boolean useFastContainer = false; + @Override public void init(ExecMapperContext context, MapredContext mrContext, Configuration hconf, MapJoinOperator joinOp) { @@ -69,6 +72,12 @@ public class HashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTable this.hconf = hconf; this.joinOp = joinOp; this.desc = joinOp.getConf(); + if (desc.getVectorMode() && HiveConf.getBoolVar( + hconf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) { + VectorMapJoinDesc vectorDesc = desc.getVectorDesc(); + useFastContainer = vectorDesc != null && vectorDesc.hashTableImplementationType() == + VectorMapJoinDesc.HashTableImplementationType.FAST; + } } @Override @@ -98,7 +107,7 @@ public class HashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTable FileSystem fs = FileSystem.get(baseDir.toUri(), hconf); BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext(); boolean firstContainer = true; - boolean useOptimizedContainer = HiveConf.getBoolVar( + boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar( hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE); for (int pos = 0; pos < mapJoinTables.length; pos++) { if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) { @@ -146,14 +155,17 @@ public class HashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTable MapJoinTableContainerSerDe mapJoinTableSerde) throws HiveException { LOG.info("\tLoad back all hashtable files from tmp folder uri:" + path); if (!SparkUtilities.isDedicatedCluster(hconf)) { - return mapJoinTableSerde.load(fs, path, hconf); + return useFastContainer ? mapJoinTableSerde.loadFastContainer(desc, fs, path, hconf) : + mapJoinTableSerde.load(fs, path, hconf); } MapJoinTableContainer mapJoinTable = SmallTableCache.get(path); if (mapJoinTable == null) { synchronized (path.toString().intern()) { mapJoinTable = SmallTableCache.get(path); if (mapJoinTable == null) { - mapJoinTable = mapJoinTableSerde.load(fs, path, hconf); + mapJoinTable = useFastContainer ? + mapJoinTableSerde.loadFastContainer(desc, fs, path, hconf) : + mapJoinTableSerde.load(fs, path, hconf); SmallTableCache.cache(path, mapJoinTable); } } http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java index 87ebcf2..efad421 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java @@ -541,7 +541,9 @@ public abstract class VectorMapJoinCommonOperator extends MapJoinOperator implem break; case FAST: // Use our specialized hash table loader. - hashTableLoader = new VectorMapJoinFastHashTableLoader(); + hashTableLoader = HiveConf.getVar( + hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark") ? + HashTableLoaderFactory.getLoader(hconf) : new VectorMapJoinFastHashTableLoader(); break; default: throw new RuntimeException("Unknown vector map join hash table implementation type " + hashTableImplementationType.name()); http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java index f2080f4..cf6c0e3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/VectorMapJoinFastTableContainer.java @@ -195,7 +195,7 @@ public class VectorMapJoinFastTableContainer implements VectorMapJoinTableContai @Override public void clear() { - throw new RuntimeException("Not applicable"); + // Do nothing } @Override http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 82c3e50..4f66cd6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -157,6 +157,7 @@ public class Vectorizer implements PhysicalPlanResolver { private PhysicalContext physicalContext = null; private HiveConf hiveConf; + private boolean isSpark; public Vectorizer() { @@ -873,6 +874,7 @@ public class Vectorizer implements PhysicalPlanResolver { LOG.info("Vectorization is disabled"); return physicalContext; } + isSpark = (HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")); // create dispatcher and graph walker Dispatcher disp = new VectorizationDispatcher(physicalContext); TaskGraphWalker ogw = new TaskGraphWalker(disp); @@ -1444,8 +1446,6 @@ public class Vectorizer implements PhysicalPlanResolver { Operator<? extends OperatorDesc> vectorOp = null; Class<? extends Operator<?>> opClass = null; - boolean isOuterJoin = !desc.getNoOuterJoin(); - VectorMapJoinDesc.HashTableImplementationType hashTableImplementationType = HashTableImplementationType.NONE; VectorMapJoinDesc.HashTableKind hashTableKind = HashTableKind.NONE; VectorMapJoinDesc.HashTableKeyType hashTableKeyType = HashTableKeyType.NONE; @@ -1666,7 +1666,7 @@ public class Vectorizer implements PhysicalPlanResolver { case MAPJOIN: { MapJoinDesc desc = (MapJoinDesc) op.getConf(); - boolean specialize = canSpecializeMapJoin(op, desc, isTez); + boolean specialize = canSpecializeMapJoin(op, desc, isTez || isSpark); if (!specialize) { http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java index 39d1f18..46eab65 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java @@ -46,6 +46,8 @@ import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.OpTraits; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.Statistics; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe; /** * SparkMapJoinOptimizer cloned from ConvertJoinMapJoin is an optimization that replaces a common join @@ -89,6 +91,14 @@ public class SparkMapJoinOptimizer implements NodeProcessor { LOG.info("Convert to non-bucketed map join"); MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, mapJoinConversionPos); + // For native vectorized map join, we require the key SerDe to be BinarySortableSerDe + // Note: the MJ may not really get natively-vectorized later, + // but changing SerDe won't hurt correctness + if (conf.getBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_ENABLED) && + conf.getBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED)) { + mapJoinOp.getConf().getKeyTblDesc().getProperties().setProperty( + serdeConstants.SERIALIZATION_LIB, BinarySortableSerDe.class.getName()); + } if (conf.getBoolVar(HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) { LOG.info("Check if it can be converted to bucketed map join"); numBuckets = convertJoinBucketMapJoin(joinOp, mapJoinOp, http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/test/results/clientpositive/spark/vector_inner_join.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/vector_inner_join.q.out b/ql/src/test/results/clientpositive/spark/vector_inner_join.q.out new file mode 100644 index 0000000..d1b775f --- /dev/null +++ b/ql/src/test/results/clientpositive/spark/vector_inner_join.q.out @@ -0,0 +1,853 @@ +PREHOOK: query: CREATE TABLE orc_table_1a(a INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_table_1a +POSTHOOK: query: CREATE TABLE orc_table_1a(a INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_table_1a +PREHOOK: query: CREATE TABLE orc_table_2a(c INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_table_2a +POSTHOOK: query: CREATE TABLE orc_table_2a(c INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_table_2a +PREHOOK: query: insert into table orc_table_1a values(1),(1), (2),(3) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@orc_table_1a +POSTHOOK: query: insert into table orc_table_1a values(1),(1), (2),(3) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@orc_table_1a +POSTHOOK: Lineage: orc_table_1a.a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into table orc_table_2a values(0),(2), (3),(null),(4) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@orc_table_2a +POSTHOOK: query: insert into table orc_table_2a values(0),(2), (3),(null),(4) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@orc_table_2a +POSTHOOK: Lineage: orc_table_2a.c EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain +select t1.a from orc_table_2a t2 join orc_table_1a t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.a from orc_table_2a t2 join orc_table_1a t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 5 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (c > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 c (type: int) + 1 a (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (a > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 c (type: int) + 1 a (type: int) + outputColumnNames: _col4 + input vertices: + 0 Map 1 + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col4 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1.a from orc_table_2a t2 join orc_table_1a t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1a +PREHOOK: Input: default@orc_table_2a +#### A masked pattern was here #### +POSTHOOK: query: select t1.a from orc_table_2a t2 join orc_table_1a t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1a +POSTHOOK: Input: default@orc_table_2a +#### A masked pattern was here #### +3 +PREHOOK: query: explain +select t2.c from orc_table_2a t2 left semi join orc_table_1a t1 on t1.a = t2.c where t2.c > 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t2.c from orc_table_2a t2 left semi join orc_table_1a t1 on t1.a = t2.c where t2.c > 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 5 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (c > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: c (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0 + input vertices: + 1 Map 2 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t2.c from orc_table_2a t2 left semi join orc_table_1a t1 on t1.a = t2.c where t2.c > 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1a +PREHOOK: Input: default@orc_table_2a +#### A masked pattern was here #### +POSTHOOK: query: select t2.c from orc_table_2a t2 left semi join orc_table_1a t1 on t1.a = t2.c where t2.c > 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1a +POSTHOOK: Input: default@orc_table_2a +#### A masked pattern was here #### +3 +PREHOOK: query: CREATE TABLE orc_table_1b(v1 STRING, a INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_table_1b +POSTHOOK: query: CREATE TABLE orc_table_1b(v1 STRING, a INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_table_1b +PREHOOK: query: CREATE TABLE orc_table_2b(c INT, v2 STRING) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_table_2b +POSTHOOK: query: CREATE TABLE orc_table_2b(c INT, v2 STRING) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_table_2b +PREHOOK: query: insert into table orc_table_1b values("one", 1),("one", 1), ("two", 2),("three", 3) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__3 +PREHOOK: Output: default@orc_table_1b +POSTHOOK: query: insert into table orc_table_1b values("one", 1),("one", 1), ("two", 2),("three", 3) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__3 +POSTHOOK: Output: default@orc_table_1b +POSTHOOK: Lineage: orc_table_1b.a EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: orc_table_1b.v1 SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into table orc_table_2b values(0, "ZERO"),(2, "TWO"), (3, "THREE"),(null, "<NULL>"),(4, "FOUR") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__4 +PREHOOK: Output: default@orc_table_2b +POSTHOOK: query: insert into table orc_table_2b values(0, "ZERO"),(2, "TWO"), (3, "THREE"),(null, "<NULL>"),(4, "FOUR") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__4 +POSTHOOK: Output: default@orc_table_2b +POSTHOOK: Lineage: orc_table_2b.c EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: orc_table_2b.v2 SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain +select t1.v1, t1.a from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.v1, t1.a from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (a > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 c (type: int) + 1 a (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (c > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 c (type: int) + 1 a (type: int) + outputColumnNames: _col5, _col6 + input vertices: + 1 Map 2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col5 (type: string), _col6 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1.v1, t1.a from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1b +PREHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +POSTHOOK: query: select t1.v1, t1.a from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1b +POSTHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +three 3 +PREHOOK: query: explain +select t1.v1, t1.a, t2.c, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.v1, t1.a, t2.c, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (a > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 c (type: int) + 1 a (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (c > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 c (type: int) + 1 a (type: int) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col5 (type: string), _col6 (type: int), _col0 (type: int), _col1 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1.v1, t1.a, t2.c, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1b +PREHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +POSTHOOK: query: select t1.v1, t1.a, t2.c, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1b +POSTHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +three 3 3 THREE +PREHOOK: query: explain +select t1.v1, t1.a*2, t2.c*5, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.v1, t1.a*2, t2.c*5, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (a > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 c (type: int) + 1 a (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (c > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 c (type: int) + 1 a (type: int) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col5 (type: string), (_col6 * 2) (type: int), (_col0 * 5) (type: int), _col1 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1.v1, t1.a*2, t2.c*5, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1b +PREHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +POSTHOOK: query: select t1.v1, t1.a*2, t2.c*5, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1b +POSTHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +three 6 15 THREE +PREHOOK: query: explain +select t1.v1, t2.v2, t2.c from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.v1, t2.v2, t2.c from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (a > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 c (type: int) + 1 a (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (c > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 c (type: int) + 1 a (type: int) + outputColumnNames: _col0, _col1, _col5 + input vertices: + 1 Map 2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col5 (type: string), _col1 (type: string), _col0 (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1.v1, t2.v2, t2.c from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1b +PREHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +POSTHOOK: query: select t1.v1, t2.v2, t2.c from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1b +POSTHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +three THREE 3 +PREHOOK: query: explain +select t1.a, t1.v1, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.a, t1.v1, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (a > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 c (type: int) + 1 a (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (c > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 c (type: int) + 1 a (type: int) + outputColumnNames: _col1, _col5, _col6 + input vertices: + 1 Map 2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col6 (type: int), _col5 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1.a, t1.v1, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1b +PREHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +POSTHOOK: query: select t1.a, t1.v1, t2.v2 from orc_table_2b t2 join orc_table_1b t1 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1b +POSTHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +3 three THREE +PREHOOK: query: explain +select t1.v1, t2.v2, t2.c from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.v1, t2.v2, t2.c from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (c > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 a (type: int) + 1 c (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (a > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 a (type: int) + 1 c (type: int) + outputColumnNames: _col0, _col5, _col6 + input vertices: + 1 Map 2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col6 (type: string), _col5 (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1.v1, t2.v2, t2.c from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1b +PREHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +POSTHOOK: query: select t1.v1, t2.v2, t2.c from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1b +POSTHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +three THREE 3 +PREHOOK: query: explain +select t1.a, t1.v1, t2.v2 from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.a, t1.v1, t2.v2 from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 5 Data size: 456 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (c > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 a (type: int) + 1 c (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 4 Data size: 364 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (a > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 a (type: int) + 1 c (type: int) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 1 Map 2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: int), _col0 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1.a, t1.v1, t2.v2 from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1b +PREHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +POSTHOOK: query: select t1.a, t1.v1, t2.v2 from orc_table_1b t1 join orc_table_2b t2 on t1.a = t2.c where t1.a > 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1b +POSTHOOK: Input: default@orc_table_2b +#### A masked pattern was here #### +3 three THREE http://git-wip-us.apache.org/repos/asf/hive/blob/80f548af/ql/src/test/results/clientpositive/spark/vector_outer_join0.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/vector_outer_join0.q.out b/ql/src/test/results/clientpositive/spark/vector_outer_join0.q.out new file mode 100644 index 0000000..cc66db5 --- /dev/null +++ b/ql/src/test/results/clientpositive/spark/vector_outer_join0.q.out @@ -0,0 +1,242 @@ +PREHOOK: query: CREATE TABLE orc_table_1(v1 STRING, a INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_table_1 +POSTHOOK: query: CREATE TABLE orc_table_1(v1 STRING, a INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_table_1 +PREHOOK: query: CREATE TABLE orc_table_2(c INT, v2 STRING) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_table_2 +POSTHOOK: query: CREATE TABLE orc_table_2(c INT, v2 STRING) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_table_2 +PREHOOK: query: insert into table orc_table_1 values ("<null1>", null),("one", 1),("one", 1),("two", 2),("three", 3),("<null2>", null) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@orc_table_1 +POSTHOOK: query: insert into table orc_table_1 values ("<null1>", null),("one", 1),("one", 1),("two", 2),("three", 3),("<null2>", null) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@orc_table_1 +POSTHOOK: Lineage: orc_table_1.a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: orc_table_1.v1 SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into table orc_table_2 values (0, "ZERO"),(2, "TWO"), (3, "THREE"),(null, "<NULL1>"),(4, "FOUR"),(null, "<NULL2>") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@orc_table_2 +POSTHOOK: query: insert into table orc_table_2 values (0, "ZERO"),(2, "TWO"), (3, "THREE"),(null, "<NULL1>"),(4, "FOUR"),(null, "<NULL2>") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@orc_table_2 +POSTHOOK: Lineage: orc_table_2.c EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: orc_table_2.v2 SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: select * from orc_table_1 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_table_1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1 +#### A masked pattern was here #### +<null1> NULL +<null2> NULL +one 1 +one 1 +three 3 +two 2 +PREHOOK: query: select * from orc_table_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_2 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_table_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_2 +#### A masked pattern was here #### +0 ZERO +2 TWO +3 THREE +4 FOUR +NULL <NULL1> +NULL <NULL2> +PREHOOK: query: explain +select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 left outer join orc_table_2 t2 on t1.a = t2.c +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 left outer join orc_table_2 t2 on t1.a = t2.c +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 6 Data size: 550 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 a (type: int) + 1 c (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 6 Data size: 544 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Left Outer Join0 to 1 + keys: + 0 a (type: int) + 1 c (type: int) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int), _col5 (type: int), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- SORT_QUERY_RESULTS + +select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 left outer join orc_table_2 t2 on t1.a = t2.c +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1 +PREHOOK: Input: default@orc_table_2 +#### A masked pattern was here #### +POSTHOOK: query: -- SORT_QUERY_RESULTS + +select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 left outer join orc_table_2 t2 on t1.a = t2.c +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1 +POSTHOOK: Input: default@orc_table_2 +#### A masked pattern was here #### +<null1> NULL NULL NULL +<null2> NULL NULL NULL +one 1 NULL NULL +one 1 NULL NULL +three 3 3 THREE +two 2 2 TWO +PREHOOK: query: explain +select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 right outer join orc_table_2 t2 on t1.a = t2.c +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 right outer join orc_table_2 t2 on t1.a = t2.c +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 6 Data size: 544 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 a (type: int) + 1 c (type: int) + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: t2 + Statistics: Num rows: 6 Data size: 550 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Right Outer Join0 to 1 + keys: + 0 a (type: int) + 1 c (type: int) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 0 Map 1 + Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int), _col5 (type: int), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 598 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- SORT_QUERY_RESULTS + +select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 right outer join orc_table_2 t2 on t1.a = t2.c +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_table_1 +PREHOOK: Input: default@orc_table_2 +#### A masked pattern was here #### +POSTHOOK: query: -- SORT_QUERY_RESULTS + +select t1.v1, t1.a, t2.c, t2.v2 from orc_table_1 t1 right outer join orc_table_2 t2 on t1.a = t2.c +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_table_1 +POSTHOOK: Input: default@orc_table_2 +#### A masked pattern was here #### +NULL NULL 0 ZERO +NULL NULL 4 FOUR +NULL NULL NULL <NULL1> +NULL NULL NULL <NULL2> +three 3 3 THREE +two 2 2 TWO
