[4/5] hive git commit: HIVE-10403 - Add n-way join support for Hybrid Grace Hash Join (Wei Zheng via Vikram Dixit)

vikram Fri, 01 May 2015 11:08:08 -0700

http://git-wip-us.apache.org/repos/asf/hive/blob/c72d073c/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
index 0192fb5..cee9100 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
@@ -67,7 +67,7 @@ public class MapJoinDesc extends JoinDesc implements 
Serializable {
   private boolean isBucketMapJoin;
 
   // Hash table memory usage allowed; used in case of non-staged mapjoin.
-  private float hashtableMemoryUsage;
+  private float hashtableMemoryUsage;   // This is a percentage value between 
0 and 1
   protected boolean genJoinKeys = true;
 
   private boolean isHybridHashJoin;


http://git-wip-us.apache.org/repos/asf/hive/blob/c72d073c/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java
----------------------------------------------------------------------
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java
 
b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java
index eb38b19..a45275b 100644
--- 
a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java
+++ 
b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastLongHashMap.java
@@ -35,7 +35,7 @@ public class TestVectorMapJoinFastLongHashMap extends 
CommonFastHashTable {
     random = new Random(47496);
 
     VectorMapJoinFastLongHashMap map =
-        new VectorMapJoinFastLongHashMap(false, false, HashTableKeyType.LONG, 
CAPACITY, LOAD_FACTOR, WB_SIZE, 0);
+        new VectorMapJoinFastLongHashMap(false, false, HashTableKeyType.LONG, 
CAPACITY, LOAD_FACTOR, WB_SIZE);
 
     RandomLongStream randomLongKeyStream = new RandomLongStream(random);
     RandomByteArrayStream randomByteArrayValueStream = new 
RandomByteArrayStream(random);
@@ -55,7 +55,7 @@ public class TestVectorMapJoinFastLongHashMap extends 
CommonFastHashTable {
   public void testPutGetMultiple() throws Exception {
     random = new Random(2990);
 
-    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE, 0);
+    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE);
 
     RandomLongStream randomLongKeyStream = new RandomLongStream(random);
     RandomByteArrayStream randomByteArrayValueStream = new 
RandomByteArrayStream(random);
@@ -77,7 +77,7 @@ public class TestVectorMapJoinFastLongHashMap extends 
CommonFastHashTable {
   public void testGetNonExistent() throws Exception {
     random = new Random(16916);
 
-    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE, 0);
+    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, CAPACITY, LOAD_FACTOR, WB_SIZE);
 
     RandomLongStream randomLongKeyStream = new RandomLongStream(random);
     RandomByteArrayStream randomByteArrayValueStream = new 
RandomByteArrayStream(random);
@@ -101,7 +101,7 @@ public class TestVectorMapJoinFastLongHashMap extends 
CommonFastHashTable {
     random = new Random(26078);
 
     // Make sure the map does not expand; should be able to find space.
-    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, CAPACITY, 1f, WB_SIZE, 0);
+    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, CAPACITY, 1f, WB_SIZE);
 
     RandomLongStream randomLongKeyStream = new RandomLongStream(random);
     RandomByteArrayStream randomByteArrayValueStream = new 
RandomByteArrayStream(random);
@@ -126,7 +126,7 @@ public class TestVectorMapJoinFastLongHashMap extends 
CommonFastHashTable {
     random = new Random(22470);
 
     // Start with capacity 1; make sure we expand on every put.
-    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, 1, 0.0000001f, WB_SIZE, 0);
+    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, 1, 0.0000001f, WB_SIZE);
 
     RandomLongStream randomLongKeyStream = new RandomLongStream(random);
     RandomByteArrayStream randomByteArrayValueStream = new 
RandomByteArrayStream(random);
@@ -147,7 +147,7 @@ public class TestVectorMapJoinFastLongHashMap extends 
CommonFastHashTable {
     random = new Random(40719);
 
     // Use a large capacity that doesn't require expansion, yet.
-    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE, 0);
+    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, LARGE_CAPACITY, LOAD_FACTOR, LARGE_WB_SIZE);
 
     RandomLongStream randomLongKeyStream = new RandomLongStream(random);
 
@@ -172,7 +172,7 @@ public class TestVectorMapJoinFastLongHashMap extends 
CommonFastHashTable {
     random = new Random(46809);
 
     // Use a large capacity that doesn't require expansion, yet.
-    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE, 
0);
+    VectorMapJoinFastLongHashMap map = new VectorMapJoinFastLongHashMap(false, 
false, HashTableKeyType.LONG, MODERATE_CAPACITY, LOAD_FACTOR, MODERATE_WB_SIZE);
 
     RandomLongStream randomLongKeyStream = new RandomLongStream(random);
 

http://git-wip-us.apache.org/repos/asf/hive/blob/c72d073c/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastMultiKeyHashMap.java
----------------------------------------------------------------------
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastMultiKeyHashMap.java
 
b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastMultiKeyHashMap.java
index 3c1b29a..944bda6 100644
--- 
a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastMultiKeyHashMap.java
+++ 
b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/fast/TestVectorMapJoinFastMultiKeyHashMap.java
@@ -35,7 +35,7 @@ public class TestVectorMapJoinFastMultiKeyHashMap extends 
CommonFastHashTable {
     random = new Random(47496);
 
     VectorMapJoinFastMultiKeyHashMap map =
-        new VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, LOAD_FACTOR, 
WB_SIZE, 0);
+        new VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, LOAD_FACTOR, 
WB_SIZE);
 
     RandomByteArrayStream randomByteArrayKeyStream = new 
RandomByteArrayStream(random);
     RandomByteArrayStream randomByteArrayValueStream = new 
RandomByteArrayStream(random);
@@ -55,7 +55,7 @@ public class TestVectorMapJoinFastMultiKeyHashMap extends 
CommonFastHashTable {
   public void testPutGetMultiple() throws Exception {
     random = new Random(2990);
 
-    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, LOAD_FACTOR, WB_SIZE, 0);
+    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, LOAD_FACTOR, WB_SIZE);
 
     RandomByteArrayStream randomByteArrayKeyStream = new 
RandomByteArrayStream(random);
     RandomByteArrayStream randomByteArrayValueStream = new 
RandomByteArrayStream(random);
@@ -77,7 +77,7 @@ public class TestVectorMapJoinFastMultiKeyHashMap extends 
CommonFastHashTable {
   public void testGetNonExistent() throws Exception {
     random = new Random(16916);
 
-    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, LOAD_FACTOR, WB_SIZE, 0);
+    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, LOAD_FACTOR, WB_SIZE);
 
     RandomByteArrayStream randomByteArrayKeyStream = new 
RandomByteArrayStream(random);
     RandomByteArrayStream randomByteArrayValueStream = new 
RandomByteArrayStream(random);
@@ -101,7 +101,7 @@ public class TestVectorMapJoinFastMultiKeyHashMap extends 
CommonFastHashTable {
     random = new Random(26078);
 
     // Make sure the map does not expand; should be able to find space.
-    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, 1f, WB_SIZE, 0);
+    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, CAPACITY, 1f, WB_SIZE);
 
     RandomByteArrayStream randomByteArrayKeyStream = new 
RandomByteArrayStream(random);
     RandomByteArrayStream randomByteArrayValueStream = new 
RandomByteArrayStream(random);
@@ -126,7 +126,7 @@ public class TestVectorMapJoinFastMultiKeyHashMap extends 
CommonFastHashTable {
     random = new Random(22470);
 
     // Start with capacity 1; make sure we expand on every put.
-    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, 1, 0.0000001f, WB_SIZE, 0);
+    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, 1, 0.0000001f, WB_SIZE);
 
     RandomByteArrayStream randomByteArrayKeyStream = new 
RandomByteArrayStream(random);
     RandomByteArrayStream randomByteArrayValueStream = new 
RandomByteArrayStream(random);
@@ -147,7 +147,7 @@ public class TestVectorMapJoinFastMultiKeyHashMap extends 
CommonFastHashTable {
     random = new Random(5231);
 
     // Use a large capacity that doesn't require expansion, yet.
-    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, LARGE_CAPACITY, LOAD_FACTOR, 
LARGE_WB_SIZE, 0);
+    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, LARGE_CAPACITY, LOAD_FACTOR, 
LARGE_WB_SIZE);
 
     RandomByteArrayStream randomByteArrayKeyStream = new 
RandomByteArrayStream(random, 10);
 
@@ -178,7 +178,7 @@ public class TestVectorMapJoinFastMultiKeyHashMap extends 
CommonFastHashTable {
     random = new Random(46809);
 
     // Use a large capacity that doesn't require expansion, yet.
-    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, MODERATE_CAPACITY, LOAD_FACTOR, 
MODERATE_WB_SIZE, 0);
+    VectorMapJoinFastMultiKeyHashMap map = new 
VectorMapJoinFastMultiKeyHashMap(false, MODERATE_CAPACITY, LOAD_FACTOR, 
MODERATE_WB_SIZE);
 
     RandomByteArrayStream randomByteArrayKeyStream = new 
RandomByteArrayStream(random, 10);
 

http://git-wip-us.apache.org/repos/asf/hive/blob/c72d073c/ql/src/test/queries/clientpositive/auto_sortmerge_join_13.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/auto_sortmerge_join_13.q 
b/ql/src/test/queries/clientpositive/auto_sortmerge_join_13.q
index 096c890..e92504a 100644
--- a/ql/src/test/queries/clientpositive/auto_sortmerge_join_13.q
+++ b/ql/src/test/queries/clientpositive/auto_sortmerge_join_13.q
@@ -42,6 +42,8 @@ select * from dest2;
 
 set hive.auto.convert.join.noconditionaltask=true;
 set hive.auto.convert.join.noconditionaltask.size=200;
+set hive.mapjoin.hybridgrace.minwbsize=100;
+set hive.mapjoin.hybridgrace.minnumpartitions=2;
 
 -- A SMB join followed by a mutli-insert
 explain 

http://git-wip-us.apache.org/repos/asf/hive/blob/c72d073c/ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q 
b/ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q
new file mode 100644
index 0000000..c7d925e
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q
@@ -0,0 +1,258 @@
+-- Hybrid Grace Hash Join
+-- Test basic functionalities:
+-- 1. Various cases when hash partitions spill
+-- 2. Partitioned table spilling
+-- 3. Vectorization
+
+SELECT 1;
+
+set hive.auto.convert.join=true;
+set hive.auto.convert.join.noconditionaltask.size=1300000;
+set hive.mapjoin.optimized.hashtable.wbsize=880000;
+set hive.mapjoin.hybridgrace.memcheckfrequency=1024;
+
+set hive.mapjoin.hybridgrace.hashtable=false;
+
+-- Base result for inner join
+explain
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ inner join alltypesorc cd
+ on cd.cint = c.cint
+ where c.cint < 2000000000) t1
+;
+
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ inner join alltypesorc cd
+ on cd.cint = c.cint
+ where c.cint < 2000000000) t1
+;
+
+set hive.mapjoin.hybridgrace.hashtable=true;
+
+-- Two partitions are created. One in memory, one on disk on creation.
+-- The one in memory will eventually exceed memory limit, but won't spill.
+explain
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ inner join alltypesorc cd
+ on cd.cint = c.cint
+ where c.cint < 2000000000) t1
+;
+
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ inner join alltypesorc cd
+ on cd.cint = c.cint
+ where c.cint < 2000000000) t1
+;
+
+set hive.auto.convert.join.noconditionaltask.size=3000000;
+set hive.mapjoin.optimized.hashtable.wbsize=100000;
+
+set hive.mapjoin.hybridgrace.hashtable=false;
+
+-- Base result for inner join
+explain
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ inner join alltypesorc cd
+ on cd.cint = c.cint) t1
+;
+
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ inner join alltypesorc cd
+ on cd.cint = c.cint) t1
+;
+
+set hive.mapjoin.hybridgrace.hashtable=true;
+
+-- 16 partitions are created: 3 in memory, 13 on disk on creation.
+-- 1 partition is spilled during first round processing, which ends up having 
2 in memory, 14 on disk
+explain
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ inner join alltypesorc cd
+ on cd.cint = c.cint) t1
+;
+
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ inner join alltypesorc cd
+ on cd.cint = c.cint) t1
+;
+
+
+
+set hive.mapjoin.hybridgrace.hashtable=false;
+
+-- Base result for outer join
+explain
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ left outer join alltypesorc cd
+ on cd.cint = c.cint) t1
+;
+
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ left outer join alltypesorc cd
+ on cd.cint = c.cint) t1
+;
+
+set hive.mapjoin.hybridgrace.hashtable=true;
+
+-- 32 partitions are created. 3 in memory, 29 on disk on creation.
+explain
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ left outer join alltypesorc cd
+ on cd.cint = c.cint) t1
+;
+
+select count(*) from
+(select c.ctinyint
+ from alltypesorc c
+ left outer join alltypesorc cd
+ on cd.cint = c.cint) t1
+;
+
+
+-- Partitioned table
+create table parttbl (key string, value char(20)) partitioned by (dt char(10));
+insert overwrite table parttbl partition(dt='2000-01-01')
+  select * from src;
+insert overwrite table parttbl partition(dt='2000-01-02')
+  select * from src1;
+
+set hive.auto.convert.join.noconditionaltask.size=30000000;
+set hive.mapjoin.optimized.hashtable.wbsize=10000000;
+
+set hive.mapjoin.hybridgrace.hashtable=false;
+
+-- No spill, base result
+explain
+select count(*) from
+(select p1.value
+ from parttbl p1
+ inner join parttbl p2
+ on p1.key = p2.key) t1
+;
+
+select count(*) from
+(select p1.value
+ from parttbl p1
+ inner join parttbl p2
+ on p1.key = p2.key) t1
+;
+
+set hive.mapjoin.hybridgrace.hashtable=true;
+
+-- No spill, 2 partitions created in memory
+explain
+select count(*) from
+(select p1.value
+ from parttbl p1
+ inner join parttbl p2
+ on p1.key = p2.key) t1
+;
+
+select count(*) from
+(select p1.value
+ from parttbl p1
+ inner join parttbl p2
+ on p1.key = p2.key) t1
+;
+
+
+set hive.auto.convert.join.noconditionaltask.size=20000;
+set hive.mapjoin.optimized.hashtable.wbsize=10000;
+
+set hive.mapjoin.hybridgrace.hashtable=false;
+
+-- Spill case base result
+explain
+select count(*) from
+(select p1.value
+ from parttbl p1
+ inner join parttbl p2
+ on p1.key = p2.key) t1
+;
+
+select count(*) from
+(select p1.value
+ from parttbl p1
+ inner join parttbl p2
+ on p1.key = p2.key) t1
+;
+
+set hive.mapjoin.hybridgrace.hashtable=true;
+
+-- Spill case, one partition in memory, one spilled on creation
+explain
+select count(*) from
+(select p1.value
+ from parttbl p1
+ inner join parttbl p2
+ on p1.key = p2.key) t1
+;
+
+select count(*) from
+(select p1.value
+ from parttbl p1
+ inner join parttbl p2
+ on p1.key = p2.key) t1
+;
+
+drop table parttbl;
+
+
+-- Test vectorization
+-- Test case borrowed from vector_decimal_mapjoin.q
+CREATE TABLE decimal_mapjoin STORED AS ORC AS
+  SELECT cdouble, CAST (((cdouble*22.1)/37) AS DECIMAL(20,10)) AS cdecimal1,
+  CAST (((cdouble*9.3)/13) AS DECIMAL(23,14)) AS cdecimal2,
+  cint
+  FROM alltypesorc;
+
+SET hive.auto.convert.join=true;
+SET hive.auto.convert.join.noconditionaltask=true;
+SET hive.auto.convert.join.noconditionaltask.size=50000000;
+set hive.mapjoin.optimized.hashtable.wbsize=10000;
+SET hive.vectorized.execution.enabled=true;
+set hive.mapjoin.hybridgrace.hashtable=false;
+
+EXPLAIN SELECT l.cint, r.cint, l.cdecimal1, r.cdecimal2
+  FROM decimal_mapjoin l
+  JOIN decimal_mapjoin r ON l.cint = r.cint
+  WHERE l.cint = 6981;
+SELECT l.cint, r.cint, l.cdecimal1, r.cdecimal2
+  FROM decimal_mapjoin l
+  JOIN decimal_mapjoin r ON l.cint = r.cint
+  WHERE l.cint = 6981;
+
+set hive.mapjoin.hybridgrace.hashtable=true;
+
+EXPLAIN SELECT l.cint, r.cint, l.cdecimal1, r.cdecimal2
+  FROM decimal_mapjoin l
+  JOIN decimal_mapjoin r ON l.cint = r.cint
+  WHERE l.cint = 6981;
+SELECT l.cint, r.cint, l.cdecimal1, r.cdecimal2
+  FROM decimal_mapjoin l
+  JOIN decimal_mapjoin r ON l.cint = r.cint
+  WHERE l.cint = 6981;
+
+DROP TABLE decimal_mapjoin;

http://git-wip-us.apache.org/repos/asf/hive/blob/c72d073c/ql/src/test/queries/clientpositive/hybridgrace_hashjoin_2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/hybridgrace_hashjoin_2.q 
b/ql/src/test/queries/clientpositive/hybridgrace_hashjoin_2.q
new file mode 100644
index 0000000..dd425f4
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/hybridgrace_hashjoin_2.q
@@ -0,0 +1,152 @@
+-- Hybrid Grace Hash Join
+-- Test n-way join
+SELECT 1;
+
+set hive.auto.convert.join=true;
+set hive.auto.convert.join.noconditionaltask=true;
+set hive.auto.convert.join.noconditionaltask.size=10000000;
+set hive.cbo.enable=false;
+
+
+-- 3-way mapjoin (1 big table, 2 small tables)
+SELECT 1;
+
+set hive.mapjoin.hybridgrace.hashtable=false;
+
+EXPLAIN
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN src y ON (y.key = x.key);
+
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN src y ON (y.key = x.key);
+
+set hive.mapjoin.hybridgrace.hashtable=true;
+
+EXPLAIN
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN src y ON (y.key = x.key);
+
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN src y ON (y.key = x.key);
+
+
+-- 4-way mapjoin (1 big table, 3 small tables)
+SELECT 1;
+
+set hive.mapjoin.hybridgrace.hashtable=false;
+
+EXPLAIN
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN srcpart w ON (x.key = w.key)
+JOIN src y ON (y.key = x.key);
+
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN srcpart w ON (x.key = w.key)
+JOIN src y ON (y.key = x.key);
+
+set hive.mapjoin.hybridgrace.hashtable=true;
+
+EXPLAIN
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN srcpart w ON (x.key = w.key)
+JOIN src y ON (y.key = x.key);
+
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN srcpart w ON (x.key = w.key)
+JOIN src y ON (y.key = x.key);
+
+
+-- 2 sets of 3-way mapjoin under 2 different tasks
+SELECT 1;
+
+set hive.mapjoin.hybridgrace.hashtable=false;
+
+EXPLAIN
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN src y ON (y.key = x.key)
+UNION
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.value = z.value)
+JOIN src y ON (y.value = x.value);
+
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN src y ON (y.key = x.key)
+UNION
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.value = z.value)
+JOIN src y ON (y.value = x.value);
+
+set hive.mapjoin.hybridgrace.hashtable=true;
+
+EXPLAIN
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN src y ON (y.key = x.key)
+UNION
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.value = z.value)
+JOIN src y ON (y.value = x.value);
+
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.key = z.key)
+JOIN src y ON (y.key = x.key)
+UNION
+SELECT COUNT(*)
+FROM src1 x JOIN srcpart z ON (x.value = z.value)
+JOIN src y ON (y.value = x.value);
+
+
+-- A chain of 2 sets of 3-way mapjoin under the same task
+SELECT 1;
+
+set hive.mapjoin.hybridgrace.hashtable=false;
+
+EXPLAIN
+SELECT COUNT(*)
+FROM src1 x
+JOIN srcpart z1 ON (x.key = z1.key)
+JOIN src y1     ON (x.key = y1.key)
+JOIN srcpart z2 ON (x.value = z2.value)
+JOIN src y2     ON (x.value = y2.value)
+WHERE z1.key < 'zzzzzzzz' AND z2.key < 'zzzzzzzzzz'
+ AND y1.value < 'zzzzzzzz' AND y2.value < 'zzzzzzzzzz';
+
+SELECT COUNT(*)
+FROM src1 x
+JOIN srcpart z1 ON (x.key = z1.key)
+JOIN src y1     ON (x.key = y1.key)
+JOIN srcpart z2 ON (x.value = z2.value)
+JOIN src y2     ON (x.value = y2.value)
+WHERE z1.key < 'zzzzzzzz' AND z2.key < 'zzzzzzzzzz'
+ AND y1.value < 'zzzzzzzz' AND y2.value < 'zzzzzzzzzz';
+
+set hive.mapjoin.hybridgrace.hashtable=true;
+
+EXPLAIN
+SELECT COUNT(*)
+FROM src1 x
+JOIN srcpart z1 ON (x.key = z1.key)
+JOIN src y1     ON (x.key = y1.key)
+JOIN srcpart z2 ON (x.value = z2.value)
+JOIN src y2     ON (x.value = y2.value)
+WHERE z1.key < 'zzzzzzzz' AND z2.key < 'zzzzzzzzzz'
+ AND y1.value < 'zzzzzzzz' AND y2.value < 'zzzzzzzzzz';
+
+SELECT COUNT(*)
+FROM src1 x
+JOIN srcpart z1 ON (x.key = z1.key)
+JOIN src y1     ON (x.key = y1.key)
+JOIN srcpart z2 ON (x.value = z2.value)
+JOIN src y2     ON (x.value = y2.value)
+WHERE z1.key < 'zzzzzzzz' AND z2.key < 'zzzzzzzzzz'
+ AND y1.value < 'zzzzzzzz' AND y2.value < 'zzzzzzzzzz';
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/c72d073c/ql/src/test/queries/clientpositive/hybridhashjoin.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/hybridhashjoin.q 
b/ql/src/test/queries/clientpositive/hybridhashjoin.q
deleted file mode 100644
index fbd48ea..0000000
--- a/ql/src/test/queries/clientpositive/hybridhashjoin.q
+++ /dev/null
@@ -1,250 +0,0 @@
-set hive.auto.convert.join=true;
-set hive.auto.convert.join.noconditionaltask.size=1300000;
-set hive.mapjoin.optimized.hashtable.wbsize=880000;
-set hive.mapjoin.hybridgrace.memcheckfrequency=1024;
-
-set hive.mapjoin.hybridgrace.hashtable=false;
-
--- Base result for inner join
-explain
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- inner join alltypesorc cd
- on cd.cint = c.cint
- where c.cint < 2000000000) t1
-;
-
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- inner join alltypesorc cd
- on cd.cint = c.cint
- where c.cint < 2000000000) t1
-;
-
-set hive.mapjoin.hybridgrace.hashtable=true;
-
--- Two partitions are created. One in memory, one on disk on creation.
--- The one in memory will eventually exceed memory limit, but won't spill.
-explain
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- inner join alltypesorc cd
- on cd.cint = c.cint
- where c.cint < 2000000000) t1
-;
-
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- inner join alltypesorc cd
- on cd.cint = c.cint
- where c.cint < 2000000000) t1
-;
-
-set hive.auto.convert.join.noconditionaltask.size=3000000;
-set hive.mapjoin.optimized.hashtable.wbsize=100000;
-
-set hive.mapjoin.hybridgrace.hashtable=false;
-
--- Base result for inner join
-explain
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- inner join alltypesorc cd
- on cd.cint = c.cint) t1
-;
-
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- inner join alltypesorc cd
- on cd.cint = c.cint) t1
-;
-
-set hive.mapjoin.hybridgrace.hashtable=true;
-
--- 16 partitions are created: 3 in memory, 13 on disk on creation.
--- 1 partition is spilled during first round processing, which ends up having 
2 in memory, 14 on disk
-explain
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- inner join alltypesorc cd
- on cd.cint = c.cint) t1
-;
-
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- inner join alltypesorc cd
- on cd.cint = c.cint) t1
-;
-
-
-
-set hive.mapjoin.hybridgrace.hashtable=false;
-
--- Base result for outer join
-explain
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- left outer join alltypesorc cd
- on cd.cint = c.cint) t1
-;
-
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- left outer join alltypesorc cd
- on cd.cint = c.cint) t1
-;
-
-set hive.mapjoin.hybridgrace.hashtable=true;
-
--- 32 partitions are created. 3 in memory, 29 on disk on creation.
-explain
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- left outer join alltypesorc cd
- on cd.cint = c.cint) t1
-;
-
-select count(*) from
-(select c.ctinyint
- from alltypesorc c
- left outer join alltypesorc cd
- on cd.cint = c.cint) t1
-;
-
-
--- Partitioned table
-create table parttbl (key string, value char(20)) partitioned by (dt char(10));
-insert overwrite table parttbl partition(dt='2000-01-01')
-  select * from src;
-insert overwrite table parttbl partition(dt='2000-01-02')
-  select * from src1;
-
-set hive.auto.convert.join.noconditionaltask.size=30000000;
-set hive.mapjoin.optimized.hashtable.wbsize=10000000;
-
-set hive.mapjoin.hybridgrace.hashtable=false;
-
--- No spill, base result
-explain
-select count(*) from
-(select p1.value
- from parttbl p1
- inner join parttbl p2
- on p1.key = p2.key) t1
-;
-
-select count(*) from
-(select p1.value
- from parttbl p1
- inner join parttbl p2
- on p1.key = p2.key) t1
-;
-
-set hive.mapjoin.hybridgrace.hashtable=true;
-
--- No spill, 2 partitions created in memory
-explain
-select count(*) from
-(select p1.value
- from parttbl p1
- inner join parttbl p2
- on p1.key = p2.key) t1
-;
-
-select count(*) from
-(select p1.value
- from parttbl p1
- inner join parttbl p2
- on p1.key = p2.key) t1
-;
-
-
-set hive.auto.convert.join.noconditionaltask.size=20000;
-set hive.mapjoin.optimized.hashtable.wbsize=10000;
-
-set hive.mapjoin.hybridgrace.hashtable=false;
-
--- Spill case base result
-explain
-select count(*) from
-(select p1.value
- from parttbl p1
- inner join parttbl p2
- on p1.key = p2.key) t1
-;
-
-select count(*) from
-(select p1.value
- from parttbl p1
- inner join parttbl p2
- on p1.key = p2.key) t1
-;
-
-set hive.mapjoin.hybridgrace.hashtable=true;
-
--- Spill case, one partition in memory, one spilled on creation
-explain
-select count(*) from
-(select p1.value
- from parttbl p1
- inner join parttbl p2
- on p1.key = p2.key) t1
-;
-
-select count(*) from
-(select p1.value
- from parttbl p1
- inner join parttbl p2
- on p1.key = p2.key) t1
-;
-
-drop table parttbl;
-
-
--- Test vectorization
--- Test case borrowed from vector_decimal_mapjoin.q
-CREATE TABLE decimal_mapjoin STORED AS ORC AS
-  SELECT cdouble, CAST (((cdouble*22.1)/37) AS DECIMAL(20,10)) AS cdecimal1,
-  CAST (((cdouble*9.3)/13) AS DECIMAL(23,14)) AS cdecimal2,
-  cint
-  FROM alltypesorc;
-
-SET hive.auto.convert.join=true;
-SET hive.auto.convert.join.noconditionaltask=true;
-SET hive.auto.convert.join.noconditionaltask.size=50000000;
-set hive.mapjoin.optimized.hashtable.wbsize=10000;
-SET hive.vectorized.execution.enabled=true;
-set hive.mapjoin.hybridgrace.hashtable=false;
-
-EXPLAIN SELECT l.cint, r.cint, l.cdecimal1, r.cdecimal2
-  FROM decimal_mapjoin l
-  JOIN decimal_mapjoin r ON l.cint = r.cint
-  WHERE l.cint = 6981;
-SELECT l.cint, r.cint, l.cdecimal1, r.cdecimal2
-  FROM decimal_mapjoin l
-  JOIN decimal_mapjoin r ON l.cint = r.cint
-  WHERE l.cint = 6981;
-
-set hive.mapjoin.hybridgrace.hashtable=true;
-
-EXPLAIN SELECT l.cint, r.cint, l.cdecimal1, r.cdecimal2
-  FROM decimal_mapjoin l
-  JOIN decimal_mapjoin r ON l.cint = r.cint
-  WHERE l.cint = 6981;
-SELECT l.cint, r.cint, l.cdecimal1, r.cdecimal2
-  FROM decimal_mapjoin l
-  JOIN decimal_mapjoin r ON l.cint = r.cint
-  WHERE l.cint = 6981;
-
-DROP TABLE decimal_mapjoin;

http://git-wip-us.apache.org/repos/asf/hive/blob/c72d073c/ql/src/test/queries/clientpositive/tez_join_hash.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/tez_join_hash.q 
b/ql/src/test/queries/clientpositive/tez_join_hash.q
index 3571cd5..67d89f8 100644
--- a/ql/src/test/queries/clientpositive/tez_join_hash.q
+++ b/ql/src/test/queries/clientpositive/tez_join_hash.q
@@ -14,6 +14,8 @@ SELECT count(*) FROM src, orc_src where src.key=orc_src.key;
 set hive.auto.convert.join=true;
 set hive.auto.convert.join.noconditionaltask=true;
 set hive.auto.convert.join.noconditionaltask.size=3000;
+set hive.mapjoin.hybridgrace.minwbsize=350;
+set hive.mapjoin.hybridgrace.minnumpartitions=8;
 
 explain
 select count(*) from (select x.key as key, y.value as value from

http://git-wip-us.apache.org/repos/asf/hive/blob/c72d073c/ql/src/test/queries/clientpositive/tez_smb_main.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/tez_smb_main.q 
b/ql/src/test/queries/clientpositive/tez_smb_main.q
index 6398762..1802709 100644
--- a/ql/src/test/queries/clientpositive/tez_smb_main.q
+++ b/ql/src/test/queries/clientpositive/tez_smb_main.q
@@ -42,6 +42,8 @@ select count(*)
 from tab a join tab_part b on a.key = b.key;
 
 set hive.auto.convert.join.noconditionaltask.size=2000;
+set hive.mapjoin.hybridgrace.minwbsize=500;
+set hive.mapjoin.hybridgrace.minnumpartitions=4;
 explain
 select count (*)
 from tab a join tab_part b on a.key = b.key;
@@ -50,6 +52,8 @@ select count(*)
 from tab a join tab_part b on a.key = b.key;
 
 set hive.auto.convert.join.noconditionaltask.size=1000;
+set hive.mapjoin.hybridgrace.minwbsize=250;
+set hive.mapjoin.hybridgrace.minnumpartitions=4;
 explain
 select count (*)
 from tab a join tab_part b on a.key = b.key;
@@ -58,6 +62,8 @@ select count(*)
 from tab a join tab_part b on a.key = b.key;
 
 set hive.auto.convert.join.noconditionaltask.size=500;
+set hive.mapjoin.hybridgrace.minwbsize=125;
+set hive.mapjoin.hybridgrace.minnumpartitions=4;
 explain select count(*) from tab a join tab_part b on a.key = b.key join src1 
c on a.value = c.value;
 select count(*) from tab a join tab_part b on a.key = b.key join src1 c on 
a.value = c.value;

[4/5] hive git commit: HIVE-10403 - Add n-way join support for Hybrid Grace Hash Join (Wei Zheng via Vikram Dixit)

Reply via email to