[
https://issues.apache.org/jira/browse/HIVE-22567?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Zhang Xiaoyang updated HIVE-22567:
----------------------------------
Description:
sql :
select count(1) from (
select department_code
from tmp.tmp_mon
where business_unit_code in (select business_unit_code from
tmp.business_unit_config)
group by department_code
)tmp
I get diffrent results when the map join is off and the reduce tasks is
diffrent !
the tmp.tmp_mon is a big table and tmp.business_unit_config has only 7 records;
when set hive.auto.convert.join=false,the result is diffrent when the number of
the reduce tasks changed;
set mapred.reduce.tasks=1 the result seems right and when set
mapred.reduce.tasks=2 or other,the result is missing some data;
what can cause this ?
{code:java}
scot_dws.dws_sales_contribution_kpi_mon is same as tmp.tmp_mon.
case 1:
set hive.auto.convert.join=true;
set mapred.reduce.tasks=-1;
select count(1) from (
select department_code
from scot_dws.dws_sales_contribution_kpi_mon
where business_unit_code in (select business_unit_code from
tmp.business_unit_config)
group by department_code
)tmp;
+------+
| _c0 |
+------+
| 62 |
+------+
+----------------------------------------------------+
| Explain |
+----------------------------------------------------+
| Plan optimized by CBO. |
| |
| Vertex dependency in root stage |
| Map 1 <- Map 4 (BROADCAST_EDGE) |
| Reducer 2 <- Map 1 (SIMPLE_EDGE) |
| Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) |
| |
| Stage-0 |
| Fetch Operator |
| limit:-1 |
| Stage-1 |
| Reducer 3 |
| File Output Operator [FS_21] |
| Group By Operator [GBY_19] (rows=1 width=8) |
| Output:["_col0"],aggregations:["count(VALUE._col0)"] |
| <-Reducer 2 [CUSTOM_SIMPLE_EDGE] |
| PARTITION_ONLY_SHUFFLE [RS_18] |
| Group By Operator [GBY_17] (rows=1 width=8) |
| Output:["_col0"],aggregations:["count()"] |
| Select Operator [SEL_15] (rows=3673352 width=178) |
| Group By Operator [GBY_14] (rows=3673352 width=178) |
| Output:["_col0"],keys:KEY._col0 |
| <-Map 1 [SIMPLE_EDGE] |
| SHUFFLE [RS_13] |
| PartitionCols:_col0 |
| Group By Operator [GBY_12] (rows=7346704 width=178) |
| Output:["_col0"],keys:_col1 |
| Map Join Operator [MAPJOIN_36] (rows=7346704 width=178)
|
| Conds:SEL_2._col0=RS_9._col0(Left
Semi),HybridGraceHashJoin:true,Output:["_col1"] |
| <-Map 4 [BROADCAST_EDGE] |
| BROADCAST [RS_9] |
| PartitionCols:_col0 |
| Group By Operator [GBY_7] (rows=7 width=184) |
| Output:["_col0"],keys:_col0 |
| Select Operator [SEL_5] (rows=7 width=184) |
| Output:["_col0"] |
| Filter Operator [FIL_25] (rows=7 width=184) |
| predicate:business_unit_code is not null |
| TableScan [TS_3] (rows=7 width=184) |
|
tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]
|
| <-Select Operator [SEL_2] (rows=6678822 width=178) |
| Output:["_col0","_col1"] |
| Filter Operator [FIL_24] (rows=6678822 width=178) |
| predicate:business_unit_code is not null |
| TableScan [TS_0] (rows=6678822 width=178) |
|
scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]
|
| |
+----------------------------------------------------+
47 rows selected (1.589 seconds)
case 2:
set hive.auto.convert.join=true;
set mapred.reduce.tasks=100;
select count(1) from (
select department_code
from scot_dws.dws_sales_contribution_kpi_mon
where business_unit_code in (select business_unit_code from
tmp.business_unit_config)
group by department_code
)tmp;
+------+
| _c0 |
+------+
| 62 |
+------+
+----------------------------------------------------+
| Explain |
+----------------------------------------------------+
| Plan optimized by CBO. |
| |
| Vertex dependency in root stage |
| Map 1 <- Map 4 (BROADCAST_EDGE) |
| Reducer 2 <- Map 1 (SIMPLE_EDGE) |
| Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) |
| |
| Stage-0 |
| Fetch Operator |
| limit:-1 |
| Stage-1 |
| Reducer 3 |
| File Output Operator [FS_21] |
| Group By Operator [GBY_19] (rows=1 width=8) |
| Output:["_col0"],aggregations:["count(VALUE._col0)"] |
| <-Reducer 2 [CUSTOM_SIMPLE_EDGE] |
| PARTITION_ONLY_SHUFFLE [RS_18] |
| Group By Operator [GBY_17] (rows=1 width=8) |
| Output:["_col0"],aggregations:["count()"] |
| Select Operator [SEL_15] (rows=3673352 width=178) |
| Group By Operator [GBY_14] (rows=3673352 width=178) |
| Output:["_col0"],keys:KEY._col0 |
| <-Map 1 [SIMPLE_EDGE] |
| SHUFFLE [RS_13] |
| PartitionCols:_col0 |
| Group By Operator [GBY_12] (rows=7346704 width=178) |
| Output:["_col0"],keys:_col1 |
| Map Join Operator [MAPJOIN_36] (rows=7346704 width=178)
|
| Conds:SEL_2._col0=RS_9._col0(Left
Semi),HybridGraceHashJoin:true,Output:["_col1"] |
| <-Map 4 [BROADCAST_EDGE] |
| BROADCAST [RS_9] |
| PartitionCols:_col0 |
| Group By Operator [GBY_7] (rows=7 width=184) |
| Output:["_col0"],keys:_col0 |
| Select Operator [SEL_5] (rows=7 width=184) |
| Output:["_col0"] |
| Filter Operator [FIL_25] (rows=7 width=184) |
| predicate:business_unit_code is not null |
| TableScan [TS_3] (rows=7 width=184) |
|
tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]
|
| <-Select Operator [SEL_2] (rows=6678822 width=178) |
| Output:["_col0","_col1"] |
| Filter Operator [FIL_24] (rows=6678822 width=178) |
| predicate:business_unit_code is not null |
| TableScan [TS_0] (rows=6678822 width=178) |
|
scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]
|
| |
+----------------------------------------------------+
47 rows selected (1.541 seconds)
case 3:
set hive.auto.convert.join=false;
set mapred.reduce.tasks=-1;
select count(1) from (
select department_code
from scot_dws.dws_sales_contribution_kpi_mon
where business_unit_code in (select business_unit_code from
tmp.business_unit_config)
group by department_code
)tmp;
+------+
| _c0 |
+------+
| 62 |
+------+
+----------------------------------------------------+
| Explain |
+----------------------------------------------------+
| Plan optimized by CBO. |
| |
| Vertex dependency in root stage |
| Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) |
| Reducer 3 <- Reducer 2 (SIMPLE_EDGE) |
| Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) |
| |
| Stage-0 |
| Fetch Operator |
| limit:-1 |
| Stage-1 |
| Reducer 4 |
| File Output Operator [FS_21] |
| Group By Operator [GBY_19] (rows=1 width=8) |
| Output:["_col0"],aggregations:["count(VALUE._col0)"] |
| <-Reducer 3 [CUSTOM_SIMPLE_EDGE] |
| PARTITION_ONLY_SHUFFLE [RS_18] |
| Group By Operator [GBY_17] (rows=1 width=8) |
| Output:["_col0"],aggregations:["count()"] |
| Select Operator [SEL_15] (rows=3673352 width=178) |
| Group By Operator [GBY_14] (rows=3673352 width=178) |
| Output:["_col0"],keys:KEY._col0 |
| <-Reducer 2 [SIMPLE_EDGE] |
| SHUFFLE [RS_13] |
| PartitionCols:_col0 |
| Group By Operator [GBY_12] (rows=7346704 width=178) |
| Output:["_col0"],keys:_col1 |
| Merge Join Operator [MERGEJOIN_36] (rows=7346704
width=178) |
| Conds:RS_8._col0=RS_9._col0(Left
Semi),Output:["_col1"] |
| <-Map 1 [SIMPLE_EDGE] |
| SHUFFLE [RS_8] |
| PartitionCols:_col0 |
| Select Operator [SEL_2] (rows=6678822 width=178) |
| Output:["_col0","_col1"] |
| Filter Operator [FIL_24] (rows=6678822 width=178)
|
| predicate:business_unit_code is not null |
| TableScan [TS_0] (rows=6678822 width=178) |
|
scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]
|
| <-Map 5 [SIMPLE_EDGE] |
| SHUFFLE [RS_9] |
| PartitionCols:_col0 |
| Group By Operator [GBY_7] (rows=7 width=184) |
| Output:["_col0"],keys:_col0 |
| Select Operator [SEL_5] (rows=7 width=184) |
| Output:["_col0"] |
| Filter Operator [FIL_25] (rows=7 width=184) |
| predicate:business_unit_code is not null |
| TableScan [TS_3] (rows=7 width=184) |
|
tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]
|
| |
+----------------------------------------------------+
50 rows selected (1.545 seconds)
case 4:
set hive.auto.convert.join=false;
set mapred.reduce.tasks=100;
select count(1) from (
select department_code
from scot_dws.dws_sales_contribution_kpi_mon
where business_unit_code in (select business_unit_code from
tmp.business_unit_config)
group by department_code
)tmp;
+------+
| _c0 |
+------+
| 0 |
+------+
+----------------------------------------------------+
| Explain |
+----------------------------------------------------+
| Plan optimized by CBO. |
| |
| Vertex dependency in root stage |
| Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) |
| Reducer 3 <- Reducer 2 (SIMPLE_EDGE) |
| Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) |
| |
| Stage-0 |
| Fetch Operator |
| limit:-1 |
| Stage-1 |
| Reducer 4 |
| File Output Operator [FS_21] |
| Group By Operator [GBY_19] (rows=1 width=8) |
| Output:["_col0"],aggregations:["count(VALUE._col0)"] |
| <-Reducer 3 [CUSTOM_SIMPLE_EDGE] |
| PARTITION_ONLY_SHUFFLE [RS_18] |
| Group By Operator [GBY_17] (rows=1 width=8) |
| Output:["_col0"],aggregations:["count()"] |
| Select Operator [SEL_15] (rows=3673352 width=178) |
| Group By Operator [GBY_14] (rows=3673352 width=178) |
| Output:["_col0"],keys:KEY._col0 |
| <-Reducer 2 [SIMPLE_EDGE] |
| SHUFFLE [RS_13] |
| PartitionCols:_col0 |
| Group By Operator [GBY_12] (rows=7346704 width=178) |
| Output:["_col0"],keys:_col1 |
| Merge Join Operator [MERGEJOIN_36] (rows=7346704
width=178) |
| Conds:RS_8._col0=RS_9._col0(Left
Semi),Output:["_col1"] |
| <-Map 1 [SIMPLE_EDGE] |
| SHUFFLE [RS_8] |
| PartitionCols:_col0 |
| Select Operator [SEL_2] (rows=6678822 width=178) |
| Output:["_col0","_col1"] |
| Filter Operator [FIL_24] (rows=6678822 width=178)
|
| predicate:business_unit_code is not null |
| TableScan [TS_0] (rows=6678822 width=178) |
|
scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]
|
| <-Map 5 [SIMPLE_EDGE] |
| SHUFFLE [RS_9] |
| PartitionCols:_col0 |
| Group By Operator [GBY_7] (rows=7 width=184) |
| Output:["_col0"],keys:_col0 |
| Select Operator [SEL_5] (rows=7 width=184) |
| Output:["_col0"] |
| Filter Operator [FIL_25] (rows=7 width=184) |
| predicate:business_unit_code is not null |
| TableScan [TS_3] (rows=7 width=184) |
|
tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]
|
| |
+----------------------------------------------------+
50 rows selected (1.547 seconds)
in this case,the result is wrong!{code}
was:
sql :
select count(1) from (
select department_code
from tmp.tmp_mon
where business_unit_code in (select business_unit_code from
tmp.business_unit_config)
group by department_code
)tmp
I get diffrent results when the map join is off and the reduce tasks is
diffrent !
the tmp.tmp_mon is a big table and tmp.business_unit_config has only 7 records;
when set hive.auto.convert.join=false,the result is diffrent when the number of
the reduce tasks changed;
set mapred.reduce.tasks=1 the result seems right and when set
mapred.reduce.tasks=2 or other,the result is missing some data;
what can cause this ?
scot_dws.dws_sales_contribution_kpi_mon is same as tmp.tmp_mon.
case 1:
set hive.auto.convert.join=true;
set mapred.reduce.tasks=-1;
select count(1) from (
select department_code
from scot_dws.dws_sales_contribution_kpi_mon
where business_unit_code in (select business_unit_code from
tmp.business_unit_config)
group by department_code
)tmp;
+------+
|_c0|
+------+
|62|
+------+
+----------------------------------------------------+
|Explain|
+----------------------------------------------------+
|Plan optimized by CBO.|
| |
|Vertex dependency in root stage|
|Map 1 <- Map 4 (BROADCAST_EDGE)|
|Reducer 2 <- Map 1 (SIMPLE_EDGE)|
|Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)|
| |
|Stage-0|
|Fetch Operator|
|limit:-1|
|Stage-1|
|Reducer 3|
|File Output Operator [FS_21]|
|Group By Operator [GBY_19] (rows=1 width=8)|
|Output:["_col0"],aggregations:["count(VALUE._col0)"]|
|<-Reducer 2 [CUSTOM_SIMPLE_EDGE]|
|PARTITION_ONLY_SHUFFLE [RS_18]|
|Group By Operator [GBY_17] (rows=1 width=8)|
|Output:["_col0"],aggregations:["count()"]|
|Select Operator [SEL_15] (rows=3673352 width=178)|
|Group By Operator [GBY_14] (rows=3673352 width=178)|
|Output:["_col0"],keys:KEY._col0|
|<-Map 1 [SIMPLE_EDGE]|
|SHUFFLE [RS_13]|
|PartitionCols:_col0|
|Group By Operator [GBY_12] (rows=7346704 width=178)|
|Output:["_col0"],keys:_col1|
|Map Join Operator [MAPJOIN_36] (rows=7346704 width=178)|
|Conds:SEL_2._col0=RS_9._col0(Left
Semi),HybridGraceHashJoin:true,Output:["_col1"]|
|<-Map 4 [BROADCAST_EDGE]|
|BROADCAST [RS_9]|
|PartitionCols:_col0|
|Group By Operator [GBY_7] (rows=7 width=184)|
|Output:["_col0"],keys:_col0|
|Select Operator [SEL_5] (rows=7 width=184)|
|Output:["_col0"]|
|Filter Operator [FIL_25] (rows=7 width=184)|
|predicate:business_unit_code is not null|
|TableScan [TS_3] (rows=7 width=184)|
|tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]|
|<-Select Operator [SEL_2] (rows=6678822 width=178)|
|Output:["_col0","_col1"]|
|Filter Operator [FIL_24] (rows=6678822 width=178)|
|predicate:business_unit_code is not null|
|TableScan [TS_0] (rows=6678822 width=178)|
|scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]|
| |
+----------------------------------------------------+
47 rows selected (1.589 seconds)
case 2:
set hive.auto.convert.join=true;
set mapred.reduce.tasks=100;
select count(1) from (
select department_code
from scot_dws.dws_sales_contribution_kpi_mon
where business_unit_code in (select business_unit_code from
tmp.business_unit_config)
group by department_code
)tmp;
+------+
|_c0|
+------+
|62|
+------+
+----------------------------------------------------+
|Explain|
+----------------------------------------------------+
|Plan optimized by CBO.|
| |
|Vertex dependency in root stage|
|Map 1 <- Map 4 (BROADCAST_EDGE)|
|Reducer 2 <- Map 1 (SIMPLE_EDGE)|
|Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)|
| |
|Stage-0|
|Fetch Operator|
|limit:-1|
|Stage-1|
|Reducer 3|
|File Output Operator [FS_21]|
|Group By Operator [GBY_19] (rows=1 width=8)|
|Output:["_col0"],aggregations:["count(VALUE._col0)"]|
|<-Reducer 2 [CUSTOM_SIMPLE_EDGE]|
|PARTITION_ONLY_SHUFFLE [RS_18]|
|Group By Operator [GBY_17] (rows=1 width=8)|
|Output:["_col0"],aggregations:["count()"]|
|Select Operator [SEL_15] (rows=3673352 width=178)|
|Group By Operator [GBY_14] (rows=3673352 width=178)|
|Output:["_col0"],keys:KEY._col0|
|<-Map 1 [SIMPLE_EDGE]|
|SHUFFLE [RS_13]|
|PartitionCols:_col0|
|Group By Operator [GBY_12] (rows=7346704 width=178)|
|Output:["_col0"],keys:_col1|
|Map Join Operator [MAPJOIN_36] (rows=7346704 width=178)|
|Conds:SEL_2._col0=RS_9._col0(Left
Semi),HybridGraceHashJoin:true,Output:["_col1"]|
|<-Map 4 [BROADCAST_EDGE]|
|BROADCAST [RS_9]|
|PartitionCols:_col0|
|Group By Operator [GBY_7] (rows=7 width=184)|
|Output:["_col0"],keys:_col0|
|Select Operator [SEL_5] (rows=7 width=184)|
|Output:["_col0"]|
|Filter Operator [FIL_25] (rows=7 width=184)|
|predicate:business_unit_code is not null|
|TableScan [TS_3] (rows=7 width=184)|
|tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]|
|<-Select Operator [SEL_2] (rows=6678822 width=178)|
|Output:["_col0","_col1"]|
|Filter Operator [FIL_24] (rows=6678822 width=178)|
|predicate:business_unit_code is not null|
|TableScan [TS_0] (rows=6678822 width=178)|
|scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]|
| |
+----------------------------------------------------+
47 rows selected (1.541 seconds)
case 3:
set hive.auto.convert.join=false;
set mapred.reduce.tasks=-1;
select count(1) from (
select department_code
from scot_dws.dws_sales_contribution_kpi_mon
where business_unit_code in (select business_unit_code from
tmp.business_unit_config)
group by department_code
)tmp;
+------+
|_c0|
+------+
|62|
+------+
+----------------------------------------------------+
|Explain|
+----------------------------------------------------+
|Plan optimized by CBO.|
| |
|Vertex dependency in root stage|
|Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)|
|Reducer 3 <- Reducer 2 (SIMPLE_EDGE)|
|Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE)|
| |
|Stage-0|
|Fetch Operator|
|limit:-1|
|Stage-1|
|Reducer 4|
|File Output Operator [FS_21]|
|Group By Operator [GBY_19] (rows=1 width=8)|
|Output:["_col0"],aggregations:["count(VALUE._col0)"]|
|<-Reducer 3 [CUSTOM_SIMPLE_EDGE]|
|PARTITION_ONLY_SHUFFLE [RS_18]|
|Group By Operator [GBY_17] (rows=1 width=8)|
|Output:["_col0"],aggregations:["count()"]|
|Select Operator [SEL_15] (rows=3673352 width=178)|
|Group By Operator [GBY_14] (rows=3673352 width=178)|
|Output:["_col0"],keys:KEY._col0|
|<-Reducer 2 [SIMPLE_EDGE]|
|SHUFFLE [RS_13]|
|PartitionCols:_col0|
|Group By Operator [GBY_12] (rows=7346704 width=178)|
|Output:["_col0"],keys:_col1|
|Merge Join Operator [MERGEJOIN_36] (rows=7346704 width=178)|
|Conds:RS_8._col0=RS_9._col0(Left Semi),Output:["_col1"]|
|<-Map 1 [SIMPLE_EDGE]|
|SHUFFLE [RS_8]|
|PartitionCols:_col0|
|Select Operator [SEL_2] (rows=6678822 width=178)|
|Output:["_col0","_col1"]|
|Filter Operator [FIL_24] (rows=6678822 width=178)|
|predicate:business_unit_code is not null|
|TableScan [TS_0] (rows=6678822 width=178)|
|scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]|
|<-Map 5 [SIMPLE_EDGE]|
|SHUFFLE [RS_9]|
|PartitionCols:_col0|
|Group By Operator [GBY_7] (rows=7 width=184)|
|Output:["_col0"],keys:_col0|
|Select Operator [SEL_5] (rows=7 width=184)|
|Output:["_col0"]|
|Filter Operator [FIL_25] (rows=7 width=184)|
|predicate:business_unit_code is not null|
|TableScan [TS_3] (rows=7 width=184)|
|tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]|
| |
+----------------------------------------------------+
50 rows selected (1.545 seconds)
case 4:
set hive.auto.convert.join=false;
set mapred.reduce.tasks=100;
select count(1) from (
select department_code
from scot_dws.dws_sales_contribution_kpi_mon
where business_unit_code in (select business_unit_code from
tmp.business_unit_config)
group by department_code
)tmp;
+------+
|_c0|
+------+
|0|
+------+
+----------------------------------------------------+
|Explain|
+----------------------------------------------------+
|Plan optimized by CBO.|
| |
|Vertex dependency in root stage|
|Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)|
|Reducer 3 <- Reducer 2 (SIMPLE_EDGE)|
|Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE)|
| |
|Stage-0|
|Fetch Operator|
|limit:-1|
|Stage-1|
|Reducer 4|
|File Output Operator [FS_21]|
|Group By Operator [GBY_19] (rows=1 width=8)|
|Output:["_col0"],aggregations:["count(VALUE._col0)"]|
|<-Reducer 3 [CUSTOM_SIMPLE_EDGE]|
|PARTITION_ONLY_SHUFFLE [RS_18]|
|Group By Operator [GBY_17] (rows=1 width=8)|
|Output:["_col0"],aggregations:["count()"]|
|Select Operator [SEL_15] (rows=3673352 width=178)|
|Group By Operator [GBY_14] (rows=3673352 width=178)|
|Output:["_col0"],keys:KEY._col0|
|<-Reducer 2 [SIMPLE_EDGE]|
|SHUFFLE [RS_13]|
|PartitionCols:_col0|
|Group By Operator [GBY_12] (rows=7346704 width=178)|
|Output:["_col0"],keys:_col1|
|Merge Join Operator [MERGEJOIN_36] (rows=7346704 width=178)|
|Conds:RS_8._col0=RS_9._col0(Left Semi),Output:["_col1"]|
|<-Map 1 [SIMPLE_EDGE]|
|SHUFFLE [RS_8]|
|PartitionCols:_col0|
|Select Operator [SEL_2] (rows=6678822 width=178)|
|Output:["_col0","_col1"]|
|Filter Operator [FIL_24] (rows=6678822 width=178)|
|predicate:business_unit_code is not null|
|TableScan [TS_0] (rows=6678822 width=178)|
|scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]|
|<-Map 5 [SIMPLE_EDGE]|
|SHUFFLE [RS_9]|
|PartitionCols:_col0|
|Group By Operator [GBY_7] (rows=7 width=184)|
|Output:["_col0"],keys:_col0|
|Select Operator [SEL_5] (rows=7 width=184)|
|Output:["_col0"]|
|Filter Operator [FIL_25] (rows=7 width=184)|
|predicate:business_unit_code is not null|
|TableScan [TS_3] (rows=7 width=184)|
|tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]|
| |
+----------------------------------------------------+
50 rows selected (1.547 seconds)
> Data loss when map join is off ,the result is diffrent when the number of
> reduce tasks is diffrent;
> ---------------------------------------------------------------------------------------------------
>
> Key: HIVE-22567
> URL: https://issues.apache.org/jira/browse/HIVE-22567
> Project: Hive
> Issue Type: Bug
> Affects Versions: 3.1.1, 3.1.2
> Reporter: Zhang Xiaoyang
> Assignee: Aditya Shah
> Priority: Blocker
>
>
> sql :
> select count(1) from (
> select department_code
> from tmp.tmp_mon
> where business_unit_code in (select business_unit_code from
> tmp.business_unit_config)
> group by department_code
> )tmp
> I get diffrent results when the map join is off and the reduce tasks is
> diffrent !
> the tmp.tmp_mon is a big table and tmp.business_unit_config has only 7
> records;
> when set hive.auto.convert.join=false,the result is diffrent when the number
> of the reduce tasks changed;
> set mapred.reduce.tasks=1 the result seems right and when set
> mapred.reduce.tasks=2 or other,the result is missing some data;
> what can cause this ?
> {code:java}
> scot_dws.dws_sales_contribution_kpi_mon is same as tmp.tmp_mon.
> case 1:
> set hive.auto.convert.join=true;
> set mapred.reduce.tasks=-1;
> select count(1) from (
> select department_code
> from scot_dws.dws_sales_contribution_kpi_mon
> where business_unit_code in (select business_unit_code from
> tmp.business_unit_config)
> group by department_code
> )tmp;
> +------+
> | _c0 |
> +------+
> | 62 |
> +------+
> +----------------------------------------------------+
> | Explain |
> +----------------------------------------------------+
> | Plan optimized by CBO. |
> | |
> | Vertex dependency in root stage |
> | Map 1 <- Map 4 (BROADCAST_EDGE) |
> | Reducer 2 <- Map 1 (SIMPLE_EDGE) |
> | Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) |
> | |
> | Stage-0 |
> | Fetch Operator |
> | limit:-1 |
> | Stage-1 |
> | Reducer 3 |
> | File Output Operator [FS_21] |
> | Group By Operator [GBY_19] (rows=1 width=8) |
> | Output:["_col0"],aggregations:["count(VALUE._col0)"] |
> | <-Reducer 2 [CUSTOM_SIMPLE_EDGE] |
> | PARTITION_ONLY_SHUFFLE [RS_18] |
> | Group By Operator [GBY_17] (rows=1 width=8) |
> | Output:["_col0"],aggregations:["count()"] |
> | Select Operator [SEL_15] (rows=3673352 width=178) |
> | Group By Operator [GBY_14] (rows=3673352 width=178) |
> | Output:["_col0"],keys:KEY._col0 |
> | <-Map 1 [SIMPLE_EDGE] |
> | SHUFFLE [RS_13] |
> | PartitionCols:_col0 |
> | Group By Operator [GBY_12] (rows=7346704 width=178) |
> | Output:["_col0"],keys:_col1 |
> | Map Join Operator [MAPJOIN_36] (rows=7346704
> width=178) |
> | Conds:SEL_2._col0=RS_9._col0(Left
> Semi),HybridGraceHashJoin:true,Output:["_col1"] |
> | <-Map 4 [BROADCAST_EDGE] |
> | BROADCAST [RS_9] |
> | PartitionCols:_col0 |
> | Group By Operator [GBY_7] (rows=7 width=184) |
> | Output:["_col0"],keys:_col0 |
> | Select Operator [SEL_5] (rows=7 width=184) |
> | Output:["_col0"] |
> | Filter Operator [FIL_25] (rows=7 width=184) |
> | predicate:business_unit_code is not null |
> | TableScan [TS_3] (rows=7 width=184) |
> |
> tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]
> |
> | <-Select Operator [SEL_2] (rows=6678822 width=178) |
> | Output:["_col0","_col1"] |
> | Filter Operator [FIL_24] (rows=6678822 width=178)
> |
> | predicate:business_unit_code is not null |
> | TableScan [TS_0] (rows=6678822 width=178) |
> |
> scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]
> |
> | |
> +----------------------------------------------------+
> 47 rows selected (1.589 seconds)
> case 2:
> set hive.auto.convert.join=true;
> set mapred.reduce.tasks=100;
> select count(1) from (
> select department_code
> from scot_dws.dws_sales_contribution_kpi_mon
> where business_unit_code in (select business_unit_code from
> tmp.business_unit_config)
> group by department_code
> )tmp;
> +------+
> | _c0 |
> +------+
> | 62 |
> +------+
> +----------------------------------------------------+
> | Explain |
> +----------------------------------------------------+
> | Plan optimized by CBO. |
> | |
> | Vertex dependency in root stage |
> | Map 1 <- Map 4 (BROADCAST_EDGE) |
> | Reducer 2 <- Map 1 (SIMPLE_EDGE) |
> | Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) |
> | |
> | Stage-0 |
> | Fetch Operator |
> | limit:-1 |
> | Stage-1 |
> | Reducer 3 |
> | File Output Operator [FS_21] |
> | Group By Operator [GBY_19] (rows=1 width=8) |
> | Output:["_col0"],aggregations:["count(VALUE._col0)"] |
> | <-Reducer 2 [CUSTOM_SIMPLE_EDGE] |
> | PARTITION_ONLY_SHUFFLE [RS_18] |
> | Group By Operator [GBY_17] (rows=1 width=8) |
> | Output:["_col0"],aggregations:["count()"] |
> | Select Operator [SEL_15] (rows=3673352 width=178) |
> | Group By Operator [GBY_14] (rows=3673352 width=178) |
> | Output:["_col0"],keys:KEY._col0 |
> | <-Map 1 [SIMPLE_EDGE] |
> | SHUFFLE [RS_13] |
> | PartitionCols:_col0 |
> | Group By Operator [GBY_12] (rows=7346704 width=178) |
> | Output:["_col0"],keys:_col1 |
> | Map Join Operator [MAPJOIN_36] (rows=7346704
> width=178) |
> | Conds:SEL_2._col0=RS_9._col0(Left
> Semi),HybridGraceHashJoin:true,Output:["_col1"] |
> | <-Map 4 [BROADCAST_EDGE] |
> | BROADCAST [RS_9] |
> | PartitionCols:_col0 |
> | Group By Operator [GBY_7] (rows=7 width=184) |
> | Output:["_col0"],keys:_col0 |
> | Select Operator [SEL_5] (rows=7 width=184) |
> | Output:["_col0"] |
> | Filter Operator [FIL_25] (rows=7 width=184) |
> | predicate:business_unit_code is not null |
> | TableScan [TS_3] (rows=7 width=184) |
> |
> tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]
> |
> | <-Select Operator [SEL_2] (rows=6678822 width=178) |
> | Output:["_col0","_col1"] |
> | Filter Operator [FIL_24] (rows=6678822 width=178)
> |
> | predicate:business_unit_code is not null |
> | TableScan [TS_0] (rows=6678822 width=178) |
> |
> scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]
> |
> | |
> +----------------------------------------------------+
> 47 rows selected (1.541 seconds)
> case 3:
> set hive.auto.convert.join=false;
> set mapred.reduce.tasks=-1;
> select count(1) from (
> select department_code
> from scot_dws.dws_sales_contribution_kpi_mon
> where business_unit_code in (select business_unit_code from
> tmp.business_unit_config)
> group by department_code
> )tmp;
> +------+
> | _c0 |
> +------+
> | 62 |
> +------+
> +----------------------------------------------------+
> | Explain |
> +----------------------------------------------------+
> | Plan optimized by CBO. |
> | |
> | Vertex dependency in root stage |
> | Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) |
> | Reducer 3 <- Reducer 2 (SIMPLE_EDGE) |
> | Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) |
> | |
> | Stage-0 |
> | Fetch Operator |
> | limit:-1 |
> | Stage-1 |
> | Reducer 4 |
> | File Output Operator [FS_21] |
> | Group By Operator [GBY_19] (rows=1 width=8) |
> | Output:["_col0"],aggregations:["count(VALUE._col0)"] |
> | <-Reducer 3 [CUSTOM_SIMPLE_EDGE] |
> | PARTITION_ONLY_SHUFFLE [RS_18] |
> | Group By Operator [GBY_17] (rows=1 width=8) |
> | Output:["_col0"],aggregations:["count()"] |
> | Select Operator [SEL_15] (rows=3673352 width=178) |
> | Group By Operator [GBY_14] (rows=3673352 width=178) |
> | Output:["_col0"],keys:KEY._col0 |
> | <-Reducer 2 [SIMPLE_EDGE] |
> | SHUFFLE [RS_13] |
> | PartitionCols:_col0 |
> | Group By Operator [GBY_12] (rows=7346704 width=178) |
> | Output:["_col0"],keys:_col1 |
> | Merge Join Operator [MERGEJOIN_36] (rows=7346704
> width=178) |
> | Conds:RS_8._col0=RS_9._col0(Left
> Semi),Output:["_col1"] |
> | <-Map 1 [SIMPLE_EDGE] |
> | SHUFFLE [RS_8] |
> | PartitionCols:_col0 |
> | Select Operator [SEL_2] (rows=6678822 width=178) |
> | Output:["_col0","_col1"] |
> | Filter Operator [FIL_24] (rows=6678822
> width=178) |
> | predicate:business_unit_code is not null |
> | TableScan [TS_0] (rows=6678822 width=178) |
> |
> scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]
> |
> | <-Map 5 [SIMPLE_EDGE] |
> | SHUFFLE [RS_9] |
> | PartitionCols:_col0 |
> | Group By Operator [GBY_7] (rows=7 width=184) |
> | Output:["_col0"],keys:_col0 |
> | Select Operator [SEL_5] (rows=7 width=184) |
> | Output:["_col0"] |
> | Filter Operator [FIL_25] (rows=7 width=184) |
> | predicate:business_unit_code is not null |
> | TableScan [TS_3] (rows=7 width=184) |
> |
> tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]
> |
> | |
> +----------------------------------------------------+
> 50 rows selected (1.545 seconds)
> case 4:
> set hive.auto.convert.join=false;
> set mapred.reduce.tasks=100;
> select count(1) from (
> select department_code
> from scot_dws.dws_sales_contribution_kpi_mon
> where business_unit_code in (select business_unit_code from
> tmp.business_unit_config)
> group by department_code
> )tmp;
> +------+
> | _c0 |
> +------+
> | 0 |
> +------+
> +----------------------------------------------------+
> | Explain |
> +----------------------------------------------------+
> | Plan optimized by CBO. |
> | |
> | Vertex dependency in root stage |
> | Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) |
> | Reducer 3 <- Reducer 2 (SIMPLE_EDGE) |
> | Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) |
> | |
> | Stage-0 |
> | Fetch Operator |
> | limit:-1 |
> | Stage-1 |
> | Reducer 4 |
> | File Output Operator [FS_21] |
> | Group By Operator [GBY_19] (rows=1 width=8) |
> | Output:["_col0"],aggregations:["count(VALUE._col0)"] |
> | <-Reducer 3 [CUSTOM_SIMPLE_EDGE] |
> | PARTITION_ONLY_SHUFFLE [RS_18] |
> | Group By Operator [GBY_17] (rows=1 width=8) |
> | Output:["_col0"],aggregations:["count()"] |
> | Select Operator [SEL_15] (rows=3673352 width=178) |
> | Group By Operator [GBY_14] (rows=3673352 width=178) |
> | Output:["_col0"],keys:KEY._col0 |
> | <-Reducer 2 [SIMPLE_EDGE] |
> | SHUFFLE [RS_13] |
> | PartitionCols:_col0 |
> | Group By Operator [GBY_12] (rows=7346704 width=178) |
> | Output:["_col0"],keys:_col1 |
> | Merge Join Operator [MERGEJOIN_36] (rows=7346704
> width=178) |
> | Conds:RS_8._col0=RS_9._col0(Left
> Semi),Output:["_col1"] |
> | <-Map 1 [SIMPLE_EDGE] |
> | SHUFFLE [RS_8] |
> | PartitionCols:_col0 |
> | Select Operator [SEL_2] (rows=6678822 width=178) |
> | Output:["_col0","_col1"] |
> | Filter Operator [FIL_24] (rows=6678822
> width=178) |
> | predicate:business_unit_code is not null |
> | TableScan [TS_0] (rows=6678822 width=178) |
> |
> scot_dws@dws_sales_contribution_kpi_mon,dws_sales_contribution_kpi_mon,Tbl:COMPLETE,Col:COMPLETE,Output:["business_unit_code","department_code"]
> |
> | <-Map 5 [SIMPLE_EDGE] |
> | SHUFFLE [RS_9] |
> | PartitionCols:_col0 |
> | Group By Operator [GBY_7] (rows=7 width=184) |
> | Output:["_col0"],keys:_col0 |
> | Select Operator [SEL_5] (rows=7 width=184) |
> | Output:["_col0"] |
> | Filter Operator [FIL_25] (rows=7 width=184) |
> | predicate:business_unit_code is not null |
> | TableScan [TS_3] (rows=7 width=184) |
> |
> tmp@business_unit_config,business_unit_config,Tbl:COMPLETE,Col:NONE,Output:["business_unit_code"]
> |
> | |
> +----------------------------------------------------+
> 50 rows selected (1.547 seconds)
> in this case,the result is wrong!{code}
>
>
--
This message was sent by Atlassian Jira
(v8.3.4#803005)