[
https://issues.apache.org/jira/browse/HIVE-21574?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Panda Song updated HIVE-21574:
------------------------------
Description:
when I use a table instead of the sub select,I get the right result,much more
rows are joined together(metrics old_uv is bigger!!!)
Is there some bugs here?
Please help me ,thanks a lot!!!!!!
select
{{a.event_date,}}
{{count(distinct a.device_id) as uv,}}
{{count(distinct case when b.device_id is not null then b.device_id end) as
old_uv,}}
{{count(distinct a.device_id) - count(distinct case when b.device_id is not
null then b.device_id end) as new_uv}}
{{from}}
{{(}}
{{select}}
{{event_date,}}
{{device_id,}}
{{qingting_id}}
{{from datacenter.bl_page_chain_day}}
{{where event_date = '2019-03-31'}}
{{and (current_content like 'https://a.qingting.fm/membership5%'}}
{{or current_content like 'https://m.qingting.fm/vips/members%'}}
{{or current_content like 'https://sss.qingting.fm/vips/members/v2/%')}}
{{)a}}
{{left join}}
{{(select}}
b.device_id
{{from}}
{{lzq_test.first_buy_vip a}}
{{inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id}}
{{where a.first_buy < '2019-03-31'}}
{{group by b.device_id}}
{{)b}}
{{on a.device_id = b.device_id}}
{{group by a.event_date;}}
plan:
{quote}Plan optimized by CBO. |
Vertex dependency in root stage
Map 1 <- Map 3 (BROADCAST_EDGE)
Reducer 2 <- Map 1 (SIMPLE_EDGE)
Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE)
Reducer 6 <- Reducer 5 (SIMPLE_EDGE)
Stage-0
Fetch Operator
limit:-1
Stage-1
Reducer 6
File Output Operator [FS_26]
Select Operator [SEL_25] (rows=35527639 width=349)
Output:["_col0","_col1","_col2","_col3"]
Group By Operator [GBY_24] (rows=35527639 width=349)
Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT
KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0
<-Reducer 5 [SIMPLE_EDGE]
SHUFFLE [RS_23]
PartitionCols:_col0
Group By Operator [GBY_22] (rows=71055278 width=349)
Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT
_col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2
Select Operator [SEL_20] (rows=71055278 width=349)
Output:["_col1","_col2"]
Map Join Operator [MAPJOIN_45] (rows=71055278 width=349)
Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right
Outer),Output:["_col0","_col1"]
<-Reducer 2 [ONE_TO_ONE_EDGE]
FORWARD [RS_17]
PartitionCols:_col0
Group By Operator [GBY_12] (rows=21738609 width=235)
Output:["_col0"],keys:KEY._col0
<-Map 1 [SIMPLE_EDGE]
SHUFFLE [RS_11]
PartitionCols:_col0
Group By Operator [GBY_10] (rows=43477219 width=235)
Output:["_col0"],keys:_col0
Map Join Operator [MAPJOIN_44] (rows=43477219 width=235)
Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"]
<-Map 3 [BROADCAST_EDGE]
BROADCAST [RS_7]
PartitionCols:_col0
Select Operator [SEL_5] (rows=301013 width=228)
Output:["_col0"]
Filter Operator [FIL_32] (rows=301013 width=228)
predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null)
TableScan [TS_3] (rows=1062401 width=228)
lzq_test@first_buy_vip,a, transactional
table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"]
<-Select Operator [SEL_2] (rows=39524744 width=235)
Output:["_col0","_col1"]
Filter Operator [FIL_31] (rows=39524744 width=235)
predicate:qingting_id is not null
TableScan [TS_0] (rows=39524744 width=235)
datacenter@device_qingting,b, ACID
table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"]
<-Map 4 [CUSTOM_SIMPLE_EDGE]
PARTITION_ONLY_SHUFFLE [RS_18]
PartitionCols:_col0
Select Operator [SEL_16] (rows=64595706 width=349)
Output:["_col0"]
Filter Operator [FIL_33] (rows=64595706 width=349)
predicate:((current_content like 'https://a.qingting.fm/membership5%') or
(current_content like 'https://m.qingting.fm/vips/members%') or
(current_content like 'https://sss.qingting.fm/vips/members/v2/%'))
TableScan [TS_14] (rows=64595706 width=349)
datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"]
{quote}
was:
when I use a table instead of the sub select,I get the right result,much more
rows are joined together(metrics old_uv is bigger!!!)
Is there some bugs here?
Please help me ,thanks a lot!!!!!!
{quote}select
{{a.event_date,}}
{{count(distinct a.device_id) as uv,}}
{{count(distinct case when b.device_id is not null then b.device_id end) as
old_uv,}}
{{count(distinct a.device_id) - count(distinct case when b.device_id is not
null then b.device_id end) as new_uv}}
{{from}}
{{(}}
{{select}}
{{event_date,}}
{{device_id,}}
{{qingting_id}}
{{from datacenter.bl_page_chain_day}}
{{where event_date = '2019-03-31'}}
{{and (current_content like 'https://a.qingting.fm/membership5%'}}
{{or current_content like 'https://m.qingting.fm/vips/members%'}}
{{or current_content like 'https://sss.qingting.fm/vips/members/v2/%')}}
{{)a}}
{{left join}}
{{(select}}
b.device_id
{{from}}
{{lzq_test.first_buy_vip a}}
{{inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id}}
{{where a.first_buy < '2019-03-31'}}
{{group by b.device_id}}
{{)b}}
{{on a.device_id = b.device_id}}
{{group by a.event_date;}}
{quote}
plan:
{quote}Plan optimized by CBO. |
Vertex dependency in root stage
Map 1 <- Map 3 (BROADCAST_EDGE)
Reducer 2 <- Map 1 (SIMPLE_EDGE)
Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE)
Reducer 6 <- Reducer 5 (SIMPLE_EDGE)
Stage-0
Fetch Operator
limit:-1
Stage-1
Reducer 6
File Output Operator [FS_26]
Select Operator [SEL_25] (rows=35527639 width=349)
Output:["_col0","_col1","_col2","_col3"]
Group By Operator [GBY_24] (rows=35527639 width=349)
Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT
KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0
<-Reducer 5 [SIMPLE_EDGE]
SHUFFLE [RS_23]
PartitionCols:_col0
Group By Operator [GBY_22] (rows=71055278 width=349)
Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT
_col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2
Select Operator [SEL_20] (rows=71055278 width=349)
Output:["_col1","_col2"]
Map Join Operator [MAPJOIN_45] (rows=71055278 width=349)
Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right
Outer),Output:["_col0","_col1"]
<-Reducer 2 [ONE_TO_ONE_EDGE]
FORWARD [RS_17]
PartitionCols:_col0
Group By Operator [GBY_12] (rows=21738609 width=235)
Output:["_col0"],keys:KEY._col0
<-Map 1 [SIMPLE_EDGE]
SHUFFLE [RS_11]
PartitionCols:_col0
Group By Operator [GBY_10] (rows=43477219 width=235)
Output:["_col0"],keys:_col0
Map Join Operator [MAPJOIN_44] (rows=43477219 width=235)
Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"]
<-Map 3 [BROADCAST_EDGE]
BROADCAST [RS_7]
PartitionCols:_col0
Select Operator [SEL_5] (rows=301013 width=228)
Output:["_col0"]
Filter Operator [FIL_32] (rows=301013 width=228)
predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null)
TableScan [TS_3] (rows=1062401 width=228)
lzq_test@first_buy_vip,a, transactional
table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"]
<-Select Operator [SEL_2] (rows=39524744 width=235)
Output:["_col0","_col1"]
Filter Operator [FIL_31] (rows=39524744 width=235)
predicate:qingting_id is not null
TableScan [TS_0] (rows=39524744 width=235)
datacenter@device_qingting,b, ACID
table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"]
<-Map 4 [CUSTOM_SIMPLE_EDGE]
PARTITION_ONLY_SHUFFLE [RS_18]
PartitionCols:_col0
Select Operator [SEL_16] (rows=64595706 width=349)
Output:["_col0"]
Filter Operator [FIL_33] (rows=64595706 width=349)
predicate:((current_content like 'https://a.qingting.fm/membership5%') or
(current_content like 'https://m.qingting.fm/vips/members%') or
(current_content like 'https://sss.qingting.fm/vips/members/v2/%'))
TableScan [TS_14] (rows=64595706 width=349)
datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"]
{quote}
> return wrong result when excuting left join sql
> -----------------------------------------------
>
> Key: HIVE-21574
> URL: https://issues.apache.org/jira/browse/HIVE-21574
> Project: Hive
> Issue Type: Bug
> Affects Versions: 3.1.0
> Environment: hive 3.1.0 hdfs 3.1.1
> Reporter: Panda Song
> Priority: Blocker
>
> when I use a table instead of the sub select,I get the right result,much more
> rows are joined together(metrics old_uv is bigger!!!)
> Is there some bugs here?
> Please help me ,thanks a lot!!!!!!
> select
> {{a.event_date,}}
> {{count(distinct a.device_id) as uv,}}
> {{count(distinct case when b.device_id is not null then b.device_id end) as
> old_uv,}}
> {{count(distinct a.device_id) - count(distinct case when b.device_id is not
> null then b.device_id end) as new_uv}}
> {{from}}
> {{(}}
> {{select}}
> {{event_date,}}
> {{device_id,}}
> {{qingting_id}}
> {{from datacenter.bl_page_chain_day}}
> {{where event_date = '2019-03-31'}}
> {{and (current_content like 'https://a.qingting.fm/membership5%'}}
> {{or current_content like 'https://m.qingting.fm/vips/members%'}}
> {{or current_content like 'https://sss.qingting.fm/vips/members/v2/%')}}
> {{)a}}
> {{left join}}
> {{(select}}
> b.device_id
> {{from}}
> {{lzq_test.first_buy_vip a}}
> {{inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id}}
> {{where a.first_buy < '2019-03-31'}}
> {{group by b.device_id}}
> {{)b}}
> {{on a.device_id = b.device_id}}
> {{group by a.event_date;}}
> plan:
> {quote}Plan optimized by CBO. |
> Vertex dependency in root stage
> Map 1 <- Map 3 (BROADCAST_EDGE)
> Reducer 2 <- Map 1 (SIMPLE_EDGE)
> Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE)
> Reducer 6 <- Reducer 5 (SIMPLE_EDGE)
> Stage-0
> Fetch Operator
> limit:-1
> Stage-1
> Reducer 6
> File Output Operator [FS_26]
> Select Operator [SEL_25] (rows=35527639 width=349)
> Output:["_col0","_col1","_col2","_col3"]
> Group By Operator [GBY_24] (rows=35527639 width=349)
> Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT
> KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0
> <-Reducer 5 [SIMPLE_EDGE]
> SHUFFLE [RS_23]
> PartitionCols:_col0
> Group By Operator [GBY_22] (rows=71055278 width=349)
>
> Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT
> _col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2
> Select Operator [SEL_20] (rows=71055278 width=349)
> Output:["_col1","_col2"]
> Map Join Operator [MAPJOIN_45] (rows=71055278 width=349)
> Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right
> Outer),Output:["_col0","_col1"]
> <-Reducer 2 [ONE_TO_ONE_EDGE]
> FORWARD [RS_17]
> PartitionCols:_col0
> Group By Operator [GBY_12] (rows=21738609 width=235)
> Output:["_col0"],keys:KEY._col0
> <-Map 1 [SIMPLE_EDGE]
> SHUFFLE [RS_11]
> PartitionCols:_col0
> Group By Operator [GBY_10] (rows=43477219 width=235)
> Output:["_col0"],keys:_col0
> Map Join Operator [MAPJOIN_44] (rows=43477219 width=235)
> Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"]
> <-Map 3 [BROADCAST_EDGE]
> BROADCAST [RS_7]
> PartitionCols:_col0
> Select Operator [SEL_5] (rows=301013 width=228)
> Output:["_col0"]
> Filter Operator [FIL_32] (rows=301013 width=228)
> predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null)
> TableScan [TS_3] (rows=1062401 width=228)
> lzq_test@first_buy_vip,a, transactional
> table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"]
> <-Select Operator [SEL_2] (rows=39524744 width=235)
> Output:["_col0","_col1"]
> Filter Operator [FIL_31] (rows=39524744 width=235)
> predicate:qingting_id is not null
> TableScan [TS_0] (rows=39524744 width=235)
> datacenter@device_qingting,b, ACID
> table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"]
> <-Map 4 [CUSTOM_SIMPLE_EDGE]
> PARTITION_ONLY_SHUFFLE [RS_18]
> PartitionCols:_col0
> Select Operator [SEL_16] (rows=64595706 width=349)
> Output:["_col0"]
> Filter Operator [FIL_33] (rows=64595706 width=349)
> predicate:((current_content like 'https://a.qingting.fm/membership5%') or
> (current_content like 'https://m.qingting.fm/vips/members%') or
> (current_content like 'https://sss.qingting.fm/vips/members/v2/%'))
> TableScan [TS_14] (rows=64595706 width=349)
>
> datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"]
> {quote}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)