[ 
https://issues.apache.org/jira/browse/HIVE-21574?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Panda Song updated HIVE-21574:
------------------------------
    Description: 
when I use a table instead of the sub select,I get the right result,much more 
rows are joined together(metrics old_uv is bigger!!!) 

Is there some bugs here?

Please help me ,thanks a lot!!!!!!

select 
 {{a.event_date,}}
 {{count(distinct a.device_id) as uv,}}
 {{count(distinct case when b.device_id is not null then b.device_id end) as 
old_uv,}}
 {{count(distinct a.device_id) - count(distinct case when b.device_id is not 
null then b.device_id end) as new_uv}}
 {{from}}
 {{(}}
 {{select}}
 {{event_date,}}
 {{device_id,}}
 {{qingting_id}}
 {{from datacenter.bl_page_chain_day}}
 {{where event_date = '2019-03-31'}}
 {{and (current_content like 'https://a.qingting.fm/membership5%'}}
 {{or current_content like 'https://m.qingting.fm/vips/members%'}}
 {{or current_content like 'https://sss.qingting.fm/vips/members/v2/%')}}
 {{)a}}
 {{left join}}
 {{(select}}
   b.device_id
 {{from}}
 {{lzq_test.first_buy_vip a}}
 {{inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id}}
 {{where a.first_buy < '2019-03-31'}}
 {{group by b.device_id}}
 {{)b}}
 {{on a.device_id = b.device_id}}
 {{group by a.event_date;}}

plan:
{quote}Plan optimized by CBO. |

Vertex dependency in root stage 
 Map 1 <- Map 3 (BROADCAST_EDGE) 
 Reducer 2 <- Map 1 (SIMPLE_EDGE) 
 Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE) 
 Reducer 6 <- Reducer 5 (SIMPLE_EDGE)

Stage-0 
 Fetch Operator 
 limit:-1 
 Stage-1 
 Reducer 6 
 File Output Operator [FS_26] 
 Select Operator [SEL_25] (rows=35527639 width=349) 
 Output:["_col0","_col1","_col2","_col3"] 
 Group By Operator [GBY_24] (rows=35527639 width=349) 
 Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT 
KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0 
 <-Reducer 5 [SIMPLE_EDGE] 
 SHUFFLE [RS_23] 
 PartitionCols:_col0 
 Group By Operator [GBY_22] (rows=71055278 width=349) 
 Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT 
_col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2 
 Select Operator [SEL_20] (rows=71055278 width=349) 
 Output:["_col1","_col2"] 
 Map Join Operator [MAPJOIN_45] (rows=71055278 width=349) 
 Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right 
Outer),Output:["_col0","_col1"] 
 <-Reducer 2 [ONE_TO_ONE_EDGE] 
 FORWARD [RS_17] 
 PartitionCols:_col0 
 Group By Operator [GBY_12] (rows=21738609 width=235) 
 Output:["_col0"],keys:KEY._col0 
 <-Map 1 [SIMPLE_EDGE] 
 SHUFFLE [RS_11] 
 PartitionCols:_col0 
 Group By Operator [GBY_10] (rows=43477219 width=235) 
 Output:["_col0"],keys:_col0 
 Map Join Operator [MAPJOIN_44] (rows=43477219 width=235) 
 Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"] 
 <-Map 3 [BROADCAST_EDGE] 
 BROADCAST [RS_7] 
 PartitionCols:_col0 
 Select Operator [SEL_5] (rows=301013 width=228) 
 Output:["_col0"] 
 Filter Operator [FIL_32] (rows=301013 width=228) 
 predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null) 
 TableScan [TS_3] (rows=1062401 width=228) 
 lzq_test@first_buy_vip,a, transactional 
table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"] 
 <-Select Operator [SEL_2] (rows=39524744 width=235) 
 Output:["_col0","_col1"] 
 Filter Operator [FIL_31] (rows=39524744 width=235) 
 predicate:qingting_id is not null 
 TableScan [TS_0] (rows=39524744 width=235) 
 datacenter@device_qingting,b, ACID 
table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"] 
 <-Map 4 [CUSTOM_SIMPLE_EDGE] 
 PARTITION_ONLY_SHUFFLE [RS_18] 
 PartitionCols:_col0 
 Select Operator [SEL_16] (rows=64595706 width=349) 
 Output:["_col0"] 
 Filter Operator [FIL_33] (rows=64595706 width=349) 
 predicate:((current_content like 'https://a.qingting.fm/membership5%') or 
(current_content like 'https://m.qingting.fm/vips/members%') or 
(current_content like 'https://sss.qingting.fm/vips/members/v2/%')) 
 TableScan [TS_14] (rows=64595706 width=349) 
 
datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"]
{quote}

  was:
when I use a table instead of the sub select,I get the right result,much more 
rows are joined together(metrics old_uv is bigger!!!) 

Is there some bugs here?

Please help me ,thanks a lot!!!!!!
{quote}select 
 {{a.event_date,}}
 {{count(distinct a.device_id) as uv,}}
 {{count(distinct case when b.device_id is not null then b.device_id end) as 
old_uv,}}
 {{count(distinct a.device_id) - count(distinct case when b.device_id is not 
null then b.device_id end) as new_uv}}
 {{from}}
 {{(}}
 {{select}}
 {{event_date,}}
 {{device_id,}}
 {{qingting_id}}
 {{from datacenter.bl_page_chain_day}}
 {{where event_date = '2019-03-31'}}
 {{and (current_content like 'https://a.qingting.fm/membership5%'}}
 {{or current_content like 'https://m.qingting.fm/vips/members%'}}
 {{or current_content like 'https://sss.qingting.fm/vips/members/v2/%')}}
 {{)a}}
 {{left join}}
 {{(select}}
  b.device_id
 {{from}}
 {{lzq_test.first_buy_vip a}}
 {{inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id}}
 {{where a.first_buy < '2019-03-31'}}
 {{group by b.device_id}}
 {{)b}}
 {{on a.device_id = b.device_id}}
 {{group by a.event_date;}}
{quote}
plan:
{quote}Plan optimized by CBO. |

Vertex dependency in root stage 
 Map 1 <- Map 3 (BROADCAST_EDGE) 
 Reducer 2 <- Map 1 (SIMPLE_EDGE) 
 Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE) 
 Reducer 6 <- Reducer 5 (SIMPLE_EDGE)

Stage-0 
 Fetch Operator 
 limit:-1 
 Stage-1 
 Reducer 6 
 File Output Operator [FS_26] 
 Select Operator [SEL_25] (rows=35527639 width=349) 
 Output:["_col0","_col1","_col2","_col3"] 
 Group By Operator [GBY_24] (rows=35527639 width=349) 
 Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT 
KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0 
 <-Reducer 5 [SIMPLE_EDGE] 
 SHUFFLE [RS_23] 
 PartitionCols:_col0 
 Group By Operator [GBY_22] (rows=71055278 width=349) 
 Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT 
_col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2 
 Select Operator [SEL_20] (rows=71055278 width=349) 
 Output:["_col1","_col2"] 
 Map Join Operator [MAPJOIN_45] (rows=71055278 width=349) 
 Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right 
Outer),Output:["_col0","_col1"] 
 <-Reducer 2 [ONE_TO_ONE_EDGE] 
 FORWARD [RS_17] 
 PartitionCols:_col0 
 Group By Operator [GBY_12] (rows=21738609 width=235) 
 Output:["_col0"],keys:KEY._col0 
 <-Map 1 [SIMPLE_EDGE] 
 SHUFFLE [RS_11] 
 PartitionCols:_col0 
 Group By Operator [GBY_10] (rows=43477219 width=235) 
 Output:["_col0"],keys:_col0 
 Map Join Operator [MAPJOIN_44] (rows=43477219 width=235) 
 Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"] 
 <-Map 3 [BROADCAST_EDGE] 
 BROADCAST [RS_7] 
 PartitionCols:_col0 
 Select Operator [SEL_5] (rows=301013 width=228) 
 Output:["_col0"] 
 Filter Operator [FIL_32] (rows=301013 width=228) 
 predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null) 
 TableScan [TS_3] (rows=1062401 width=228) 
 lzq_test@first_buy_vip,a, transactional 
table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"] 
 <-Select Operator [SEL_2] (rows=39524744 width=235) 
 Output:["_col0","_col1"] 
 Filter Operator [FIL_31] (rows=39524744 width=235) 
 predicate:qingting_id is not null 
 TableScan [TS_0] (rows=39524744 width=235) 
 datacenter@device_qingting,b, ACID 
table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"] 
 <-Map 4 [CUSTOM_SIMPLE_EDGE] 
 PARTITION_ONLY_SHUFFLE [RS_18] 
 PartitionCols:_col0 
 Select Operator [SEL_16] (rows=64595706 width=349) 
 Output:["_col0"] 
 Filter Operator [FIL_33] (rows=64595706 width=349) 
 predicate:((current_content like 'https://a.qingting.fm/membership5%') or 
(current_content like 'https://m.qingting.fm/vips/members%') or 
(current_content like 'https://sss.qingting.fm/vips/members/v2/%')) 
 TableScan [TS_14] (rows=64595706 width=349) 
 
datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"]
{quote}


> return wrong result when excuting left join sql
> -----------------------------------------------
>
>                 Key: HIVE-21574
>                 URL: https://issues.apache.org/jira/browse/HIVE-21574
>             Project: Hive
>          Issue Type: Bug
>    Affects Versions: 3.1.0
>         Environment: hive 3.1.0 hdfs 3.1.1
>            Reporter: Panda Song
>            Priority: Blocker
>
> when I use a table instead of the sub select,I get the right result,much more 
> rows are joined together(metrics old_uv is bigger!!!) 
> Is there some bugs here?
> Please help me ,thanks a lot!!!!!!
> select 
>  {{a.event_date,}}
>  {{count(distinct a.device_id) as uv,}}
>  {{count(distinct case when b.device_id is not null then b.device_id end) as 
> old_uv,}}
>  {{count(distinct a.device_id) - count(distinct case when b.device_id is not 
> null then b.device_id end) as new_uv}}
>  {{from}}
>  {{(}}
>  {{select}}
>  {{event_date,}}
>  {{device_id,}}
>  {{qingting_id}}
>  {{from datacenter.bl_page_chain_day}}
>  {{where event_date = '2019-03-31'}}
>  {{and (current_content like 'https://a.qingting.fm/membership5%'}}
>  {{or current_content like 'https://m.qingting.fm/vips/members%'}}
>  {{or current_content like 'https://sss.qingting.fm/vips/members/v2/%')}}
>  {{)a}}
>  {{left join}}
>  {{(select}}
>    b.device_id
>  {{from}}
>  {{lzq_test.first_buy_vip a}}
>  {{inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id}}
>  {{where a.first_buy < '2019-03-31'}}
>  {{group by b.device_id}}
>  {{)b}}
>  {{on a.device_id = b.device_id}}
>  {{group by a.event_date;}}
> plan:
> {quote}Plan optimized by CBO. |
> Vertex dependency in root stage 
>  Map 1 <- Map 3 (BROADCAST_EDGE) 
>  Reducer 2 <- Map 1 (SIMPLE_EDGE) 
>  Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE) 
>  Reducer 6 <- Reducer 5 (SIMPLE_EDGE)
> Stage-0 
>  Fetch Operator 
>  limit:-1 
>  Stage-1 
>  Reducer 6 
>  File Output Operator [FS_26] 
>  Select Operator [SEL_25] (rows=35527639 width=349) 
>  Output:["_col0","_col1","_col2","_col3"] 
>  Group By Operator [GBY_24] (rows=35527639 width=349) 
>  Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT 
> KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0 
>  <-Reducer 5 [SIMPLE_EDGE] 
>  SHUFFLE [RS_23] 
>  PartitionCols:_col0 
>  Group By Operator [GBY_22] (rows=71055278 width=349) 
>  
> Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT
>  _col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2 
>  Select Operator [SEL_20] (rows=71055278 width=349) 
>  Output:["_col1","_col2"] 
>  Map Join Operator [MAPJOIN_45] (rows=71055278 width=349) 
>  Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right 
> Outer),Output:["_col0","_col1"] 
>  <-Reducer 2 [ONE_TO_ONE_EDGE] 
>  FORWARD [RS_17] 
>  PartitionCols:_col0 
>  Group By Operator [GBY_12] (rows=21738609 width=235) 
>  Output:["_col0"],keys:KEY._col0 
>  <-Map 1 [SIMPLE_EDGE] 
>  SHUFFLE [RS_11] 
>  PartitionCols:_col0 
>  Group By Operator [GBY_10] (rows=43477219 width=235) 
>  Output:["_col0"],keys:_col0 
>  Map Join Operator [MAPJOIN_44] (rows=43477219 width=235) 
>  Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"] 
>  <-Map 3 [BROADCAST_EDGE] 
>  BROADCAST [RS_7] 
>  PartitionCols:_col0 
>  Select Operator [SEL_5] (rows=301013 width=228) 
>  Output:["_col0"] 
>  Filter Operator [FIL_32] (rows=301013 width=228) 
>  predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null) 
>  TableScan [TS_3] (rows=1062401 width=228) 
>  lzq_test@first_buy_vip,a, transactional 
> table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"] 
>  <-Select Operator [SEL_2] (rows=39524744 width=235) 
>  Output:["_col0","_col1"] 
>  Filter Operator [FIL_31] (rows=39524744 width=235) 
>  predicate:qingting_id is not null 
>  TableScan [TS_0] (rows=39524744 width=235) 
>  datacenter@device_qingting,b, ACID 
> table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"] 
>  <-Map 4 [CUSTOM_SIMPLE_EDGE] 
>  PARTITION_ONLY_SHUFFLE [RS_18] 
>  PartitionCols:_col0 
>  Select Operator [SEL_16] (rows=64595706 width=349) 
>  Output:["_col0"] 
>  Filter Operator [FIL_33] (rows=64595706 width=349) 
>  predicate:((current_content like 'https://a.qingting.fm/membership5%') or 
> (current_content like 'https://m.qingting.fm/vips/members%') or 
> (current_content like 'https://sss.qingting.fm/vips/members/v2/%')) 
>  TableScan [TS_14] (rows=64595706 width=349) 
>  
> datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"]
> {quote}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to