[jira] [Updated] (HIVE-18454) Incorrect rownum estimation in joins

Zoltan Haindrich (JIRA) Tue, 16 Jan 2018 05:58:34 -0800

     [ 
https://issues.apache.org/jira/browse/HIVE-18454?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


Zoltan Haindrich updated HIVE-18454:
------------------------------------
    Description: 
row counts seems to be off the charts...sometimes ~120000 rows estimated when 
the table has only 10 rows
{code:java}
create table s (x int);

insert into s values
(1),(2),(3),(4),(5),
(6),(7),(8),(9),(10);

create table tu(id_uv int,id_uw int,u int);
create table tv(id_uv int,v int);
create table tw(id_uw int,w int);

from s
insert overwrite table tu
        select x,x,x 
        where x<=6 or x=10
insert overwrite table tv
        select x,x              
        where x<=3 or x=10
insert overwrite table tw
        select x,x              
;

set hive.explain.user=true;

explain analyze
select sum(u*v*w) from tu
        join tv on (tu.id_uv=tv.id_uv)
        join tw on (tu.id_uw=tw.id_uw)
        where w>9 and u>1 and v>3;

desc formatted tv;
{code}
explain analyze output:
{code:java}
| Plan optimized by CBO.                             |
|                                                    |
| Vertex dependency in root stage                    |
| Map 1 <- Map 3 (BROADCAST_EDGE), Map 4 (BROADCAST_EDGE) |
| Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)            |
|                                                    |
| Stage-0                                            |
|   Fetch Operator                                   |
|     limit:-1                                       |
|     Stage-1                                        |
|       Reducer 2                                    |
|       File Output Operator [FS_21]                 |
|         Group By Operator [GBY_19] (rows=1/1 width=8) |
|           Output:["_col0"],aggregations:["sum(VALUE._col0)"] |
|         <-Map 1 [CUSTOM_SIMPLE_EDGE]               |
|           PARTITION_ONLY_SHUFFLE [RS_18]           |
|             Group By Operator [GBY_17] (rows=1/1 width=8) |
|               Output:["_col0"],aggregations:["sum(_col0)"] |
|               Select Operator [SEL_15] (rows=48400/1 width=5) |
|                 Output:["_col0"]                   |
|                 Map Join Operator [MAPJOIN_31] (rows=48400/1 width=5) |
|                   
Conds:MAPJOIN_30._col1=RS_13._col0(Inner),HybridGraceHashJoin:true,Output:["_col2","_col4","_col6"]
 |
|                 <-Map 4 [BROADCAST_EDGE]           |
|                   BROADCAST [RS_13]                |
|                     PartitionCols:_col0            |
|                     Select Operator [SEL_8] (rows=3/1 width=3) |
|                       Output:["_col0","_col1"]     |
|                       Filter Operator [FIL_29] (rows=3/1 width=3) |
|                         predicate:((w > 9) and id_uw is not null) |
|                         TableScan [TS_6] (rows=10/10 width=3) |
|                           
default@tw,tw,Tbl:COMPLETE,Col:NONE,Output:["id_uw","w"] |
|                 <-Map Join Operator [MAPJOIN_30] (rows=44000/1 width=5) |
|                     
Conds:SEL_2._col0=RS_10._col0(Inner),HybridGraceHashJoin:true,Output:["_col1","_col2","_col4"]
 |
|                   <-Map 3 [BROADCAST_EDGE]         |
|                     BROADCAST [RS_10]              |
|                       PartitionCols:_col0          |
|                       Select Operator [SEL_5] (rows=1632/1 width=3) |
|                         Output:["_col0","_col1"]   |
|                         Filter Operator [FIL_28] (rows=1632/1 width=3) |
|                           predicate:((v > 3) and id_uv is not null) |
|                           TableScan [TS_3] (rows=4898/4 width=3) |
|                             
default@tv,tv,Tbl:COMPLETE,Col:NONE,Output:["id_uv","v"] |
|                   <-Select Operator [SEL_2] (rows=40000/6 width=5) |
|                       Output:["_col0","_col1","_col2"] |
|                       Filter Operator [FIL_27] (rows=40000/6 width=5) |
|                         predicate:((u > 1) and id_uv is not null and id_uw is 
not null) |
|                         TableScan [TS_0] (rows=120000/7 width=5) |
|                           
default@tu,tu,Tbl:COMPLETE,Col:NONE,Output:["id_uv","id_uw","u"] |
{code}

  was:
I've seen this probably earlier...row counts seems to be off the 
charts...120000 rows estimated when the table has only 10 rows

{code}
create table s (x int);

insert into s values
(1),(2),(3),(4),(5),
(6),(7),(8),(9),(10);

create table tu(id_uv int,id_uw int,u int);
create table tv(id_uv int,v int);
create table tw(id_uw int,w int);

from s
insert overwrite table tu
        select x,x,x 
        where x<=6 or x=10
insert overwrite table tv
        select x,x              
        where x<=3 or x=10
insert overwrite table tw
        select x,x              
;

set hive.explain.user=true;

explain analyze
select sum(u*v*w) from tu
        join tv on (tu.id_uv=tv.id_uv)
        join tw on (tu.id_uw=tw.id_uw)
        where w>9 and u>1 and v>3;

desc formatted tv;
{code}




> Incorrect rownum estimation in joins
> ------------------------------------
>
>                 Key: HIVE-18454
>                 URL: https://issues.apache.org/jira/browse/HIVE-18454
>             Project: Hive
>          Issue Type: Sub-task
>          Components: Statistics
>            Reporter: Zoltan Haindrich
>            Priority: Major
>
> row counts seems to be off the charts...sometimes ~120000 rows estimated when 
> the table has only 10 rows
> {code:java}
> create table s (x int);
> insert into s values
> (1),(2),(3),(4),(5),
> (6),(7),(8),(9),(10);
> create table tu(id_uv int,id_uw int,u int);
> create table tv(id_uv int,v int);
> create table tw(id_uw int,w int);
> from s
> insert overwrite table tu
>         select x,x,x 
>         where x<=6 or x=10
> insert overwrite table tv
>         select x,x              
>         where x<=3 or x=10
> insert overwrite table tw
>         select x,x              
> ;
> set hive.explain.user=true;
> explain analyze
> select sum(u*v*w) from tu
>         join tv on (tu.id_uv=tv.id_uv)
>         join tw on (tu.id_uw=tw.id_uw)
>         where w>9 and u>1 and v>3;
> desc formatted tv;
> {code}
> explain analyze output:
> {code:java}
> | Plan optimized by CBO.                             |
> |                                                    |
> | Vertex dependency in root stage                    |
> | Map 1 <- Map 3 (BROADCAST_EDGE), Map 4 (BROADCAST_EDGE) |
> | Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)            |
> |                                                    |
> | Stage-0                                            |
> |   Fetch Operator                                   |
> |     limit:-1                                       |
> |     Stage-1                                        |
> |       Reducer 2                                    |
> |       File Output Operator [FS_21]                 |
> |         Group By Operator [GBY_19] (rows=1/1 width=8) |
> |           Output:["_col0"],aggregations:["sum(VALUE._col0)"] |
> |         <-Map 1 [CUSTOM_SIMPLE_EDGE]               |
> |           PARTITION_ONLY_SHUFFLE [RS_18]           |
> |             Group By Operator [GBY_17] (rows=1/1 width=8) |
> |               Output:["_col0"],aggregations:["sum(_col0)"] |
> |               Select Operator [SEL_15] (rows=48400/1 width=5) |
> |                 Output:["_col0"]                   |
> |                 Map Join Operator [MAPJOIN_31] (rows=48400/1 width=5) |
> |                   
> Conds:MAPJOIN_30._col1=RS_13._col0(Inner),HybridGraceHashJoin:true,Output:["_col2","_col4","_col6"]
>  |
> |                 <-Map 4 [BROADCAST_EDGE]           |
> |                   BROADCAST [RS_13]                |
> |                     PartitionCols:_col0            |
> |                     Select Operator [SEL_8] (rows=3/1 width=3) |
> |                       Output:["_col0","_col1"]     |
> |                       Filter Operator [FIL_29] (rows=3/1 width=3) |
> |                         predicate:((w > 9) and id_uw is not null) |
> |                         TableScan [TS_6] (rows=10/10 width=3) |
> |                           
> default@tw,tw,Tbl:COMPLETE,Col:NONE,Output:["id_uw","w"] |
> |                 <-Map Join Operator [MAPJOIN_30] (rows=44000/1 width=5) |
> |                     
> Conds:SEL_2._col0=RS_10._col0(Inner),HybridGraceHashJoin:true,Output:["_col1","_col2","_col4"]
>  |
> |                   <-Map 3 [BROADCAST_EDGE]         |
> |                     BROADCAST [RS_10]              |
> |                       PartitionCols:_col0          |
> |                       Select Operator [SEL_5] (rows=1632/1 width=3) |
> |                         Output:["_col0","_col1"]   |
> |                         Filter Operator [FIL_28] (rows=1632/1 width=3) |
> |                           predicate:((v > 3) and id_uv is not null) |
> |                           TableScan [TS_3] (rows=4898/4 width=3) |
> |                             
> default@tv,tv,Tbl:COMPLETE,Col:NONE,Output:["id_uv","v"] |
> |                   <-Select Operator [SEL_2] (rows=40000/6 width=5) |
> |                       Output:["_col0","_col1","_col2"] |
> |                       Filter Operator [FIL_27] (rows=40000/6 width=5) |
> |                         predicate:((u > 1) and id_uv is not null and id_uw 
> is not null) |
> |                         TableScan [TS_0] (rows=120000/7 width=5) |
> |                           
> default@tu,tu,Tbl:COMPLETE,Col:NONE,Output:["id_uv","id_uw","u"] |
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Updated] (HIVE-18454) Incorrect rownum estimation in joins

Reply via email to