[ 
https://issues.apache.org/jira/browse/HIVE-23304?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

gumengchao updated HIVE-23304:
------------------------------
    Description: 
The simplified question is like:

         sql_1: select * from (subquery)

         sql_2: create  temporary  table with subquery

                 and  then  select * from  temporary table

the result of sql_1 and sql_2 is diffrent.

 

The specific questions are as follows:

execute sql

 
{noformat}
SELECT AVG(a.rate), a.movieid, MAX(b.userid)
FROM t_rating a
        INNER JOIN (
                SELECT a.movieid AS movieid, a.userid AS userid, a.rate
                FROM t_rating a
                        JOIN (
                                SELECT COUNT(*) AS rate_times, a.userid AS 
userid
                                FROM t_rating a
                                        JOIN (
                                                SELECT *
                                                FROM t_user
                                                WHERE sex = 'F'
                                        ) b
                                        ON a.userid = b.userid
                                GROUP BY a.userid
                                ORDER BY rate_times DESC
                                LIMIT 0, 1
                        ) b
                        ON a.userid = b.userid
                ORDER BY a.rate DESC
                LIMIT 0, 10
        ) b
        ON a.movieid = b.movieid
GROUP BY a.movieid{noformat}
The result is below.But this is not the result I want 
{noformat}
4.063136456211812       162     1150
4.476190476190476       904     1150
4.249370277078086       951     1150
4.14167916041979        1230    1150
3.6464646464646466      1966    1150
4.163043478260869       2330    1150
3.7039473684210527      3163    1150
4.387453874538745       3307    1150
4.047363717605005       3671    1150
3.8265682656826567      3675    1150{noformat}
So I divided sql into two steps

fisrt: create temporary table with subquery

 
{noformat}
create temporary table tmp as
SELECT a.movieid AS movieid, a.userid AS userid, a.rate
                FROM t_rating a
                        JOIN (
                                SELECT COUNT(*) AS rate_times, a.userid AS 
userid
                                FROM t_rating a
                                        JOIN (
                                                SELECT *
                                                FROM t_user
                                                WHERE sex = 'F'
                                        ) b
                                        ON a.userid = b.userid
                                GROUP BY a.userid
                                ORDER BY rate_times DESC
                                LIMIT 0, 1
                        ) b
                        ON a.userid = b.userid
                ORDER BY a.rate DESC
                LIMIT 0, 10{noformat}
second: use temporary table replace subquery

 
{noformat}
SELECT AVG(a.rate), a.movieid, MAX(b.userid)
FROM t_rating a INNER JOIN tmp b
        ON a.movieid = b.movieid
GROUP BY a.movieid{noformat}
the result 
{noformat}
4.52054794520548        745     1150
4.4498902706656915      750     1150
4.476190476190476       904     1150
4.280748663101604       905     1150
3.7314890154597236      1094    1150
4.188888888888889       1236    1150
4.21043771043771        1256    1150
3.747422680412371       1279    1150
4.0739348370927315      2064    1150
4.125390450691656       2997    1150{noformat}
Why the results of two executions are different?

 

 

 

 

 

 

  was:
The simplified question is like

         sql_1: select * from (subquery)

         sql_2: create  temporary  table with subquery

                 and  then  select * from  temporary table

the result of sql_1 and sql_2 is dirffrent.

 

The specific questions are as follows:

execute sql

 
{noformat}
SELECT AVG(a.rate), a.movieid, MAX(b.userid)
FROM t_rating a
        INNER JOIN (
                SELECT a.movieid AS movieid, a.userid AS userid, a.rate
                FROM t_rating a
                        JOIN (
                                SELECT COUNT(*) AS rate_times, a.userid AS 
userid
                                FROM t_rating a
                                        JOIN (
                                                SELECT *
                                                FROM t_user
                                                WHERE sex = 'F'
                                        ) b
                                        ON a.userid = b.userid
                                GROUP BY a.userid
                                ORDER BY rate_times DESC
                                LIMIT 0, 1
                        ) b
                        ON a.userid = b.userid
                ORDER BY a.rate DESC
                LIMIT 0, 10
        ) b
        ON a.movieid = b.movieid
GROUP BY a.movieid{noformat}
The result is below.But this is not the result I want 
{noformat}
4.063136456211812       162     1150
4.476190476190476       904     1150
4.249370277078086       951     1150
4.14167916041979        1230    1150
3.6464646464646466      1966    1150
4.163043478260869       2330    1150
3.7039473684210527      3163    1150
4.387453874538745       3307    1150
4.047363717605005       3671    1150
3.8265682656826567      3675    1150{noformat}
So I divided sql into two steps

fisrt: create temporary table with subquery

 
{noformat}
create temporary table tmp as
SELECT a.movieid AS movieid, a.userid AS userid, a.rate
                FROM t_rating a
                        JOIN (
                                SELECT COUNT(*) AS rate_times, a.userid AS 
userid
                                FROM t_rating a
                                        JOIN (
                                                SELECT *
                                                FROM t_user
                                                WHERE sex = 'F'
                                        ) b
                                        ON a.userid = b.userid
                                GROUP BY a.userid
                                ORDER BY rate_times DESC
                                LIMIT 0, 1
                        ) b
                        ON a.userid = b.userid
                ORDER BY a.rate DESC
                LIMIT 0, 10{noformat}
second: use temporary table replace subquery

 
{noformat}
SELECT AVG(a.rate), a.movieid, MAX(b.userid)
FROM t_rating a INNER JOIN tmp b
        ON a.movieid = b.movieid
GROUP BY a.movieid{noformat}
the result 
{noformat}
4.52054794520548        745     1150
4.4498902706656915      750     1150
4.476190476190476       904     1150
4.280748663101604       905     1150
3.7314890154597236      1094    1150
4.188888888888889       1236    1150
4.21043771043771        1256    1150
3.747422680412371       1279    1150
4.0739348370927315      2064    1150
4.125390450691656       2997    1150{noformat}
Why the results of two executions are different?

 

 

 

 

 

 


> Two SQL execution results with the same semantics are not the same
> ------------------------------------------------------------------
>
>                 Key: HIVE-23304
>                 URL: https://issues.apache.org/jira/browse/HIVE-23304
>             Project: Hive
>          Issue Type: Bug
>          Components: Hive, SQL
>    Affects Versions: 2.3.7
>         Environment: hadoop version 2.7.7
> hive version 2.3.7
> hive.execution.engine=mr
>  
>            Reporter: gumengchao
>            Priority: Critical
>
> The simplified question is like:
>          sql_1: select * from (subquery)
>          sql_2: create  temporary  table with subquery
>                  and  then  select * from  temporary table
> the result of sql_1 and sql_2 is diffrent.
>  
> The specific questions are as follows:
> execute sql
>  
> {noformat}
> SELECT AVG(a.rate), a.movieid, MAX(b.userid)
> FROM t_rating a
>       INNER JOIN (
>               SELECT a.movieid AS movieid, a.userid AS userid, a.rate
>               FROM t_rating a
>                       JOIN (
>                               SELECT COUNT(*) AS rate_times, a.userid AS 
> userid
>                               FROM t_rating a
>                                       JOIN (
>                                               SELECT *
>                                               FROM t_user
>                                               WHERE sex = 'F'
>                                       ) b
>                                       ON a.userid = b.userid
>                               GROUP BY a.userid
>                               ORDER BY rate_times DESC
>                               LIMIT 0, 1
>                       ) b
>                       ON a.userid = b.userid
>               ORDER BY a.rate DESC
>               LIMIT 0, 10
>       ) b
>       ON a.movieid = b.movieid
> GROUP BY a.movieid{noformat}
> The result is below.But this is not the result I want 
> {noformat}
> 4.063136456211812     162     1150
> 4.476190476190476     904     1150
> 4.249370277078086     951     1150
> 4.14167916041979      1230    1150
> 3.6464646464646466    1966    1150
> 4.163043478260869     2330    1150
> 3.7039473684210527    3163    1150
> 4.387453874538745     3307    1150
> 4.047363717605005     3671    1150
> 3.8265682656826567    3675    1150{noformat}
> So I divided sql into two steps
> fisrt: create temporary table with subquery
>  
> {noformat}
> create temporary table tmp as
> SELECT a.movieid AS movieid, a.userid AS userid, a.rate
>               FROM t_rating a
>                       JOIN (
>                               SELECT COUNT(*) AS rate_times, a.userid AS 
> userid
>                               FROM t_rating a
>                                       JOIN (
>                                               SELECT *
>                                               FROM t_user
>                                               WHERE sex = 'F'
>                                       ) b
>                                       ON a.userid = b.userid
>                               GROUP BY a.userid
>                               ORDER BY rate_times DESC
>                               LIMIT 0, 1
>                       ) b
>                       ON a.userid = b.userid
>               ORDER BY a.rate DESC
>               LIMIT 0, 10{noformat}
> second: use temporary table replace subquery
>  
> {noformat}
> SELECT AVG(a.rate), a.movieid, MAX(b.userid)
> FROM t_rating a INNER JOIN tmp b
>       ON a.movieid = b.movieid
> GROUP BY a.movieid{noformat}
> the result 
> {noformat}
> 4.52054794520548      745     1150
> 4.4498902706656915    750     1150
> 4.476190476190476     904     1150
> 4.280748663101604     905     1150
> 3.7314890154597236    1094    1150
> 4.188888888888889     1236    1150
> 4.21043771043771      1256    1150
> 3.747422680412371     1279    1150
> 4.0739348370927315    2064    1150
> 4.125390450691656     2997    1150{noformat}
> Why the results of two executions are different?
>  
>  
>  
>  
>  
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to