Dhanesh Padmanabhan created HIVE-20525:
------------------------------------------

             Summary: Hive UNION ALL query gives wrong results with 
hive.optimize.skewjoin=true
                 Key: HIVE-20525
                 URL: https://issues.apache.org/jira/browse/HIVE-20525
             Project: Hive
          Issue Type: Bug
    Affects Versions: 2.1.1, 1.2.1
            Reporter: Dhanesh Padmanabhan


Following query gives 0 rows instead of the expected 3 rows. Tested on dataproc 
1.2.47-deb8 with the default hive version 2.1.1. The query throws an 
ArrayIndexOutOfBoundsException if hive.auto.convert.join=true.

On HDP2.6.4 sandbox, hive 1.2.1 version this gives 2 rows instead of 3
{code:java}

set hive.optimize.skewjoin=true;
set hive.auto.convert.join=false;

CREATE TABLE t_ins (user_id bigint, app_id string, store string, oem string ,dt 
string);
CREATE TABLE t_fa (user_id bigint, app_type string, build_name string, dt 
string);
CREATE TABLE t_la (user_id bigint, oem string, dt string);
CREATE TABLE t_tu (user_id bigint, new_user_id bigint, dt string);
CREATE TABLE t_duc (country_code string, user_id bigint);
CREATE TABLE t_app (name string, app_id string, dt string);

INSERT INTO t_ins VALUES (1234567, 'xyzuvw','store1','OEM1','20180814');
INSERT INTO t_app VALUES ('app1','xyzuvw','20180814');

INSERT INTO t_fa VALUES (1234567,'app1','store1','20180814');
INSERT INTO t_la VALUES (1234567,'OEM1','20180814');

INSERT INTO t_tu VALUES (1234567,1111111,'20180814');
INSERT INTO t_duc VALUES ('IN',1234567);


INSERT INTO t_fa VALUES (7654321,'app2','store2','20180814');
INSERT INTO t_la VALUES (7654321,'OEM2','20180814');

INSERT INTO t_tu VALUES (7654321,1111111,'20180814');
INSERT INTO t_duc VALUES ('SE',7654321);


CREATE TABLE `abs`(
`country_code` string, 
`app_type` string, 
`build_name` string, 
`device` string, 
`new_user_id` string, 
`user_type_1` smallint, 
`user_type_2` smallint)
PARTITIONED BY ( 
`dt` string)
ROW FORMAT SERDE 
'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
STORED AS INPUTFORMAT 
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
OUTPUTFORMAT 
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
;


INSERT OVERWRITE TABLE abs PARTITION (dt=20180814)

SELECT
duc.country_code country_code,
app.name app_type,
i.store build_name,
i.oem device,
tu.new_user_id new_user_id,
1 user_type_1,
0 user_type_2
FROM
t_ins i
JOIN t_app app ON app.app_id = i.app_id AND app.dt = 20180814
LEFT JOIN t_duc duc on i.user_id = duc.user_id
LEFT JOIN t_tu tu on tu.user_id = i.user_id and tu.dt = 20180814
WHERE
i.dt = 20180814
GROUP BY
duc.country_code,
app.name,
i.store,
i.oem,
tu.new_user_id

UNION ALL

SELECT
duc.country_code country_code,
fa.app_type app_type,
fa.build_name build_name,
la.oem device,
tu.new_user_id new_user_id,
0 user_type_1,
1 user_type_2
FROM
t_fa fa
LEFT JOIN t_duc duc on fa.user_id = duc.user_id
LEFT JOIN t_tu tu on tu.user_id = fa.user_id and tu.dt = 20180814
LEFT JOIN t_la la ON fa.user_id = la.user_id AND la.dt = 20180814
WHERE
fa.dt = 20180814
GROUP BY
duc.country_code,
fa.app_type,
fa.build_name,
la.oem,
tu.new_user_id
;

select * from abs where dt=20180814;

{code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to