Dhanesh Padmanabhan created HIVE-20525: ------------------------------------------
Summary: Hive UNION ALL query gives wrong results with hive.optimize.skewjoin=true Key: HIVE-20525 URL: https://issues.apache.org/jira/browse/HIVE-20525 Project: Hive Issue Type: Bug Affects Versions: 2.1.1, 1.2.1 Reporter: Dhanesh Padmanabhan Following query gives 0 rows instead of the expected 3 rows. Tested on dataproc 1.2.47-deb8 with the default hive version 2.1.1. The query throws an ArrayIndexOutOfBoundsException if hive.auto.convert.join=true. On HDP2.6.4 sandbox, hive 1.2.1 version this gives 2 rows instead of 3 {code:java} set hive.optimize.skewjoin=true; set hive.auto.convert.join=false; CREATE TABLE t_ins (user_id bigint, app_id string, store string, oem string ,dt string); CREATE TABLE t_fa (user_id bigint, app_type string, build_name string, dt string); CREATE TABLE t_la (user_id bigint, oem string, dt string); CREATE TABLE t_tu (user_id bigint, new_user_id bigint, dt string); CREATE TABLE t_duc (country_code string, user_id bigint); CREATE TABLE t_app (name string, app_id string, dt string); INSERT INTO t_ins VALUES (1234567, 'xyzuvw','store1','OEM1','20180814'); INSERT INTO t_app VALUES ('app1','xyzuvw','20180814'); INSERT INTO t_fa VALUES (1234567,'app1','store1','20180814'); INSERT INTO t_la VALUES (1234567,'OEM1','20180814'); INSERT INTO t_tu VALUES (1234567,1111111,'20180814'); INSERT INTO t_duc VALUES ('IN',1234567); INSERT INTO t_fa VALUES (7654321,'app2','store2','20180814'); INSERT INTO t_la VALUES (7654321,'OEM2','20180814'); INSERT INTO t_tu VALUES (7654321,1111111,'20180814'); INSERT INTO t_duc VALUES ('SE',7654321); CREATE TABLE `abs`( `country_code` string, `app_type` string, `build_name` string, `device` string, `new_user_id` string, `user_type_1` smallint, `user_type_2` smallint) PARTITIONED BY ( `dt` string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' ; INSERT OVERWRITE TABLE abs PARTITION (dt=20180814) SELECT duc.country_code country_code, app.name app_type, i.store build_name, i.oem device, tu.new_user_id new_user_id, 1 user_type_1, 0 user_type_2 FROM t_ins i JOIN t_app app ON app.app_id = i.app_id AND app.dt = 20180814 LEFT JOIN t_duc duc on i.user_id = duc.user_id LEFT JOIN t_tu tu on tu.user_id = i.user_id and tu.dt = 20180814 WHERE i.dt = 20180814 GROUP BY duc.country_code, app.name, i.store, i.oem, tu.new_user_id UNION ALL SELECT duc.country_code country_code, fa.app_type app_type, fa.build_name build_name, la.oem device, tu.new_user_id new_user_id, 0 user_type_1, 1 user_type_2 FROM t_fa fa LEFT JOIN t_duc duc on fa.user_id = duc.user_id LEFT JOIN t_tu tu on tu.user_id = fa.user_id and tu.dt = 20180814 LEFT JOIN t_la la ON fa.user_id = la.user_id AND la.dt = 20180814 WHERE fa.dt = 20180814 GROUP BY duc.country_code, fa.app_type, fa.build_name, la.oem, tu.new_user_id ; select * from abs where dt=20180814; {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005)