zengxl created HIVE-27655:
-----------------------------
Summary: When a table is joined more than 2 times, the join result
is null
Key: HIVE-27655
URL: https://issues.apache.org/jira/browse/HIVE-27655
Project: Hive
Issue Type: Bug
Affects Versions: 3.1.2
Reporter: zengxl
Assignee: Ayush Saxena
*1.set up parameters*
close mapjoin and set the number of reduce to more than 2
{code:java}
set hive.auto.convert.join=false;
set mapreduce.job.reduces=2; {code}
*2.Prepare data*
* {*}{color:#FF0000}Note{color}{*}: The table storage format does not affect
the test
{code:java}
create table test_parquet(acc_nbr string,user_area_1 string,user_area_2
string,user_area_3 string,user_area_4 string) partitioned by (p_date_no string)
stored as parquet;
create table join_test_parquet(ms_area_id string,area_name string) stored as
textfile;
insert into table test_parquet partition (p_date_no='20230821') values
('acc_nbr_1','1','2','3','4');
insert into table test_parquet partition (p_date_no='20230821') values
('acc_nbr_2','5','6','7','8');
insert into table test_parquet partition (p_date_no='20230821') values
('acc_nbr_3','9','10','11','12');
insert into table join_test_parquet values
('1','area_name_1'),('2','area_name_2'),('3','area_name_3'),('4','area_name_4');
insert into table join_test_parquet values
('5','area_name_5'),('6','area_name_6'),('7','area_name_7'),('8','area_name_8');
insert into table join_test_parquet values
('9','area_name_9'),('10','area_name_10'),('11','area_name_11'),('12','area_name_12');
{code}
{*}Test SQL {color:#FF0000}case1 {color}as follows{*}{*}{*}
{code:java}
{code}
*create table test_join_result
as
select a.acc_nbr,user_area_1,user_area_2,user_area_3,user_area_4,
b1.area_name as a1,
b2.area_name as a2,
b3.area_name as a3,
b4.area_name as a4
from test_parquet a
LEFT join join_test_parquet b1 on a.user_area_1 = b1.ms_area_id
LEFT join join_test_parquet b2 on a.user_area_2 = b2.ms_area_id
LEFT join join_test_parquet b3 on a.user_area_3 = b3.ms_area_id
LEFT join join_test_parquet b4 on a.user_area_4 = b4.ms_area_id
where a.p_date_no = '20230821';*
*{color:#FF0000}incorrect{color}* *reuslt:*
{code:java}
hive> select * from test_join_result ;
OK
acc_nbr_1 1 2 3 4 area_name_1 NULL NULL
NULL
acc_nbr_2 5 6 7 8 area_name_5 NULL NULL
area_name_8
acc_nbr_3 9 10 11 12 area_name_9 NULL NULL
area_name_12
Time taken: 0.435 seconds, Fetched: 3 row(s) {code}
*{color:#FF0000}correct{color}* {*}reuslt:{*}{*}{*}
{code:java}
hive> select * from test_join_result ;
OK
acc_nbr_3 9 10 11 12 area_name_9 area_name_10
area_name_11 area_name_12
acc_nbr_1 1 2 3 4 area_name_1 area_name_2
area_name_3 area_name_4
acc_nbr_2 5 6 7 8 area_name_5 area_name_6
area_name_7 area_name_8
Time taken: 0.276 seconds, Fetched: 3 row(s){code}
{*}{*}{*}Test SQL {color:#FF0000}case2{color} as follows{*}{*}{*}
{code:java}
create table test_join_result
as
select a.acc_nbr,user_area_1,user_area_2,user_area_3,user_area_4,
b1.area_name as a1,
b2.area_name as a2
--b3.area_name as a3,
--b4.area_name as a4
from test_parquet a
LEFT join join_test_parquet b1 on a.user_area_1 = b1.ms_area_id
LEFT join join_test_parquet b2 on a.user_area_2 = b2.ms_area_id
--LEFT join join_test_parquet b3 on a.user_area_3 = b3.ms_area_id
--LEFT join join_test_parquet b4 on a.user_area_4 = b4.ms_area_id
where a.p_date_no = '20230821'; {code}
*{color:#FF0000}incorrect {color}result*
{code:java}
hive> select * from test_join_result;
OK
acc_nbr_1 1 2 3 4 area_name_1 NULL
acc_nbr_2 5 6 7 8 area_name_5 NULL
acc_nbr_3 9 10 11 12 area_name_9 NULL {code}
*correct reuslt*
{code:java}
hive> select * from test_join_result ;
OK
acc_nbr_3 9 10 11 12 area_name_9 area_name_10
acc_nbr_1 1 2 3 4 area_name_1 area_name_2
acc_nbr_2 5 6 7 8 area_name_5 area_name_6
{code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)