[ https://issues.apache.org/jira/browse/HIVE-28513 ]
zhaolong deleted comment on HIVE-28513: --------------------------------- was (Author: fsilent): on ( D.column4_4 = 1 ) OR ( T1.column10_2 = '0') should be on ( D.column4_4 = ‘1’) OR ( T1.column10_2 = '0') because D.column4_4 is a string type > when cbo is false,join with 'or' condition cause wrong result > ------------------------------------------------------------- > > Key: HIVE-28513 > URL: https://issues.apache.org/jira/browse/HIVE-28513 > Project: Hive > Issue Type: Bug > Components: CBO > Affects Versions: 4.0.0-beta-1, 4.0.1 > Reporter: lotan > Priority: Major > Attachments: Reproduce SQL statements..txt, > image-2024-09-10-09-56-28-750.png, image-2024-09-10-09-57-42-902.png > > > In the 4 table join scenario, when join on is set to "or" for filtering and > is a fixed value condition, data is incorrectly listed. > The problem is reproduced as follows: > CREATE TABLE `table1`( > `column1` varchar(90), > `column2` varchar(11) , > > `column3` varchar(18), > `column4` varchar(3) , > > `column5` varchar(12) , > `column6` varchar(29) , > > `column7` varchar(8) , > > `column8` varchar(11) , > > `column9` varchar(5), > `column10` varchar(2) ) > PARTITIONED BY ( > > `pt_dt` varchar(10) ) > ROW FORMAT SERDE > > 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' > > WITH SERDEPROPERTIES ( > > 'field.delim'='27', > > 'serialization.format'='27') > > STORED AS INPUTFORMAT > > 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' > > OUTPUTFORMAT > > 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' > ; > > > CREATE TABLE `table2`( > `column2_1` varchar(9) , > `column2_2` varchar(30) , > `column2_3` varchar(30), > `column2_4` varchar(90) , > > `column2_5` varchar(15)) > PARTITIONED BY ( > > `pt_dt` varchar(10)) > > ROW FORMAT SERDE > > 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' > > WITH SERDEPROPERTIES ( > > 'field.delim'='27', > > 'serialization.format'='27') > > STORED AS INPUTFORMAT > > 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' > > OUTPUTFORMAT > > 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' > ; > > > CREATE TABLE `table3`( > `column3_1` varchar(30) , > `column3_2` varchar(30) , > `column3_3` varchar(30) , > `column3_4` varchar(30), > `column3_5` varchar(30)) > > ROW FORMAT SERDE > > 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' > > WITH SERDEPROPERTIES ( > > 'field.delim'='27', > > 'serialization.format'='27') > > STORED AS INPUTFORMAT > > 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' > > OUTPUTFORMAT > > 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' ; > > > CREATE TABLE `table4`( > > `column4_2` string , > `column4_3` string , > `column4_1` string , > `column4_4` string ) > > PARTITIONED BY ( > `pt_dt` string) > ROW FORMAT SERDE > 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' > WITH SERDEPROPERTIES ( > 'field.delim'='27', > 'serialization.format'='27') > STORED AS INPUTFORMAT > 'org.apache.hadoop.mapred.TextInputFormat' > OUTPUTFORMAT > 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ; > Insert Reproduction Data: > insert into table1 partition (pt_dt='2023-11-30') > (column6,column3,column7,column2,column5,column10,column1,column8,column9,column4) > values > (16160200980000057,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(27120403980000164,0140010000296,00002,0000001,'2022-11-30',1,1,1234567,12345,1),(26040204980001179,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(20100213980049933,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(16070091980002440,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(16070017980004555,0030100000004,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(06060822980002332,0030100000013,00001,0000001,'2022-11-30',1,1,1234567,12345,1); > insert into table2 partition (pt_dt='2023-11-30') > (column2_1,column2_4,column2_2,column2_3,column2_5) values > ('S6',1,016160020000000055,027120000600001061,0161600203),('S6',1,027120000600001061,016160020000000055,0271200006),('S6',1,026040000700001541,026040000700001541,0260400007),('S6',1,020100025900019726,016070001700001229,0201000259),('S6',1,016070009100000471,016070001700001229,0160700091),('S6',1,016070001700001229,016070009100000471,0160700296),('S6',1,006060000500001050,016070009100000471,0060600005); > Run the reproduction SQL statement: > SELECT > T2.column2_5 > ,T2.column2_2 > ,T6.column3_5 > ,T6.column3_2 > ,T6.column3_4 > ,T6.column3_3 > ,T1.column10 > ,T2.column2_4 > FROM (SELECT > column8 > ,column1 > ,column5 > ,column9 > ,column4 > ,PT_DT > ,column10 > ,column10 AS column10_2 > FROM table1 > WHERE PT_DT= '2023-11-30' > ) T1 > INNER JOIN (SELECT column2_4 > ,column2_3 > ,column2_2 > ,column2_5 > FROM table2 A > WHERE PT_DT = '2023-11-30' > )T2 > ON T1.column1 = T2.column2_4 > LEFT JOIN (SELECT column4_1,column4_2,column4_3,column4_4 > FROM table4 > WHERE PT_DT='2023-11-30' > ) D > on ( D.column4_4 = 1 ) OR ( T1.column10_2 = '0') > LEFT JOIN ( SELECT > F.column3_1 > ,F.column3_2 > ,F.column3_3 > ,F.column3_4 > ,F.column3_5 > FROM table3 F > ) T6 > ON T2.column2_5 = T6.column3_1 > WHERE D.column4_3 IS NULL ; > > Expected data: > !image-2024-09-10-09-56-28-750.png|width=981,height=219! > Abnormal data: > !image-2024-09-10-09-57-42-902.png! > > -- This message was sent by Atlassian Jira (v8.20.10#820010)