[
https://issues.apache.org/jira/browse/IMPALA-13262?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Fang-Yu Rao updated IMPALA-13262:
---------------------------------
Description:
We found that in some scenario Apache Impala could incorrectly push predicates
to scan nodes, which in turn produces the wrong result. The following is a
concrete example to reproduce the issue.
{code:sql}
create database impala_13262;
use impala_13262;
create table department ( dept_no integer, dept_rank integer, start_date
timestamp,end_date timestamp);
insert into department values(1,1,'2024-01-01','2024-01-02');
insert into department values(1,2,'2024-01-02','2024-01-03');
insert into department values(1,3,'2024-01-03','2024-01-03');
create table employee (employee_no integer, depart_no integer);
insert into employee values (1,1);
// The following should return 0 row. However Apache Impala produces one row.
select * from employee t1
inner join (
select * from
(
select dept_no,dept_rank,start_date,end_date
,row_number() over(partition by dept_no order by dept_rank) rn
from department
) t2
where rn=1
) t2
on t1.depart_no=t2.dept_no
where t2.start_date=t2.end_date
set explain_level=2;
// In the output of the EXPLAIN statement, we found that the predicate
"start_data = end_date" was pushed
// down to the scan node, which is wrong.
| 01:SCAN HDFS [impala_13262.department, RANDOM]
|
| HDFS partitions=1/1 files=3 size=132B
|
| predicates: start_date = end_date
|
| stored statistics:
|
| table: rows=unavailable size=unavailable
|
| columns: unavailable
|
| extrapolated-rows=disabled max-scan-range-rows=unavailable
|
| mem-estimate=32.00MB mem-reservation=8.00KB thread-reservation=1
|
| tuple-ids=1 row-size=40B cardinality=1
|
| in pipelines: 01(GETNEXT)
|
+-------------------------------------------------------------------------------------------------------+
{code}
was:
We found that in some scenario Apache Impala could incorrectly push predicates
to scan nodes, which in turn produces the wrong result. The following is a
concrete example to reproduce the issue.
{code:java}
create table department ( dept_no integer, dept_rank integer, start_date
timestamp,end_date timestamp);
insert into department values(1,1,'2024-01-01','2024-01-02');
insert into department values(1,2,'2024-01-02','2024-01-03');
insert into department values(1,3,'2024-01-03','2024-01-03');
create table employee (employee_no integer, depart_no integer);
insert into employee values (1,1);
// The following should return 0 row. However Apache Impala produces one row.
select * from employee t1
inner join (
select * from
(
select dept_no,dept_rank,start_date,end_date
,row_number() over(partition by dept_no order by dept_rank) rn
from department
) t2
where rn=1
) t2
on t1.depart_no=t2.dept_no
where t2.start_date=t2.end_date
//
| 01:SCAN HDFS [impala_13262.department, RANDOM]
|
| HDFS partitions=1/1 files=3 size=132B
|
| predicates: start_date = end_date
|
| stored statistics:
|
| table: rows=unavailable size=unavailable
|
| columns: unavailable
|
| extrapolated-rows=disabled max-scan-range-rows=unavailable
|
| mem-estimate=32.00MB mem-reservation=8.00KB thread-reservation=1
|
| tuple-ids=1 row-size=40B cardinality=1
|
| in pipelines: 01(GETNEXT)
|
+-------------------------------------------------------------------------------------------------------+
{code}
> Predicate pushdown causes incorrect results in join condition
> -------------------------------------------------------------
>
> Key: IMPALA-13262
> URL: https://issues.apache.org/jira/browse/IMPALA-13262
> Project: IMPALA
> Issue Type: Bug
> Reporter: Fang-Yu Rao
> Assignee: Fang-Yu Rao
> Priority: Major
>
> We found that in some scenario Apache Impala could incorrectly push
> predicates to scan nodes, which in turn produces the wrong result. The
> following is a concrete example to reproduce the issue.
> {code:sql}
> create database impala_13262;
> use impala_13262;
> create table department ( dept_no integer, dept_rank integer, start_date
> timestamp,end_date timestamp);
> insert into department values(1,1,'2024-01-01','2024-01-02');
> insert into department values(1,2,'2024-01-02','2024-01-03');
> insert into department values(1,3,'2024-01-03','2024-01-03');
> create table employee (employee_no integer, depart_no integer);
> insert into employee values (1,1);
> // The following should return 0 row. However Apache Impala produces one row.
> select * from employee t1
> inner join (
> select * from
> (
> select dept_no,dept_rank,start_date,end_date
> ,row_number() over(partition by dept_no order by dept_rank) rn
> from department
> ) t2
> where rn=1
> ) t2
> on t1.depart_no=t2.dept_no
> where t2.start_date=t2.end_date
> set explain_level=2;
> // In the output of the EXPLAIN statement, we found that the predicate
> "start_data = end_date" was pushed
> // down to the scan node, which is wrong.
> | 01:SCAN HDFS [impala_13262.department, RANDOM]
> |
> | HDFS partitions=1/1 files=3 size=132B
> |
> | predicates: start_date = end_date
> |
> | stored statistics:
> |
> | table: rows=unavailable size=unavailable
> |
> | columns: unavailable
> |
> | extrapolated-rows=disabled max-scan-range-rows=unavailable
> |
> | mem-estimate=32.00MB mem-reservation=8.00KB thread-reservation=1
> |
> | tuple-ids=1 row-size=40B cardinality=1
> |
> | in pipelines: 01(GETNEXT)
> |
> +-------------------------------------------------------------------------------------------------------+
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]