This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 3573db68c83918126337c9267140b7fa36f153e4 Author: Csaba Ringhofer <[email protected]> AuthorDate: Thu Mar 2 14:37:17 2023 +0100 IMPALA-11960: Fix constant propagation from TIMESTAMP to DATE The constant propagation introduced in IMPALA-10064 handled conversion of < and > predicates from timestamps to dates incorrectly. Example: select * from functional.alltypes_date_partition where date_col = cast(timestamp_col as date) and timestamp_col > '2009-01-01 01:00:00' and timestamp_col < '2009-02-01 01:00:00'; Before this change query rewrites added the following predicates: date_col > DATE '2009-01-01' AND date_col < DATE '2009-02-01' This incorrectly rejected all timestamps on the days of the lower / upper bounds. The fix is to rewrite < and > to <= and >= in the date predicates. < could be kept if the upper bound is a constant with no time-of-day part, e.g. timestamp_col < "2009-01-01" could be rewritten to date_col < "2009-01-01", but this optimization is not added in this patch to make it simpler. Testing: - added planner + EE regression tests Change-Id: I1938bf5e91057b220daf8a1892940f674aac3d68 Reviewed-on: http://gerrit.cloudera.org:8080/19572 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../impala/analysis/ConstantPredicateHandler.java | 9 ++++++ .../queries/PlannerTest/constant-propagation.test | 36 ++++++++++++++++++++-- .../QueryTest/range-constant-propagation.test | 21 +++++++++++++ 3 files changed, 64 insertions(+), 2 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/ConstantPredicateHandler.java b/fe/src/main/java/org/apache/impala/analysis/ConstantPredicateHandler.java index 36ae91360..1cf5e3822 100644 --- a/fe/src/main/java/org/apache/impala/analysis/ConstantPredicateHandler.java +++ b/fe/src/main/java/org/apache/impala/analysis/ConstantPredicateHandler.java @@ -172,6 +172,15 @@ public class ConstantPredicateHandler { private Expr rewriteWithOp(BinaryPredicate pred, BinaryPredicate.Operator newOp, Analyzer analyzer) { + // Allow equality to avoid problems with truncating the time-of-day part of + // timestamps (IMPALA-11960). + // TODO: the original (stricter) op could be kept if the upper bound is a + // TimestampLiteral without time component, e.g ts_col < "2000-01-01" + if (newOp == BinaryPredicate.Operator.LT) { + newOp = BinaryPredicate.Operator.LE; + } else if (newOp == BinaryPredicate.Operator.GT) { + newOp = BinaryPredicate.Operator.GE; + } if (pred.getOp() == newOp) return pred; BinaryPredicate newPred = new BinaryPredicate(newOp, pred.getChild(0), pred.getChild(1)); diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/constant-propagation.test b/testdata/workloads/functional-planner/queries/PlannerTest/constant-propagation.test index 350cfbddf..3455e227c 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/constant-propagation.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/constant-propagation.test @@ -434,6 +434,38 @@ PLAN-ROOT SINK predicates: functional.alltypes_date_partition.timestamp_col <= TIMESTAMP '2009-02-01 00:00:00', functional.alltypes_date_partition.timestamp_col >= TIMESTAMP '2009-01-01 00:00:00', date_col = CAST(timestamp_col AS DATE) row-size=65B cardinality=30 ==== +# Same as above but using < > and timestamps with time-of-day parts. +# Regression test for IMPALA-11960. +with dp_view as +(select * from functional.alltypes_date_partition +where date_col = cast(timestamp_col as date)) +select * from dp_view +where timestamp_col > '2009-01-01 01:00:00' and timestamp_col < '2009-02-01 01:00:00'; +---- PLAN +PLAN-ROOT SINK +| +00:SCAN HDFS [functional.alltypes_date_partition] + partition predicates: date_col >= DATE '2009-01-01' AND date_col <= DATE '2009-02-01' + HDFS partitions=32/55 files=32 size=15.99KB + predicates: functional.alltypes_date_partition.timestamp_col < TIMESTAMP '2009-02-01 01:00:00', functional.alltypes_date_partition.timestamp_col > TIMESTAMP '2009-01-01 01:00:00', date_col = CAST(timestamp_col AS DATE) + row-size=65B cardinality=30 +==== +# Same as above but using >= instead of >. +# Regression test for IMPALA-11960. +with dp_view as +(select * from functional.alltypes_date_partition +where date_col = cast(timestamp_col as date)) +select * from dp_view +where timestamp_col >= '2009-01-01 01:00:00' and timestamp_col < '2009-02-01 01:00:00'; +---- PLAN +PLAN-ROOT SINK +| +00:SCAN HDFS [functional.alltypes_date_partition] + partition predicates: date_col >= DATE '2009-01-01' AND date_col <= DATE '2009-02-01' + HDFS partitions=32/55 files=32 size=15.99KB + predicates: functional.alltypes_date_partition.timestamp_col < TIMESTAMP '2009-02-01 01:00:00', functional.alltypes_date_partition.timestamp_col >= TIMESTAMP '2009-01-01 01:00:00', date_col = CAST(timestamp_col AS DATE) + row-size=65B cardinality=30 +==== # Mix of various predicates some of which are eligible for propagation with dp_view as (select * from functional.alltypes_date_partition @@ -521,8 +553,8 @@ and timestamp_col > from_timestamp('2009-02-01 00:02:00', 'yyyy-mm-dd'); PLAN-ROOT SINK | 00:SCAN HDFS [functional.alltypes_date_partition] - partition predicates: date_col > DATE '2009-02-01' - HDFS partitions=23/55 files=23 size=11.28KB + partition predicates: date_col >= DATE '2009-02-01' + HDFS partitions=24/55 files=24 size=11.82KB predicates: timestamp_col > TIMESTAMP '2009-02-01 00:00:00', date_col = CAST(timestamp_col AS DATE) row-size=65B cardinality=21 ==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/range-constant-propagation.test b/testdata/workloads/functional-query/queries/QueryTest/range-constant-propagation.test index aa714147d..ba7b3b11d 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/range-constant-propagation.test +++ b/testdata/workloads/functional-query/queries/QueryTest/range-constant-propagation.test @@ -10,6 +10,27 @@ and timestamp_col between '2009-01-01' and '2009-02-01'; BIGINT, BIGINT ==== ---- QUERY +# Same query as above with time-of-day parts. Regression test for IMPALA-11960. +select count(*), sum(int_col) from alltypes_date_partition +where date_col = cast(timestamp_col as date) +and timestamp_col > '2009-01-01 01:00:00' and timestamp_col < '2009-02-01 01:00:00'; +---- RESULTS +155,620 +---- TYPES +BIGINT, BIGINT +==== +---- QUERY +# Same as above but using >= instead of > and using a lower bound that matches with +# a row. Regression test for IMPALA-11960. +select count(*), sum(int_col) from alltypes_date_partition +where date_col = cast(timestamp_col as date) +and timestamp_col >= '2009-01-01 00:08:00.280' and timestamp_col < '2009-02-01 01:00:00'; +---- RESULTS +156,628 +---- TYPES +BIGINT, BIGINT +==== +---- QUERY # Mix of various predicates some of which are eligible for propagation with dp_view as (select * from alltypes_date_partition
