This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 0dfed88861c03ecf466f8762ae1f03756518da88
Author: stiga-huang <huangquanl...@gmail.com>
AuthorDate: Wed Sep 3 20:11:07 2025 +0800

    IMPALA-14383: Fix crash in casting timestamp string with timezone offsets 
to DATE
    
    Timestamp string can have a timezone offset at its end, e.g.
    "2025-08-31 06:23:24.9392129 +08:00" has "+08:00" as the timezone
    offset. When casting strings to DATE type, we try to find the default
    format by matching the separators, i.e. '-', ':', ' ', etc in
    SimpleDateFormatTokenizer::GetDefaultFormatContext(). The one that
    matches this example is DEFAULT_DATE_TIME_CTX[] which represents the
    default date/time context for "yyyy-MM-dd HH:mm:ss.SSSSSSSSS". The
    fractional part at the end can have length from 0 to 9, matching
    DEFAULT_DATE_TIME_CTX[0] to DEFAULT_DATE_TIME_CTX[9] respectively.
    When calculating which item in DEFAULT_DATE_TIME_CTX is the matched
    format, we use the index as str_len - 20 where 20 is the length of
    "yyyy-MM-dd HH:mm:ss.". This causes the index overflow if the string
    length is larger than 29. A wild pointer is returned from
    GetDefaultFormatContext(), leading crash in following codes.
    
    This patch fixes the issue by adding a check to make sure the string
    length is smaller than the max length of the default date time format,
    i.e. DEFAULT_DATE_TIME_FMT_LEN (29). Longer strings will use
    DateTimeFormatContext created lazily.
    
    Note that this just fixes the crash. Converting timestamp strings with
    timezone offset at the end to DATE type is not supported yet and will be
    followed up in IMPALA-14391.
    
    Tests
     - Added e2e tests on constant expressions. Also added a test table with
       such timestamp strings and added test on it.
    
    Change-Id: I36d73f4a71432588732b2284ac66552f75628a62
    Reviewed-on: http://gerrit.cloudera.org:8080/23371
    Reviewed-by: Daniel Becker <daniel.bec...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 .../runtime/datetime-simple-date-format-parser.cc  |  1 +
 .../functional/functional_schema_template.sql      | 13 ++++++++++
 .../datasets/functional/schema_constraints.csv     |  4 +++
 .../functional-query/queries/QueryTest/date.test   | 29 ++++++++++++++++++++++
 4 files changed, 47 insertions(+)

diff --git a/be/src/runtime/datetime-simple-date-format-parser.cc 
b/be/src/runtime/datetime-simple-date-format-parser.cc
index 5a937540e..4626dd71d 100644
--- a/be/src/runtime/datetime-simple-date-format-parser.cc
+++ b/be/src/runtime/datetime-simple-date-format-parser.cc
@@ -401,6 +401,7 @@ const DateTimeFormatContext* 
SimpleDateFormatTokenizer::GetDefaultFormatContext(
           // We will need to work out which default context to use that 
corresponds to
           // the fractional length in the string.
           if (LIKELY(len > DEFAULT_SHORT_DATE_TIME_FMT_LEN)
+              && LIKELY(len < DEFAULT_DATE_TIME_FMT_LEN)
               && LIKELY(str[19] == '.') && LIKELY(str[13] == ':')) {
             switch (str[10]) {
               case ' ': {
diff --git a/testdata/datasets/functional/functional_schema_template.sql 
b/testdata/datasets/functional/functional_schema_template.sql
index b15d698d9..99c26a602 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -4787,3 +4787,16 @@ CREATE TABLE {db_name}{db_suffix}.{table_name} (
 )
 PARTITION BY HASH (tkey) PARTITIONS 3 STORED AS KUDU;
 ====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
+timestamp_with_tz_str
+---- COLUMNS
+id INT
+timestamp_str STRING
+---- ROW_FORMAT
+delimited fields terminated by ','
+---- DEPENDENT_LOAD
+insert overwrite table {db_name}{db_suffix}.{table_name}
+select id, concat(cast(timestamp_col as string format 'YYYY-MM-DD 
HH24:MM:SS'), ' +08:00') from functional.alltypestiny;
+====
diff --git a/testdata/datasets/functional/schema_constraints.csv 
b/testdata/datasets/functional/schema_constraints.csv
index a3d279df3..c5229ecac 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -436,3 +436,7 @@ table_name:timestamp_primary_key, constraint:restrict_to, 
table_format:kudu/none
 # Table where all column values are unique but have some NULLs.
 # Depends on functional.alltypessmall.
 table_name:unique_with_nulls, constraint:restrict_to, 
table_format:parquet/none/none
+
+# The table is used to test casting string values to date type. Don't need to 
test on
+# all formats.
+table_name:timestamp_with_tz_str, constraint:restrict_to, 
table_format:parquet/none/none
diff --git a/testdata/workloads/functional-query/queries/QueryTest/date.test 
b/testdata/workloads/functional-query/queries/QueryTest/date.test
index 6fcdfc242..afec7221d 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/date.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/date.test
@@ -884,3 +884,32 @@ select cast(cast(NULL as date) as string format 
"YYYY-MM-DD");
 ---- TYPES
 STRING
 ====
+---- QUERY
+select cast('2025-08-31 06:23:24.1234567890123456789' as DATE);
+---- RESULTS
+2025-08-31
+---- TYPES
+DATE
+====
+---- QUERY
+# TODO(IMPALA-14391): support timezone offset in the string
+select cast('2025-08-31 06:23:24.9392129 +00:00' as DATE)
+---- CATCH
+UDF ERROR: String to Date parse failed. Invalid string val: '2025-08-31 
06:23:24.9392129 +00:00'
+====
+---- QUERY
+select cast('2025-08-31 06:23:24.123456789abcd' as DATE)
+---- CATCH
+UDF ERROR: String to Date parse failed. Invalid string val: '2025-08-31 
06:23:24.123456789abcd'
+====
+---- QUERY
+select cast('aaaa-aa-aa aa:aa:aa.123456789abcd' as DATE)
+---- CATCH
+UDF ERROR: String to Date parse failed. Invalid string val: 'aaaa-aa-aa 
aa:aa:aa.123456789abcd'
+====
+---- QUERY
+# TODO(IMPALA-14391): support timezone offset in the string
+select cast(timestamp_str as date) from 
functional_parquet.timestamp_with_tz_str where id=0;
+---- CATCH
+UDF ERROR: String to Date parse failed. Invalid string val: '2009-01-01 
00:01:00 +08:00'
+====

Reply via email to