This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 0dfed88861c03ecf466f8762ae1f03756518da88 Author: stiga-huang <huangquanl...@gmail.com> AuthorDate: Wed Sep 3 20:11:07 2025 +0800 IMPALA-14383: Fix crash in casting timestamp string with timezone offsets to DATE Timestamp string can have a timezone offset at its end, e.g. "2025-08-31 06:23:24.9392129 +08:00" has "+08:00" as the timezone offset. When casting strings to DATE type, we try to find the default format by matching the separators, i.e. '-', ':', ' ', etc in SimpleDateFormatTokenizer::GetDefaultFormatContext(). The one that matches this example is DEFAULT_DATE_TIME_CTX[] which represents the default date/time context for "yyyy-MM-dd HH:mm:ss.SSSSSSSSS". The fractional part at the end can have length from 0 to 9, matching DEFAULT_DATE_TIME_CTX[0] to DEFAULT_DATE_TIME_CTX[9] respectively. When calculating which item in DEFAULT_DATE_TIME_CTX is the matched format, we use the index as str_len - 20 where 20 is the length of "yyyy-MM-dd HH:mm:ss.". This causes the index overflow if the string length is larger than 29. A wild pointer is returned from GetDefaultFormatContext(), leading crash in following codes. This patch fixes the issue by adding a check to make sure the string length is smaller than the max length of the default date time format, i.e. DEFAULT_DATE_TIME_FMT_LEN (29). Longer strings will use DateTimeFormatContext created lazily. Note that this just fixes the crash. Converting timestamp strings with timezone offset at the end to DATE type is not supported yet and will be followed up in IMPALA-14391. Tests - Added e2e tests on constant expressions. Also added a test table with such timestamp strings and added test on it. Change-Id: I36d73f4a71432588732b2284ac66552f75628a62 Reviewed-on: http://gerrit.cloudera.org:8080/23371 Reviewed-by: Daniel Becker <daniel.bec...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- .../runtime/datetime-simple-date-format-parser.cc | 1 + .../functional/functional_schema_template.sql | 13 ++++++++++ .../datasets/functional/schema_constraints.csv | 4 +++ .../functional-query/queries/QueryTest/date.test | 29 ++++++++++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/be/src/runtime/datetime-simple-date-format-parser.cc b/be/src/runtime/datetime-simple-date-format-parser.cc index 5a937540e..4626dd71d 100644 --- a/be/src/runtime/datetime-simple-date-format-parser.cc +++ b/be/src/runtime/datetime-simple-date-format-parser.cc @@ -401,6 +401,7 @@ const DateTimeFormatContext* SimpleDateFormatTokenizer::GetDefaultFormatContext( // We will need to work out which default context to use that corresponds to // the fractional length in the string. if (LIKELY(len > DEFAULT_SHORT_DATE_TIME_FMT_LEN) + && LIKELY(len < DEFAULT_DATE_TIME_FMT_LEN) && LIKELY(str[19] == '.') && LIKELY(str[13] == ':')) { switch (str[10]) { case ' ': { diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index b15d698d9..99c26a602 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -4787,3 +4787,16 @@ CREATE TABLE {db_name}{db_suffix}.{table_name} ( ) PARTITION BY HASH (tkey) PARTITIONS 3 STORED AS KUDU; ==== +---- DATASET +functional +---- BASE_TABLE_NAME +timestamp_with_tz_str +---- COLUMNS +id INT +timestamp_str STRING +---- ROW_FORMAT +delimited fields terminated by ',' +---- DEPENDENT_LOAD +insert overwrite table {db_name}{db_suffix}.{table_name} +select id, concat(cast(timestamp_col as string format 'YYYY-MM-DD HH24:MM:SS'), ' +08:00') from functional.alltypestiny; +==== diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index a3d279df3..c5229ecac 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -436,3 +436,7 @@ table_name:timestamp_primary_key, constraint:restrict_to, table_format:kudu/none # Table where all column values are unique but have some NULLs. # Depends on functional.alltypessmall. table_name:unique_with_nulls, constraint:restrict_to, table_format:parquet/none/none + +# The table is used to test casting string values to date type. Don't need to test on +# all formats. +table_name:timestamp_with_tz_str, constraint:restrict_to, table_format:parquet/none/none diff --git a/testdata/workloads/functional-query/queries/QueryTest/date.test b/testdata/workloads/functional-query/queries/QueryTest/date.test index 6fcdfc242..afec7221d 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/date.test +++ b/testdata/workloads/functional-query/queries/QueryTest/date.test @@ -884,3 +884,32 @@ select cast(cast(NULL as date) as string format "YYYY-MM-DD"); ---- TYPES STRING ==== +---- QUERY +select cast('2025-08-31 06:23:24.1234567890123456789' as DATE); +---- RESULTS +2025-08-31 +---- TYPES +DATE +==== +---- QUERY +# TODO(IMPALA-14391): support timezone offset in the string +select cast('2025-08-31 06:23:24.9392129 +00:00' as DATE) +---- CATCH +UDF ERROR: String to Date parse failed. Invalid string val: '2025-08-31 06:23:24.9392129 +00:00' +==== +---- QUERY +select cast('2025-08-31 06:23:24.123456789abcd' as DATE) +---- CATCH +UDF ERROR: String to Date parse failed. Invalid string val: '2025-08-31 06:23:24.123456789abcd' +==== +---- QUERY +select cast('aaaa-aa-aa aa:aa:aa.123456789abcd' as DATE) +---- CATCH +UDF ERROR: String to Date parse failed. Invalid string val: 'aaaa-aa-aa aa:aa:aa.123456789abcd' +==== +---- QUERY +# TODO(IMPALA-14391): support timezone offset in the string +select cast(timestamp_str as date) from functional_parquet.timestamp_with_tz_str where id=0; +---- CATCH +UDF ERROR: String to Date parse failed. Invalid string val: '2009-01-01 00:01:00 +08:00' +====