Repository: impala Updated Branches: refs/heads/master fcbbae495 -> 02ee2d1c5
IMPALA-6995: avoid DCHECK in TimestampParse::Parse() The bug was that the string's length is checked before trimming leading and trailing spaces instead of afterwards. The bug has been present for a long time but couldn't hit a DCHECK until recently. Testing: Added some backend tests that reproduce the crash. Change-Id: I02a18ffd8893fe74f5830144300f745ce31477b1 Reviewed-on: http://gerrit.cloudera.org:8080/10349 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/02ee2d1c Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/02ee2d1c Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/02ee2d1c Branch: refs/heads/master Commit: 02ee2d1c509bffc71817fdc4bd25b9589e7085e5 Parents: fcbbae4 Author: Tim Armstrong <[email protected]> Authored: Tue May 8 14:41:23 2018 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Wed May 16 20:02:20 2018 +0000 ---------------------------------------------------------------------- be/src/exprs/expr-test.cc | 6 ++ be/src/runtime/timestamp-parse-util.cc | 101 ++++++++++++++-------------- 2 files changed, 56 insertions(+), 51 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/02ee2d1c/be/src/exprs/expr-test.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc index af8e753..61cfceb 100644 --- a/be/src/exprs/expr-test.cc +++ b/be/src/exprs/expr-test.cc @@ -3157,6 +3157,12 @@ TEST_F(ExprTest, CastExprs) { TestTimestampValue("cast(' \t\r\n 2001-1-9 \t\r\n ' as timestamp)", TimestampValue::Parse("2001-01-09")); + // IMPALA-6995: whitespace-only strings should return NULL. + TestIsNull("cast(' ' as timestamp)", TYPE_TIMESTAMP); + TestIsNull("cast('\n' as timestamp)", TYPE_TIMESTAMP); + TestIsNull("cast('\t' as timestamp)", TYPE_TIMESTAMP); + TestIsNull("cast(' \t\r\n' as timestamp)", TYPE_TIMESTAMP); + // Test valid multi-space and 'T' separators between date and time // components TestTimestampValue("cast('2001-01-09 01:05:01' as timestamp)", http://git-wip-us.apache.org/repos/asf/impala/blob/02ee2d1c/be/src/runtime/timestamp-parse-util.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/timestamp-parse-util.cc b/be/src/runtime/timestamp-parse-util.cc index cbabeab..3063728 100644 --- a/be/src/runtime/timestamp-parse-util.cc +++ b/be/src/runtime/timestamp-parse-util.cc @@ -27,6 +27,8 @@ #include "runtime/timestamp-value.h" #include "util/string-parser.h" +#include "common/names.h" + namespace assign = boost::assign; using boost::unordered_map; using boost::gregorian::date; @@ -83,6 +85,12 @@ bool TimestampParser::initialized_ = false; /// Lazily initialized pseudo-constant hashmap for mapping month names to an index. static unordered_map<StringValue, int> REV_MONTH_INDEX; +const int TimestampParser::DEFAULT_DATE_FMT_LEN; +const int TimestampParser::DEFAULT_TIME_FMT_LEN; +const int TimestampParser::DEFAULT_TIME_FRAC_FMT_LEN; +const int TimestampParser::DEFAULT_SHORT_DATE_TIME_FMT_LEN; +const int TimestampParser::DEFAULT_DATE_TIME_FMT_LEN; + DateTimeFormatContext TimestampParser::DEFAULT_SHORT_DATE_TIME_CTX; DateTimeFormatContext TimestampParser::DEFAULT_SHORT_ISO_DATE_TIME_CTX; DateTimeFormatContext TimestampParser::DEFAULT_DATE_CTX; @@ -360,51 +368,55 @@ bool TimestampParser::ParseFormatTokensByStr(DateTimeFormatContext* dt_ctx) { return true; } +// Helper for TimestampParse::Parse to produce return value and set output parameters +// when parsing fails. 'd' and 't' must be non-NULL. +static bool IndicateTimestampParseFailure(date* d, time_duration* t) { + *d = date(); + *t = time_duration(not_a_date_time); + return false; +} + bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d, boost::posix_time::time_duration* t) { - int lazy_len; - DCHECK(TimestampParser::initialized_); - DCHECK(d != NULL); - DCHECK(t != NULL); - if (UNLIKELY(str == NULL || len <= 0)) { - *d = boost::gregorian::date(); - *t = boost::posix_time::time_duration(boost::posix_time::not_a_date_time); - return false; - } + DCHECK(d != nullptr); + DCHECK(t != nullptr); + if (UNLIKELY(str == nullptr)) return IndicateTimestampParseFailure(d, t); + + int trimmed_len = len; // Remove leading white space. - while (len > 0 && isspace(*str)) { + while (trimmed_len > 0 && isspace(*str)) { ++str; - --len; + --trimmed_len; } // Strip the trailing blanks. - while (len > 0 && isspace(str[len - 1])) --len; + while (trimmed_len > 0 && isspace(str[trimmed_len - 1])) --trimmed_len; // Strip if there is a 'Z' suffix - if (len > 0 && str[len - 1] == 'Z') { - --len; - } else if (len > DEFAULT_TIME_FMT_LEN && (str[4] == '-' || str[2] == ':')) { + if (trimmed_len > 0 && str[trimmed_len - 1] == 'Z') { + --trimmed_len; + } else if (trimmed_len > DEFAULT_TIME_FMT_LEN && (str[4] == '-' || str[2] == ':')) { // Strip timezone offset if it seems like a valid timestamp string. int curr_pos = DEFAULT_TIME_FMT_LEN; // Timezone offset will be at least two bytes long, no need to check last // two bytes. - while (curr_pos < len - 2) { + while (curr_pos < trimmed_len - 2) { if (str[curr_pos] == '+' || str[curr_pos] == '-') { - len = curr_pos; + trimmed_len = curr_pos; break; } ++curr_pos; } } + if (UNLIKELY(trimmed_len <= 0)) return IndicateTimestampParseFailure(d, t); - lazy_len = len; - // Only process what we have to. - if (len > DEFAULT_DATE_TIME_FMT_LEN) len = DEFAULT_DATE_TIME_FMT_LEN; + // Determine the length of relevant input, if we're using one of the default formats. + int default_fmt_len = min(trimmed_len, DEFAULT_DATE_TIME_FMT_LEN); // Determine the default formatting context that's required for parsing. DateTimeFormatContext* dt_ctx = NULL; - if (LIKELY(len >= DEFAULT_TIME_FMT_LEN)) { + if (LIKELY(default_fmt_len >= DEFAULT_TIME_FMT_LEN)) { // This string starts with a date component if (str[4] == '-' && str[7] == '-') { - switch (len) { + switch (default_fmt_len) { case DEFAULT_DATE_FMT_LEN: { dt_ctx = &DEFAULT_DATE_CTX; break; @@ -439,17 +451,17 @@ bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d, // There is likely a fractional component that's below the expected 9 chars. // We will need to work out which default context to use that corresponds to // the fractional length in the string. - if (LIKELY(len > DEFAULT_SHORT_DATE_TIME_FMT_LEN) && LIKELY(str[19] == '.') - && LIKELY(str[13] == ':')) { + if (LIKELY(default_fmt_len > DEFAULT_SHORT_DATE_TIME_FMT_LEN) + && LIKELY(str[19] == '.') && LIKELY(str[13] == ':')) { switch (str[10]) { case ' ': { - dt_ctx = - &DEFAULT_DATE_TIME_CTX[len - DEFAULT_SHORT_DATE_TIME_FMT_LEN - 1]; + dt_ctx = &DEFAULT_DATE_TIME_CTX + [default_fmt_len - DEFAULT_SHORT_DATE_TIME_FMT_LEN - 1]; break; } case 'T': { dt_ctx = &DEFAULT_ISO_DATE_TIME_CTX - [len - DEFAULT_SHORT_DATE_TIME_FMT_LEN - 1]; + [default_fmt_len - DEFAULT_SHORT_DATE_TIME_FMT_LEN - 1]; break; } } @@ -458,32 +470,23 @@ bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d, } } } else if (str[2] == ':' && str[5] == ':' && isdigit(str[7])) { - if (len > DEFAULT_TIME_FRAC_FMT_LEN) len = DEFAULT_TIME_FRAC_FMT_LEN; - if (len > DEFAULT_TIME_FMT_LEN && str[8] == '.') { - dt_ctx = &DEFAULT_TIME_FRAC_CTX[len - DEFAULT_TIME_FMT_LEN - 1]; + default_fmt_len = min(default_fmt_len, DEFAULT_TIME_FRAC_FMT_LEN); + if (default_fmt_len > DEFAULT_TIME_FMT_LEN && str[8] == '.') { + dt_ctx = &DEFAULT_TIME_FRAC_CTX[default_fmt_len - DEFAULT_TIME_FMT_LEN - 1]; } else { dt_ctx = &DEFAULT_TIME_CTX; } } } + if (dt_ctx != nullptr) return Parse(str, default_fmt_len, *dt_ctx, d, t); // Generating context lazily as a fall back if default formats fail. // ParseFormatTokenByStr() does not require a template format string. - if (dt_ctx != nullptr) { - return Parse(str, len, *dt_ctx, d, t); - } else { - DateTimeFormatContext lazy_ctx; - lazy_ctx.Reset(str, lazy_len); - if (ParseFormatTokensByStr(&lazy_ctx)) { - dt_ctx = &lazy_ctx; - len = lazy_len; - return Parse(str, len, *dt_ctx, d, t); - } else { - *d = boost::gregorian::date(); - *t = boost::posix_time::time_duration(boost::posix_time::not_a_date_time); - return false; - } - } + DateTimeFormatContext lazy_ctx; + lazy_ctx.Reset(str, trimmed_len); + if (!ParseFormatTokensByStr(&lazy_ctx)) return IndicateTimestampParseFailure(d, t); + dt_ctx = &lazy_ctx; + return Parse(str, trimmed_len, *dt_ctx, d, t); } date TimestampParser::RealignYear(const DateTimeParseResult& dt_result, @@ -522,9 +525,7 @@ bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContex int day_offset = 0; if (UNLIKELY(str == NULL || len <= 0 || !ParseDateTime(str, len, dt_ctx, &dt_result))) { - *d = date(); - *t = time_duration(not_a_date_time); - return false; + return IndicateTimestampParseFailure(d, t); } if (dt_ctx.has_time_toks) { *t = time_duration(dt_result.hour, dt_result.minute, @@ -560,9 +561,7 @@ bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContex } catch (boost::exception&) { VLOG_ROW << "Invalid date: " << dt_result.year << "-" << dt_result.month << "-" << dt_result.day; - *d = date(); - *t = time_duration(not_a_date_time); - return false; + return IndicateTimestampParseFailure(d, t); } } else { *d = date();
