IMPALA-5867: Fix bugs parsing 2-digit year This patch fixes several bugs parsing 1 or 2-digit year formats. Existing code is broken in several ways: 1. With 1 or 2-digit year format and month/day missing, ParseDateTime() throws an uncaught exception. 2. If now() is 02/29 in a leap year but (now() - 80 years) isn't, DateTimeFormatContext::SetCenturyBreak() throws an uncaught exception. 3. If the year parsed is 02/29 in a leap year but it isn't a leap year 100 years ago, TimestampParser::Parse() will consider the date as invalid though it isn't. This patch fixes above bugs and adds a few test cases in be/src/runtime/timestamp-test.cc The behaviors after change is: 1. A date without month or day is considered invalid. This is a pre-existing difference from Hive, which defaults missing month/day to 01/01. 2. Century break would be set to 02/28 80 years ago. 3. If parsed date is 00/02/29 but 1900/02/29 does not exist, treat it as 03/01 when comparing to century break.
Change-Id: Ia4f430caea88b6c33f8050a1984ee0ee32ecb0a1 Reviewed-on: http://gerrit.cloudera.org:8080/7910 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2fbdc8e3 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2fbdc8e3 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2fbdc8e3 Branch: refs/heads/master Commit: 2fbdc8e37e4cb0a3b3408e90b5a972d778fea7eb Parents: ac68913 Author: Tianyi Wang <[email protected]> Authored: Wed Aug 30 14:14:52 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Fri Sep 8 03:05:11 2017 +0000 ---------------------------------------------------------------------- be/src/runtime/timestamp-parse-util.cc | 95 +++++++++++++++++------------ be/src/runtime/timestamp-parse-util.h | 10 +++ be/src/runtime/timestamp-test.cc | 49 +++++++++++---- 3 files changed, 104 insertions(+), 50 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2fbdc8e3/be/src/runtime/timestamp-parse-util.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/timestamp-parse-util.cc b/be/src/runtime/timestamp-parse-util.cc index 9b9e8c8..e64d904 100644 --- a/be/src/runtime/timestamp-parse-util.cc +++ b/be/src/runtime/timestamp-parse-util.cc @@ -29,8 +29,10 @@ namespace assign = boost::assign; using boost::unordered_map; using boost::gregorian::date; using boost::gregorian::date_duration; +using boost::gregorian::gregorian_calendar; using boost::posix_time::hours; using boost::posix_time::not_a_date_time; +using boost::posix_time::ptime; using boost::posix_time::time_duration; namespace impala { @@ -45,6 +47,8 @@ struct DateTimeParseResult { int second; int32_t fraction; boost::posix_time::time_duration tz_offset; + // Whether to realign the year for 2-digit year format + bool realign_year; DateTimeParseResult() : year(0), @@ -54,14 +58,22 @@ struct DateTimeParseResult { minute(0), second(0), fraction(0), - tz_offset(0,0,0,0) { + tz_offset(0,0,0,0), + realign_year(false) { } }; void DateTimeFormatContext::SetCenturyBreak(const TimestampValue &now) { - const date& now_date = now.date(); - century_break_ptime = boost::posix_time::ptime( - date(now_date.year() - 80, now_date.month(), now_date.day()), now.time()); + auto& now_date = now.date(); + // If the century break is at an invalid 02/29, set it to 02/28 for consistency with + // Hive. + if (now_date.month() == 2 && now_date.day() == 29 && + !gregorian_calendar::is_leap_year(now_date.year() - 80)) { + century_break_ptime = ptime(date(now_date.year() - 80, 2, 28), now.time()); + } else { + century_break_ptime = ptime( + date(now_date.year() - 80, now_date.month(), now_date.day()), now.time()); + } } bool TimestampParser::initialized_ = false; @@ -301,6 +313,32 @@ bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d, } } +date TimestampParser::RealignYear(const DateTimeParseResult& dt_result, + const DateTimeFormatContext& dt_ctx, int day_offset, const time_duration& t) { + DCHECK(!dt_ctx.century_break_ptime.is_special()); + // Let the century start at AABB and the year parsed be YY, this gives us AAYY. + int year = dt_result.year + (dt_ctx.century_break_ptime.date().year() / 100) * 100; + date unshifted_date; + // The potential actual date (02/29 in unshifted year + 100 years) might be valid + // even if unshifted date is not, so try to make unshifted date valid by adding 1 day. + // This makes the behavior closer to Hive. + if (dt_result.month == 2 && dt_result.day == 29 && + !gregorian_calendar::is_leap_year(year)) { + unshifted_date = date(year, 3, 1); + } else { + unshifted_date = date(year, dt_result.month, dt_result.day); + } + unshifted_date += date_duration(day_offset); + // Advance 100 years if parsed time is before the century break. + // For example if the century breaks at 1937 but dt_result->year = 1936, + // the correct year would be 2036. + if (ptime(unshifted_date, t) < dt_ctx.century_break_ptime) { + return date(year + 100, dt_result.month, dt_result.day) + date_duration(day_offset); + } else { + return date(year, dt_result.month, dt_result.day) + date_duration(day_offset); + } +} + bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContext& dt_ctx, date* d, time_duration* t) { DCHECK(TimestampParser::initialized_); @@ -330,25 +368,23 @@ bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContex *t = time_duration(0, 0, 0, 0); } if (dt_ctx.has_date_toks) { - bool is_valid_date = true; try { DCHECK(-1 <= day_offset && day_offset <= 1); - if ((dt_result.year == 1400 && dt_result.month == 1 && dt_result.day == 1 && - day_offset == -1) || - (dt_result.year == 9999 && dt_result.month == 12 && dt_result.day == 31 && - day_offset == 1)) { - // Have to check lower/upper bound explicitly. - // Tried date::is_not_a_date_time() but it doesn't complain value is out of range - // for "'1400-01-01' - 1 day" and "'9999-12-31' + 1 day". - is_valid_date = false; + if (dt_result.realign_year) { + *d = RealignYear(dt_result, dt_ctx, day_offset, *t); } else { - *d = date(dt_result.year, dt_result.month, dt_result.day); - *d += date_duration(day_offset); + *d = date(dt_result.year, dt_result.month, dt_result.day) + + date_duration(day_offset); + } + // Have to check year lower/upper bound [1400, 9999] here because + // operator + (date, date_duration) won't throw an exception even if the result is + // out-of-range. + if (d->year() < 1400 || d->year() > 9999) { + // Calling year() on out-of-range date throws an exception itself. This branch is + // to describe the checking logic but is never taken. + DCHECK(false); } } catch (boost::exception&) { - is_valid_date = false; - } - if (!is_valid_date) { VLOG_ROW << "Invalid date: " << dt_result.year << "-" << dt_result.month << "-" << dt_result.day; *d = date(); @@ -428,8 +464,6 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len, // Keep track of the number of characters we need to shift token positions by. // Variable-length tokens will result in values > 0; int shift_len = 0; - // Whether to realign the year for 2-digit year format - bool realign_year = false; for (const DateTimeFormatToken& tok: dt_ctx.toks) { const char* tok_val = str + tok.pos + shift_len; if (tok.type == SEPARATOR) { @@ -449,10 +483,10 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len, case YEAR: { dt_result->year = StringParser::StringToInt<int>(tok_val, tok_len, &status); if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false; - if (UNLIKELY(dt_result->year < 1 || dt_result->year > 9999)) return false; + if (UNLIKELY(dt_result->year < 0 || dt_result->year > 9999)) return false; // Year in "Y" and "YY" format should be in the interval // [current time - 80 years, current time + 20 years) - if (tok_len <= 2) realign_year = true; + if (tok_len <= 2) dt_result->realign_year = true; break; } case MONTH_IN_YEAR: { @@ -546,23 +580,6 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len, default: DCHECK(false) << "Unknown date/time format token"; } } - // Hive uses Java's SimpleDateFormat to parse timestamp: - // In SimpleDateFormat, the century for 2-digit-year breaks at current_time - 80 years. - // https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html - if (realign_year) { - DCHECK(!dt_ctx.century_break_ptime.is_special()); - // Let the century start at AABB and the year parsed be YY, this gives us AAYY. - dt_result->year += (dt_ctx.century_break_ptime.date().year() / 100) * 100; - date parsed_date(dt_result->year, dt_result->month, dt_result->day); - time_duration parsed_time(dt_result->hour, dt_result->minute, dt_result->second, - dt_result->fraction); - // Advance 100 years if parsed time is before the century break - // For example if the century breaks at 1937 but dt_result->year = 1936, - // the correct year would be 2036. - if (boost::posix_time::ptime(parsed_date, parsed_time) < dt_ctx.century_break_ptime) { - dt_result->year += 100; - } - } return true; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2fbdc8e3/be/src/runtime/timestamp-parse-util.h ---------------------------------------------------------------------- diff --git a/be/src/runtime/timestamp-parse-util.h b/be/src/runtime/timestamp-parse-util.h index b68cc85..bbcc03f 100644 --- a/be/src/runtime/timestamp-parse-util.h +++ b/be/src/runtime/timestamp-parse-util.h @@ -219,6 +219,16 @@ class TimestampParser { static bool ParseDateTime(const char* str, int str_len, const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result); + /// Helper function finding the correct century for 1 or 2 digit year according to + /// century break. Throws bad_year, bad_day_of_month, or bad_day_month if the date is + /// invalid. The century break behavior is copied from Java SimpleDateFormat in order to + /// be consistent with Hive. + /// In SimpleDateFormat, the century for 2-digit-year breaks at current_time - 80 years. + /// https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html + static boost::gregorian::date RealignYear(const DateTimeParseResult& dt_result, + const DateTimeFormatContext& dt_ctx, int day_offset, + const boost::posix_time::time_duration& t); + /// Check if the string is a TimeZone offset token. /// Valid offset token format are 'hh:mm', 'hhmm', 'hh'. static bool IsValidTZOffset(const char* str_begin, const char* str_end); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2fbdc8e3/be/src/runtime/timestamp-test.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/timestamp-test.cc b/be/src/runtime/timestamp-test.cc index 5f3e0e6..b8919cc 100644 --- a/be/src/runtime/timestamp-test.cc +++ b/be/src/runtime/timestamp-test.cc @@ -231,7 +231,8 @@ void TestTimestampTokens(vector<TimestampToken>* toks, int year, int month, TEST(TimestampTest, Basic) { // Fix current time to determine the behavior parsing 2-digit year format - TimestampValue now(date(2017, 7, 28), time_duration(16, 14, 24)); + // Set it to 03/01 to test 02/29 edge cases. + TimestampValue now(date(1980, 3, 1), time_duration(16, 14, 24)); char s1[] = "2012-01-20 01:10:01"; char s2[] = "1990-10-20 10:10:10.123456789 "; @@ -426,6 +427,19 @@ TEST(TimestampTest, Basic) { } // Test parsing/formatting of complex date/time formats vector<TimestampTC> test_cases = boost::assign::list_of + // Test year upper/lower bound + (TimestampTC("yyyy-MM-dd HH:mm:ss", "1400-01-01 00:00:00", + false, true, false, 1400, 1, 1)) + (TimestampTC("yyyy-MM-dd HH:mm:ss", "1399-12-31 23:59:59", + false, true)) + (TimestampTC("yyyy-MM-dd HH:mm:ss", "9999-12-31 23:59:59", + false, true, false, 9999, 12, 31, 23, 59, 59)) + (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "1400-01-01 01:00:00 +01", false, true, false, + 1400, 1, 1, 0, 0, 0)) + (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "1400-01-01 01:00:00 +02", false, true)) + (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "9999-12-31 22:00:00 -01", false, true, false, + 9999, 12, 31, 23, 0, 0)) + (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "9999-12-31 22:00:00 -02", false, true)) // Test case on literal short months (TimestampTC("yyyy-MMM-dd", "2013-OCT-01", false, true, false, 2013, 10, 1)) // Test case on literal short months @@ -470,19 +484,32 @@ TEST(TimestampTest, Basic) { (TimestampTC("MMdd", "1201", false, true)) // Test missing month (TimestampTC("yyyydd", "201301", false, true)) - // Test missing month - (TimestampTC("yyyymm", "201301", false, true)) + (TimestampTC("yydd", "1301", false, true)) + // Test missing day + (TimestampTC("yyyyMM", "201301", false, true)) + (TimestampTC("yyMM", "8512", false, true)) + // Test missing month and day + (TimestampTC("yyyy", "2013", false, true)) + (TimestampTC("yy", "13", false, true)) // Test short year token (TimestampTC("y-MM-dd", "2013-11-13", false, true, false, 2013, 11, 13)) - (TimestampTC("y-MM-dd", "13-11-13", false, true, false, 2013, 11, 13)) + (TimestampTC("y-MM-dd", "13-11-13", false, true, false, 1913, 11, 13)) // Test 2-digit year format - (TimestampTC("yy-MM-dd", "37-07-28", false, true, false, 2037, 7, 28)) - (TimestampTC("yy-MM-dd", "37-07-29", false, true, false, 1937, 7, 29)) + (TimestampTC("yy-MM-dd", "17-08-31", false, true, false, 1917, 8, 31)) + (TimestampTC("yy-MM-dd", "99-08-31", false, true, false, 1999, 8, 31)) + // Test 02/29 edge cases of 2-digit year format + (TimestampTC("yy-MM-dd", "00-02-28", false, true, false, 2000, 2, 28)) + (TimestampTC("yy-MM-dd", "00-02-29", false, true, false, 2000, 2, 29)) + (TimestampTC("yy-MM-dd", "00-03-01", false, true, false, 2000, 3, 1)) + (TimestampTC("yy-MM-dd", "00-03-02", false, true, false, 1900, 3, 2)) + (TimestampTC("yy-MM-dd", "04-02-29", false, true, false, 1904, 2, 29)) + (TimestampTC("yy-MM-dd", "99-02-29", false, true)) // Test 1-digit year format with time to show the exact boundary - (TimestampTC("y-MM-dd HH:mm:ss", "37-07-28 16:14:23", false, true, false, - 2037, 7, 28, 16, 14, 23)) - (TimestampTC("y-MM-dd HH:mm:ss", "37-07-28 16:14:24", false, true, false, - 1937, 7, 28, 16, 14, 24)) + // Before the cutoff. Year should be 2000 + (TimestampTC("y-MM-dd HH:mm:ss", "00-02-29 16:14:23", false, true, false, + 2000, 2, 29, 16, 14, 23)) + // After the cutoff but 02/29/1900 is invalid + (TimestampTC("y-MM-dd HH:mm:ss", "00-02-29 16:14:24", false, true)) // Test short month token (TimestampTC("yyyy-M-dd", "2013-11-13", false, true, false, 2013, 11, 13)) (TimestampTC("yyyy-M-dd", "2013-1-13", false, true, false, 2013, 1, 13)) @@ -491,7 +518,7 @@ TEST(TimestampTest, Basic) { (TimestampTC("yyyy-MM-d", "2013-11-3", false, true, false, 2013, 11, 3)) // Test short all date tokens (TimestampTC("y-M-d", "2013-11-13", false, true, false, 2013, 11, 13)) - (TimestampTC("y-M-d", "13-1-3", false, true, false, 2013, 1, 3)) + (TimestampTC("y-M-d", "13-1-3", false, true, false, 1913, 1, 3)) // Test short hour token (TimestampTC("H:mm:ss", "14:24:34", false, false, true, 0, 0, 0, 14, 24, 34)) (TimestampTC("H:mm:ss", "4:24:34", false, false, true, 0, 0, 0, 4, 24, 34))
