http://git-wip-us.apache.org/repos/asf/impala/blob/cb493716/be/src/runtime/timestamp-parse-util.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/timestamp-parse-util.cc b/be/src/runtime/timestamp-parse-util.cc index 3063728..27568a3 100644 --- a/be/src/runtime/timestamp-parse-util.cc +++ b/be/src/runtime/timestamp-parse-util.cc @@ -17,20 +17,12 @@ #include "runtime/timestamp-parse-util.h" -#include <algorithm> - -#include <boost/assign/list_of.hpp> -#include <boost/date_time/gregorian/gregorian.hpp> -#include <boost/unordered_map.hpp> - #include "runtime/string-value.inline.h" #include "runtime/timestamp-value.h" #include "util/string-parser.h" #include "common/names.h" -namespace assign = boost::assign; -using boost::unordered_map; using boost::gregorian::date; using boost::gregorian::date_duration; using boost::gregorian::gregorian_calendar; @@ -41,332 +33,7 @@ using boost::posix_time::time_duration; namespace impala { -/// Stores the results of parsing a date/time string. -struct DateTimeParseResult { - int year; - int month; - int day; - int hour; - int minute; - int second; - int32_t fraction; - boost::posix_time::time_duration tz_offset; - // Whether to realign the year for 2-digit year format - bool realign_year; - - DateTimeParseResult() - : year(0), - month(0), - day(0), - hour(0), - minute(0), - second(0), - fraction(0), - tz_offset(0,0,0,0), - realign_year(false) { - } -}; - -void DateTimeFormatContext::SetCenturyBreak(const TimestampValue &now) { - auto& now_date = now.date(); - // If the century break is at an invalid 02/29, set it to 02/28 for consistency with - // Hive. - if (now_date.month() == 2 && now_date.day() == 29 && - !gregorian_calendar::is_leap_year(now_date.year() - 80)) { - century_break_ptime = ptime(date(now_date.year() - 80, 2, 28), now.time()); - } else { - century_break_ptime = ptime( - date(now_date.year() - 80, now_date.month(), now_date.day()), now.time()); - } -} - -bool TimestampParser::initialized_ = false; - -/// Lazily initialized pseudo-constant hashmap for mapping month names to an index. -static unordered_map<StringValue, int> REV_MONTH_INDEX; - -const int TimestampParser::DEFAULT_DATE_FMT_LEN; -const int TimestampParser::DEFAULT_TIME_FMT_LEN; -const int TimestampParser::DEFAULT_TIME_FRAC_FMT_LEN; -const int TimestampParser::DEFAULT_SHORT_DATE_TIME_FMT_LEN; -const int TimestampParser::DEFAULT_DATE_TIME_FMT_LEN; - -DateTimeFormatContext TimestampParser::DEFAULT_SHORT_DATE_TIME_CTX; -DateTimeFormatContext TimestampParser::DEFAULT_SHORT_ISO_DATE_TIME_CTX; -DateTimeFormatContext TimestampParser::DEFAULT_DATE_CTX; -DateTimeFormatContext TimestampParser::DEFAULT_TIME_CTX; -DateTimeFormatContext TimestampParser::DEFAULT_DATE_TIME_CTX[10]; -DateTimeFormatContext TimestampParser::DEFAULT_ISO_DATE_TIME_CTX[10]; -DateTimeFormatContext TimestampParser::DEFAULT_TIME_FRAC_CTX[10]; - -void TimestampParser::Init() { - if (TimestampParser::initialized_) return; - // This needs to be lazily init'd because a StringValues hash function will be invoked - // for each entry that's placed in the map. The hash function expects that - // CpuInfo::Init() has already been called. - REV_MONTH_INDEX = boost::unordered_map<StringValue, int>({ - {StringValue("jan"), 1}, {StringValue("feb"), 2}, - {StringValue("mar"), 3}, {StringValue("apr"), 4}, - {StringValue("may"), 5}, {StringValue("jun"), 6}, - {StringValue("jul"), 7}, {StringValue("aug"), 8}, - {StringValue("sep"), 9}, {StringValue("oct"), 10}, - {StringValue("nov"), 11}, {StringValue("dec"), 12} - }); - - // Setup the default date/time context yyyy-MM-dd HH:mm:ss.SSSSSSSSS - const char* DATE_TIME_CTX_FMT = "yyyy-MM-dd HH:mm:ss.SSSSSSSSS"; - const int FRACTIONAL_MAX_LEN = 9; - for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) { - DEFAULT_DATE_TIME_CTX[i].Reset(DATE_TIME_CTX_FMT, - DEFAULT_DATE_TIME_FMT_LEN - (FRACTIONAL_MAX_LEN - i)); - ParseFormatTokens(&DEFAULT_DATE_TIME_CTX[i]); - } - - // Setup the default ISO date/time context yyyy-MM-ddTHH:mm:ss.SSSSSSSSS - for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) { - DEFAULT_ISO_DATE_TIME_CTX[i].Reset("yyyy-MM-ddTHH:mm:ss.SSSSSSSSS", - DEFAULT_DATE_TIME_FMT_LEN - (FRACTIONAL_MAX_LEN - i)); - ParseFormatTokens(&DEFAULT_ISO_DATE_TIME_CTX[i]); - } - - // Setup the short default date/time context yyyy-MM-dd HH:mm:ss - DEFAULT_SHORT_DATE_TIME_CTX.Reset("yyyy-MM-dd HH:mm:ss", - DEFAULT_SHORT_DATE_TIME_FMT_LEN); - ParseFormatTokens(&DEFAULT_SHORT_DATE_TIME_CTX); - - // Setup the short default ISO date/time context yyyy-MM-ddTHH:mm:ss - DEFAULT_SHORT_ISO_DATE_TIME_CTX.Reset("yyyy-MM-ddTHH:mm:ss", - DEFAULT_SHORT_DATE_TIME_FMT_LEN); - ParseFormatTokens(&DEFAULT_SHORT_ISO_DATE_TIME_CTX); - - // Setup the default short date context yyyy-MM-dd - DEFAULT_DATE_CTX.Reset("yyyy-MM-dd", DEFAULT_DATE_FMT_LEN); - ParseFormatTokens(&DEFAULT_DATE_CTX); - - // Setup the default short time context HH:mm:ss - DEFAULT_TIME_CTX.Reset("HH:mm:ss", DEFAULT_TIME_FMT_LEN); - ParseFormatTokens(&DEFAULT_TIME_CTX); - - // Setup the default short time context with fractional seconds HH:mm:ss.SSSSSSSSS - for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) { - DEFAULT_TIME_FRAC_CTX[i].Reset(DATE_TIME_CTX_FMT + 11, - DEFAULT_TIME_FRAC_FMT_LEN - (FRACTIONAL_MAX_LEN - i)); - ParseFormatTokens(&DEFAULT_TIME_FRAC_CTX[i]); - } - // Flag that the parser is ready. - TimestampParser::initialized_ = true; -} - -bool TimestampParser::ParseFormatTokens(DateTimeFormatContext* dt_ctx) { - DCHECK(dt_ctx != NULL); - DCHECK(dt_ctx->fmt != NULL); - DCHECK(dt_ctx->fmt_len > 0); - DCHECK(dt_ctx->toks.size() == 0); - const char* str_begin = dt_ctx->fmt; - const char* str_end = str_begin + dt_ctx->fmt_len; - const char* str = str_begin; - // Parse the tokens from the format string - while (str < str_end) { - if (isdigit(*str)) return false; - // Ignore T|Z|non aA-zZ chars but track them as separators (required for printing). - if ((*str == 'T') || (*str == 'Z') || (!isalpha(*str))) { - if (dt_ctx->has_time_toks && IsValidTZOffset(str, str_end)) { - // TZ offset must come at the end of the format. - dt_ctx->toks.push_back(DateTimeFormatToken(TZ_OFFSET, str - str_begin, - str_end - str, str)); - break; - } - dt_ctx->toks.push_back(DateTimeFormatToken(SEPARATOR, str - str_begin, 1, str)); - ++str; - continue; - } - // Not a separator, verify that the previous token is either a separator or has - // length >1, i.e., it is not a variable length token. - if (!dt_ctx->toks.empty()) { - const DateTimeFormatToken& prev = dt_ctx->toks.back(); - if (UNLIKELY(prev.type != SEPARATOR && prev.len == 1)) return false; - } - DateTimeFormatTokenType tok_type = UNKNOWN; - switch (*str) { - case 'y': tok_type = YEAR; break; - case 'M': tok_type = MONTH_IN_YEAR; break; - case 'd': tok_type = DAY_IN_MONTH; break; - case 'H': tok_type = HOUR_IN_DAY; break; - case 'm': tok_type = MINUTE_IN_HOUR; break; - case 's': tok_type = SECOND_IN_MINUTE; break; - case 'S': tok_type = FRACTION; break; - // Error on aA-zZ reserved characters that are not used yet. - default: return false; - } - dt_ctx->has_date_toks |= tok_type < HOUR_IN_DAY; - dt_ctx->has_time_toks |= tok_type >= HOUR_IN_DAY; - // Get the token group length - int tok_len = 1; - char tok_chr = *str; - const char* curr_tok_chr = str + 1; - while (curr_tok_chr < str_end) { - if (*curr_tok_chr != tok_chr) break; - ++tok_len; - ++curr_tok_chr; - } - if (tok_type == MONTH_IN_YEAR) { - if (UNLIKELY(tok_len > 3)) return false; - if (tok_len == 3) tok_type = MONTH_IN_YEAR_SLT; - } - // In an output scenario, fmt_out_len is used to determine the print buffer size. - // If the format uses short token groups e.g. yyyy-MM-d, there must to be enough - // room in the buffer for wider values e.g. 2013-12-16. - if (tok_len == 1) ++dt_ctx->fmt_out_len; - DateTimeFormatToken tok(tok_type, str - str_begin, tok_len, str); - str += tok.len; - dt_ctx->toks.push_back(tok); - } - return dt_ctx->has_date_toks || dt_ctx->has_time_toks; -} - -const char* TimestampParser::ParseDigitToken(const char* str, const char* str_end) { - const char* tok_end = str; - while (tok_end < str_end) { - if (!isdigit(*tok_end)) return tok_end; - ++tok_end; - } - return tok_end; -} - -const char* TimestampParser::ParseSeparatorToken( - const char* str, const char* str_end, const char sep) { - const char* tok_end = str; - while (tok_end < str_end) { - if (*tok_end != sep) return tok_end; - ++tok_end; - } - return tok_end; -} - -bool TimestampParser::ParseFormatTokensByStr(DateTimeFormatContext* dt_ctx) { - DCHECK(dt_ctx != NULL); - DCHECK(dt_ctx->fmt != NULL); - DCHECK_GT(dt_ctx->fmt_len, 0); - DCHECK_EQ(dt_ctx->toks.size(), 0); - const char* str_begin = dt_ctx->fmt; - const char* str_end = str_begin + dt_ctx->fmt_len; - const char* str = str_begin; - const char* tok_end; - - // Parse the 4-digit year - tok_end = ParseDigitToken(str, str_end); - if (tok_end - str == 4) { - dt_ctx->toks.push_back( - DateTimeFormatToken(YEAR, str - str_begin, tok_end - str, str)); - str = tok_end; - - // Check for the date separator '-' - tok_end = ParseSeparatorToken(str, str_end, '-'); - if (tok_end - str != 1) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str)); - str = tok_end; - - // Parse the 1 or 2 digit month. - tok_end = ParseDigitToken(str, str_end); - if (tok_end - str != 1 && tok_end - str != 2) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(MONTH_IN_YEAR, str - str_begin, tok_end - str, str)); - str = tok_end; - - // Check for the date separator '-' - tok_end = ParseSeparatorToken(str, str_end, '-'); - if (tok_end - str != 1) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str)); - str = tok_end; - - // Parse the 1 or 2 digit day in month - tok_end = ParseDigitToken(str, str_end); - if (tok_end - str != 1 && tok_end - str != 2) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(DAY_IN_MONTH, str - str_begin, tok_end - str, str)); - str = tok_end; - dt_ctx->has_date_toks = true; - - // If the string ends here, we only have a date component - if (str == str_end) return true; - - // Check for the space between date and time component - if (*str != ' ' && *str != 'T') return false; - char sep = *str; - tok_end = ParseSeparatorToken(str, str_end, sep); - if (tok_end - str < 1) return false; - // IMPALA-6641: Multiple spaces are okay, 'T' separator must be single - if (sep == 'T' && tok_end - str > 1) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str)); - str = tok_end; - - // Invalid format if date-time separator is not followed by more digits - if (str > str_end) return false; - tok_end = ParseDigitToken(str, str_end); - } - - // Parse the 1 or 2 digit hour - if (tok_end - str != 1 && tok_end - str != 2) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(HOUR_IN_DAY, str - str_begin, tok_end - str, str)); - str = tok_end; - - // Check for the time component separator ':' - tok_end = ParseSeparatorToken(str, str_end, ':'); - if (tok_end - str != 1) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str)); - str = tok_end; - - // Parse the 1 or 2 digit minute - tok_end = ParseDigitToken(str, str_end); - if (tok_end - str != 1 && tok_end - str != 2) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(MINUTE_IN_HOUR, str - str_begin, tok_end - str, str)); - str = tok_end; - - // Check for the time component separator ':' - tok_end = ParseSeparatorToken(str, str_end, ':'); - if (tok_end - str != 1) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str)); - str = tok_end; - - // Parse the 1 or 2 digit second - tok_end = ParseDigitToken(str, str_end); - if (tok_end - str != 1 && tok_end - str != 2) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(SECOND_IN_MINUTE, str - str_begin, tok_end - str, str)); - str = tok_end; - dt_ctx->has_time_toks = true; - - // There is more to parse, there maybe a fractional component. - if (str < str_end) { - tok_end = ParseSeparatorToken(str, str_end, '.'); - if (tok_end - str != 1) return false; - dt_ctx->toks.push_back( - DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str)); - str = tok_end; - - // Invalid format when there is no fractional component following '.' - if (str > str_end) return false; - - // Parse the fractional component. - // Like the non-lazy path, this will parse up to 9 fractional digits - tok_end = ParseDigitToken(str, str_end); - int num_digits = std::min<int>(9, tok_end - str); - dt_ctx->toks.push_back( - DateTimeFormatToken(FRACTION, str - str_begin, num_digits, str)); - str = tok_end; - - // Invalid format if there is more to parse after the fractional component - if (str < str_end) return false; - } - return true; -} +using namespace datetime_parse_util; // Helper for TimestampParse::Parse to produce return value and set output parameters // when parsing fails. 'd' and 't' must be non-NULL. @@ -378,7 +45,7 @@ static bool IndicateTimestampParseFailure(date* d, time_duration* t) { bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d, boost::posix_time::time_duration* t) { - DCHECK(TimestampParser::initialized_); + DCHECK(IsParseCtxInitialized()); DCHECK(d != nullptr); DCHECK(t != nullptr); if (UNLIKELY(str == nullptr)) return IndicateTimestampParseFailure(d, t); @@ -482,8 +149,7 @@ bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d, if (dt_ctx != nullptr) return Parse(str, default_fmt_len, *dt_ctx, d, t); // Generating context lazily as a fall back if default formats fail. // ParseFormatTokenByStr() does not require a template format string. - DateTimeFormatContext lazy_ctx; - lazy_ctx.Reset(str, trimmed_len); + DateTimeFormatContext lazy_ctx(str, trimmed_len); if (!ParseFormatTokensByStr(&lazy_ctx)) return IndicateTimestampParseFailure(d, t); dt_ctx = &lazy_ctx; return Parse(str, trimmed_len, *dt_ctx, d, t); @@ -517,14 +183,13 @@ date TimestampParser::RealignYear(const DateTimeParseResult& dt_result, bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContext& dt_ctx, date* d, time_duration* t) { - DCHECK(TimestampParser::initialized_); + DCHECK(IsParseCtxInitialized()); DCHECK(dt_ctx.toks.size() > 0); DCHECK(d != NULL); DCHECK(t != NULL); DateTimeParseResult dt_result; int day_offset = 0; - if (UNLIKELY(str == NULL || len <= 0 || - !ParseDateTime(str, len, dt_ctx, &dt_result))) { + if (UNLIKELY(str == NULL || len <= 0 || !ParseDateTime(str, len, dt_ctx, &dt_result))) { return IndicateTimestampParseFailure(d, t); } if (dt_ctx.has_time_toks) { @@ -572,7 +237,7 @@ bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContex int TimestampParser::Format(const DateTimeFormatContext& dt_ctx, const boost::gregorian::date& d, const boost::posix_time::time_duration& t, int len, char* buff) { - DCHECK(TimestampParser::initialized_); + DCHECK(IsParseCtxInitialized()); DCHECK(dt_ctx.toks.size() > 0); DCHECK(len > dt_ctx.fmt_out_len); DCHECK(buff != NULL); @@ -626,151 +291,4 @@ int TimestampParser::Format(const DateTimeFormatContext& dt_ctx, return str - buff; } -bool TimestampParser::ParseDateTime(const char* str, int str_len, - const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result) { - DCHECK(dt_ctx.fmt_len > 0); - DCHECK(dt_ctx.toks.size() > 0); - DCHECK(dt_result != NULL); - if (str_len <= 0 || str_len < dt_ctx.fmt_len || str == NULL) return false; - StringParser::ParseResult status; - // Keep track of the number of characters we need to shift token positions by. - // Variable-length tokens will result in values > 0; - int shift_len = 0; - for (const DateTimeFormatToken& tok: dt_ctx.toks) { - const char* tok_val = str + tok.pos + shift_len; - if (tok.type == SEPARATOR) { - if (UNLIKELY(*tok_val != *tok.val)) return false; - continue; - } - int tok_len = tok.len; - const char* str_end = str + str_len; - // In case of single-character tokens we scan ahead to the next separator. - if (UNLIKELY(tok_len == 1)) { - while ((tok_val + tok_len < str_end) && isdigit(*(tok_val + tok_len))) { - ++tok_len; - ++shift_len; - } - } - switch (tok.type) { - case YEAR: { - dt_result->year = StringParser::StringToInt<int>(tok_val, tok_len, &status); - if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false; - if (UNLIKELY(dt_result->year < 0 || dt_result->year > 9999)) return false; - // Year in "Y" and "YY" format should be in the interval - // [current time - 80 years, current time + 20 years) - if (tok_len <= 2) dt_result->realign_year = true; - break; - } - case MONTH_IN_YEAR: { - dt_result->month = StringParser::StringToInt<int>(tok_val, tok_len, &status); - if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false; - if (UNLIKELY(dt_result->month < 1 || dt_result->month > 12)) return false; - break; - } - case MONTH_IN_YEAR_SLT: { - char raw_buff[tok.len]; - std::transform(tok_val, tok_val + tok.len, raw_buff, ::tolower); - StringValue buff(raw_buff, tok.len); - boost::unordered_map<StringValue, int>::const_iterator iter = - REV_MONTH_INDEX.find(buff); - if (UNLIKELY(iter == REV_MONTH_INDEX.end())) return false; - dt_result->month = iter->second; - break; - } - case DAY_IN_MONTH: { - dt_result->day = StringParser::StringToInt<int>(tok_val, tok_len, &status); - if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false; - // TODO: Validate that the value of day is correct for the given month. - if (UNLIKELY(dt_result->day < 1 || dt_result->day > 31)) return false; - break; - } - case HOUR_IN_DAY: { - dt_result->hour = StringParser::StringToInt<int>(tok_val, tok_len, &status); - if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false; - if (UNLIKELY(dt_result->hour < 0 || dt_result->hour > 23)) return false; - break; - } - case MINUTE_IN_HOUR: { - dt_result->minute = StringParser::StringToInt<int>(tok_val, tok_len, &status); - if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false; - if (UNLIKELY(dt_result->minute < 0 || dt_result->minute > 59)) return false; - break; - } - case SECOND_IN_MINUTE: { - dt_result->second = StringParser::StringToInt<int>(tok_val, tok_len, &status); - if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false; - if (UNLIKELY(dt_result->second < 0 || dt_result->second > 59)) return false; - break; - } - case FRACTION: { - dt_result->fraction = - StringParser::StringToInt<int32_t>(tok_val, tok_len, &status); - if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false; - // A user may specify a time of 04:30:22.1238, the parser will return 1238 for - // the fractional portion. This does not represent the intended value of - // 123800000, therefore the number must be scaled up. - for (int i = tok_len; i < 9; ++i) dt_result->fraction *= 10; - break; - } - case TZ_OFFSET: { - if (tok_val[0] != '+' && tok_val[0] != '-') return false; - int sign = tok_val[0] == '-' ? -1 : 1; - int minute = 0; - int hour = StringParser::StringToInt<int>(tok_val + 1, 2, &status); - if (UNLIKELY(StringParser::PARSE_SUCCESS != status || - hour < 0 || hour > 23)) { - return false; - } - switch (tok_len) { - case 6: { - // +hh:mm - minute = StringParser::StringToInt<int>(tok_val + 4, 2, &status); - break; - } - case 5: { - // +hh:mm - minute = StringParser::StringToInt<int>(tok_val + 3, 2, &status); - break; - } - case 3: { - // +hh - break; - } - default: { - // Invalid timezone offset length. - return false; - } - } - if (UNLIKELY(StringParser::PARSE_SUCCESS != status || - minute < 0 || minute > 59)) { - return false; - } - dt_result->tz_offset = boost::posix_time::time_duration(sign * hour, - sign * minute, 0, 0); - break; - } - default: DCHECK(false) << "Unknown date/time format token"; - } - } - return true; -} - -bool TimestampParser::IsValidTZOffset(const char* str_begin, const char* str_end) { - if (*str_begin == '+' || *str_begin == '-') { - ++str_begin; - switch(str_end - str_begin) { - case 5: // hh:mm - return strncmp(str_begin, "hh:mm", 5) == 0; - case 4: // hhmm - return strncmp(str_begin, "hhmm", 4) == 0; - case 2: // hh - return strncmp(str_begin, "hh", 2) == 0; - default: - break; - } - } - return false; -} - - }
http://git-wip-us.apache.org/repos/asf/impala/blob/cb493716/be/src/runtime/timestamp-parse-util.h ---------------------------------------------------------------------- diff --git a/be/src/runtime/timestamp-parse-util.h b/be/src/runtime/timestamp-parse-util.h index bccf0b7..ca309c3 100644 --- a/be/src/runtime/timestamp-parse-util.h +++ b/be/src/runtime/timestamp-parse-util.h @@ -15,17 +15,14 @@ // specific language governing permissions and limitations // under the License. -#ifndef IMPALA_RUNTIME_TIMESTAMP_PARSE_UTIL_H -#define IMPALA_RUNTIME_TIMESTAMP_PARSE_UTIL_H -#include <cstddef> -#include <vector> -#include <boost/date_time/posix_time/ptime.hpp> +#pragma once + +#include <boost/date_time/gregorian/gregorian.hpp> + +#include "runtime/datetime-parse-util.h" namespace boost { - namespace gregorian { - class date; - } namespace posix_time { class time_duration; } @@ -33,179 +30,9 @@ namespace boost { namespace impala { -struct DateTimeParseResult; -class TimestampValue; - -/// Add support for dealing with custom date/time formats in Impala. The following -/// date/time tokens are supported: -/// y â Year -/// M â Month -/// d â Day -/// H â Hour -/// m â Minute -/// s â second -/// S â Fractional second -/// -/// TimeZone offset formats (Must be at the end of format string): -/// +/-hh:mm -/// +/-hhmm -/// +/-hh -/// -/// -/// The token names and usage have been modeled after the SimpleDateFormat class used in -/// Java, with only the above list of tokens being supported. All fields will consume -/// variable length inputs when parsing an input string and must therefore use separators -/// to specify the boundaries of the fields, with the exception of TimeZone values, which -/// have to be of fixed width. Repeating tokens can be used to specify fields of exact -/// witdh, e.g. in yy-MM both fields must be of exactly length two. When using fixed width -/// fields values must be zero-padded and output values will be zero padded during -/// formatting. There is one exception to this: a month field of length 3 will specify -/// literal month names instead of zero padding, i.e., yyyy-MMM-dd will parse from and -/// format to strings like 2013-Nov-21. When using fields of fixed width the separators -/// can be omitted. -/// -/// -/// Formatting character groups can appear in any order along with any separators -/// except TimeZone offset. -/// e.g. -/// yyyy/MM/dd -/// dd-MMM-yy -/// (dd)(MM)(yyyy) HH:mm:ss -/// yyyy-MM-dd HH:mm:ss+hh:mm -/// ..etc.. -/// -/// The following features are not supported: -/// Long literal months e.g. MMMM -/// Nested strings e.g. âYear: â yyyy âMonth: â mm âDay: â dd -/// Lazy formatting - -/// Used to indicate the type of a date/time format token group. -enum DateTimeFormatTokenType { - UNKNOWN = 0, - SEPARATOR, - YEAR, - MONTH_IN_YEAR, - /// Indicates a short literal month e.g. MMM (Aug). Note that the month name is case - /// insensitive for an input scenario and printed in camel case for an output scenario. - MONTH_IN_YEAR_SLT, - DAY_IN_MONTH, - HOUR_IN_DAY, - MINUTE_IN_HOUR, - SECOND_IN_MINUTE, - /// Indicates fractional seconds e.g.14:52:36.2334. By default this provides nanosecond - /// resolution. - FRACTION, - TZ_OFFSET, -}; - -/// Used to store metadata about a token group within a date/time format. -struct DateTimeFormatToken { - /// Indicates the type of date/time format token e.g. year - DateTimeFormatTokenType type; - /// The position of where this token group is supposed to start in the date/time string - /// to be parsed - int pos; - /// The length of the token group - int len; - /// A pointer to the date/time format string that is positioned at the start of this - /// token group - const char* val; - - DateTimeFormatToken(DateTimeFormatTokenType type, int pos, int len, const char* val) - : type(type), - pos(pos), - len(len), - val(val) { - } -}; - -/// This structure is used to hold metadata for a date/time format. Each token group -/// within the raw format is parsed and placed in this structure along with other high -/// level information e.g. if the format contains date and/or time tokens. This context -/// is used during date/time parsing. -struct DateTimeFormatContext { - const char* fmt; - int fmt_len; - /// Holds the expanded length of fmt_len plus any required space when short format - /// tokens are used. The output buffer size is driven from this value. For example, in - /// an output scenario a user may provide the format yyyy-M-d, if the day and month - /// equates to 12, 21 then extra space is needed in the buffer to hold the values. The - /// short format type e.g. yyyy-M-d is valid where no zero padding is required on single - /// digits. - int fmt_out_len; - std::vector<DateTimeFormatToken> toks; - bool has_date_toks; - bool has_time_toks; - /// Current time - 80 years to determine the actual year when - /// parsing 1 or 2-digit year token. - boost::posix_time::ptime century_break_ptime; - - DateTimeFormatContext() { - Reset(NULL, 0); - } - - DateTimeFormatContext(const char* fmt, int fmt_len) { - Reset(fmt, fmt_len); - } - - /// Set the century break when parsing 1 or 2-digit year format. - /// When parsing 1 or 2-digit year, the year should be in the interval - /// [now - 80 years, now + 20 years), according to Hive. - void SetCenturyBreak(const TimestampValue &now); - - void Reset(const char* fmt, int fmt_len) { - this->fmt = fmt; - this->fmt_len = fmt_len; - this->fmt_out_len = fmt_len; - this->has_date_toks = false; - this->has_time_toks = false; - this->toks.clear(); - this->century_break_ptime = boost::posix_time::not_a_date_time; - } -}; - /// Used for parsing both default and custom formatted timestamp values. class TimestampParser { public: - /// Initializes the static parser context which includes default date/time formats and - /// lookup tables. This *must* be called before any of the Parse* related functions can - /// be used. - static void Init(); - - /// Parse the date/time format into tokens and place them in the context. - /// dt_ctx -- date/time format context - /// Return true if the parse was successful. - static bool ParseFormatTokens(DateTimeFormatContext* dt_ctx); - - // Parse out the next digit token from the date/time string by checking for contiguous - // digit characters and return a pointer to the end of that token. - // str -- pointer to the string to be parsed - // str_end -- the pointer to the end of the string to be parsed - // Returns the pointer within the string to the end of the valid digit token. - static const char* ParseDigitToken(const char* str, const char* str_end); - - // Parse out the next separator token from the date/time string against an expected - // character. - // str -- pointer to the string to be parsed - // str_end -- the pointer to the end of the string to be parsed - // sep -- the separator char to compare the token to - // Returns the pointer within the string to the end of the valid separator token. - static const char* ParseSeparatorToken( - const char* str, const char* str_end, const char sep); - - /// Parse the date/time string to generate the DateTimeFormatToken required by - /// DateTimeFormatContext. Similar to ParseFormatTokens() this function will take the - /// string and length, then heuristically determine whether the value contains date - // tokens, time tokens, or both. Unlike ParseFormatTokens, it does not require the - // template format string. - /// str -- valid pointer to the string to parse - /// len -- length of the string to parse (must be > 0) - /// dt_ctx -- date/time format context (must contain valid tokens) - /// d -- the date value where the results of the parsing will be placed - /// t -- the time value where the results of the parsing will be placed - /// Returns true if the date/time was successfully parsed. - static bool ParseFormatTokensByStr(DateTimeFormatContext* dt_ctx); - /// Parse a default date/time string. The default timestamp format is: /// yyyy-MM-dd HH:mm:ss.SSSSSSSSS or yyyy-MM-ddTHH:mm:ss.SSSSSSSSS. Either just the /// date or just the time may be specified. All components are required in either the @@ -214,7 +41,6 @@ class TimestampParser { /// date will be set to invalid. /// str -- valid pointer to the string to parse /// len -- length of the string to parse (must be > 0) - /// dt_ctx -- date/time format context (must contain valid tokens) /// d -- the date value where the results of the parsing will be placed /// t -- the time value where the results of the parsing will be placed /// Returns true if the date/time was successfully parsed. @@ -226,11 +52,13 @@ class TimestampParser { /// to 00:00:00. In the case of just a time, the date will be set to invalid. /// str -- valid pointer to the string to parse /// len -- length of the string to parse (must be > 0) + /// dt_ctx -- date/time format context (must contain valid tokens) /// d -- the date value where the results of the parsing will be placed /// t -- the time value where the results of the parsing will be placed /// Returns true if the date/time was successfully parsed. - static bool Parse(const char* str, int len, const DateTimeFormatContext& dt_ctx, - boost::gregorian::date* d, boost::posix_time::time_duration* t); + static bool Parse(const char* str, int len, + const datetime_parse_util::DateTimeFormatContext& dt_ctx, boost::gregorian::date* d, + boost::posix_time::time_duration* t); /// Format the date/time values using the given format context. Note that a string /// terminator will be appended to the string. @@ -240,50 +68,21 @@ class TimestampParser { /// len -- the output buffer length (should be at least dt_ctx.fmt_exp_len + 1) /// buff -- the output string buffer (must be large enough to hold value) /// Return the number of characters copied in to the buffer (excluding terminator). - static int Format(const DateTimeFormatContext& dt_ctx, - const boost::gregorian::date& d, const boost::posix_time::time_duration& t, - int len, char* buff); + static int Format(const datetime_parse_util::DateTimeFormatContext& dt_ctx, + const boost::gregorian::date& d, const boost::posix_time::time_duration& t, int len, + char* buff); private: - static bool ParseDateTime(const char* str, int str_len, - const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result); - /// Helper function finding the correct century for 1 or 2 digit year according to /// century break. Throws bad_year, bad_day_of_month, or bad_day_month if the date is /// invalid. The century break behavior is copied from Java SimpleDateFormat in order to /// be consistent with Hive. /// In SimpleDateFormat, the century for 2-digit-year breaks at current_time - 80 years. /// https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html - static boost::gregorian::date RealignYear(const DateTimeParseResult& dt_result, - const DateTimeFormatContext& dt_ctx, int day_offset, + static boost::gregorian::date RealignYear( + const datetime_parse_util::DateTimeParseResult& dt_result, + const datetime_parse_util::DateTimeFormatContext& dt_ctx, int day_offset, const boost::posix_time::time_duration& t); - - /// Check if the string is a TimeZone offset token. - /// Valid offset token format are 'hh:mm', 'hhmm', 'hh'. - static bool IsValidTZOffset(const char* str_begin, const char* str_end); - - /// Constants to hold default format lengths. - static const int DEFAULT_DATE_FMT_LEN = 10; - static const int DEFAULT_TIME_FMT_LEN = 8; - static const int DEFAULT_TIME_FRAC_FMT_LEN = 18; - static const int DEFAULT_SHORT_DATE_TIME_FMT_LEN = 19; - static const int DEFAULT_DATE_TIME_FMT_LEN = 29; - - /// Used to indicate if the parsing state has been initialized. - static bool initialized_; - /// Pseudo-constant default date/time contexts. Backwards compatibility is provided on - /// variable length fractional components by defining a format context for each expected - /// length (0 - 9). This logic will be refactored when the parser supports lazy token - /// groups. - static DateTimeFormatContext DEFAULT_SHORT_DATE_TIME_CTX; - static DateTimeFormatContext DEFAULT_SHORT_ISO_DATE_TIME_CTX; - static DateTimeFormatContext DEFAULT_DATE_CTX; - static DateTimeFormatContext DEFAULT_TIME_CTX; - static DateTimeFormatContext DEFAULT_DATE_TIME_CTX[10]; - static DateTimeFormatContext DEFAULT_ISO_DATE_TIME_CTX[10]; - static DateTimeFormatContext DEFAULT_TIME_FRAC_CTX[10]; }; } - -#endif http://git-wip-us.apache.org/repos/asf/impala/blob/cb493716/be/src/runtime/timestamp-test.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/timestamp-test.cc b/be/src/runtime/timestamp-test.cc index 8afa957..3f3f01d 100644 --- a/be/src/runtime/timestamp-test.cc +++ b/be/src/runtime/timestamp-test.cc @@ -23,8 +23,8 @@ #include "common/status.h" #include "exprs/timezone_db.h" +#include "runtime/datetime-parse-util.h" #include "runtime/raw-value.inline.h" -#include "runtime/timestamp-parse-util.h" #include "runtime/timestamp-value.h" #include "runtime/timestamp-value.inline.h" #include "testutil/gtest-util.h" @@ -41,6 +41,9 @@ using boost::posix_time::time_duration; namespace impala { +using datetime_parse_util::ParseFormatTokens; +using datetime_parse_util::DateTimeFormatContext; + // Used for defining a custom date/time format test. The structure can be used to // indicate whether the format or value is expected to fail. In a happy path test, // the values for year, month, day etc will be validated against the parsed result. @@ -181,8 +184,8 @@ void TestTimestampTokens(vector<TimestampToken>* toks, int year, int month, } } string fmt_val = "Format: " + fmt + ", Val: " + val; - DateTimeFormatContext dt_ctx(fmt.c_str(), fmt.length()); - ASSERT_TRUE(TimestampParser::ParseFormatTokens(&dt_ctx)) << fmt_val; + DateTimeFormatContext dt_ctx(fmt.c_str()); + ASSERT_TRUE(ParseFormatTokens(&dt_ctx)) << fmt_val; TimestampValue tv = TimestampValue::Parse(val.c_str(), val.length(), dt_ctx); ValidateTimestamp(tv, fmt, val, fmt_val, year, month, day, hours, mins, secs, frac); @@ -211,8 +214,8 @@ void TestTimestampTokens(vector<TimestampToken>* toks, int year, int month, if (i + 1 < toks_len) val.push_back(*separator); } string fmt_val = "Format: " + fmt + ", Val: " + val; - DateTimeFormatContext dt_ctx(fmt.c_str(), fmt.length()); - ASSERT_TRUE(TimestampParser::ParseFormatTokens(&dt_ctx)) << fmt_val; + DateTimeFormatContext dt_ctx(fmt.c_str()); + ASSERT_TRUE(ParseFormatTokens(&dt_ctx)) << fmt_val; TimestampValue tv = TimestampValue::Parse(val.c_str(), val.length(), dt_ctx); ValidateTimestamp(tv, fmt, val, fmt_val, year, month, day, hours, mins, secs, frac); @@ -541,9 +544,9 @@ TEST(TimestampTest, Basic) { // or literal value. for (int i = 0; i < test_cases.size(); ++i) { TimestampTC test_case = test_cases[i]; - DateTimeFormatContext dt_ctx(test_case.fmt, strlen(test_case.fmt)); + DateTimeFormatContext dt_ctx(test_case.fmt); dt_ctx.SetCenturyBreak(now); - bool parse_result = TimestampParser::ParseFormatTokens(&dt_ctx); + bool parse_result = ParseFormatTokens(&dt_ctx); if (test_case.fmt_should_fail) { EXPECT_FALSE(parse_result) << "TC: " << i; continue; @@ -616,8 +619,8 @@ TEST(TimestampTest, Basic) { // Loop through format test cases for (int i = 0; i < fmt_test_cases.size(); ++i) { TimestampFormatTC test_case = fmt_test_cases[i]; - DateTimeFormatContext dt_ctx(test_case.fmt, strlen(test_case.fmt)); - ASSERT_TRUE(TimestampParser::ParseFormatTokens(&dt_ctx)) << "TC: " << i; + DateTimeFormatContext dt_ctx(test_case.fmt); + ASSERT_TRUE(ParseFormatTokens(&dt_ctx)) << "TC: " << i; TimestampValue cust_tv = TimestampValue::FromUnixTime(test_case.ts, utc_tz); EXPECT_NE(cust_tv.date(), not_a_date) << "TC: " << i; EXPECT_NE(cust_tv.time(), not_a_date_time) << "TC: " << i; http://git-wip-us.apache.org/repos/asf/impala/blob/cb493716/be/src/runtime/timestamp-value.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/timestamp-value.cc b/be/src/runtime/timestamp-value.cc index 50d1ee4..a3631e5 100644 --- a/be/src/runtime/timestamp-value.cc +++ b/be/src/runtime/timestamp-value.cc @@ -48,6 +48,8 @@ const int64_t EPOCH_DAY_NUMBER = namespace impala { +using datetime_parse_util::DateTimeFormatContext; + const char* TimestampValue::LLVM_CLASS_NAME = "class.impala::TimestampValue"; const double TimestampValue::ONE_BILLIONTH = 0.000000001; http://git-wip-us.apache.org/repos/asf/impala/blob/cb493716/be/src/runtime/timestamp-value.h ---------------------------------------------------------------------- diff --git a/be/src/runtime/timestamp-value.h b/be/src/runtime/timestamp-value.h index 2509f21..56f103e 100644 --- a/be/src/runtime/timestamp-value.h +++ b/be/src/runtime/timestamp-value.h @@ -37,7 +37,9 @@ DECLARE_bool(use_local_tz_for_unix_timestamp_conversions); namespace impala { +namespace datetime_parse_util { struct DateTimeFormatContext; +} /// Represents either a (1) date and time, (2) a date with an undefined time, or (3) /// a time with an undefined date. In all cases, times have up to nanosecond resolution @@ -89,7 +91,7 @@ class TimestampValue { static TimestampValue Parse(const std::string& str); static TimestampValue Parse(const char* str, int len); static TimestampValue Parse(const char* str, int len, - const DateTimeFormatContext& dt_ctx); + const datetime_parse_util::DateTimeFormatContext& dt_ctx); /// Unix time (seconds since 1970-01-01 UTC by definition) constructors. /// Return the corresponding timestamp in the 'local_tz' time zone if @@ -199,7 +201,8 @@ class TimestampValue { /// len -- the length of the buffer /// buff -- the buffer that will hold the result /// Returns the number of characters copied in to the buffer (minus the terminator) - int Format(const DateTimeFormatContext& dt_ctx, int len, char* buff) const; + int Format(const datetime_parse_util::DateTimeFormatContext& dt_ctx, int len, + char* buff) const; /// Interpret 'this' as a timestamp in UTC and convert to unix time. /// Returns false if the conversion failed ('unix_time' will be undefined), otherwise