formatter

tarmstrong Sun, 30 Sep 2018 00:26:32 -0700

http://git-wip-us.apache.org/repos/asf/impala/blob/cb493716/be/src/runtime/timestamp-parse-util.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-parse-util.cc 
b/be/src/runtime/timestamp-parse-util.cc
index 3063728..27568a3 100644
--- a/be/src/runtime/timestamp-parse-util.cc
+++ b/be/src/runtime/timestamp-parse-util.cc
@@ -17,20 +17,12 @@
 
 #include "runtime/timestamp-parse-util.h"
 
-#include <algorithm>
-
-#include <boost/assign/list_of.hpp>
-#include <boost/date_time/gregorian/gregorian.hpp>
-#include <boost/unordered_map.hpp>
-
 #include "runtime/string-value.inline.h"
 #include "runtime/timestamp-value.h"
 #include "util/string-parser.h"
 
 #include "common/names.h"
 
-namespace assign = boost::assign;
-using boost::unordered_map;
 using boost::gregorian::date;
 using boost::gregorian::date_duration;
 using boost::gregorian::gregorian_calendar;
@@ -41,332 +33,7 @@ using boost::posix_time::time_duration;
 
 namespace impala {
 
-/// Stores the results of parsing a date/time string.
-struct DateTimeParseResult {
-  int year;
-  int month;
-  int day;
-  int hour;
-  int minute;
-  int second;
-  int32_t fraction;
-  boost::posix_time::time_duration tz_offset;
-  // Whether to realign the year for 2-digit year format
-  bool realign_year;
-
-  DateTimeParseResult()
-    : year(0),
-      month(0),
-      day(0),
-      hour(0),
-      minute(0),
-      second(0),
-      fraction(0),
-      tz_offset(0,0,0,0),
-      realign_year(false) {
-  }
-};
-
-void DateTimeFormatContext::SetCenturyBreak(const TimestampValue &now) {
-  auto& now_date = now.date();
-  // If the century break is at an invalid 02/29, set it to 02/28 for 
consistency with
-  // Hive.
-  if (now_date.month() == 2 && now_date.day() == 29 &&
-      !gregorian_calendar::is_leap_year(now_date.year() - 80)) {
-    century_break_ptime = ptime(date(now_date.year() - 80, 2, 28), now.time());
-  } else {
-    century_break_ptime = ptime(
-        date(now_date.year() - 80, now_date.month(), now_date.day()), 
now.time());
-  }
-}
-
-bool TimestampParser::initialized_ = false;
-
-/// Lazily initialized pseudo-constant hashmap for mapping month names to an 
index.
-static unordered_map<StringValue, int> REV_MONTH_INDEX;
-
-const int TimestampParser::DEFAULT_DATE_FMT_LEN;
-const int TimestampParser::DEFAULT_TIME_FMT_LEN;
-const int TimestampParser::DEFAULT_TIME_FRAC_FMT_LEN;
-const int TimestampParser::DEFAULT_SHORT_DATE_TIME_FMT_LEN;
-const int TimestampParser::DEFAULT_DATE_TIME_FMT_LEN;
-
-DateTimeFormatContext TimestampParser::DEFAULT_SHORT_DATE_TIME_CTX;
-DateTimeFormatContext TimestampParser::DEFAULT_SHORT_ISO_DATE_TIME_CTX;
-DateTimeFormatContext TimestampParser::DEFAULT_DATE_CTX;
-DateTimeFormatContext TimestampParser::DEFAULT_TIME_CTX;
-DateTimeFormatContext TimestampParser::DEFAULT_DATE_TIME_CTX[10];
-DateTimeFormatContext TimestampParser::DEFAULT_ISO_DATE_TIME_CTX[10];
-DateTimeFormatContext TimestampParser::DEFAULT_TIME_FRAC_CTX[10];
-
-void TimestampParser::Init() {
-  if (TimestampParser::initialized_) return;
-  // This needs to be lazily init'd because a StringValues hash function will 
be invoked
-  // for each entry that's placed in the map. The hash function expects that
-  // CpuInfo::Init() has already been called.
-  REV_MONTH_INDEX = boost::unordered_map<StringValue, int>({
-      {StringValue("jan"), 1}, {StringValue("feb"), 2},
-      {StringValue("mar"), 3}, {StringValue("apr"), 4},
-      {StringValue("may"), 5}, {StringValue("jun"), 6},
-      {StringValue("jul"), 7}, {StringValue("aug"), 8},
-      {StringValue("sep"), 9}, {StringValue("oct"), 10},
-      {StringValue("nov"), 11}, {StringValue("dec"), 12}
-  });
-
-  // Setup the default date/time context yyyy-MM-dd HH:mm:ss.SSSSSSSSS
-  const char* DATE_TIME_CTX_FMT = "yyyy-MM-dd HH:mm:ss.SSSSSSSSS";
-  const int FRACTIONAL_MAX_LEN = 9;
-  for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) {
-    DEFAULT_DATE_TIME_CTX[i].Reset(DATE_TIME_CTX_FMT,
-        DEFAULT_DATE_TIME_FMT_LEN - (FRACTIONAL_MAX_LEN - i));
-    ParseFormatTokens(&DEFAULT_DATE_TIME_CTX[i]);
-  }
-
-  // Setup the default ISO date/time context yyyy-MM-ddTHH:mm:ss.SSSSSSSSS
-  for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) {
-    DEFAULT_ISO_DATE_TIME_CTX[i].Reset("yyyy-MM-ddTHH:mm:ss.SSSSSSSSS",
-        DEFAULT_DATE_TIME_FMT_LEN - (FRACTIONAL_MAX_LEN - i));
-    ParseFormatTokens(&DEFAULT_ISO_DATE_TIME_CTX[i]);
-  }
-
-  // Setup the short default date/time context yyyy-MM-dd HH:mm:ss
-  DEFAULT_SHORT_DATE_TIME_CTX.Reset("yyyy-MM-dd HH:mm:ss",
-      DEFAULT_SHORT_DATE_TIME_FMT_LEN);
-  ParseFormatTokens(&DEFAULT_SHORT_DATE_TIME_CTX);
-
-  // Setup the short default ISO date/time context yyyy-MM-ddTHH:mm:ss
-  DEFAULT_SHORT_ISO_DATE_TIME_CTX.Reset("yyyy-MM-ddTHH:mm:ss",
-      DEFAULT_SHORT_DATE_TIME_FMT_LEN);
-  ParseFormatTokens(&DEFAULT_SHORT_ISO_DATE_TIME_CTX);
-
-  // Setup the default short date context yyyy-MM-dd
-  DEFAULT_DATE_CTX.Reset("yyyy-MM-dd", DEFAULT_DATE_FMT_LEN);
-  ParseFormatTokens(&DEFAULT_DATE_CTX);
-
-  // Setup the default short time context HH:mm:ss
-  DEFAULT_TIME_CTX.Reset("HH:mm:ss", DEFAULT_TIME_FMT_LEN);
-  ParseFormatTokens(&DEFAULT_TIME_CTX);
-
-  // Setup the default short time context with fractional seconds 
HH:mm:ss.SSSSSSSSS
-  for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) {
-    DEFAULT_TIME_FRAC_CTX[i].Reset(DATE_TIME_CTX_FMT + 11,
-        DEFAULT_TIME_FRAC_FMT_LEN - (FRACTIONAL_MAX_LEN - i));
-    ParseFormatTokens(&DEFAULT_TIME_FRAC_CTX[i]);
-  }
-  // Flag that the parser is ready.
-  TimestampParser::initialized_ = true;
-}
-
-bool TimestampParser::ParseFormatTokens(DateTimeFormatContext* dt_ctx) {
-  DCHECK(dt_ctx != NULL);
-  DCHECK(dt_ctx->fmt != NULL);
-  DCHECK(dt_ctx->fmt_len > 0);
-  DCHECK(dt_ctx->toks.size() == 0);
-  const char* str_begin = dt_ctx->fmt;
-  const char* str_end = str_begin + dt_ctx->fmt_len;
-  const char* str = str_begin;
-  // Parse the tokens from the format string
-  while (str < str_end) {
-    if (isdigit(*str)) return false;
-    // Ignore T|Z|non aA-zZ chars but track them as separators (required for 
printing).
-    if ((*str == 'T') || (*str == 'Z') || (!isalpha(*str))) {
-      if (dt_ctx->has_time_toks && IsValidTZOffset(str, str_end)) {
-        // TZ offset must come at the end of the format.
-        dt_ctx->toks.push_back(DateTimeFormatToken(TZ_OFFSET, str - str_begin,
-            str_end - str, str));
-        break;
-      }
-      dt_ctx->toks.push_back(DateTimeFormatToken(SEPARATOR, str - str_begin, 
1, str));
-      ++str;
-      continue;
-    }
-    // Not a separator, verify that the previous token is either a separator 
or has
-    // length >1, i.e., it is not a variable length token.
-    if (!dt_ctx->toks.empty()) {
-      const DateTimeFormatToken& prev = dt_ctx->toks.back();
-      if (UNLIKELY(prev.type != SEPARATOR && prev.len == 1)) return false;
-    }
-    DateTimeFormatTokenType tok_type = UNKNOWN;
-    switch (*str) {
-      case 'y': tok_type = YEAR; break;
-      case 'M': tok_type = MONTH_IN_YEAR; break;
-      case 'd': tok_type = DAY_IN_MONTH; break;
-      case 'H': tok_type = HOUR_IN_DAY; break;
-      case 'm': tok_type = MINUTE_IN_HOUR; break;
-      case 's': tok_type = SECOND_IN_MINUTE; break;
-      case 'S': tok_type = FRACTION; break;
-      // Error on aA-zZ reserved characters that are not used yet.
-      default: return false;
-    }
-    dt_ctx->has_date_toks |= tok_type < HOUR_IN_DAY;
-    dt_ctx->has_time_toks |= tok_type >= HOUR_IN_DAY;
-    // Get the token group length
-    int tok_len = 1;
-    char tok_chr = *str;
-    const char* curr_tok_chr = str + 1;
-    while (curr_tok_chr < str_end) {
-      if (*curr_tok_chr != tok_chr) break;
-      ++tok_len;
-      ++curr_tok_chr;
-    }
-    if (tok_type == MONTH_IN_YEAR) {
-      if (UNLIKELY(tok_len > 3)) return false;
-      if (tok_len == 3) tok_type = MONTH_IN_YEAR_SLT;
-    }
-    // In an output scenario, fmt_out_len is used to determine the print 
buffer size.
-    // If the format uses short token groups e.g. yyyy-MM-d, there must to be 
enough
-    // room in the buffer for wider values e.g. 2013-12-16.
-    if (tok_len == 1) ++dt_ctx->fmt_out_len;
-    DateTimeFormatToken tok(tok_type, str - str_begin, tok_len, str);
-    str += tok.len;
-    dt_ctx->toks.push_back(tok);
-  }
-  return dt_ctx->has_date_toks || dt_ctx->has_time_toks;
-}
-
-const char* TimestampParser::ParseDigitToken(const char* str, const char* 
str_end) {
-  const char* tok_end = str;
-  while (tok_end < str_end) {
-    if (!isdigit(*tok_end)) return tok_end;
-    ++tok_end;
-  }
-  return tok_end;
-}
-
-const char* TimestampParser::ParseSeparatorToken(
-    const char* str, const char* str_end, const char sep) {
-  const char* tok_end = str;
-  while (tok_end < str_end) {
-    if (*tok_end != sep) return tok_end;
-    ++tok_end;
-  }
-  return tok_end;
-}
-
-bool TimestampParser::ParseFormatTokensByStr(DateTimeFormatContext* dt_ctx) {
-  DCHECK(dt_ctx != NULL);
-  DCHECK(dt_ctx->fmt != NULL);
-  DCHECK_GT(dt_ctx->fmt_len, 0);
-  DCHECK_EQ(dt_ctx->toks.size(), 0);
-  const char* str_begin = dt_ctx->fmt;
-  const char* str_end = str_begin + dt_ctx->fmt_len;
-  const char* str = str_begin;
-  const char* tok_end;
-
-  // Parse the 4-digit year
-  tok_end = ParseDigitToken(str, str_end);
-  if (tok_end - str == 4) {
-    dt_ctx->toks.push_back(
-        DateTimeFormatToken(YEAR, str - str_begin, tok_end - str, str));
-    str = tok_end;
-
-    // Check for the date separator '-'
-    tok_end = ParseSeparatorToken(str, str_end, '-');
-    if (tok_end - str != 1) return false;
-    dt_ctx->toks.push_back(
-        DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
-    str = tok_end;
-
-    // Parse the 1 or 2 digit month.
-    tok_end = ParseDigitToken(str, str_end);
-    if (tok_end - str != 1 && tok_end - str != 2) return false;
-    dt_ctx->toks.push_back(
-        DateTimeFormatToken(MONTH_IN_YEAR, str - str_begin, tok_end - str, 
str));
-    str = tok_end;
-
-    // Check for the date separator '-'
-    tok_end = ParseSeparatorToken(str, str_end, '-');
-    if (tok_end - str != 1) return false;
-    dt_ctx->toks.push_back(
-        DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
-    str = tok_end;
-
-    // Parse the 1 or 2 digit day in month
-    tok_end = ParseDigitToken(str, str_end);
-    if (tok_end - str != 1 && tok_end - str != 2) return false;
-    dt_ctx->toks.push_back(
-        DateTimeFormatToken(DAY_IN_MONTH, str - str_begin, tok_end - str, 
str));
-    str = tok_end;
-    dt_ctx->has_date_toks = true;
-
-    // If the string ends here, we only have a date component
-    if (str == str_end) return true;
-
-    // Check for the space between date and time component
-    if (*str != ' ' && *str != 'T') return false;
-    char sep = *str;
-    tok_end = ParseSeparatorToken(str, str_end, sep);
-    if (tok_end - str < 1) return false;
-    // IMPALA-6641: Multiple spaces are okay, 'T' separator must be single
-    if (sep == 'T' && tok_end - str > 1) return false;
-    dt_ctx->toks.push_back(
-        DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
-    str = tok_end;
-
-    // Invalid format if date-time separator is not followed by more digits
-    if (str > str_end) return false;
-    tok_end = ParseDigitToken(str, str_end);
-  }
-
-  // Parse the 1 or 2 digit hour
-  if (tok_end - str != 1 && tok_end - str != 2) return false;
-  dt_ctx->toks.push_back(
-      DateTimeFormatToken(HOUR_IN_DAY, str - str_begin, tok_end - str, str));
-  str = tok_end;
-
-  // Check for the time component separator ':'
-  tok_end = ParseSeparatorToken(str, str_end, ':');
-  if (tok_end - str != 1) return false;
-  dt_ctx->toks.push_back(
-      DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
-  str = tok_end;
-
-  // Parse the 1 or 2 digit minute
-  tok_end = ParseDigitToken(str, str_end);
-  if (tok_end - str != 1 && tok_end - str != 2) return false;
-  dt_ctx->toks.push_back(
-      DateTimeFormatToken(MINUTE_IN_HOUR, str - str_begin, tok_end - str, 
str));
-  str = tok_end;
-
-  // Check for the time component separator ':'
-  tok_end = ParseSeparatorToken(str, str_end, ':');
-  if (tok_end - str != 1) return false;
-  dt_ctx->toks.push_back(
-      DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
-  str = tok_end;
-
-  // Parse the 1 or 2 digit second
-  tok_end = ParseDigitToken(str, str_end);
-  if (tok_end - str != 1 && tok_end - str != 2) return false;
-  dt_ctx->toks.push_back(
-      DateTimeFormatToken(SECOND_IN_MINUTE, str - str_begin, tok_end - str, 
str));
-  str = tok_end;
-  dt_ctx->has_time_toks = true;
-
-  // There is more to parse, there maybe a fractional component.
-  if (str < str_end) {
-    tok_end = ParseSeparatorToken(str, str_end, '.');
-    if (tok_end - str != 1) return false;
-    dt_ctx->toks.push_back(
-        DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
-    str = tok_end;
-
-    // Invalid format when there is no fractional component following '.'
-    if (str > str_end) return false;
-
-    // Parse the fractional component.
-    // Like the non-lazy path, this will parse up to 9 fractional digits
-    tok_end = ParseDigitToken(str, str_end);
-    int num_digits = std::min<int>(9, tok_end - str);
-    dt_ctx->toks.push_back(
-        DateTimeFormatToken(FRACTION, str - str_begin, num_digits, str));
-    str = tok_end;
-
-    // Invalid format if there is more to parse after the fractional component
-    if (str < str_end) return false;
-  }
-  return true;
-}
+using namespace datetime_parse_util;
 
 // Helper for TimestampParse::Parse to produce return value and set output 
parameters
 // when parsing fails. 'd' and 't' must be non-NULL.
@@ -378,7 +45,7 @@ static bool IndicateTimestampParseFailure(date* d, 
time_duration* t) {
 
 bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* 
d,
     boost::posix_time::time_duration* t) {
-  DCHECK(TimestampParser::initialized_);
+  DCHECK(IsParseCtxInitialized());
   DCHECK(d != nullptr);
   DCHECK(t != nullptr);
   if (UNLIKELY(str == nullptr)) return IndicateTimestampParseFailure(d, t);
@@ -482,8 +149,7 @@ bool TimestampParser::Parse(const char* str, int len, 
boost::gregorian::date* d,
   if (dt_ctx != nullptr) return Parse(str, default_fmt_len, *dt_ctx, d, t);
   // Generating context lazily as a fall back if default formats fail.
   // ParseFormatTokenByStr() does not require a template format string.
-  DateTimeFormatContext lazy_ctx;
-  lazy_ctx.Reset(str, trimmed_len);
+  DateTimeFormatContext lazy_ctx(str, trimmed_len);
   if (!ParseFormatTokensByStr(&lazy_ctx)) return 
IndicateTimestampParseFailure(d, t);
   dt_ctx = &lazy_ctx;
   return Parse(str, trimmed_len, *dt_ctx, d, t);
@@ -517,14 +183,13 @@ date TimestampParser::RealignYear(const 
DateTimeParseResult& dt_result,
 
 bool TimestampParser::Parse(const char* str, int len, const 
DateTimeFormatContext& dt_ctx,
     date* d, time_duration* t) {
-  DCHECK(TimestampParser::initialized_);
+  DCHECK(IsParseCtxInitialized());
   DCHECK(dt_ctx.toks.size() > 0);
   DCHECK(d != NULL);
   DCHECK(t != NULL);
   DateTimeParseResult dt_result;
   int day_offset = 0;
-  if (UNLIKELY(str == NULL || len <= 0 ||
-          !ParseDateTime(str, len, dt_ctx, &dt_result))) {
+  if (UNLIKELY(str == NULL || len <= 0 || !ParseDateTime(str, len, dt_ctx, 
&dt_result))) {
     return IndicateTimestampParseFailure(d, t);
   }
   if (dt_ctx.has_time_toks) {
@@ -572,7 +237,7 @@ bool TimestampParser::Parse(const char* str, int len, const 
DateTimeFormatContex
 int TimestampParser::Format(const DateTimeFormatContext& dt_ctx,
     const boost::gregorian::date& d, const boost::posix_time::time_duration& t,
     int len, char* buff) {
-  DCHECK(TimestampParser::initialized_);
+  DCHECK(IsParseCtxInitialized());
   DCHECK(dt_ctx.toks.size() > 0);
   DCHECK(len > dt_ctx.fmt_out_len);
   DCHECK(buff != NULL);
@@ -626,151 +291,4 @@ int TimestampParser::Format(const DateTimeFormatContext& 
dt_ctx,
   return str - buff;
 }
 
-bool TimestampParser::ParseDateTime(const char* str, int str_len,
-    const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result) {
-  DCHECK(dt_ctx.fmt_len > 0);
-  DCHECK(dt_ctx.toks.size() > 0);
-  DCHECK(dt_result != NULL);
-  if (str_len <= 0 || str_len < dt_ctx.fmt_len || str == NULL) return false;
-  StringParser::ParseResult status;
-  // Keep track of the number of characters we need to shift token positions 
by.
-  // Variable-length tokens will result in values > 0;
-  int shift_len = 0;
-  for (const DateTimeFormatToken& tok: dt_ctx.toks) {
-    const char* tok_val = str + tok.pos + shift_len;
-    if (tok.type == SEPARATOR) {
-      if (UNLIKELY(*tok_val != *tok.val)) return false;
-      continue;
-    }
-    int tok_len = tok.len;
-    const char* str_end = str + str_len;
-    // In case of single-character tokens we scan ahead to the next separator.
-    if (UNLIKELY(tok_len == 1)) {
-      while ((tok_val + tok_len < str_end) && isdigit(*(tok_val + tok_len))) {
-        ++tok_len;
-        ++shift_len;
-      }
-    }
-    switch (tok.type) {
-      case YEAR: {
-        dt_result->year = StringParser::StringToInt<int>(tok_val, tok_len, 
&status);
-        if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
-        if (UNLIKELY(dt_result->year < 0 || dt_result->year > 9999)) return 
false;
-        // Year in "Y" and "YY" format should be in the interval
-        // [current time - 80 years, current time + 20 years)
-        if (tok_len <= 2) dt_result->realign_year = true;
-        break;
-      }
-      case MONTH_IN_YEAR: {
-        dt_result->month = StringParser::StringToInt<int>(tok_val, tok_len, 
&status);
-        if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
-        if (UNLIKELY(dt_result->month < 1 || dt_result->month > 12)) return 
false;
-        break;
-      }
-      case MONTH_IN_YEAR_SLT: {
-        char raw_buff[tok.len];
-        std::transform(tok_val, tok_val + tok.len, raw_buff, ::tolower);
-        StringValue buff(raw_buff, tok.len);
-        boost::unordered_map<StringValue, int>::const_iterator iter =
-            REV_MONTH_INDEX.find(buff);
-        if (UNLIKELY(iter == REV_MONTH_INDEX.end())) return false;
-        dt_result->month = iter->second;
-        break;
-      }
-      case DAY_IN_MONTH: {
-        dt_result->day = StringParser::StringToInt<int>(tok_val, tok_len, 
&status);
-        if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
-        // TODO: Validate that the value of day is correct for the given month.
-        if (UNLIKELY(dt_result->day < 1 || dt_result->day > 31)) return false;
-        break;
-      }
-      case HOUR_IN_DAY: {
-        dt_result->hour = StringParser::StringToInt<int>(tok_val, tok_len, 
&status);
-        if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
-        if (UNLIKELY(dt_result->hour < 0 || dt_result->hour > 23)) return 
false;
-        break;
-      }
-      case MINUTE_IN_HOUR: {
-        dt_result->minute = StringParser::StringToInt<int>(tok_val, tok_len, 
&status);
-        if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
-        if (UNLIKELY(dt_result->minute < 0 || dt_result->minute > 59)) return 
false;
-        break;
-      }
-      case SECOND_IN_MINUTE: {
-        dt_result->second = StringParser::StringToInt<int>(tok_val, tok_len, 
&status);
-        if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
-        if (UNLIKELY(dt_result->second < 0 || dt_result->second > 59)) return 
false;
-        break;
-      }
-      case FRACTION: {
-        dt_result->fraction =
-            StringParser::StringToInt<int32_t>(tok_val, tok_len, &status);
-        if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
-        // A user may specify a time of 04:30:22.1238, the parser will return 
1238 for
-        // the fractional portion. This does not represent the intended value 
of
-        // 123800000, therefore the number must be scaled up.
-        for (int i = tok_len; i < 9; ++i) dt_result->fraction *= 10;
-        break;
-      }
-      case TZ_OFFSET: {
-        if (tok_val[0] != '+' && tok_val[0] != '-') return false;
-        int sign = tok_val[0] == '-' ? -1 : 1;
-        int minute = 0;
-        int hour = StringParser::StringToInt<int>(tok_val + 1, 2, &status);
-        if (UNLIKELY(StringParser::PARSE_SUCCESS != status ||
-            hour < 0 || hour > 23)) {
-          return false;
-        }
-        switch (tok_len) {
-          case 6: {
-            // +hh:mm
-            minute = StringParser::StringToInt<int>(tok_val + 4, 2, &status);
-            break;
-          }
-          case 5: {
-            // +hh:mm
-            minute = StringParser::StringToInt<int>(tok_val + 3, 2, &status);
-            break;
-          }
-          case 3: {
-            // +hh
-            break;
-          }
-          default: {
-            // Invalid timezone offset length.
-            return false;
-          }
-        }
-        if (UNLIKELY(StringParser::PARSE_SUCCESS != status ||
-            minute < 0 || minute > 59)) {
-          return false;
-        }
-        dt_result->tz_offset = boost::posix_time::time_duration(sign * hour,
-            sign * minute, 0, 0);
-        break;
-      }
-      default: DCHECK(false) << "Unknown date/time format token";
-    }
-  }
-  return true;
-}
-
-bool TimestampParser::IsValidTZOffset(const char* str_begin, const char* 
str_end) {
-  if (*str_begin == '+' || *str_begin == '-') {
-    ++str_begin;
-    switch(str_end - str_begin) {
-      case 5:   // hh:mm
-        return strncmp(str_begin, "hh:mm", 5) == 0;
-      case 4:   // hhmm
-        return strncmp(str_begin, "hhmm", 4) == 0;
-      case 2:   // hh
-        return strncmp(str_begin, "hh", 2) == 0;
-      default:
-        break;
-    }
-  }
-  return false;
-}
-
-
 }


http://git-wip-us.apache.org/repos/asf/impala/blob/cb493716/be/src/runtime/timestamp-parse-util.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-parse-util.h 
b/be/src/runtime/timestamp-parse-util.h
index bccf0b7..ca309c3 100644
--- a/be/src/runtime/timestamp-parse-util.h
+++ b/be/src/runtime/timestamp-parse-util.h
@@ -15,17 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef IMPALA_RUNTIME_TIMESTAMP_PARSE_UTIL_H
-#define IMPALA_RUNTIME_TIMESTAMP_PARSE_UTIL_H
 
-#include <cstddef>
-#include <vector>
-#include <boost/date_time/posix_time/ptime.hpp>
+#pragma once
+
+#include <boost/date_time/gregorian/gregorian.hpp>
+
+#include "runtime/datetime-parse-util.h"
 
 namespace boost {
-  namespace gregorian {
-    class date;
-  }
   namespace posix_time {
     class time_duration;
   }
@@ -33,179 +30,9 @@ namespace boost {
 
 namespace impala {
 
-struct DateTimeParseResult;
-class TimestampValue;
-
-/// Add support for dealing with custom date/time formats in Impala. The 
following
-/// date/time tokens are supported:
-///   y â Year
-///   M â Month
-///   d â Day
-///   H â Hour
-///   m â Minute
-///   s â second
-///   S â Fractional second
-///
-///   TimeZone offset formats (Must be at the end of format string):
-///   +/-hh:mm
-///   +/-hhmm
-///   +/-hh
-///
-///
-/// The token names and usage have been modeled after the SimpleDateFormat 
class used in
-/// Java, with only the above list of tokens being supported. All fields will 
consume
-/// variable length inputs when parsing an input string and must therefore use 
separators
-/// to specify the boundaries of the fields, with the exception of TimeZone 
values, which
-/// have to be of fixed width. Repeating tokens can be used to specify fields 
of exact
-/// witdh, e.g. in yy-MM both fields must be of exactly length two. When using 
fixed width
-/// fields values must be zero-padded and output values will be zero padded 
during
-/// formatting. There is one exception to this: a month field of length 3 will 
specify
-/// literal month names instead of zero padding, i.e., yyyy-MMM-dd will parse 
from and
-/// format to strings like 2013-Nov-21. When using fields of fixed width the 
separators
-/// can be omitted.
-///
-///
-/// Formatting character groups can appear in any order along with any 
separators
-/// except TimeZone offset.
-/// e.g.
-///   yyyy/MM/dd
-///   dd-MMM-yy
-///   (dd)(MM)(yyyy) HH:mm:ss
-///   yyyy-MM-dd HH:mm:ss+hh:mm
-/// ..etc..
-///
-/// The following features are not supported:
-///   Long literal months e.g. MMMM
-///   Nested strings e.g. âYear: â yyyy âMonth: â mm âDay: â dd
-///   Lazy formatting
-
-/// Used to indicate the type of a date/time format token group.
-enum DateTimeFormatTokenType {
-  UNKNOWN = 0,
-  SEPARATOR,
-  YEAR,
-  MONTH_IN_YEAR,
-  /// Indicates a short literal month e.g. MMM (Aug). Note that the month name 
is case
-  /// insensitive for an input scenario and printed in camel case for an 
output scenario.
-  MONTH_IN_YEAR_SLT,
-  DAY_IN_MONTH,
-  HOUR_IN_DAY,
-  MINUTE_IN_HOUR,
-  SECOND_IN_MINUTE,
-  /// Indicates fractional seconds e.g.14:52:36.2334. By default this provides 
nanosecond
-  /// resolution.
-  FRACTION,
-  TZ_OFFSET,
-};
-
-/// Used to store metadata about a token group within a date/time format.
-struct DateTimeFormatToken {
-  /// Indicates the type of date/time format token e.g. year
-  DateTimeFormatTokenType type;
-  /// The position of where this token group is supposed to start in the 
date/time string
-  /// to be parsed
-  int pos;
-  /// The length of the token group
-  int len;
-  /// A pointer to the date/time format string that is positioned at the start 
of this
-  /// token group
-  const char* val;
-
-  DateTimeFormatToken(DateTimeFormatTokenType type, int pos, int len, const 
char* val)
-    : type(type),
-      pos(pos),
-      len(len),
-      val(val) {
-  }
-};
-
-/// This structure is used to hold metadata for a date/time format. Each token 
group
-/// within the raw format is parsed and placed in this structure along with 
other high
-/// level information e.g. if the format contains date and/or time tokens. 
This context
-/// is used during date/time parsing.
-struct DateTimeFormatContext {
-  const char* fmt;
-  int fmt_len;
-  /// Holds the expanded length of fmt_len plus any required space when short 
format
-  /// tokens are used. The output buffer size is driven from this value. For 
example, in
-  /// an output scenario a user may provide the format yyyy-M-d, if the day 
and month
-  /// equates to 12, 21 then extra space is needed in the buffer to hold the 
values. The
-  /// short format type e.g. yyyy-M-d is valid where no zero padding is 
required on single
-  /// digits.
-  int fmt_out_len;
-  std::vector<DateTimeFormatToken> toks;
-  bool has_date_toks;
-  bool has_time_toks;
-  /// Current time - 80 years to determine the actual year when
-  /// parsing 1 or 2-digit year token.
-  boost::posix_time::ptime century_break_ptime;
-
-  DateTimeFormatContext() {
-    Reset(NULL, 0);
-  }
-
-  DateTimeFormatContext(const char* fmt, int fmt_len) {
-    Reset(fmt, fmt_len);
-  }
-
-  /// Set the century break when parsing 1 or 2-digit year format.
-  /// When parsing 1 or 2-digit year, the year should be in the interval
-  /// [now - 80 years, now + 20 years), according to Hive.
-  void SetCenturyBreak(const TimestampValue &now);
-
-  void Reset(const char* fmt, int fmt_len) {
-    this->fmt = fmt;
-    this->fmt_len = fmt_len;
-    this->fmt_out_len = fmt_len;
-    this->has_date_toks = false;
-    this->has_time_toks = false;
-    this->toks.clear();
-    this->century_break_ptime = boost::posix_time::not_a_date_time;
-  }
-};
-
 /// Used for parsing both default and custom formatted timestamp values.
 class TimestampParser {
  public:
-  /// Initializes the static parser context which includes default date/time 
formats and
-  /// lookup tables. This *must* be called before any of the Parse* related 
functions can
-  /// be used.
-  static void Init();
-
-  /// Parse the date/time format into tokens and place them in the context.
-  /// dt_ctx -- date/time format context
-  /// Return true if the parse was successful.
-  static bool ParseFormatTokens(DateTimeFormatContext* dt_ctx);
-
-  // Parse out the next digit token from the date/time string by checking for 
contiguous
-  // digit characters and return a pointer to the end of that token.
-  // str -- pointer to the string to be parsed
-  // str_end -- the pointer to the end of the string to be parsed
-  // Returns the pointer within the string to the end of the valid digit token.
-  static const char* ParseDigitToken(const char* str, const char* str_end);
-
-  // Parse out the next separator token from the date/time string against an 
expected
-  // character.
-  // str -- pointer to the string to be parsed
-  // str_end -- the pointer to the end of the string to be parsed
-  // sep -- the separator char to compare the token to
-  // Returns the pointer within the string to the end of the valid separator 
token.
-  static const char* ParseSeparatorToken(
-      const char* str, const char* str_end, const char sep);
-
-  /// Parse the date/time string to generate the DateTimeFormatToken required 
by
-  /// DateTimeFormatContext. Similar to ParseFormatTokens() this function will 
take the
-  /// string and length, then heuristically determine whether the value 
contains date
-  //  tokens, time tokens, or both. Unlike ParseFormatTokens, it does not 
require the
-  //  template format string.
-  /// str -- valid pointer to the string to parse
-  /// len -- length of the string to parse (must be > 0)
-  /// dt_ctx -- date/time format context (must contain valid tokens)
-  /// d -- the date value where the results of the parsing will be placed
-  /// t -- the time value where the results of the parsing will be placed
-  /// Returns true if the date/time was successfully parsed.
-  static bool ParseFormatTokensByStr(DateTimeFormatContext* dt_ctx);
-
   /// Parse a default date/time string. The default timestamp format is:
   /// yyyy-MM-dd HH:mm:ss.SSSSSSSSS or yyyy-MM-ddTHH:mm:ss.SSSSSSSSS. Either 
just the
   /// date or just the time may be specified. All components are required in 
either the
@@ -214,7 +41,6 @@ class TimestampParser {
   /// date will be set to invalid.
   /// str -- valid pointer to the string to parse
   /// len -- length of the string to parse (must be > 0)
-  /// dt_ctx -- date/time format context (must contain valid tokens)
   /// d -- the date value where the results of the parsing will be placed
   /// t -- the time value where the results of the parsing will be placed
   /// Returns true if the date/time was successfully parsed.
@@ -226,11 +52,13 @@ class TimestampParser {
   /// to 00:00:00. In the case of just a time, the date will be set to invalid.
   /// str -- valid pointer to the string to parse
   /// len -- length of the string to parse (must be > 0)
+  /// dt_ctx -- date/time format context (must contain valid tokens)
   /// d -- the date value where the results of the parsing will be placed
   /// t -- the time value where the results of the parsing will be placed
   /// Returns true if the date/time was successfully parsed.
-  static bool Parse(const char* str, int len, const DateTimeFormatContext& 
dt_ctx,
-      boost::gregorian::date* d, boost::posix_time::time_duration* t);
+  static bool Parse(const char* str, int len,
+      const datetime_parse_util::DateTimeFormatContext& dt_ctx, 
boost::gregorian::date* d,
+      boost::posix_time::time_duration* t);
 
   /// Format the date/time values using the given format context. Note that a 
string
   /// terminator will be appended to the string.
@@ -240,50 +68,21 @@ class TimestampParser {
   /// len -- the output buffer length (should be at least dt_ctx.fmt_exp_len + 
1)
   /// buff -- the output string buffer (must be large enough to hold value)
   /// Return the number of characters copied in to the buffer (excluding 
terminator).
-  static int Format(const DateTimeFormatContext& dt_ctx,
-      const boost::gregorian::date& d, const boost::posix_time::time_duration& 
t,
-      int len, char* buff);
+  static int Format(const datetime_parse_util::DateTimeFormatContext& dt_ctx,
+      const boost::gregorian::date& d, const boost::posix_time::time_duration& 
t, int len,
+      char* buff);
 
  private:
-  static bool ParseDateTime(const char* str, int str_len,
-      const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result);
-
   /// Helper function finding the correct century for 1 or 2 digit year 
according to
   /// century break. Throws bad_year, bad_day_of_month, or bad_day_month if 
the date is
   /// invalid. The century break behavior is copied from Java SimpleDateFormat 
in order to
   /// be consistent with Hive.
   /// In SimpleDateFormat, the century for 2-digit-year breaks at current_time 
- 80 years.
   /// https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html
-  static boost::gregorian::date RealignYear(const DateTimeParseResult& 
dt_result,
-      const DateTimeFormatContext& dt_ctx, int day_offset,
+  static boost::gregorian::date RealignYear(
+      const datetime_parse_util::DateTimeParseResult& dt_result,
+      const datetime_parse_util::DateTimeFormatContext& dt_ctx, int day_offset,
       const boost::posix_time::time_duration& t);
-
-  /// Check if the string is a TimeZone offset token.
-  /// Valid offset token format are 'hh:mm', 'hhmm', 'hh'.
-  static bool IsValidTZOffset(const char* str_begin, const char* str_end);
-
-  /// Constants to hold default format lengths.
-  static const int DEFAULT_DATE_FMT_LEN = 10;
-  static const int DEFAULT_TIME_FMT_LEN = 8;
-  static const int DEFAULT_TIME_FRAC_FMT_LEN = 18;
-  static const int DEFAULT_SHORT_DATE_TIME_FMT_LEN = 19;
-  static const int DEFAULT_DATE_TIME_FMT_LEN = 29;
-
-  /// Used to indicate if the parsing state has been initialized.
-  static bool initialized_;
-  /// Pseudo-constant default date/time contexts. Backwards compatibility is 
provided on
-  /// variable length fractional components by defining a format context for 
each expected
-  /// length (0 - 9). This logic will be refactored when the parser supports 
lazy token
-  /// groups.
-  static DateTimeFormatContext DEFAULT_SHORT_DATE_TIME_CTX;
-  static DateTimeFormatContext DEFAULT_SHORT_ISO_DATE_TIME_CTX;
-  static DateTimeFormatContext DEFAULT_DATE_CTX;
-  static DateTimeFormatContext DEFAULT_TIME_CTX;
-  static DateTimeFormatContext DEFAULT_DATE_TIME_CTX[10];
-  static DateTimeFormatContext DEFAULT_ISO_DATE_TIME_CTX[10];
-  static DateTimeFormatContext DEFAULT_TIME_FRAC_CTX[10];
 };
 
 }
-
-#endif

http://git-wip-us.apache.org/repos/asf/impala/blob/cb493716/be/src/runtime/timestamp-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-test.cc b/be/src/runtime/timestamp-test.cc
index 8afa957..3f3f01d 100644
--- a/be/src/runtime/timestamp-test.cc
+++ b/be/src/runtime/timestamp-test.cc
@@ -23,8 +23,8 @@
 
 #include "common/status.h"
 #include "exprs/timezone_db.h"
+#include "runtime/datetime-parse-util.h"
 #include "runtime/raw-value.inline.h"
-#include "runtime/timestamp-parse-util.h"
 #include "runtime/timestamp-value.h"
 #include "runtime/timestamp-value.inline.h"
 #include "testutil/gtest-util.h"
@@ -41,6 +41,9 @@ using boost::posix_time::time_duration;
 
 namespace impala {
 
+using datetime_parse_util::ParseFormatTokens;
+using datetime_parse_util::DateTimeFormatContext;
+
 // Used for defining a custom date/time format test. The structure can be used 
to
 // indicate whether the format or value is expected to fail. In a happy path 
test,
 // the values for year, month, day etc will be validated against the parsed 
result.
@@ -181,8 +184,8 @@ void TestTimestampTokens(vector<TimestampToken>* toks, int 
year, int month,
         }
       }
       string fmt_val = "Format: " + fmt + ", Val: " + val;
-      DateTimeFormatContext dt_ctx(fmt.c_str(), fmt.length());
-      ASSERT_TRUE(TimestampParser::ParseFormatTokens(&dt_ctx)) << fmt_val;
+      DateTimeFormatContext dt_ctx(fmt.c_str());
+      ASSERT_TRUE(ParseFormatTokens(&dt_ctx)) << fmt_val;
       TimestampValue tv = TimestampValue::Parse(val.c_str(), val.length(), 
dt_ctx);
       ValidateTimestamp(tv, fmt, val, fmt_val, year, month, day, hours, mins, 
secs,
           frac);
@@ -211,8 +214,8 @@ void TestTimestampTokens(vector<TimestampToken>* toks, int 
year, int month,
           if (i + 1 < toks_len) val.push_back(*separator);
         }
         string fmt_val = "Format: " + fmt + ", Val: " + val;
-        DateTimeFormatContext dt_ctx(fmt.c_str(), fmt.length());
-        ASSERT_TRUE(TimestampParser::ParseFormatTokens(&dt_ctx)) << fmt_val;
+        DateTimeFormatContext dt_ctx(fmt.c_str());
+        ASSERT_TRUE(ParseFormatTokens(&dt_ctx)) << fmt_val;
         TimestampValue tv = TimestampValue::Parse(val.c_str(), val.length(), 
dt_ctx);
         ValidateTimestamp(tv, fmt, val, fmt_val, year, month, day, hours, 
mins, secs,
             frac);
@@ -541,9 +544,9 @@ TEST(TimestampTest, Basic) {
   // or literal value.
   for (int i = 0; i < test_cases.size(); ++i) {
     TimestampTC test_case = test_cases[i];
-    DateTimeFormatContext dt_ctx(test_case.fmt, strlen(test_case.fmt));
+    DateTimeFormatContext dt_ctx(test_case.fmt);
     dt_ctx.SetCenturyBreak(now);
-    bool parse_result = TimestampParser::ParseFormatTokens(&dt_ctx);
+    bool parse_result = ParseFormatTokens(&dt_ctx);
     if (test_case.fmt_should_fail) {
       EXPECT_FALSE(parse_result) << "TC: " << i;
       continue;
@@ -616,8 +619,8 @@ TEST(TimestampTest, Basic) {
   // Loop through format test cases
   for (int i = 0; i < fmt_test_cases.size(); ++i) {
     TimestampFormatTC test_case = fmt_test_cases[i];
-    DateTimeFormatContext dt_ctx(test_case.fmt, strlen(test_case.fmt));
-    ASSERT_TRUE(TimestampParser::ParseFormatTokens(&dt_ctx))  << "TC: " << i;
+    DateTimeFormatContext dt_ctx(test_case.fmt);
+    ASSERT_TRUE(ParseFormatTokens(&dt_ctx))  << "TC: " << i;
     TimestampValue cust_tv = TimestampValue::FromUnixTime(test_case.ts, 
utc_tz);
     EXPECT_NE(cust_tv.date(), not_a_date) << "TC: " << i;
     EXPECT_NE(cust_tv.time(), not_a_date_time) << "TC: " << i;

http://git-wip-us.apache.org/repos/asf/impala/blob/cb493716/be/src/runtime/timestamp-value.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-value.cc 
b/be/src/runtime/timestamp-value.cc
index 50d1ee4..a3631e5 100644
--- a/be/src/runtime/timestamp-value.cc
+++ b/be/src/runtime/timestamp-value.cc
@@ -48,6 +48,8 @@ const int64_t EPOCH_DAY_NUMBER =
 
 namespace impala {
 
+using datetime_parse_util::DateTimeFormatContext;
+
 const char* TimestampValue::LLVM_CLASS_NAME = "class.impala::TimestampValue";
 const double TimestampValue::ONE_BILLIONTH = 0.000000001;
 

http://git-wip-us.apache.org/repos/asf/impala/blob/cb493716/be/src/runtime/timestamp-value.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-value.h b/be/src/runtime/timestamp-value.h
index 2509f21..56f103e 100644
--- a/be/src/runtime/timestamp-value.h
+++ b/be/src/runtime/timestamp-value.h
@@ -37,7 +37,9 @@ DECLARE_bool(use_local_tz_for_unix_timestamp_conversions);
 
 namespace impala {
 
+namespace datetime_parse_util {
 struct DateTimeFormatContext;
+}
 
 /// Represents either a (1) date and time, (2) a date with an undefined time, 
or (3)
 /// a time with an undefined date. In all cases, times have up to nanosecond 
resolution
@@ -89,7 +91,7 @@ class TimestampValue {
   static TimestampValue Parse(const std::string& str);
   static TimestampValue Parse(const char* str, int len);
   static TimestampValue Parse(const char* str, int len,
-      const DateTimeFormatContext& dt_ctx);
+      const datetime_parse_util::DateTimeFormatContext& dt_ctx);
 
   /// Unix time (seconds since 1970-01-01 UTC by definition) constructors.
   /// Return the corresponding timestamp in the 'local_tz' time zone if
@@ -199,7 +201,8 @@ class TimestampValue {
   /// len -- the length of the buffer
   /// buff -- the buffer that will hold the result
   /// Returns the number of characters copied in to the buffer (minus the 
terminator)
-  int Format(const DateTimeFormatContext& dt_ctx, int len, char* buff) const;
+  int Format(const datetime_parse_util::DateTimeFormatContext& dt_ctx, int len,
+      char* buff) const;
 
   /// Interpret 'this' as a timestamp in UTC and convert to unix time.
   /// Returns false if the conversion failed ('unix_time' will be undefined), 
otherwise

[4/9] impala git commit: IMPALA-7492: Add support for DATE text parser/formatter

Reply via email to