This is an automated email from the ASF dual-hosted git repository.

dbecker pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 607ae742e5b12df12e2e2e64c890190a02beb765
Author: Kurt Deschler <[email protected]>
AuthorDate: Thu May 11 08:32:11 2023 -0500

    IMPALA-12134: Optimize row materialization time
    
    This patch improves row materialization time by providing specialized
    formatting logic for default date and timestamp formats. For Beeswax
    protocol, performance is also improved by caching deserialized column
    metadata to avoid unnecessary per-row cost.
    
    Benchmarks:
    - Manually tested mixed datatype table showed ~20% reduction in row
      materialization time
    - Added cases to date-benchmark for new formatters. Date formatting
      improved by 3x and timestamp by 2x
    
    Machine Info: Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
    ToYearMonthDay:            Function  iters/ms
    -------------------------------------------------------
                 TestCctzToYearMonthDay               16.5
                     TestToYearMonthDay               61.1
                             TestToYear                280
                           TestToString                 18
              TestToString_stringstream               1.86
               TestDefaultDateToCharBuf               25.5
                  TestTimestampToString               11.7
          TestDefaultTimestampToCharBuf               15.7
    
    Testing:
    - Ran core tests
    
    Change-Id: I1ef5e4137fa6c2d0a5f08b430e01e3fb7de86330
    Reviewed-on: http://gerrit.cloudera.org:8080/19875
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/benchmarks/date-benchmark.cc                | 105 +++++++++++++++++++--
 be/src/runtime/date-parse-util.cc                  |  18 ++++
 be/src/runtime/date-parse-util.h                   |   3 +
 be/src/runtime/date-value.cc                       |  16 +++-
 .../runtime/datetime-simple-date-format-parser.h   |  13 +++
 be/src/runtime/timestamp-parse-util.cc             |  27 ++++++
 be/src/runtime/timestamp-parse-util.h              |   4 +
 be/src/runtime/timestamp-value.cc                  |  21 ++++-
 be/src/service/query-result-set.cc                 |  24 +++--
 9 files changed, 210 insertions(+), 21 deletions(-)

diff --git a/be/src/benchmarks/date-benchmark.cc 
b/be/src/benchmarks/date-benchmark.cc
index 2c049ce69..84ccd6324 100644
--- a/be/src/benchmarks/date-benchmark.cc
+++ b/be/src/benchmarks/date-benchmark.cc
@@ -24,6 +24,8 @@
 #include "gutil/basictypes.h"
 #include "runtime/date-value.h"
 #include "runtime/datetime-simple-date-format-parser.h"
+#include "runtime/date-parse-util.h"
+#include "runtime/timestamp-parse-util.h"
 #include "util/benchmark.h"
 #include "util/cpu-info.h"
 
@@ -34,21 +36,30 @@ using std::mt19937;
 using std::uniform_int_distribution;
 
 using namespace impala;
+using datetime_parse_util::SimpleDateFormatTokenizer;
 
 // ToYearMonthDay:            Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//              TestCctzToYearMonthDay               23.1     23.4     23.7    
     1X         1X         1X
-//                  TestToYearMonthDay                 68     69.6     70.7    
  2.95X      2.98X      2.98X
-//                          TestToYear                443      446      448    
  19.2X      19.1X      18.9X
-//                        TestToString               9.02     9.04     9.06    
 0.391X     0.386X     0.382X
-//           TestToString_stringstream               2.04     2.04     2.08    
0.0883X    0.0871X    0.0875X
+//             TestCctzToYearMonthDay               16.5     16.6     16.7     
    1X         1X         1X
+//                 TestToYearMonthDay               61.1     62.1     62.3     
 3.69X      3.75X      3.73X
+//                         TestToYear                280      308      308     
 16.9X      18.6X      18.5X
+//                       TestToString                 18     19.5     19.7     
 1.09X      1.18X      1.18X
+//          TestToString_stringstream               1.86     2.08     2.12     
0.113X     0.125X     0.127X
+//           TestDefaultDateToCharBuf               25.5       27     27.2     
 1.54X      1.63X      1.63X
+//              TestTimestampToString               11.7     12.6     12.6     
0.707X      0.76X     0.757X
+//      TestDefaultTimestampToCharBuf               15.7     17.2     17.2     
0.949X      1.04X      1.03X
+
+
+
+
 
 const cctz::civil_day EPOCH_DATE(1970, 1, 1);
 
 class TestData {
 public:
-  void AddRandomRange(const DateValue& dv_min, const DateValue& dv_max, int 
data_size) {
+  void AddRandomRange(const DateValue& dv_min, const DateValue& dv_max,
+      int data_size) {
     DCHECK(dv_min.IsValid());
     DCHECK(dv_max.IsValid());
 
@@ -60,18 +71,29 @@ public:
     mt19937 gen(rd());
     // Random values in a [min_dse..max_dse] days range.
     uniform_int_distribution<int32_t> dis_dse(min_dse, max_dse);
+    uniform_int_distribution<int64_t> dis_utc(-9223372036, 9223372036);
 
     // Add random DateValue values in the [dv_min, dv_max] range.
     for (int i = 0; i <= data_size; ++i) {
       DateValue dv(dis_dse(gen));
       DCHECK(dv.IsValid());
       date_.push_back(dv);
+      timestamp_.push_back(TimestampValue::FromUnixTime(dis_utc(gen), UTCPTR));
     }
     cctz_to_ymd_result_.resize(date_.size());
     to_ymd_result_.resize(date_.size());
     to_year_result_.resize(date_.size());
     to_string_result_.resize(date_.size());
     to_string_old_result_.resize(date_.size());
+    date_to_char_buf_result_.resize(timestamp_.size());
+    timestamp_to_char_buf_result_.resize(timestamp_.size());
+    timestamp_to_string_result_.resize(timestamp_.size());
+    for(int i = 0; i < timestamp_.size(); ++i) {
+      timestamp_to_char_buf_result_[i].reserve(
+          SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN);
+      timestamp_to_string_result_[i].reserve(
+          SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN);
+    }
   }
 
   void CctzToYearMonthDay(const DateValue& dv, int* year, int* month, int* 
day) const {
@@ -129,6 +151,42 @@ public:
     }
   }
 
+  void TestDefaultDateToCharBuf(int batch_size) {
+    for (int i = 0; i < batch_size; ++i) {
+      int n = date_.size();
+      for (int j = 0; j < n; ++j) {
+        date_to_char_buf_result_[j].resize(
+            SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN);
+        DateParser::FormatDefault(date_[j], &(date_to_char_buf_result_[j][0]));
+      }
+    }
+  }
+
+  void TestTimestampToString(int batch_size) {
+    for (int i = 0; i < batch_size; ++i) {
+      int n = timestamp_.size();
+      for (int j = 0; j < n; ++j) {
+        timestamp_to_string_result_[j] = timestamp_[j].ToString();
+      }
+    }
+  }
+
+  void TestDefaultTimestampToCharBuf(int batch_size) {
+    for (int i = 0; i < batch_size; ++i) {
+      int n = timestamp_.size();
+      for (int j = 0; j < n; ++j) {
+        const uint32 len = timestamp_[j].time().fractional_seconds() ?
+            SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN :
+            SimpleDateFormatTokenizer::DEFAULT_SHORT_DATE_TIME_FMT_LEN;
+
+        char buf[len + 1];
+        TimestampParser::FormatDefault(timestamp_[j].date(), 
timestamp_[j].time(), buf);
+        buf[len] = '\0';
+        timestamp_to_char_buf_result_[j] = buf;
+      }
+    }
+  }
+
   void TestToString_stringstream(int batch_size) {
     for (int i = 0; i < batch_size; ++i) {
       int n = date_.size();
@@ -167,6 +225,18 @@ public:
              << to_string_result_[i] << " != " << to_string_old_result_[i] << 
endl;
         ok = false;
       }
+      if (date_to_char_buf_result_[i] != to_string_result_[i]) {
+        cerr << "Incorrect results (TestDefaultDateToCharBuf() vs ToString()): 
"
+             << date_to_char_buf_result_[i] << " != " << to_string_result_[i] 
<< endl;
+        ok = false;
+      }
+      if (timestamp_to_char_buf_result_[i] != timestamp_to_string_result_[i]) {
+        cerr << "Incorrect results (TestDefaultTimestampToCharBuf()"
+             << " vs TestTimestampToString()): "
+             << timestamp_to_char_buf_result_[i] << " != "
+             << timestamp_to_string_result_[i] << endl;
+        ok = false;
+      }
     }
     return ok;
   }
@@ -190,11 +260,15 @@ private:
   };
 
   vector<DateValue> date_;
+  vector<TimestampValue> timestamp_;
   vector<YearMonthDayResult> cctz_to_ymd_result_;
   vector<YearMonthDayResult> to_ymd_result_;
   vector<int> to_year_result_;
   vector<string> to_string_result_;
   vector<string> to_string_old_result_;
+  vector<string> date_to_char_buf_result_;
+  vector<string> timestamp_to_char_buf_result_;
+  vector<string> timestamp_to_string_result_;
 };
 
 void TestCctzToYearMonthDay(int batch_size, void* d) {
@@ -217,6 +291,21 @@ void TestToString(int batch_size, void* d) {
   data->TestToString(batch_size);
 }
 
+void TestDefaultDateToCharBuf(int batch_size, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  data->TestDefaultDateToCharBuf(batch_size);
+}
+
+void TestTimestampToString(int batch_size, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  data->TestTimestampToString(batch_size);
+}
+
+void TestDefaultTimestampToCharBuf(int batch_size, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  data->TestDefaultTimestampToCharBuf(batch_size);
+}
+
 void TestToString_stringstream(int batch_size, void* d) {
   TestData* data = reinterpret_cast<TestData*>(d);
   data->TestToString_stringstream(batch_size);
@@ -239,6 +328,10 @@ int main(int argc, char* argv[]) {
   suite.AddBenchmark("TestToYear", TestToYear, &data);
   suite.AddBenchmark("TestToString", TestToString, &data);
   suite.AddBenchmark("TestToString_stringstream", TestToString_stringstream, 
&data);
+  suite.AddBenchmark("TestDefaultDateToCharBuf", TestDefaultDateToCharBuf, 
&data);
+  suite.AddBenchmark("TestTimestampToString", TestTimestampToString, &data);
+  suite.AddBenchmark("TestDefaultTimestampToCharBuf", 
TestDefaultTimestampToCharBuf,
+      &data);
   cout << suite.Measure();
 
   return data.CheckResults() ? 0 : 1;
diff --git a/be/src/runtime/date-parse-util.cc 
b/be/src/runtime/date-parse-util.cc
index ea8698d82..a68532286 100644
--- a/be/src/runtime/date-parse-util.cc
+++ b/be/src/runtime/date-parse-util.cc
@@ -128,6 +128,24 @@ bool DateParser::ParseIsoSqlFormat(const char* str, int 
len,
   return date->IsValid();
 }
 
+// Formats date into dst using the default format
+// Format:  yyyy-MM-dd
+// Offsets: 0123456789
+int DateParser::FormatDefault(const DateValue& date, char* dst) {
+  int year, month, day;
+  if (!date.ToYearMonthDay(&year, &month, &day)) {
+    *dst = '\0';
+    return -1;
+  }
+  else {
+    ZeroPad(dst, year, 4);
+    ZeroPad(dst + 5, month, 2);
+    ZeroPad(dst + 8, day, 2);
+    dst[7] = dst[4] = '-';
+    return SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN;
+  }
+}
+
 string DateParser::Format(const DateTimeFormatContext& dt_ctx, const 
DateValue& date) {
   DCHECK(dt_ctx.toks.size() > 0);
   DCHECK(dt_ctx.has_date_toks && !dt_ctx.has_time_toks);
diff --git a/be/src/runtime/date-parse-util.h b/be/src/runtime/date-parse-util.h
index 731e8e7be..e0efaeb74 100644
--- a/be/src/runtime/date-parse-util.h
+++ b/be/src/runtime/date-parse-util.h
@@ -59,6 +59,9 @@ class DateParser {
       const datetime_parse_util::DateTimeFormatContext& dt_ctx, DateValue* 
date)
       WARN_UNUSED_RESULT;
 
+  /// Optimized formatter for default date format
+  static int FormatDefault(const DateValue& date, char* dst);
+
   /// Format the date values using the given format context.
   /// dt_ctx -- date format context
   /// date -- the date value
diff --git a/be/src/runtime/date-value.cc b/be/src/runtime/date-value.cc
index 78db95eb0..b0733c22d 100644
--- a/be/src/runtime/date-value.cc
+++ b/be/src/runtime/date-value.cc
@@ -429,11 +429,23 @@ bool DateValue::MonthsBetween(const DateValue& other, 
double* months_between) co
 }
 
 string DateValue::ToString() const {
-  return Format(*SimpleDateFormatTokenizer::GetDefaultDateFormatContext());
+  string s;
+  s.resize(SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN);
+  const int out_len = DateParser::FormatDefault(*this, s.data());
+  if (UNLIKELY(out_len != SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN)) {
+    s.clear();
+  }
+  return s;
 }
 
 ostream& operator<<(ostream& os, const DateValue& date_value) {
-  return os << date_value.ToString();
+  char dst[SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN + 1];
+  const int out_len = DateParser::FormatDefault(date_value, dst);
+  if (LIKELY(out_len >= 0)) {
+    dst[out_len] = '\0';
+    os << dst;
+  }
+  return os;
 }
 
 }
diff --git a/be/src/runtime/datetime-simple-date-format-parser.h 
b/be/src/runtime/datetime-simple-date-format-parser.h
index 7bd173cce..95bccbf93 100644
--- a/be/src/runtime/datetime-simple-date-format-parser.h
+++ b/be/src/runtime/datetime-simple-date-format-parser.h
@@ -182,6 +182,19 @@ public:
       const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result);
 };
 
+/// Helper function for formatting small numbers with leading zeros
+/// This is used inline with data and timestamp formatting functions
+inline void ZeroPad(char* const dst, uint32 val, const uint32 digits) {
+  char* p = dst + digits;
+  while(val) {
+    *--p = '0' + (val % 10);
+    val /= 10;
+  }
+  while(p != dst) {
+    *--p = '0';
+  }
+}
+
 }
 
 }
diff --git a/be/src/runtime/timestamp-parse-util.cc 
b/be/src/runtime/timestamp-parse-util.cc
index 98339dba7..d8d369725 100644
--- a/be/src/runtime/timestamp-parse-util.cc
+++ b/be/src/runtime/timestamp-parse-util.cc
@@ -227,6 +227,33 @@ bool TimestampParser::ParseIsoSqlFormat(const char* str, 
int len,
   return true;
 }
 
+// Formats date and time into dst using the default format
+// Short:   yyyy-MM-dd HH:mm:ss
+//  Long:   yyyy-MM-dd HH:mm:ss.SSSSSSSSS
+// Offsets: 01234567890123456789012345678
+
+int TimestampParser::FormatDefault(const date& d, const time_duration& t, 
char* dst) {
+  if (UNLIKELY(d.is_special() || t.is_special())) return -1;
+  const auto ymd = d.year_month_day();
+  ZeroPad(dst, ymd.year, 4);
+  ZeroPad(dst + 5, ymd.month, 2);
+  ZeroPad(dst + 8, ymd.day, 2);
+  const auto tot_sec = t.total_seconds();
+  ZeroPad(dst + 11, tot_sec / 3600, 2);
+  ZeroPad(dst + 14, (tot_sec / 60) % 60, 2);
+  ZeroPad(dst + 17, tot_sec % 60, 2);
+  dst[7] = dst[4] = '-';
+  dst[10] = ' ';
+  dst[16] = dst[13] = ':';
+
+  if (LIKELY(t.fractional_seconds() > 0)) {
+    dst[19] = '.';
+    ZeroPad(dst + 20, t.fractional_seconds(), 9);
+    return SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN;
+  }
+  return SimpleDateFormatTokenizer::DEFAULT_SHORT_DATE_TIME_FMT_LEN;
+}
+
 int TimestampParser::Format(const DateTimeFormatContext& dt_ctx, const date& d,
     const time_duration& t, int max_length, char* dst) {
   DCHECK(dt_ctx.toks.size() > 0);
diff --git a/be/src/runtime/timestamp-parse-util.h 
b/be/src/runtime/timestamp-parse-util.h
index 60eb8888a..a5c359f8e 100644
--- a/be/src/runtime/timestamp-parse-util.h
+++ b/be/src/runtime/timestamp-parse-util.h
@@ -73,6 +73,10 @@ class TimestampParser {
       const datetime_parse_util::DateTimeFormatContext& dt_ctx, 
boost::gregorian::date* d,
       boost::posix_time::time_duration* t) WARN_UNUSED_RESULT;
 
+  /// Optimized formatter for default short and long formats
+  static int FormatDefault(const boost::gregorian::date& d,
+      const boost::posix_time::time_duration& t, char* dst);
+
   /// Format the date/time values using the given format context.
   /// Caller must make sure that it has enough buffer space in 'dst' to hold 
the output.
   /// Return total output length that is written into 'dst'. Return -1 If 'd' 
or 't' is
diff --git a/be/src/runtime/timestamp-value.cc 
b/be/src/runtime/timestamp-value.cc
index 26d819944..cc8ef971e 100644
--- a/be/src/runtime/timestamp-value.cc
+++ b/be/src/runtime/timestamp-value.cc
@@ -180,7 +180,14 @@ void TimestampValue::LocalToUtc(const Timezone& local_tz) {
 }
 
 ostream& operator<<(ostream& os, const TimestampValue& timestamp_value) {
-  return os << timestamp_value.ToString();
+  char dst[SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN + 1];
+  const int out_len = TimestampParser::FormatDefault(timestamp_value.date(),
+      timestamp_value.time(), dst);
+  if (LIKELY(out_len >= 0)) {
+    dst[out_len] = '\0';
+    os << dst;
+  }
+  return os;
 }
 
 TimestampValue TimestampValue::UnixTimeToLocal(
@@ -207,12 +214,20 @@ TimestampValue TimestampValue::FromUnixTime(time_t 
unix_time, const Timezone* lo
 }
 
 void TimestampValue::ToString(string& dst) const {
-  Format(*SimpleDateFormatTokenizer::GetDefaultTimestampFormatContext(time_), 
dst);
+  dst.resize(SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN);
+  const int out_len = TimestampParser::FormatDefault(date(), time(), 
dst.data());
+  if (UNLIKELY(out_len != 
SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN)) {
+    if (UNLIKELY(out_len < 0)) {
+      dst.clear();
+    } else {
+      dst.resize(out_len);
+    }
+  }
 }
 
 string TimestampValue::ToString() const {
   string dst;
-  Format(*SimpleDateFormatTokenizer::GetDefaultTimestampFormatContext(time_), 
dst);
+  ToString(dst);
   return dst;
 }
 
diff --git a/be/src/service/query-result-set.cc 
b/be/src/service/query-result-set.cc
index e4a828709..f32f05fa1 100644
--- a/be/src/service/query-result-set.cc
+++ b/be/src/service/query-result-set.cc
@@ -60,7 +60,12 @@ class AsciiQueryResultSet : public QueryResultSet {
   /// Rows are added into 'rowset'.
   AsciiQueryResultSet(const TResultSetMetadata& metadata, vector<string>* 
rowset,
       bool stringify_map_keys)
-    : metadata_(metadata), result_set_(rowset), 
stringify_map_keys_(stringify_map_keys) {}
+    : metadata_(metadata), result_set_(rowset), 
stringify_map_keys_(stringify_map_keys) {
+    types_.reserve(metadata.columns.size());
+    for (int i = 0; i < metadata.columns.size(); ++i) {
+      
types_.push_back(ColumnType::FromThrift(metadata_.columns[i].columnType));
+    }
+  }
 
   virtual ~AsciiQueryResultSet() {}
 
@@ -87,6 +92,9 @@ class AsciiQueryResultSet : public QueryResultSet {
 
   // If true, converts map keys to strings; see IMPALA-11778.
   const bool stringify_map_keys_;
+
+  // De-serialized column metadata
+  vector<ColumnType> types_;
 };
 
 /// Result set container for Hive protocol versions >= V6, where results are 
returned in
@@ -210,16 +218,12 @@ Status AsciiQueryResultSet::AddRows(const 
vector<ScalarExprEvaluator*>& expr_eva
       // ODBC-187 - ODBC can only take "\t" as the delimiter
       out_stream << (i > 0 ? "\t" : "");
 
-      if (metadata_.columns[i].columnType.types.size() == 1) {
-        RawValue::PrintValue(expr_evals[i]->GetValue(it.Get()),
-            ColumnType::FromThrift(metadata_.columns[i].columnType), scales[i],
-            &out_stream);
-      } else if (metadata_.columns[i].columnType.types.size() > 1) {
-        ColumnType col_type = 
ColumnType::FromThrift(metadata_.columns[i].columnType);
-        PrintComplexValue(expr_evals[i], it.Get(), &out_stream, col_type,
-            stringify_map_keys_);
+      if (!types_[i].IsComplexType()) {
+        RawValue::PrintValue(expr_evals[i]->GetValue(it.Get()), types_[i],
+            scales[i], &out_stream);
       } else {
-        DCHECK(false);
+        PrintComplexValue(expr_evals[i], it.Get(), &out_stream, types_[i],
+            stringify_map_keys_);
       }
     }
     result_set_->push_back(out_stream.str());

Reply via email to