This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 4d9f50eb74208fa21c70964b0015209bb3f973a8
Author: Csaba Ringhofer <[email protected]>
AuthorDate: Thu Apr 27 20:04:12 2023 +0200

    IMPALA-12111: Speed up DATE to STRING conversion
    
    Before this patch DATE to STRING conversion seemed slow in
    general (slower than TIMESTAMP to STRING) which was visible
    especially on the coordinator where result DATEs are returned
    as STRINGs in HS2/Beeswax and the conversion happens on a
    single thread.
    
    The main cause seems to be using std::stringstream in
    DateValue::ToString(). The patch switches to using
    impala::TimestampParser::Format() similarly to TimestampValue.
    
    HS2 result set generation is also changed to avoid using
    stringstream for TIMESTAMP/DATE and call ToString() directly.
    
    Benchmarks:
    - Added benchmark that shows ~4x impovement for DateValue.ToString().
    - Manually tested EE scanerio: RowMaterializationTimer dropped from
      1.7s to 0.6s in
      impala-shell -B -q "select cast(l_shipdate as date) from 
tpch_parquet.lineitem;" > /dev/null
      (note that the query above converts STRING to DATE first and then
       from DATE to std::string before returning it in HS2 - the
       improvement comes from the second conversion)
    
    Testing:
    - ran core tests
    
    Change-Id: I9a233ae92b1461fc5c47d8345667e36c2632f4c4
    Reviewed-on: http://gerrit.cloudera.org:8080/19829
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/benchmarks/date-benchmark.cc                | 74 ++++++++++++++++++----
 be/src/runtime/date-value.cc                       | 10 +--
 .../runtime/datetime-simple-date-format-parser.h   |  4 ++
 be/src/runtime/raw-value.cc                        | 18 ++++--
 be/src/service/hs2-util.cc                         |  6 +-
 5 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/be/src/benchmarks/date-benchmark.cc 
b/be/src/benchmarks/date-benchmark.cc
index 574dcc781..2c049ce69 100644
--- a/be/src/benchmarks/date-benchmark.cc
+++ b/be/src/benchmarks/date-benchmark.cc
@@ -23,6 +23,7 @@
 #include "cctz/civil_time.h"
 #include "gutil/basictypes.h"
 #include "runtime/date-value.h"
+#include "runtime/datetime-simple-date-format-parser.h"
 #include "util/benchmark.h"
 #include "util/cpu-info.h"
 
@@ -34,19 +35,20 @@ using std::uniform_int_distribution;
 
 using namespace impala;
 
-// Machine Info: Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz
-// ToYearMonthDay:     Function  iters/ms   10%ile   50%ile   90%ile     
10%ile     50%ile     90%ile
-//                                                                   
(relative) (relative) (relative)
-// 
--------------------------------------------------------------------------------------------------
-//       TestCctzToYearMonthDay               0.24    0.245     0.25         
1X         1X         1X
-//           TestToYearMonthDay                1.3     1.33     1.38      
5.42X      5.44X      5.53X
-//                   TestToYear               8.67     8.86     9.04      
36.1X      36.1X      36.2X
+// ToYearMonthDay:            Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//              TestCctzToYearMonthDay               23.1     23.4     23.7    
     1X         1X         1X
+//                  TestToYearMonthDay                 68     69.6     70.7    
  2.95X      2.98X      2.98X
+//                          TestToYear                443      446      448    
  19.2X      19.1X      18.9X
+//                        TestToString               9.02     9.04     9.06    
 0.391X     0.386X     0.382X
+//           TestToString_stringstream               2.04     2.04     2.08    
0.0883X    0.0871X    0.0875X
 
 const cctz::civil_day EPOCH_DATE(1970, 1, 1);
 
 class TestData {
 public:
-  void AddRandomRange(const DateValue& dv_min, const DateValue& dv_max) {
+  void AddRandomRange(const DateValue& dv_min, const DateValue& dv_max, int 
data_size) {
     DCHECK(dv_min.IsValid());
     DCHECK(dv_max.IsValid());
 
@@ -60,7 +62,7 @@ public:
     uniform_int_distribution<int32_t> dis_dse(min_dse, max_dse);
 
     // Add random DateValue values in the [dv_min, dv_max] range.
-    for (int i = 0; i <= max_dse - min_dse; ++i) {
+    for (int i = 0; i <= data_size; ++i) {
       DateValue dv(dis_dse(gen));
       DCHECK(dv.IsValid());
       date_.push_back(dv);
@@ -68,6 +70,8 @@ public:
     cctz_to_ymd_result_.resize(date_.size());
     to_ymd_result_.resize(date_.size());
     to_year_result_.resize(date_.size());
+    to_string_result_.resize(date_.size());
+    to_string_old_result_.resize(date_.size());
   }
 
   void CctzToYearMonthDay(const DateValue& dv, int* year, int* month, int* 
day) const {
@@ -116,6 +120,31 @@ public:
     }
   }
 
+  void TestToString(int batch_size) {
+    for (int i = 0; i < batch_size; ++i) {
+      int n = date_.size();
+      for (int j = 0; j < n; ++j) {
+        to_string_result_[j] = date_[j].ToString();
+      }
+    }
+  }
+
+  void TestToString_stringstream(int batch_size) {
+    for (int i = 0; i < batch_size; ++i) {
+      int n = date_.size();
+      for (int j = 0; j < n; ++j) {
+        // Old implementation before IMPALA-12111.
+        stringstream ss;
+        int year, month, day;
+        if (date_[j].ToYearMonthDay(&year, &month, &day)) {
+          ss << std::setfill('0') << setw(4) << year << "-" << setw(2) << 
month << "-"
+            << setw(2) << day;
+        }
+        to_string_old_result_[j] = ss.str();
+      }
+    }
+  }
+
   bool CheckResults() {
     DCHECK(to_ymd_result_.size() == cctz_to_ymd_result_.size());
     DCHECK(to_year_result_.size() == cctz_to_ymd_result_.size());
@@ -133,6 +162,11 @@ public:
              << endl;
         ok = false;
       }
+      if (to_string_result_[i] != to_string_old_result_[i]) {
+        cerr << "Incorrect results (ToString() vs 
TestToString_stringstream()): "
+             << to_string_result_[i] << " != " << to_string_old_result_[i] << 
endl;
+        ok = false;
+      }
     }
     return ok;
   }
@@ -159,6 +193,8 @@ private:
   vector<YearMonthDayResult> cctz_to_ymd_result_;
   vector<YearMonthDayResult> to_ymd_result_;
   vector<int> to_year_result_;
+  vector<string> to_string_result_;
+  vector<string> to_string_old_result_;
 };
 
 void TestCctzToYearMonthDay(int batch_size, void* d) {
@@ -176,19 +212,33 @@ void TestToYear(int batch_size, void* d) {
   data->TestToYear(batch_size);
 }
 
+void TestToString(int batch_size, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  data->TestToString(batch_size);
+}
+
+void TestToString_stringstream(int batch_size, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  data->TestToString_stringstream(batch_size);
+}
+
 int main(int argc, char* argv[]) {
   CpuInfo::Init();
   cout << Benchmark::GetMachineInfo() << endl;
 
+  datetime_parse_util::SimpleDateFormatTokenizer::InitCtx();
+
   TestData data;
-  data.AddRandomRange(DateValue(1965, 1, 1), DateValue(2020, 12, 31));
+  data.AddRandomRange(DateValue(1965, 1, 1), DateValue(2020, 12, 31), 1000);
 
-  // Benchmark TestData::CctzToYearMonthDay(), DateValue::ToYearMonthDay() and
-  // DateValue::ToYear().
+  // Benchmark TestData::CctzToYearMonthDay(), DateValue::ToYearMonthDay(),
+  // DateValue::ToYear() and DateValue().ToString's current and old 
implementation.
   Benchmark suite("ToYearMonthDay");
   suite.AddBenchmark("TestCctzToYearMonthDay", TestCctzToYearMonthDay, &data);
   suite.AddBenchmark("TestToYearMonthDay", TestToYearMonthDay, &data);
   suite.AddBenchmark("TestToYear", TestToYear, &data);
+  suite.AddBenchmark("TestToString", TestToString, &data);
+  suite.AddBenchmark("TestToString_stringstream", TestToString_stringstream, 
&data);
   cout << suite.Measure();
 
   return data.CheckResults() ? 0 : 1;
diff --git a/be/src/runtime/date-value.cc b/be/src/runtime/date-value.cc
index f14436840..78db95eb0 100644
--- a/be/src/runtime/date-value.cc
+++ b/be/src/runtime/date-value.cc
@@ -21,6 +21,7 @@
 #include <iomanip>
 #include "cctz/civil_time.h"
 #include "runtime/date-parse-util.h"
+#include "runtime/datetime-simple-date-format-parser.h"
 
 #include "common/names.h"
 
@@ -29,6 +30,7 @@ namespace impala {
 using datetime_parse_util::DateTimeFormatContext;
 using datetime_parse_util::GetMonthAndDayFromDaysSinceJan1;
 using datetime_parse_util::IsLeapYear;
+using datetime_parse_util::SimpleDateFormatTokenizer;
 
 const int EPOCH_YEAR = 1970;
 const int MIN_YEAR = 1;
@@ -427,13 +429,7 @@ bool DateValue::MonthsBetween(const DateValue& other, 
double* months_between) co
 }
 
 string DateValue::ToString() const {
-  stringstream ss;
-  int year, month, day;
-  if (ToYearMonthDay(&year, &month, &day)) {
-    ss << std::setfill('0') << setw(4) << year << "-" << setw(2) << month << 
"-"
-       << setw(2) << day;
-  }
-  return ss.str();
+  return Format(*SimpleDateFormatTokenizer::GetDefaultDateFormatContext());
 }
 
 ostream& operator<<(ostream& os, const DateValue& date_value) {
diff --git a/be/src/runtime/datetime-simple-date-format-parser.h 
b/be/src/runtime/datetime-simple-date-format-parser.h
index 2e0ce98bb..7bd173cce 100644
--- a/be/src/runtime/datetime-simple-date-format-parser.h
+++ b/be/src/runtime/datetime-simple-date-format-parser.h
@@ -125,6 +125,10 @@ public:
                                            &DEFAULT_SHORT_DATE_TIME_CTX;
   }
 
+  static ALWAYS_INLINE const DateTimeFormatContext* 
GetDefaultDateFormatContext() {
+    return &DEFAULT_DATE_CTX;
+  }
+
   /// Initialize the default format contexts. This *must* be called before 
using
   /// GetDefaultFormatContext().
   static void InitCtx();
diff --git a/be/src/runtime/raw-value.cc b/be/src/runtime/raw-value.cc
index be8058db0..928d36016 100644
--- a/be/src/runtime/raw-value.cc
+++ b/be/src/runtime/raw-value.cc
@@ -97,11 +97,9 @@ void RawValue::PrintValue(const void* value, const 
ColumnType& type, int scale,
     return;
   }
 
-  stringstream out;
-  out.precision(ASCII_PRECISION);
   const StringValue* string_val = NULL;
-  string tmp;
   bool val;
+  string tmp;
 
   // Special case types that we can print more efficiently without using a 
stringstream
   switch (type.type) {
@@ -118,12 +116,24 @@ void RawValue::PrintValue(const void* value, const 
ColumnType& type, int scale,
     case TYPE_CHAR:
       *str = string(reinterpret_cast<const char*>(value), type.len);
       return;
+    case TYPE_TIMESTAMP:
+      *str = reinterpret_cast<const TimestampValue*>(value)->ToString();
+      return;
+    case TYPE_DATE:
+      *str = reinterpret_cast<const DateValue*>(value)->ToString();
+      return;
     case TYPE_FIXED_UDA_INTERMEDIATE:
       *str = "Intermediate UDA step, no value printed";
       return;
     default:
-      PrintValue(value, type, scale, &out);
+      break;
   }
+
+  stringstream out;
+  out.precision(ASCII_PRECISION);
+
+  PrintValue(value, type, scale, &out);
+
   *str = out.str();
 }
 
diff --git a/be/src/service/hs2-util.cc b/be/src/service/hs2-util.cc
index 20279699c..3d36876cb 100644
--- a/be/src/service/hs2-util.cc
+++ b/be/src/service/hs2-util.cc
@@ -255,8 +255,7 @@ static void 
TimestampExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval,
     column->stringVal.values.emplace_back();
     if (!val.is_null) {
       TimestampValue value = TimestampValue::FromTimestampVal(val);
-      RawValue::PrintValue(
-          &value, ColumnType(TYPE_TIMESTAMP), -1, 
&(column->stringVal.values.back()));
+      column->stringVal.values.back() = value.ToString();
     }
     SetNullBit(output_row_idx, val.is_null, &column->stringVal.nulls);
     ++output_row_idx;
@@ -273,8 +272,7 @@ static void DateExprValuesToHS2TColumn(ScalarExprEvaluator* 
expr_eval,
     column->stringVal.values.emplace_back();
     if (!val.is_null) {
       DateValue value = DateValue::FromDateVal(val);
-      RawValue::PrintValue(
-          &value, ColumnType(TYPE_DATE), -1, 
&(column->stringVal.values.back()));
+      column->stringVal.values.back() = value.ToString();
     }
     SetNullBit(output_row_idx, val.is_null, &column->stringVal.nulls);
     ++output_row_idx;

Reply via email to