This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 4d9f50eb74208fa21c70964b0015209bb3f973a8 Author: Csaba Ringhofer <[email protected]> AuthorDate: Thu Apr 27 20:04:12 2023 +0200 IMPALA-12111: Speed up DATE to STRING conversion Before this patch DATE to STRING conversion seemed slow in general (slower than TIMESTAMP to STRING) which was visible especially on the coordinator where result DATEs are returned as STRINGs in HS2/Beeswax and the conversion happens on a single thread. The main cause seems to be using std::stringstream in DateValue::ToString(). The patch switches to using impala::TimestampParser::Format() similarly to TimestampValue. HS2 result set generation is also changed to avoid using stringstream for TIMESTAMP/DATE and call ToString() directly. Benchmarks: - Added benchmark that shows ~4x impovement for DateValue.ToString(). - Manually tested EE scanerio: RowMaterializationTimer dropped from 1.7s to 0.6s in impala-shell -B -q "select cast(l_shipdate as date) from tpch_parquet.lineitem;" > /dev/null (note that the query above converts STRING to DATE first and then from DATE to std::string before returning it in HS2 - the improvement comes from the second conversion) Testing: - ran core tests Change-Id: I9a233ae92b1461fc5c47d8345667e36c2632f4c4 Reviewed-on: http://gerrit.cloudera.org:8080/19829 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/benchmarks/date-benchmark.cc | 74 ++++++++++++++++++---- be/src/runtime/date-value.cc | 10 +-- .../runtime/datetime-simple-date-format-parser.h | 4 ++ be/src/runtime/raw-value.cc | 18 ++++-- be/src/service/hs2-util.cc | 6 +- 5 files changed, 85 insertions(+), 27 deletions(-) diff --git a/be/src/benchmarks/date-benchmark.cc b/be/src/benchmarks/date-benchmark.cc index 574dcc781..2c049ce69 100644 --- a/be/src/benchmarks/date-benchmark.cc +++ b/be/src/benchmarks/date-benchmark.cc @@ -23,6 +23,7 @@ #include "cctz/civil_time.h" #include "gutil/basictypes.h" #include "runtime/date-value.h" +#include "runtime/datetime-simple-date-format-parser.h" #include "util/benchmark.h" #include "util/cpu-info.h" @@ -34,19 +35,20 @@ using std::uniform_int_distribution; using namespace impala; -// Machine Info: Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz -// ToYearMonthDay: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile -// (relative) (relative) (relative) -// -------------------------------------------------------------------------------------------------- -// TestCctzToYearMonthDay 0.24 0.245 0.25 1X 1X 1X -// TestToYearMonthDay 1.3 1.33 1.38 5.42X 5.44X 5.53X -// TestToYear 8.67 8.86 9.04 36.1X 36.1X 36.2X +// ToYearMonthDay: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile +// (relative) (relative) (relative) +// --------------------------------------------------------------------------------------------------------- +// TestCctzToYearMonthDay 23.1 23.4 23.7 1X 1X 1X +// TestToYearMonthDay 68 69.6 70.7 2.95X 2.98X 2.98X +// TestToYear 443 446 448 19.2X 19.1X 18.9X +// TestToString 9.02 9.04 9.06 0.391X 0.386X 0.382X +// TestToString_stringstream 2.04 2.04 2.08 0.0883X 0.0871X 0.0875X const cctz::civil_day EPOCH_DATE(1970, 1, 1); class TestData { public: - void AddRandomRange(const DateValue& dv_min, const DateValue& dv_max) { + void AddRandomRange(const DateValue& dv_min, const DateValue& dv_max, int data_size) { DCHECK(dv_min.IsValid()); DCHECK(dv_max.IsValid()); @@ -60,7 +62,7 @@ public: uniform_int_distribution<int32_t> dis_dse(min_dse, max_dse); // Add random DateValue values in the [dv_min, dv_max] range. - for (int i = 0; i <= max_dse - min_dse; ++i) { + for (int i = 0; i <= data_size; ++i) { DateValue dv(dis_dse(gen)); DCHECK(dv.IsValid()); date_.push_back(dv); @@ -68,6 +70,8 @@ public: cctz_to_ymd_result_.resize(date_.size()); to_ymd_result_.resize(date_.size()); to_year_result_.resize(date_.size()); + to_string_result_.resize(date_.size()); + to_string_old_result_.resize(date_.size()); } void CctzToYearMonthDay(const DateValue& dv, int* year, int* month, int* day) const { @@ -116,6 +120,31 @@ public: } } + void TestToString(int batch_size) { + for (int i = 0; i < batch_size; ++i) { + int n = date_.size(); + for (int j = 0; j < n; ++j) { + to_string_result_[j] = date_[j].ToString(); + } + } + } + + void TestToString_stringstream(int batch_size) { + for (int i = 0; i < batch_size; ++i) { + int n = date_.size(); + for (int j = 0; j < n; ++j) { + // Old implementation before IMPALA-12111. + stringstream ss; + int year, month, day; + if (date_[j].ToYearMonthDay(&year, &month, &day)) { + ss << std::setfill('0') << setw(4) << year << "-" << setw(2) << month << "-" + << setw(2) << day; + } + to_string_old_result_[j] = ss.str(); + } + } + } + bool CheckResults() { DCHECK(to_ymd_result_.size() == cctz_to_ymd_result_.size()); DCHECK(to_year_result_.size() == cctz_to_ymd_result_.size()); @@ -133,6 +162,11 @@ public: << endl; ok = false; } + if (to_string_result_[i] != to_string_old_result_[i]) { + cerr << "Incorrect results (ToString() vs TestToString_stringstream()): " + << to_string_result_[i] << " != " << to_string_old_result_[i] << endl; + ok = false; + } } return ok; } @@ -159,6 +193,8 @@ private: vector<YearMonthDayResult> cctz_to_ymd_result_; vector<YearMonthDayResult> to_ymd_result_; vector<int> to_year_result_; + vector<string> to_string_result_; + vector<string> to_string_old_result_; }; void TestCctzToYearMonthDay(int batch_size, void* d) { @@ -176,19 +212,33 @@ void TestToYear(int batch_size, void* d) { data->TestToYear(batch_size); } +void TestToString(int batch_size, void* d) { + TestData* data = reinterpret_cast<TestData*>(d); + data->TestToString(batch_size); +} + +void TestToString_stringstream(int batch_size, void* d) { + TestData* data = reinterpret_cast<TestData*>(d); + data->TestToString_stringstream(batch_size); +} + int main(int argc, char* argv[]) { CpuInfo::Init(); cout << Benchmark::GetMachineInfo() << endl; + datetime_parse_util::SimpleDateFormatTokenizer::InitCtx(); + TestData data; - data.AddRandomRange(DateValue(1965, 1, 1), DateValue(2020, 12, 31)); + data.AddRandomRange(DateValue(1965, 1, 1), DateValue(2020, 12, 31), 1000); - // Benchmark TestData::CctzToYearMonthDay(), DateValue::ToYearMonthDay() and - // DateValue::ToYear(). + // Benchmark TestData::CctzToYearMonthDay(), DateValue::ToYearMonthDay(), + // DateValue::ToYear() and DateValue().ToString's current and old implementation. Benchmark suite("ToYearMonthDay"); suite.AddBenchmark("TestCctzToYearMonthDay", TestCctzToYearMonthDay, &data); suite.AddBenchmark("TestToYearMonthDay", TestToYearMonthDay, &data); suite.AddBenchmark("TestToYear", TestToYear, &data); + suite.AddBenchmark("TestToString", TestToString, &data); + suite.AddBenchmark("TestToString_stringstream", TestToString_stringstream, &data); cout << suite.Measure(); return data.CheckResults() ? 0 : 1; diff --git a/be/src/runtime/date-value.cc b/be/src/runtime/date-value.cc index f14436840..78db95eb0 100644 --- a/be/src/runtime/date-value.cc +++ b/be/src/runtime/date-value.cc @@ -21,6 +21,7 @@ #include <iomanip> #include "cctz/civil_time.h" #include "runtime/date-parse-util.h" +#include "runtime/datetime-simple-date-format-parser.h" #include "common/names.h" @@ -29,6 +30,7 @@ namespace impala { using datetime_parse_util::DateTimeFormatContext; using datetime_parse_util::GetMonthAndDayFromDaysSinceJan1; using datetime_parse_util::IsLeapYear; +using datetime_parse_util::SimpleDateFormatTokenizer; const int EPOCH_YEAR = 1970; const int MIN_YEAR = 1; @@ -427,13 +429,7 @@ bool DateValue::MonthsBetween(const DateValue& other, double* months_between) co } string DateValue::ToString() const { - stringstream ss; - int year, month, day; - if (ToYearMonthDay(&year, &month, &day)) { - ss << std::setfill('0') << setw(4) << year << "-" << setw(2) << month << "-" - << setw(2) << day; - } - return ss.str(); + return Format(*SimpleDateFormatTokenizer::GetDefaultDateFormatContext()); } ostream& operator<<(ostream& os, const DateValue& date_value) { diff --git a/be/src/runtime/datetime-simple-date-format-parser.h b/be/src/runtime/datetime-simple-date-format-parser.h index 2e0ce98bb..7bd173cce 100644 --- a/be/src/runtime/datetime-simple-date-format-parser.h +++ b/be/src/runtime/datetime-simple-date-format-parser.h @@ -125,6 +125,10 @@ public: &DEFAULT_SHORT_DATE_TIME_CTX; } + static ALWAYS_INLINE const DateTimeFormatContext* GetDefaultDateFormatContext() { + return &DEFAULT_DATE_CTX; + } + /// Initialize the default format contexts. This *must* be called before using /// GetDefaultFormatContext(). static void InitCtx(); diff --git a/be/src/runtime/raw-value.cc b/be/src/runtime/raw-value.cc index be8058db0..928d36016 100644 --- a/be/src/runtime/raw-value.cc +++ b/be/src/runtime/raw-value.cc @@ -97,11 +97,9 @@ void RawValue::PrintValue(const void* value, const ColumnType& type, int scale, return; } - stringstream out; - out.precision(ASCII_PRECISION); const StringValue* string_val = NULL; - string tmp; bool val; + string tmp; // Special case types that we can print more efficiently without using a stringstream switch (type.type) { @@ -118,12 +116,24 @@ void RawValue::PrintValue(const void* value, const ColumnType& type, int scale, case TYPE_CHAR: *str = string(reinterpret_cast<const char*>(value), type.len); return; + case TYPE_TIMESTAMP: + *str = reinterpret_cast<const TimestampValue*>(value)->ToString(); + return; + case TYPE_DATE: + *str = reinterpret_cast<const DateValue*>(value)->ToString(); + return; case TYPE_FIXED_UDA_INTERMEDIATE: *str = "Intermediate UDA step, no value printed"; return; default: - PrintValue(value, type, scale, &out); + break; } + + stringstream out; + out.precision(ASCII_PRECISION); + + PrintValue(value, type, scale, &out); + *str = out.str(); } diff --git a/be/src/service/hs2-util.cc b/be/src/service/hs2-util.cc index 20279699c..3d36876cb 100644 --- a/be/src/service/hs2-util.cc +++ b/be/src/service/hs2-util.cc @@ -255,8 +255,7 @@ static void TimestampExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval, column->stringVal.values.emplace_back(); if (!val.is_null) { TimestampValue value = TimestampValue::FromTimestampVal(val); - RawValue::PrintValue( - &value, ColumnType(TYPE_TIMESTAMP), -1, &(column->stringVal.values.back())); + column->stringVal.values.back() = value.ToString(); } SetNullBit(output_row_idx, val.is_null, &column->stringVal.nulls); ++output_row_idx; @@ -273,8 +272,7 @@ static void DateExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval, column->stringVal.values.emplace_back(); if (!val.is_null) { DateValue value = DateValue::FromDateVal(val); - RawValue::PrintValue( - &value, ColumnType(TYPE_DATE), -1, &(column->stringVal.values.back())); + column->stringVal.values.back() = value.ToString(); } SetNullBit(output_row_idx, val.is_null, &column->stringVal.nulls); ++output_row_idx;
