This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new bb5b05b1f44 [opt](parse) optimize parsing string to datetime (#38385)
bb5b05b1f44 is described below
commit bb5b05b1f447bddf21fe1b4ed8c8b720aaa8291f
Author: zclllhhjj <[email protected]>
AuthorDate: Mon Jul 29 23:52:41 2024 +0800
[opt](parse) optimize parsing string to datetime (#38385)
---
be/src/vec/functions/function_cast.h | 81 ++++++++++++++--------------------
be/src/vec/runtime/vdatetime_value.cpp | 29 +++++++-----
2 files changed, 51 insertions(+), 59 deletions(-)
diff --git a/be/src/vec/functions/function_cast.h
b/be/src/vec/functions/function_cast.h
index af2fadc84c2..5f3968e512b 100644
--- a/be/src/vec/functions/function_cast.h
+++ b/be/src/vec/functions/function_cast.h
@@ -978,9 +978,9 @@ struct NameToDateTime {
static constexpr auto name = "toDateTime";
};
-template <typename DataType, typename Additions = void*, typename FromDataType
= void*>
+template <typename DataType, typename FromDataType = void*>
bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb,
FunctionContext* context,
- Additions additions [[maybe_unused]] = Additions()) {
+ UInt32 scale [[maybe_unused]] = 0) {
if constexpr (IsDateTimeType<DataType>) {
return try_read_datetime_text(x, rb, context->state()->timezone_obj());
}
@@ -994,7 +994,6 @@ bool try_parse_impl(typename DataType::FieldType& x,
ReadBuffer& rb, FunctionCon
}
if constexpr (IsDateTimeV2Type<DataType>) {
- UInt32 scale = additions;
return try_read_datetime_v2_text(x, rb,
context->state()->timezone_obj(), scale);
}
@@ -1032,7 +1031,6 @@ bool try_parse_impl(typename DataType::FieldType& x,
ReadBuffer& rb, FunctionCon
template <typename DataType, typename Additions = void*>
StringParser::ParseResult try_parse_decimal_impl(typename DataType::FieldType&
x, ReadBuffer& rb,
- const cctz::time_zone&
local_time_zone,
Additions additions
[[maybe_unused]] =
Additions()) {
if constexpr (IsDataTypeDecimalV2<DataType>) {
@@ -1461,15 +1459,9 @@ private:
const char* name;
};
-struct NameCast {
- static constexpr auto name = "CAST";
-};
-
-template <typename FromDataType, typename ToDataType, typename Name>
-struct ConvertThroughParsing {
- static_assert(std::is_same_v<FromDataType, DataTypeString>,
- "ConvertThroughParsing is only applicable for String or
FixedString data types");
-
+// always from DataTypeString
+template <typename ToDataType, typename Name>
+struct StringParsing {
using ToFieldType = typename ToDataType::FieldType;
static bool is_all_read(ReadBuffer& in) { return in.eof(); }
@@ -1482,48 +1474,38 @@ struct ConvertThroughParsing {
ColumnDecimal<ToFieldType>,
ColumnVector<ToFieldType>>;
const IColumn* col_from =
block.get_by_position(arguments[0]).column.get();
- const ColumnString* col_from_string =
check_and_get_column<ColumnString>(col_from);
+ const auto* col_from_string =
check_and_get_column<ColumnString>(col_from);
- if (std::is_same_v<FromDataType, DataTypeString> && !col_from_string) {
+ if (!col_from_string) {
return Status::RuntimeError("Illegal column {} of first argument
of function {}",
col_from->get_name(), Name::name);
}
- size_t size = input_rows_count;
+ size_t row = input_rows_count;
typename ColVecTo::MutablePtr col_to = nullptr;
if constexpr (IsDataTypeDecimal<ToDataType>) {
UInt32 scale = ((PrecisionScaleArg)additions).scale;
ToDataType::check_type_scale(scale);
- col_to = ColVecTo::create(size, scale);
+ col_to = ColVecTo::create(row, scale);
} else {
- col_to = ColVecTo::create(size);
+ col_to = ColVecTo::create(row);
}
typename ColVecTo::Container& vec_to = col_to->get_data();
ColumnUInt8::MutablePtr col_null_map_to;
ColumnUInt8::Container* vec_null_map_to [[maybe_unused]] = nullptr;
- col_null_map_to = ColumnUInt8::create(size);
+ col_null_map_to = ColumnUInt8::create(row);
vec_null_map_to = &col_null_map_to->get_data();
- const ColumnString::Chars* chars = nullptr;
- const IColumn::Offsets* offsets = nullptr;
- size_t fixed_string_size = 0;
-
- if constexpr (std::is_same_v<FromDataType, DataTypeString>) {
- chars = &col_from_string->get_chars();
- offsets = &col_from_string->get_offsets();
- }
+ const ColumnString::Chars* chars = &col_from_string->get_chars();
+ const IColumn::Offsets* offsets = &col_from_string->get_offsets();
size_t current_offset = 0;
- for (size_t i = 0; i < size; ++i) {
- size_t next_offset = std::is_same_v<FromDataType, DataTypeString>
- ? (*offsets)[i]
- : (current_offset +
fixed_string_size);
- size_t string_size = std::is_same_v<FromDataType, DataTypeString>
- ? next_offset - current_offset
- : fixed_string_size;
+ for (size_t i = 0; i < row; ++i) {
+ size_t next_offset = (*offsets)[i];
+ size_t string_size = next_offset - current_offset;
ReadBuffer read_buffer(&(*chars)[current_offset], string_size);
@@ -1531,8 +1513,7 @@ struct ConvertThroughParsing {
if constexpr (IsDataTypeDecimal<ToDataType>) {
ToDataType::check_type_precision((PrecisionScaleArg(additions).precision));
StringParser::ParseResult res =
try_parse_decimal_impl<ToDataType>(
- vec_to[i], read_buffer,
context->state()->timezone_obj(),
- PrecisionScaleArg(additions));
+ vec_to[i], read_buffer, PrecisionScaleArg(additions));
parsed = (res == StringParser::PARSE_SUCCESS ||
res == StringParser::PARSE_OVERFLOW ||
res == StringParser::PARSE_UNDERFLOW);
@@ -1542,8 +1523,8 @@ struct ConvertThroughParsing {
parsed = try_parse_impl<ToDataType>(vec_to[i], read_buffer,
context,
type->get_scale());
} else {
- parsed = try_parse_impl<ToDataType, void*,
FromDataType>(vec_to[i], read_buffer,
-
context);
+ parsed =
+ try_parse_impl<ToDataType, DataTypeString>(vec_to[i],
read_buffer, context);
}
(*vec_null_map_to)[i] = !parsed || !is_all_read(read_buffer);
current_offset = next_offset;
@@ -1557,25 +1538,27 @@ struct ConvertThroughParsing {
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal32>, Name>
- : ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal32>,
Name> {};
+ : StringParsing<DataTypeDecimal<Decimal32>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal64>, Name>
- : ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal64>,
Name> {};
+ : StringParsing<DataTypeDecimal<Decimal64>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal128V2>, Name>
- : ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal128V2>,
Name> {};
+ : StringParsing<DataTypeDecimal<Decimal128V2>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal128V3>, Name>
- : ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal128V3>,
Name> {};
+ : StringParsing<DataTypeDecimal<Decimal128V3>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal256>, Name>
- : ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal256>,
Name> {};
+ : StringParsing<DataTypeDecimal<Decimal256>, Name> {};
template <typename Name>
-struct ConvertImpl<DataTypeString, DataTypeIPv4, Name>
- : ConvertThroughParsing<DataTypeString, DataTypeIPv4, Name> {};
+struct ConvertImpl<DataTypeString, DataTypeIPv4, Name> :
StringParsing<DataTypeIPv4, Name> {};
template <typename Name>
-struct ConvertImpl<DataTypeString, DataTypeIPv6, Name>
- : ConvertThroughParsing<DataTypeString, DataTypeIPv6, Name> {};
+struct ConvertImpl<DataTypeString, DataTypeIPv6, Name> :
StringParsing<DataTypeIPv6, Name> {};
+
+struct NameCast {
+ static constexpr auto name = "CAST";
+};
template <typename ToDataType, typename Name>
class FunctionConvertFromString : public IFunction {
@@ -1610,8 +1593,8 @@ public:
const IDataType* from_type =
block.get_by_position(arguments[0]).type.get();
if (check_and_get_data_type<DataTypeString>(from_type)) {
- return ConvertThroughParsing<DataTypeString, ToDataType,
Name>::execute(
- context, block, arguments, result, input_rows_count);
+ return StringParsing<ToDataType, Name>::execute(context, block,
arguments, result,
+ input_rows_count);
}
return Status::RuntimeError(
diff --git a/be/src/vec/runtime/vdatetime_value.cpp
b/be/src/vec/runtime/vdatetime_value.cpp
index 610983a149d..877573bcccb 100644
--- a/be/src/vec/runtime/vdatetime_value.cpp
+++ b/be/src/vec/runtime/vdatetime_value.cpp
@@ -55,6 +55,15 @@ uint8_t mysql_week_mode(uint32_t mode) {
return mode;
}
+static bool check_space(char ch) {
+ // \t, \n, \v, \f, \r are 9~13, respectively.
+ return UNLIKELY(ch == ' ' || (ch >= 9 && ch <= 13));
+}
+
+static bool check_date_punct(char ch) {
+ return UNLIKELY(!(isdigit(ch) || isalpha(ch)));
+}
+
static bool time_zone_begins(const char* ptr, const char* end) {
return *ptr == '+' || (*ptr == '-' && ptr + 3 < end && *(ptr + 3) == ':')
||
(isalpha(*ptr) && *ptr != 'T');
@@ -104,7 +113,7 @@ bool VecDateTimeValue::from_date_str_base(const char*
date_str, int len,
_neg = false;
// Skip space character
- while (ptr < end && isspace(*ptr)) {
+ while (ptr < end && check_space(*ptr)) {
ptr++;
}
if (ptr == end || !isdigit(*ptr)) {
@@ -202,8 +211,8 @@ bool VecDateTimeValue::from_date_str_base(const char*
date_str, int len,
continue;
}
// escape separator
- while (ptr < end && (ispunct(*ptr) || isspace(*ptr))) {
- if (isspace(*ptr)) {
+ while (ptr < end && (check_date_punct(*ptr) || check_space(*ptr))) {
+ if (check_space(*ptr)) {
if (((1 << field_idx) & allow_space_mask) == 0) {
return false;
}
@@ -1235,7 +1244,7 @@ bool VecDateTimeValue::from_date_format_str(const char*
format, int format_len,
auto [year, month, day, hour, minute, second] = std::tuple {0, 0, 0, 0, 0,
0};
while (ptr < end && val < val_end) {
// Skip space character
- while (val < val_end && isspace(*val)) {
+ while (val < val_end && check_space(*val)) {
val++;
}
if (val >= val_end) {
@@ -1500,7 +1509,7 @@ bool VecDateTimeValue::from_date_format_str(const char*
format, int format_len,
default:
return false;
}
- } else if (!isspace(*ptr)) {
+ } else if (!check_space(*ptr)) {
if (*ptr != *val) {
return false;
}
@@ -1987,13 +1996,13 @@ bool DateV2Value<T>::from_date_str(const char*
date_str, int len, int scale /* =
bool convert_zero) {
return from_date_str_base(date_str, len, scale, nullptr, convert_zero);
}
-// when we parse
template <typename T>
bool DateV2Value<T>::from_date_str(const char* date_str, int len,
const cctz::time_zone& local_time_zone, int
scale /* = -1*/,
bool convert_zero) {
return from_date_str_base(date_str, len, scale, &local_time_zone,
convert_zero);
}
+// if local_time_zone is null, only be able to parse time without timezone
template <typename T>
bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int
scale,
const cctz::time_zone*
local_time_zone, bool convert_zero) {
@@ -2005,7 +2014,7 @@ bool DateV2Value<T>::from_date_str_base(const char*
date_str, int len, int scale
int32_t date_len[MAX_DATE_PARTS] = {0};
// Skip space character
- while (ptr < end && isspace(*ptr)) {
+ while (ptr < end && check_space(*ptr)) {
ptr++;
}
if (ptr == end || !isdigit(*ptr)) {
@@ -2153,8 +2162,8 @@ bool DateV2Value<T>::from_date_str_base(const char*
date_str, int len, int scale
continue;
}
// escape separator
- while (ptr < end && (ispunct(*ptr) || isspace(*ptr))) {
- if (isspace(*ptr)) {
+ while (ptr < end && (check_date_punct(*ptr) || check_space(*ptr))) {
+ if (check_space(*ptr)) {
if (((1 << field_idx) & allow_space_mask) == 0) {
return false;
}
@@ -2286,7 +2295,7 @@ bool DateV2Value<T>::from_date_format_str(const char*
format, int format_len, co
auto [year, month, day, hour, minute, second, microsecond] = std::tuple
{0, 0, 0, 0, 0, 0, 0};
while (ptr < end && val < val_end) {
// Skip space character
- while (val < val_end && isspace(*val)) {
+ while (val < val_end && check_space(*val)) {
val++;
}
if (val >= val_end) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]