This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 6932eef65e5 [opt](serde)Optimize the filling of fixed values into
block columns without repeated deserialization. (#37377) (#37530)
6932eef65e5 is described below
commit 6932eef65e54d2ae060b4973da3aec0d919d7154
Author: daidai <[email protected]>
AuthorDate: Tue Jul 16 10:56:13 2024 +0800
[opt](serde)Optimize the filling of fixed values into block columns
without repeated deserialization. (#37377) (#37530)
bp #37377
---
.../serde/data_type_datetimev2_serde.cpp | 21 +++++++++++++++++
.../data_types/serde/data_type_datetimev2_serde.h | 5 +++++
.../data_types/serde/data_type_datev2_serde.cpp | 21 +++++++++++++++++
.../vec/data_types/serde/data_type_datev2_serde.h | 6 +++++
.../data_types/serde/data_type_decimal_serde.cpp | 26 ++++++++++++++++++++++
.../vec/data_types/serde/data_type_decimal_serde.h | 6 +++++
.../data_types/serde/data_type_nullable_serde.cpp | 20 +++++++++++++++++
.../data_types/serde/data_type_nullable_serde.h | 3 +++
.../data_types/serde/data_type_number_serde.cpp | 22 ++++++++++++++++++
.../vec/data_types/serde/data_type_number_serde.h | 6 +++++
be/src/vec/data_types/serde/data_type_serde.h | 21 +++++++++++++++++
.../vec/data_types/serde/data_type_string_serde.h | 25 +++++++++++++++++++++
be/src/vec/exec/format/orc/vorc_reader.cpp | 9 +++-----
.../exec/format/parquet/vparquet_group_reader.cpp | 9 +++-----
be/src/vec/exec/scan/vfile_scanner.cpp | 9 +++-----
15 files changed, 191 insertions(+), 18 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
index 63a199199a0..850ac5766fc 100644
--- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
@@ -247,4 +247,25 @@ Status DataTypeDateTimeV2SerDe::write_column_to_orc(const
std::string& timezone,
return Status::OK();
}
+Status DataTypeDateTimeV2SerDe::deserialize_column_from_fixed_json(
+ IColumn& column, Slice& slice, int rows, int* num_deserialized,
+ const FormatOptions& options) const {
+ Status st = deserialize_one_cell_from_json(column, slice, options);
+ if (!st.ok()) {
+ return st;
+ }
+
+ DataTypeDateTimeV2SerDe::insert_column_last_value_multiple_times(column,
rows - 1);
+ *num_deserialized = rows;
+ return Status::OK();
+}
+
+void DataTypeDateTimeV2SerDe::insert_column_last_value_multiple_times(IColumn&
column,
+ int
times) const {
+ auto& col = static_cast<ColumnVector<UInt64>&>(column);
+ auto sz = col.size();
+ UInt64 val = col.get_element(sz - 1);
+ col.insert_many_vals(val, times);
+}
+
} // namespace doris::vectorized
diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.h
b/be/src/vec/data_types/serde/data_type_datetimev2_serde.h
index 00b05f5fcd6..ef4aa6843a0 100644
--- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.h
+++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.h
@@ -77,6 +77,11 @@ public:
int start, int end,
std::vector<StringRef>& buffer_list) const
override;
+ Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice,
int rows,
+ int* num_deserialized,
+ const FormatOptions& options)
const override;
+ void insert_column_last_value_multiple_times(IColumn& column, int times)
const override;
+
private:
template <bool is_binary_format>
Status _write_column_to_mysql(const IColumn& column,
MysqlRowBuffer<is_binary_format>& result,
diff --git a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
index eb9122dd240..f2d595b87c4 100644
--- a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
@@ -175,5 +175,26 @@ Status DataTypeDateV2SerDe::write_column_to_orc(const
std::string& timezone, con
return Status::OK();
}
+Status DataTypeDateV2SerDe::deserialize_column_from_fixed_json(IColumn&
column, Slice& slice,
+ int rows, int*
num_deserialized,
+ const
FormatOptions& options) const {
+ Status st = deserialize_one_cell_from_json(column, slice, options);
+ if (!st.ok()) {
+ return st;
+ }
+ DataTypeDateV2SerDe::insert_column_last_value_multiple_times(column, rows
- 1);
+ *num_deserialized = rows;
+ return Status::OK();
+}
+
+void DataTypeDateV2SerDe::insert_column_last_value_multiple_times(IColumn&
column,
+ int times)
const {
+ auto& col = static_cast<ColumnVector<UInt32>&>(column);
+ auto sz = col.size();
+ UInt32 val = col.get_element(sz - 1);
+
+ col.insert_many_vals(val, times);
+}
+
} // namespace vectorized
} // namespace doris
diff --git a/be/src/vec/data_types/serde/data_type_datev2_serde.h
b/be/src/vec/data_types/serde/data_type_datev2_serde.h
index 9a8b050eeba..52e4cec364e 100644
--- a/be/src/vec/data_types/serde/data_type_datev2_serde.h
+++ b/be/src/vec/data_types/serde/data_type_datev2_serde.h
@@ -74,6 +74,12 @@ public:
int start, int end,
std::vector<StringRef>& buffer_list) const
override;
+ Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice,
int rows,
+ int* num_deserialized,
+ const FormatOptions& options)
const override;
+
+ void insert_column_last_value_multiple_times(IColumn& column, int times)
const override;
+
private:
template <bool is_binary_format>
Status _write_column_to_mysql(const IColumn& column,
MysqlRowBuffer<is_binary_format>& result,
diff --git a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
index a59fdedbfe6..e979211d6d7 100644
--- a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
@@ -275,6 +275,32 @@ Status DataTypeDecimalSerDe<T>::write_column_to_orc(const
std::string& timezone,
}
return Status::OK();
}
+template <typename T>
+
+Status DataTypeDecimalSerDe<T>::deserialize_column_from_fixed_json(
+ IColumn& column, Slice& slice, int rows, int* num_deserialized,
+ const FormatOptions& options) const {
+ Status st = deserialize_one_cell_from_json(column, slice, options);
+ if (!st.ok()) {
+ return st;
+ }
+
+ DataTypeDecimalSerDe::insert_column_last_value_multiple_times(column, rows
- 1);
+ *num_deserialized = rows;
+ return Status::OK();
+}
+
+template <typename T>
+void DataTypeDecimalSerDe<T>::insert_column_last_value_multiple_times(IColumn&
column,
+ int
times) const {
+ auto& col = static_cast<ColumnDecimal<T>&>(column);
+ auto sz = col.size();
+
+ T val = col.get_element(sz - 1);
+ for (int i = 0; i < times; i++) {
+ col.insert_value(val);
+ }
+}
template class DataTypeDecimalSerDe<Decimal32>;
template class DataTypeDecimalSerDe<Decimal64>;
diff --git a/be/src/vec/data_types/serde/data_type_decimal_serde.h
b/be/src/vec/data_types/serde/data_type_decimal_serde.h
index 55e68699f01..484c6686bc5 100644
--- a/be/src/vec/data_types/serde/data_type_decimal_serde.h
+++ b/be/src/vec/data_types/serde/data_type_decimal_serde.h
@@ -114,6 +114,12 @@ public:
int start, int end,
std::vector<StringRef>& buffer_list) const
override;
+ Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice,
int rows,
+ int* num_deserialized,
+ const FormatOptions& options)
const override;
+
+ void insert_column_last_value_multiple_times(IColumn& column, int times)
const override;
+
private:
template <bool is_binary_format>
Status _write_column_to_mysql(const IColumn& column,
MysqlRowBuffer<is_binary_format>& result,
diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
index faa3c8eb1f4..98ff1eb7f81 100644
--- a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
@@ -127,6 +127,26 @@ Status
DataTypeNullableSerDe::deserialize_column_from_hive_text_vector(
return Status::OK();
}
+Status DataTypeNullableSerDe::deserialize_column_from_fixed_json(
+ IColumn& column, Slice& slice, int rows, int* num_deserialized,
+ const FormatOptions& options) const {
+ auto& col = static_cast<ColumnNullable&>(column);
+ Status st = deserialize_one_cell_from_json(column, slice, options);
+ if (!st.ok()) {
+ return st;
+ }
+ auto& null_map = col.get_null_map_data();
+ auto& nested_column = col.get_nested_column();
+
+ null_map.resize_fill(
+ rows, null_map.back()); //
data_type_nullable::insert_column_last_value_multiple_times()
+ if (rows - 1 != 0) {
+ nested_serde->insert_column_last_value_multiple_times(nested_column,
rows - 1);
+ }
+ *num_deserialized = rows;
+ return Status::OK();
+}
+
Status DataTypeNullableSerDe::deserialize_one_cell_from_json(IColumn& column,
Slice& slice,
const
FormatOptions& options) const {
auto& null_column = assert_cast<ColumnNullable&>(column);
diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.h
b/be/src/vec/data_types/serde/data_type_nullable_serde.h
index 09d2fbde409..7b4841dcbdf 100644
--- a/be/src/vec/data_types/serde/data_type_nullable_serde.h
+++ b/be/src/vec/data_types/serde/data_type_nullable_serde.h
@@ -47,6 +47,9 @@ public:
int* num_deserialized,
const FormatOptions& options)
const override;
+ Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice,
int rows,
+ int* num_deserialized,
+ const FormatOptions& options)
const override;
Status deserialize_one_cell_from_hive_text(
IColumn& column, Slice& slice, const FormatOptions& options,
int hive_text_complex_type_delimiter_level = 1) const override;
diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp
b/be/src/vec/data_types/serde/data_type_number_serde.cpp
index 0ba338ce399..299779ea267 100644
--- a/be/src/vec/data_types/serde/data_type_number_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp
@@ -224,6 +224,28 @@ void
DataTypeNumberSerDe<T>::read_column_from_arrow(IColumn& column,
const auto* raw_data = reinterpret_cast<const T*>(buffer->data()) + start;
col_data.insert(raw_data, raw_data + row_count);
}
+template <typename T>
+Status DataTypeNumberSerDe<T>::deserialize_column_from_fixed_json(
+ IColumn& column, Slice& slice, int rows, int* num_deserialized,
+ const FormatOptions& options) const {
+ Status st = deserialize_one_cell_from_json(column, slice, options);
+ if (!st.ok()) {
+ return st;
+ }
+
+ DataTypeNumberSerDe::insert_column_last_value_multiple_times(column, rows
- 1);
+ *num_deserialized = rows;
+ return Status::OK();
+}
+
+template <typename T>
+void DataTypeNumberSerDe<T>::insert_column_last_value_multiple_times(IColumn&
column,
+ int
times) const {
+ auto& col = static_cast<ColumnVector<T>&>(column);
+ auto sz = col.size();
+ T val = col.get_element(sz - 1);
+ col.insert_many_vals(val, times);
+}
template <typename T>
template <bool is_binary_format>
diff --git a/be/src/vec/data_types/serde/data_type_number_serde.h
b/be/src/vec/data_types/serde/data_type_number_serde.h
index c66bc994605..18ba2fb26c7 100644
--- a/be/src/vec/data_types/serde/data_type_number_serde.h
+++ b/be/src/vec/data_types/serde/data_type_number_serde.h
@@ -70,6 +70,12 @@ public:
int* num_deserialized,
const FormatOptions& options)
const override;
+ Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice,
int rows,
+ int* num_deserialized,
+ const FormatOptions& options)
const override;
+
+ void insert_column_last_value_multiple_times(IColumn& column, int times)
const override;
+
Status write_column_to_pb(const IColumn& column, PValues& result, int
start,
int end) const override;
Status read_column_from_pb(IColumn& column, const PValues& arg) const
override;
diff --git a/be/src/vec/data_types/serde/data_type_serde.h
b/be/src/vec/data_types/serde/data_type_serde.h
index 77663e1d43a..1f6e24aef3f 100644
--- a/be/src/vec/data_types/serde/data_type_serde.h
+++ b/be/src/vec/data_types/serde/data_type_serde.h
@@ -234,6 +234,27 @@ public:
virtual Status deserialize_column_from_json_vector(IColumn& column,
std::vector<Slice>& slices,
int* num_deserialized,
const FormatOptions&
options) const = 0;
+ // deserialize fixed values.Repeatedly insert the value row times into the
column.
+ virtual Status deserialize_column_from_fixed_json(IColumn& column, Slice&
slice, int rows,
+ int* num_deserialized,
+ const FormatOptions&
options) const {
+ Status st = deserialize_one_cell_from_json(column, slice, options);
+ if (!st.ok()) {
+ *num_deserialized = 0;
+ return st;
+ }
+ insert_column_last_value_multiple_times(column, rows - 1);
+ *num_deserialized = rows;
+ return Status::OK();
+ }
+ // Insert the last value to the end of this column multiple times.
+ virtual void insert_column_last_value_multiple_times(IColumn& column, int
times) const {
+ //If you try to simplify this operation by using
`column.insert_many_from(column, column.size() - 1, rows - 1);`
+ // you are likely to get incorrect data results.
+ MutableColumnPtr dum_col = column.clone_empty();
+ dum_col->insert_from(column, column.size() - 1);
+ column.insert_many_from(*dum_col.get(), 0, times);
+ }
virtual Status deserialize_one_cell_from_hive_text(
IColumn& column, Slice& slice, const FormatOptions& options,
diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h
b/be/src/vec/data_types/serde/data_type_string_serde.h
index b74b5857086..0f0f1d0dfe8 100644
--- a/be/src/vec/data_types/serde/data_type_string_serde.h
+++ b/be/src/vec/data_types/serde/data_type_string_serde.h
@@ -132,6 +132,31 @@ public:
}
return Status::OK();
}
+
+ Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice,
int rows,
+ int* num_deserialized,
+ const FormatOptions& options)
const override {
+ Status st = deserialize_one_cell_from_json(column, slice, options);
+ if (!st.ok()) {
+ return st;
+ }
+
+
DataTypeStringSerDeBase::insert_column_last_value_multiple_times(column, rows -
1);
+ *num_deserialized = rows;
+ return Status::OK();
+ }
+
+ void insert_column_last_value_multiple_times(IColumn& column, int times)
const override {
+ auto& col = static_cast<ColumnString&>(column);
+ auto sz = col.size();
+
+ StringRef ref = col.get_data_at(sz - 1);
+ String str(ref.data, ref.size);
+ std::vector<StringRef> refs(times, {str.data(), str.size()});
+
+ col.insert_many_strings(refs.data(), refs.size());
+ }
+
Status read_column_from_pb(IColumn& column, const PValues& arg) const
override {
auto& column_dest = assert_cast<ColumnType&>(column);
column_dest.reserve(column_dest.size() + arg.string_value_size());
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 16909f0023a..54d94dcecc7 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -935,13 +935,10 @@ Status OrcReader::_fill_partition_columns(
auto& [value, slot_desc] = kv.second;
auto _text_serde = slot_desc->get_data_type_ptr()->get_serde();
Slice slice(value.data(), value.size());
- vector<Slice> slices(rows);
- for (int i = 0; i < rows; i++) {
- slices[i] = {value.data(), value.size()};
- }
int num_deserialized = 0;
- if (_text_serde->deserialize_column_from_json_vector(*col_ptr, slices,
&num_deserialized,
-
_text_formatOptions) != Status::OK()) {
+ if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice,
rows,
+ &num_deserialized,
+
_text_formatOptions) != Status::OK()) {
return Status::InternalError("Failed to fill partition column:
{}={}",
slot_desc->col_name(), value);
}
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index 5e824f34817..9ec1235be1d 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -631,13 +631,10 @@ Status RowGroupReader::_fill_partition_columns(
auto& [value, slot_desc] = kv.second;
auto _text_serde = slot_desc->get_data_type_ptr()->get_serde();
Slice slice(value.data(), value.size());
- vector<Slice> slices(rows);
- for (int i = 0; i < rows; i++) {
- slices[i] = {value.data(), value.size()};
- }
int num_deserialized = 0;
- if (_text_serde->deserialize_column_from_json_vector(*col_ptr, slices,
&num_deserialized,
-
_text_formatOptions) != Status::OK()) {
+ if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice,
rows,
+ &num_deserialized,
+
_text_formatOptions) != Status::OK()) {
return Status::InternalError("Failed to fill partition column:
{}={}",
slot_desc->col_name(), value);
}
diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp
b/be/src/vec/exec/scan/vfile_scanner.cpp
index 944884434f4..0688f2c0712 100644
--- a/be/src/vec/exec/scan/vfile_scanner.cpp
+++ b/be/src/vec/exec/scan/vfile_scanner.cpp
@@ -499,13 +499,10 @@ Status VFileScanner::_fill_columns_from_path(size_t rows)
{
auto& [value, slot_desc] = kv.second;
auto _text_serde = slot_desc->get_data_type_ptr()->get_serde();
Slice slice(value.data(), value.size());
- vector<Slice> slices(rows);
- for (int i = 0; i < rows; i++) {
- slices[i] = {value.data(), value.size()};
- }
int num_deserialized = 0;
- if (_text_serde->deserialize_column_from_json_vector(*col_ptr, slices,
&num_deserialized,
-
_text_formatOptions) != Status::OK()) {
+ if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice,
rows,
+ &num_deserialized,
+
_text_formatOptions) != Status::OK()) {
return Status::InternalError("Failed to fill partition column:
{}={}",
slot_desc->col_name(), value);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]