This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 0da08d5d19919a8efc4910f536fc0b74386f124e Author: daidai <[email protected]> AuthorDate: Fri Sep 22 13:44:37 2023 +0800 [enhancement](csv_reader)Optimize the reading efficiency of nullable (string) columns. (#24698) Optimize the performance of stream load tsv by reducing virtual function calls . (Optimize read performance of nullable (string) columns by reducing virtual function calls.) before : 600+ s after : 560+ s --- be/src/clucene | 2 +- .../data_types/serde/data_type_nullable_serde.cpp | 6 ++- be/src/vec/exec/format/csv/csv_reader.cpp | 48 +++++++++++++++++++--- be/src/vec/exec/format/csv/csv_reader.h | 5 +++ 4 files changed, 52 insertions(+), 9 deletions(-) diff --git a/be/src/clucene b/be/src/clucene index 0be3c4aeb62..3b51f707d4c 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 0be3c4aeb62ecbc0ff1c79c9526e619742d54fcc +Subproject commit 3b51f707d4c51596d77b97f48b0baf49db0d2c30 diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp index b10f2765995..64868b97c1c 100644 --- a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp @@ -24,6 +24,7 @@ #include <boost/iterator/iterator_facade.hpp> #include <memory> +#include "data_type_string_serde.h" #include "util/jsonb_document.h" #include "vec/columns/column.h" #include "vec/columns/column_const.h" @@ -95,8 +96,9 @@ Status DataTypeNullableSerDe::deserialize_one_cell_from_hive_text(IColumn& colum return Status::OK(); } - auto st = nested_serde->deserialize_one_cell_from_hive_text(null_column.get_nested_column(), - slice, options, nesting_level); + Status st = nested_serde->deserialize_one_cell_from_hive_text(null_column.get_nested_column(), + slice, options, nesting_level); + if (!st.ok()) { // fill null if fail null_column.insert_data(nullptr, 0); // 0 is meaningless here diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 58cdf97b44e..c997afe6a31 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -355,6 +355,15 @@ Status CsvReader::init_reader(bool is_load) { } else { _options.map_key_delim = _params.file_attributes.text_params.mapkv_delimiter[0]; } + _use_nullable_string_opt.resize(_file_slot_descs.size()); + for (int i = 0; i < _file_slot_descs.size(); ++i) { + auto data_type_ptr = _file_slot_descs[i]->get_data_type_ptr(); + if (data_type_ptr.get()->get_type_id() == TypeIndex::Nullable && + ((DataTypeNullable*)data_type_ptr.get())->get_nested_type()->get_type_id() == + TypeIndex::String) { + _use_nullable_string_opt[i] = 1; + } + } if (_params.file_attributes.__isset.trim_double_quotes) { _trim_double_quotes = _params.file_attributes.trim_double_quotes; @@ -611,6 +620,28 @@ Status CsvReader::_create_decompressor() { return Status::OK(); } +template <bool from_json> +Status CsvReader::deserialize_nullable_string(IColumn& column, Slice& slice) { + auto& null_column = assert_cast<ColumnNullable&>(column); + if (!(from_json && _options.converted_from_string && slice.trim_quote())) { + if (slice.size == 2 && slice[0] == '\\' && slice[1] == 'N') { + null_column.insert_data(nullptr, 0); + return Status::OK(); + } + } + static DataTypeStringSerDe stringSerDe; + auto st = stringSerDe.deserialize_one_cell_from_json(null_column.get_nested_column(), slice, + _options, 1); + if (!st.ok()) { + // fill null if fail + null_column.insert_data(nullptr, 0); // 0 is meaningless here + return Status::OK(); + } + // fill not null if success + null_column.get_null_map_data().push_back(0); + return Status::OK(); +} + Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, std::vector<MutableColumnPtr>& columns, size_t* rows) { bool is_success = false; @@ -628,28 +659,33 @@ Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, col_idx < _split_values.size() ? _split_values[col_idx] : _s_null_slice; Slice slice {value.data, value.size}; + IColumn* col_ptr = columns[i]; if (!_is_load) { - IColumn* col_ptr = const_cast<IColumn*>( + col_ptr = const_cast<IColumn*>( block->get_by_position(_file_slot_idx_map[i]).column.get()); + } + if (_use_nullable_string_opt[i]) { + // For load task, we always read "string" from file. + // So serdes[i] here must be DataTypeNullableSerDe, and DataTypeNullableSerDe -> nested_serde must be DataTypeStringSerDe. + // So we use deserialize_nullable_string and stringSerDe to reduce virtual function calls. switch (_text_serde_type) { case TTextSerdeType::JSON_TEXT_SERDE: - _serdes[i]->deserialize_one_cell_from_json(*col_ptr, slice, _options); + deserialize_nullable_string<true>(*col_ptr, slice); break; case TTextSerdeType::HIVE_TEXT_SERDE: - _serdes[i]->deserialize_one_cell_from_hive_text(*col_ptr, slice, _options); + deserialize_nullable_string<false>(*col_ptr, slice); break; default: break; } } else { - // For load task, we always read "string" from file. switch (_text_serde_type) { case TTextSerdeType::JSON_TEXT_SERDE: - _serdes[i]->deserialize_one_cell_from_json(*columns[i], slice, _options); + _serdes[i]->deserialize_one_cell_from_json(*col_ptr, slice, _options); break; case TTextSerdeType::HIVE_TEXT_SERDE: - _serdes[i]->deserialize_one_cell_from_hive_text(*columns[i], slice, _options); + _serdes[i]->deserialize_one_cell_from_hive_text(*col_ptr, slice, _options); break; default: break; diff --git a/be/src/vec/exec/format/csv/csv_reader.h b/be/src/vec/exec/format/csv/csv_reader.h index 86380672e28..3a6d721beea 100644 --- a/be/src/vec/exec/format/csv/csv_reader.h +++ b/be/src/vec/exec/format/csv/csv_reader.h @@ -213,6 +213,10 @@ private: void _init_system_properties(); void _init_file_description(); + //if from_json = false , deserialize from hive_text + template <bool from_json> + Status deserialize_nullable_string(IColumn& column, Slice& slice); + // used for parse table schema of csv file. // Currently, this feature is for table valued function. Status _prepare_parse(size_t* read_line, bool* is_parse_name); @@ -286,6 +290,7 @@ private: std::vector<Slice> _split_values; std::unique_ptr<LineFieldSplitterIf> _fields_splitter; TTextSerdeType::type _text_serde_type; + std::vector<int> _use_nullable_string_opt; }; } // namespace vectorized } // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
