[doris] 04/12: [enhancement](csv_reader)Optimize the reading efficiency of nullable (string) columns. (#24698)

kxiao Fri, 22 Sep 2023 08:13:06 -0700

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


commit 0da08d5d19919a8efc4910f536fc0b74386f124e
Author: daidai <[email protected]>
AuthorDate: Fri Sep 22 13:44:37 2023 +0800

    [enhancement](csv_reader)Optimize the reading efficiency of nullable 
(string) columns. (#24698)
    
    Optimize the performance of stream load tsv by reducing virtual function 
calls .
    (Optimize read performance of nullable (string) columns by reducing virtual 
function calls.)
    before : 600+ s
    after : 560+ s
---
 be/src/clucene                                     |  2 +-
 .../data_types/serde/data_type_nullable_serde.cpp  |  6 ++-
 be/src/vec/exec/format/csv/csv_reader.cpp          | 48 +++++++++++++++++++---
 be/src/vec/exec/format/csv/csv_reader.h            |  5 +++
 4 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index 0be3c4aeb62..3b51f707d4c 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 0be3c4aeb62ecbc0ff1c79c9526e619742d54fcc
+Subproject commit 3b51f707d4c51596d77b97f48b0baf49db0d2c30
diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp 
b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
index b10f2765995..64868b97c1c 100644
--- a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
@@ -24,6 +24,7 @@
 #include <boost/iterator/iterator_facade.hpp>
 #include <memory>
 
+#include "data_type_string_serde.h"
 #include "util/jsonb_document.h"
 #include "vec/columns/column.h"
 #include "vec/columns/column_const.h"
@@ -95,8 +96,9 @@ Status 
DataTypeNullableSerDe::deserialize_one_cell_from_hive_text(IColumn& colum
         return Status::OK();
     }
 
-    auto st = 
nested_serde->deserialize_one_cell_from_hive_text(null_column.get_nested_column(),
-                                                                slice, 
options, nesting_level);
+    Status st = 
nested_serde->deserialize_one_cell_from_hive_text(null_column.get_nested_column(),
+                                                                  slice, 
options, nesting_level);
+
     if (!st.ok()) {
         // fill null if fail
         null_column.insert_data(nullptr, 0); // 0 is meaningless here
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 58cdf97b44e..c997afe6a31 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -355,6 +355,15 @@ Status CsvReader::init_reader(bool is_load) {
     } else {
         _options.map_key_delim = 
_params.file_attributes.text_params.mapkv_delimiter[0];
     }
+    _use_nullable_string_opt.resize(_file_slot_descs.size());
+    for (int i = 0; i < _file_slot_descs.size(); ++i) {
+        auto data_type_ptr = _file_slot_descs[i]->get_data_type_ptr();
+        if (data_type_ptr.get()->get_type_id() == TypeIndex::Nullable &&
+            
((DataTypeNullable*)data_type_ptr.get())->get_nested_type()->get_type_id() ==
+                    TypeIndex::String) {
+            _use_nullable_string_opt[i] = 1;
+        }
+    }
 
     if (_params.file_attributes.__isset.trim_double_quotes) {
         _trim_double_quotes = _params.file_attributes.trim_double_quotes;
@@ -611,6 +620,28 @@ Status CsvReader::_create_decompressor() {
     return Status::OK();
 }
 
+template <bool from_json>
+Status CsvReader::deserialize_nullable_string(IColumn& column, Slice& slice) {
+    auto& null_column = assert_cast<ColumnNullable&>(column);
+    if (!(from_json && _options.converted_from_string && slice.trim_quote())) {
+        if (slice.size == 2 && slice[0] == '\\' && slice[1] == 'N') {
+            null_column.insert_data(nullptr, 0);
+            return Status::OK();
+        }
+    }
+    static DataTypeStringSerDe stringSerDe;
+    auto st = 
stringSerDe.deserialize_one_cell_from_json(null_column.get_nested_column(), 
slice,
+                                                         _options, 1);
+    if (!st.ok()) {
+        // fill null if fail
+        null_column.insert_data(nullptr, 0); // 0 is meaningless here
+        return Status::OK();
+    }
+    // fill not null if success
+    null_column.get_null_map_data().push_back(0);
+    return Status::OK();
+}
+
 Status CsvReader::_fill_dest_columns(const Slice& line, Block* block,
                                      std::vector<MutableColumnPtr>& columns, 
size_t* rows) {
     bool is_success = false;
@@ -628,28 +659,33 @@ Status CsvReader::_fill_dest_columns(const Slice& line, 
Block* block,
                 col_idx < _split_values.size() ? _split_values[col_idx] : 
_s_null_slice;
         Slice slice {value.data, value.size};
 
+        IColumn* col_ptr = columns[i];
         if (!_is_load) {
-            IColumn* col_ptr = const_cast<IColumn*>(
+            col_ptr = const_cast<IColumn*>(
                     
block->get_by_position(_file_slot_idx_map[i]).column.get());
+        }
 
+        if (_use_nullable_string_opt[i]) {
+            // For load task, we always read "string" from file.
+            // So serdes[i] here must be DataTypeNullableSerDe, and 
DataTypeNullableSerDe -> nested_serde must be DataTypeStringSerDe.
+            // So we use deserialize_nullable_string and stringSerDe to reduce 
virtual function calls.
             switch (_text_serde_type) {
             case TTextSerdeType::JSON_TEXT_SERDE:
-                _serdes[i]->deserialize_one_cell_from_json(*col_ptr, slice, 
_options);
+                deserialize_nullable_string<true>(*col_ptr, slice);
                 break;
             case TTextSerdeType::HIVE_TEXT_SERDE:
-                _serdes[i]->deserialize_one_cell_from_hive_text(*col_ptr, 
slice, _options);
+                deserialize_nullable_string<false>(*col_ptr, slice);
                 break;
             default:
                 break;
             }
         } else {
-            // For load task, we always read "string" from file.
             switch (_text_serde_type) {
             case TTextSerdeType::JSON_TEXT_SERDE:
-                _serdes[i]->deserialize_one_cell_from_json(*columns[i], slice, 
_options);
+                _serdes[i]->deserialize_one_cell_from_json(*col_ptr, slice, 
_options);
                 break;
             case TTextSerdeType::HIVE_TEXT_SERDE:
-                _serdes[i]->deserialize_one_cell_from_hive_text(*columns[i], 
slice, _options);
+                _serdes[i]->deserialize_one_cell_from_hive_text(*col_ptr, 
slice, _options);
                 break;
             default:
                 break;
diff --git a/be/src/vec/exec/format/csv/csv_reader.h 
b/be/src/vec/exec/format/csv/csv_reader.h
index 86380672e28..3a6d721beea 100644
--- a/be/src/vec/exec/format/csv/csv_reader.h
+++ b/be/src/vec/exec/format/csv/csv_reader.h
@@ -213,6 +213,10 @@ private:
     void _init_system_properties();
     void _init_file_description();
 
+    //if from_json = false , deserialize from hive_text
+    template <bool from_json>
+    Status deserialize_nullable_string(IColumn& column, Slice& slice);
+
     // used for parse table schema of csv file.
     // Currently, this feature is for table valued function.
     Status _prepare_parse(size_t* read_line, bool* is_parse_name);
@@ -286,6 +290,7 @@ private:
     std::vector<Slice> _split_values;
     std::unique_ptr<LineFieldSplitterIf> _fields_splitter;
     TTextSerdeType::type _text_serde_type;
+    std::vector<int> _use_nullable_string_opt;
 };
 } // namespace vectorized
 } // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] 04/12: [enhancement](csv_reader)Optimize the reading efficiency of nullable (string) columns. (#24698)

Reply via email to