[doris] branch master updated: [Perfomance][export] Opt the export of CSV tranformer (#24003)

lihaopeng Fri, 08 Sep 2023 05:27:08 -0700

This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new e140938d81 [Perfomance][export] Opt the export of CSV tranformer 
(#24003)
e140938d81 is described below

commit e140938d811fb1fab51c035a334170fd12e09c1c
Author: HappenLee <[email protected]>
AuthorDate: Fri Sep 8 20:26:54 2023 +0800

    [Perfomance][export] Opt the export of CSV tranformer (#24003)
---
 be/src/vec/runtime/vcsv_transformer.cpp | 80 +++++++++++++++++----------------
 be/src/vec/runtime/vcsv_transformer.h   |  5 +--
 2 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/be/src/vec/runtime/vcsv_transformer.cpp 
b/be/src/vec/runtime/vcsv_transformer.cpp
index da5a697460..1db5440b9c 100644
--- a/be/src/vec/runtime/vcsv_transformer.cpp
+++ b/be/src/vec/runtime/vcsv_transformer.cpp
@@ -90,29 +90,34 @@ Status VCSVTransformer::write(const Block& block) {
         for (size_t col_id = 0; col_id < block.columns(); col_id++) {
             auto col = block.get_by_position(col_id);
             if (col.column->is_null_at(i)) {
-                _plain_text_outstream << NULL_IN_CSV;
+                fmt::format_to(_outstream_buffer, "{}", NULL_IN_CSV);
             } else {
                 switch (_output_vexpr_ctxs[col_id]->root()->type().type) {
                 case TYPE_BOOLEAN:
                 case TYPE_TINYINT:
-                    _plain_text_outstream << (int)*reinterpret_cast<const 
int8_t*>(
-                            col.column->get_data_at(i).data);
+                    fmt::format_to(
+                            _outstream_buffer, "{}",
+                            (int)*reinterpret_cast<const 
int8_t*>(col.column->get_data_at(i).data));
                     break;
                 case TYPE_SMALLINT:
-                    _plain_text_outstream
-                            << *reinterpret_cast<const 
int16_t*>(col.column->get_data_at(i).data);
+                    fmt::format_to(
+                            _outstream_buffer, "{}",
+                            *reinterpret_cast<const 
int16_t*>(col.column->get_data_at(i).data));
                     break;
                 case TYPE_INT:
-                    _plain_text_outstream
-                            << *reinterpret_cast<const 
int32_t*>(col.column->get_data_at(i).data);
+                    fmt::format_to(
+                            _outstream_buffer, "{}",
+                            *reinterpret_cast<const 
int32_t*>(col.column->get_data_at(i).data));
                     break;
                 case TYPE_BIGINT:
-                    _plain_text_outstream
-                            << *reinterpret_cast<const 
int64_t*>(col.column->get_data_at(i).data);
+                    fmt::format_to(
+                            _outstream_buffer, "{}",
+                            *reinterpret_cast<const 
int64_t*>(col.column->get_data_at(i).data));
                     break;
                 case TYPE_LARGEINT:
-                    _plain_text_outstream
-                            << *reinterpret_cast<const 
__int128*>(col.column->get_data_at(i).data);
+                    fmt::format_to(
+                            _outstream_buffer, "{}",
+                            *reinterpret_cast<const 
__int128*>(col.column->get_data_at(i).data));
                     break;
                 case TYPE_FLOAT: {
                     char buffer[MAX_FLOAT_STR_LENGTH + 2];
@@ -121,7 +126,7 @@ Status VCSVTransformer::write(const Block& block) {
                     buffer[0] = '\0';
                     int length = FloatToBuffer(float_value, 
MAX_FLOAT_STR_LENGTH, buffer);
                     DCHECK(length >= 0) << "gcvt float failed, float value=" 
<< float_value;
-                    _plain_text_outstream << buffer;
+                    fmt::format_to(_outstream_buffer, "{}", buffer);
                     break;
                 }
                 case TYPE_DOUBLE: {
@@ -130,45 +135,45 @@ Status VCSVTransformer::write(const Block& block) {
                     // For example: For a double value 27361919854.929001,
                     // the direct output of using std::stringstream is 
2.73619e+10,
                     // and after conversion to a string, it outputs 
27361919854.929001
-                    char buffer[MAX_DOUBLE_STR_LENGTH + 2];
+                    char buffer[MAX_DOUBLE_STR_LENGTH + 2] = "\0";
                     double double_value =
                             *reinterpret_cast<const 
double*>(col.column->get_data_at(i).data);
                     buffer[0] = '\0';
                     int length = DoubleToBuffer(double_value, 
MAX_DOUBLE_STR_LENGTH, buffer);
                     DCHECK(length >= 0) << "gcvt double failed, double value=" 
<< double_value;
-                    _plain_text_outstream << buffer;
+                    fmt::format_to(_outstream_buffer, "{}", buffer);
                     break;
                 }
                 case TYPE_DATEV2: {
-                    char buf[64];
+                    char buf[64] = "\0";
                     const DateV2Value<DateV2ValueType>* time_val =
                             (const 
DateV2Value<DateV2ValueType>*)(col.column->get_data_at(i).data);
                     time_val->to_string(buf);
-                    _plain_text_outstream << buf;
+                    fmt::format_to(_outstream_buffer, "{}", buf);
                     break;
                 }
                 case TYPE_DATETIMEV2: {
-                    char buf[64];
+                    char buf[64] = "\0";
                     const DateV2Value<DateTimeV2ValueType>* time_val =
                             (const 
DateV2Value<DateTimeV2ValueType>*)(col.column->get_data_at(i)
                                                                               
.data);
                     time_val->to_string(buf, 
_output_vexpr_ctxs[col_id]->root()->type().scale);
-                    _plain_text_outstream << buf;
+                    fmt::format_to(_outstream_buffer, "{}", buf);
                     break;
                 }
                 case TYPE_DATE:
                 case TYPE_DATETIME: {
-                    char buf[64];
+                    char buf[64] = "\0";
                     const VecDateTimeValue* time_val =
                             (const 
VecDateTimeValue*)(col.column->get_data_at(i).data);
                     time_val->to_string(buf);
-                    _plain_text_outstream << buf;
+                    fmt::format_to(_outstream_buffer, "{}", buf);
                     break;
                 }
                 case TYPE_OBJECT:
                 case TYPE_HLL: {
                     if (!_output_object_data) {
-                        _plain_text_outstream << NULL_IN_CSV;
+                        fmt::format_to(_outstream_buffer, "{}", NULL_IN_CSV);
                         break;
                     }
                     [[fallthrough]];
@@ -177,70 +182,67 @@ Status VCSVTransformer::write(const Block& block) {
                 case TYPE_CHAR:
                 case TYPE_STRING: {
                     auto value = col.column->get_data_at(i);
-                    _plain_text_outstream << value;
+                    fmt::format_to(_outstream_buffer, "{}", value);
                     break;
                 }
                 case TYPE_DECIMALV2: {
                     const DecimalV2Value decimal_val(
                             reinterpret_cast<const 
PackedInt128*>(col.column->get_data_at(i).data)
                                     ->value);
-                    std::string decimal_str;
-                    decimal_str = decimal_val.to_string();
-                    _plain_text_outstream << decimal_str;
+                    fmt::format_to(_outstream_buffer, "{}", 
decimal_val.to_string());
                     break;
                 }
                 case TYPE_DECIMAL32: {
-                    _plain_text_outstream << col.type->to_string(*col.column, 
i);
+                    fmt::format_to(_outstream_buffer, "{}", 
col.type->to_string(*col.column, i));
                     break;
                 }
                 case TYPE_DECIMAL64: {
-                    _plain_text_outstream << col.type->to_string(*col.column, 
i);
+                    fmt::format_to(_outstream_buffer, "{}", 
col.type->to_string(*col.column, i));
                     break;
                 }
                 case TYPE_DECIMAL128I: {
-                    _plain_text_outstream << col.type->to_string(*col.column, 
i);
+                    fmt::format_to(_outstream_buffer, "{}", 
col.type->to_string(*col.column, i));
                     break;
                 }
                 case TYPE_ARRAY: {
-                    _plain_text_outstream << col.type->to_string(*col.column, 
i);
+                    fmt::format_to(_outstream_buffer, "{}", 
col.type->to_string(*col.column, i));
                     break;
                 }
                 case TYPE_MAP: {
-                    _plain_text_outstream << col.type->to_string(*col.column, 
i);
+                    fmt::format_to(_outstream_buffer, "{}", 
col.type->to_string(*col.column, i));
                     break;
                 }
                 case TYPE_STRUCT: {
-                    _plain_text_outstream << col.type->to_string(*col.column, 
i);
+                    fmt::format_to(_outstream_buffer, "{}", 
col.type->to_string(*col.column, i));
                     break;
                 }
                 default: {
                     // not supported type, like BITMAP, just export null
-                    _plain_text_outstream << NULL_IN_CSV;
+                    fmt::format_to(_outstream_buffer, "{}", NULL_IN_CSV);
                 }
                 }
             }
             if (col_id < block.columns() - 1) {
-                _plain_text_outstream << _column_separator;
+                fmt::format_to(_outstream_buffer, "{}", _column_separator);
             }
         }
-        _plain_text_outstream << _line_delimiter;
+        fmt::format_to(_outstream_buffer, "{}", _line_delimiter);
     }
 
     return _flush_plain_text_outstream();
 }
 
 Status VCSVTransformer::_flush_plain_text_outstream() {
-    size_t pos = _plain_text_outstream.tellp();
+    size_t pos = _outstream_buffer.size();
     if (pos == 0) {
         return Status::OK();
     }
 
-    const std::string& buf = _plain_text_outstream.str();
-    RETURN_IF_ERROR(_file_writer->append(buf));
+    RETURN_IF_ERROR(
+            _file_writer->append(Slice(_outstream_buffer.data(), 
_outstream_buffer.size())));
 
     // clear the stream
-    _plain_text_outstream.str("");
-    _plain_text_outstream.clear();
+    _outstream_buffer.clear();
 
     return Status::OK();
 }
diff --git a/be/src/vec/runtime/vcsv_transformer.h 
b/be/src/vec/runtime/vcsv_transformer.h
index fb3232ac93..f796ef52f5 100644
--- a/be/src/vec/runtime/vcsv_transformer.h
+++ b/be/src/vec/runtime/vcsv_transformer.h
@@ -63,13 +63,12 @@ private:
 
     doris::io::FileWriter* _file_writer;
     // Used to buffer the export data of plain text
-    // TODO(cmy): I simply use a stringstrteam to buffer the data, to avoid 
calling
+    // TODO(cmy): I simply use a fmt::memmory_buffer to buffer the data, to 
avoid calling
     // file writer's write() for every single row.
     // But this cannot solve the problem of a row of data that is too large.
     // For example: bitmap_to_string() may return large volume of data.
     // And the speed is relative low, in my test, is about 6.5MB/s.
-    std::stringstream _plain_text_outstream;
-    static const size_t OUTSTREAM_BUFFER_SIZE_BYTES;
+    fmt::memory_buffer _outstream_buffer;
 };
 
 } // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] branch master updated: [Perfomance][export] Opt the export of CSV tranformer (#24003)

Reply via email to