This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new e140938d81 [Perfomance][export] Opt the export of CSV tranformer
(#24003)
e140938d81 is described below
commit e140938d811fb1fab51c035a334170fd12e09c1c
Author: HappenLee <[email protected]>
AuthorDate: Fri Sep 8 20:26:54 2023 +0800
[Perfomance][export] Opt the export of CSV tranformer (#24003)
---
be/src/vec/runtime/vcsv_transformer.cpp | 80 +++++++++++++++++----------------
be/src/vec/runtime/vcsv_transformer.h | 5 +--
2 files changed, 43 insertions(+), 42 deletions(-)
diff --git a/be/src/vec/runtime/vcsv_transformer.cpp
b/be/src/vec/runtime/vcsv_transformer.cpp
index da5a697460..1db5440b9c 100644
--- a/be/src/vec/runtime/vcsv_transformer.cpp
+++ b/be/src/vec/runtime/vcsv_transformer.cpp
@@ -90,29 +90,34 @@ Status VCSVTransformer::write(const Block& block) {
for (size_t col_id = 0; col_id < block.columns(); col_id++) {
auto col = block.get_by_position(col_id);
if (col.column->is_null_at(i)) {
- _plain_text_outstream << NULL_IN_CSV;
+ fmt::format_to(_outstream_buffer, "{}", NULL_IN_CSV);
} else {
switch (_output_vexpr_ctxs[col_id]->root()->type().type) {
case TYPE_BOOLEAN:
case TYPE_TINYINT:
- _plain_text_outstream << (int)*reinterpret_cast<const
int8_t*>(
- col.column->get_data_at(i).data);
+ fmt::format_to(
+ _outstream_buffer, "{}",
+ (int)*reinterpret_cast<const
int8_t*>(col.column->get_data_at(i).data));
break;
case TYPE_SMALLINT:
- _plain_text_outstream
- << *reinterpret_cast<const
int16_t*>(col.column->get_data_at(i).data);
+ fmt::format_to(
+ _outstream_buffer, "{}",
+ *reinterpret_cast<const
int16_t*>(col.column->get_data_at(i).data));
break;
case TYPE_INT:
- _plain_text_outstream
- << *reinterpret_cast<const
int32_t*>(col.column->get_data_at(i).data);
+ fmt::format_to(
+ _outstream_buffer, "{}",
+ *reinterpret_cast<const
int32_t*>(col.column->get_data_at(i).data));
break;
case TYPE_BIGINT:
- _plain_text_outstream
- << *reinterpret_cast<const
int64_t*>(col.column->get_data_at(i).data);
+ fmt::format_to(
+ _outstream_buffer, "{}",
+ *reinterpret_cast<const
int64_t*>(col.column->get_data_at(i).data));
break;
case TYPE_LARGEINT:
- _plain_text_outstream
- << *reinterpret_cast<const
__int128*>(col.column->get_data_at(i).data);
+ fmt::format_to(
+ _outstream_buffer, "{}",
+ *reinterpret_cast<const
__int128*>(col.column->get_data_at(i).data));
break;
case TYPE_FLOAT: {
char buffer[MAX_FLOAT_STR_LENGTH + 2];
@@ -121,7 +126,7 @@ Status VCSVTransformer::write(const Block& block) {
buffer[0] = '\0';
int length = FloatToBuffer(float_value,
MAX_FLOAT_STR_LENGTH, buffer);
DCHECK(length >= 0) << "gcvt float failed, float value="
<< float_value;
- _plain_text_outstream << buffer;
+ fmt::format_to(_outstream_buffer, "{}", buffer);
break;
}
case TYPE_DOUBLE: {
@@ -130,45 +135,45 @@ Status VCSVTransformer::write(const Block& block) {
// For example: For a double value 27361919854.929001,
// the direct output of using std::stringstream is
2.73619e+10,
// and after conversion to a string, it outputs
27361919854.929001
- char buffer[MAX_DOUBLE_STR_LENGTH + 2];
+ char buffer[MAX_DOUBLE_STR_LENGTH + 2] = "\0";
double double_value =
*reinterpret_cast<const
double*>(col.column->get_data_at(i).data);
buffer[0] = '\0';
int length = DoubleToBuffer(double_value,
MAX_DOUBLE_STR_LENGTH, buffer);
DCHECK(length >= 0) << "gcvt double failed, double value="
<< double_value;
- _plain_text_outstream << buffer;
+ fmt::format_to(_outstream_buffer, "{}", buffer);
break;
}
case TYPE_DATEV2: {
- char buf[64];
+ char buf[64] = "\0";
const DateV2Value<DateV2ValueType>* time_val =
(const
DateV2Value<DateV2ValueType>*)(col.column->get_data_at(i).data);
time_val->to_string(buf);
- _plain_text_outstream << buf;
+ fmt::format_to(_outstream_buffer, "{}", buf);
break;
}
case TYPE_DATETIMEV2: {
- char buf[64];
+ char buf[64] = "\0";
const DateV2Value<DateTimeV2ValueType>* time_val =
(const
DateV2Value<DateTimeV2ValueType>*)(col.column->get_data_at(i)
.data);
time_val->to_string(buf,
_output_vexpr_ctxs[col_id]->root()->type().scale);
- _plain_text_outstream << buf;
+ fmt::format_to(_outstream_buffer, "{}", buf);
break;
}
case TYPE_DATE:
case TYPE_DATETIME: {
- char buf[64];
+ char buf[64] = "\0";
const VecDateTimeValue* time_val =
(const
VecDateTimeValue*)(col.column->get_data_at(i).data);
time_val->to_string(buf);
- _plain_text_outstream << buf;
+ fmt::format_to(_outstream_buffer, "{}", buf);
break;
}
case TYPE_OBJECT:
case TYPE_HLL: {
if (!_output_object_data) {
- _plain_text_outstream << NULL_IN_CSV;
+ fmt::format_to(_outstream_buffer, "{}", NULL_IN_CSV);
break;
}
[[fallthrough]];
@@ -177,70 +182,67 @@ Status VCSVTransformer::write(const Block& block) {
case TYPE_CHAR:
case TYPE_STRING: {
auto value = col.column->get_data_at(i);
- _plain_text_outstream << value;
+ fmt::format_to(_outstream_buffer, "{}", value);
break;
}
case TYPE_DECIMALV2: {
const DecimalV2Value decimal_val(
reinterpret_cast<const
PackedInt128*>(col.column->get_data_at(i).data)
->value);
- std::string decimal_str;
- decimal_str = decimal_val.to_string();
- _plain_text_outstream << decimal_str;
+ fmt::format_to(_outstream_buffer, "{}",
decimal_val.to_string());
break;
}
case TYPE_DECIMAL32: {
- _plain_text_outstream << col.type->to_string(*col.column,
i);
+ fmt::format_to(_outstream_buffer, "{}",
col.type->to_string(*col.column, i));
break;
}
case TYPE_DECIMAL64: {
- _plain_text_outstream << col.type->to_string(*col.column,
i);
+ fmt::format_to(_outstream_buffer, "{}",
col.type->to_string(*col.column, i));
break;
}
case TYPE_DECIMAL128I: {
- _plain_text_outstream << col.type->to_string(*col.column,
i);
+ fmt::format_to(_outstream_buffer, "{}",
col.type->to_string(*col.column, i));
break;
}
case TYPE_ARRAY: {
- _plain_text_outstream << col.type->to_string(*col.column,
i);
+ fmt::format_to(_outstream_buffer, "{}",
col.type->to_string(*col.column, i));
break;
}
case TYPE_MAP: {
- _plain_text_outstream << col.type->to_string(*col.column,
i);
+ fmt::format_to(_outstream_buffer, "{}",
col.type->to_string(*col.column, i));
break;
}
case TYPE_STRUCT: {
- _plain_text_outstream << col.type->to_string(*col.column,
i);
+ fmt::format_to(_outstream_buffer, "{}",
col.type->to_string(*col.column, i));
break;
}
default: {
// not supported type, like BITMAP, just export null
- _plain_text_outstream << NULL_IN_CSV;
+ fmt::format_to(_outstream_buffer, "{}", NULL_IN_CSV);
}
}
}
if (col_id < block.columns() - 1) {
- _plain_text_outstream << _column_separator;
+ fmt::format_to(_outstream_buffer, "{}", _column_separator);
}
}
- _plain_text_outstream << _line_delimiter;
+ fmt::format_to(_outstream_buffer, "{}", _line_delimiter);
}
return _flush_plain_text_outstream();
}
Status VCSVTransformer::_flush_plain_text_outstream() {
- size_t pos = _plain_text_outstream.tellp();
+ size_t pos = _outstream_buffer.size();
if (pos == 0) {
return Status::OK();
}
- const std::string& buf = _plain_text_outstream.str();
- RETURN_IF_ERROR(_file_writer->append(buf));
+ RETURN_IF_ERROR(
+ _file_writer->append(Slice(_outstream_buffer.data(),
_outstream_buffer.size())));
// clear the stream
- _plain_text_outstream.str("");
- _plain_text_outstream.clear();
+ _outstream_buffer.clear();
return Status::OK();
}
diff --git a/be/src/vec/runtime/vcsv_transformer.h
b/be/src/vec/runtime/vcsv_transformer.h
index fb3232ac93..f796ef52f5 100644
--- a/be/src/vec/runtime/vcsv_transformer.h
+++ b/be/src/vec/runtime/vcsv_transformer.h
@@ -63,13 +63,12 @@ private:
doris::io::FileWriter* _file_writer;
// Used to buffer the export data of plain text
- // TODO(cmy): I simply use a stringstrteam to buffer the data, to avoid
calling
+ // TODO(cmy): I simply use a fmt::memmory_buffer to buffer the data, to
avoid calling
// file writer's write() for every single row.
// But this cannot solve the problem of a row of data that is too large.
// For example: bitmap_to_string() may return large volume of data.
// And the speed is relative low, in my test, is about 6.5MB/s.
- std::stringstream _plain_text_outstream;
- static const size_t OUTSTREAM_BUFFER_SIZE_BYTES;
+ fmt::memory_buffer _outstream_buffer;
};
} // namespace doris::vectorized
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]