This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 9618427020 [improvement](multi-catalog) increase default batch_size to
4064 (#16326)
9618427020 is described below
commit 96184270203b02bd72148699abc96c34c1202eaf
Author: Ashin Gau <[email protected]>
AuthorDate: Thu Feb 2 11:51:09 2023 +0800
[improvement](multi-catalog) increase default batch_size to 4064 (#16326)
The performance of ClickBench Q30 is affected by batch_size:
| batch_size | 1024 | 4096 | 20480 |
| -- | -- | -- | -- |
| Q30 query time | 2.27 | 1.08 | 0.62 |
Because aggregation operator will create a new result block for each batch
block, and Q30 has 90 columns, which is time-consuming. Larger batch_size will
decrease the number of aggregation blocks, so the larger batch_size will
improve performance.
Doris internal reader will read at least 4064 rows even if batch_size <
4064, so this PR keep the process of reading external table the same as
internal table.
---
be/src/vec/exec/format/csv/csv_reader.cpp | 2 +-
be/src/vec/exec/format/generic_reader.h | 2 ++
be/src/vec/exec/format/json/new_json_reader.cpp | 2 +-
be/src/vec/exec/format/orc/vorc_reader.cpp | 2 +-
be/src/vec/exec/format/parquet/vparquet_reader.cpp | 2 +-
fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java | 4 ++--
6 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index d5863c96de..cbd8954207 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -211,7 +211,7 @@ Status CsvReader::get_next_block(Block* block, size_t*
read_rows, bool* eof) {
return Status::OK();
}
- const int batch_size = _state->batch_size();
+ const int batch_size = std::max(_state->batch_size(),
(int)_MIN_BATCH_SIZE);
size_t rows = 0;
auto columns = block->mutate_columns();
while (rows < batch_size && !_line_reader_eof) {
diff --git a/be/src/vec/exec/format/generic_reader.h
b/be/src/vec/exec/format/generic_reader.h
index 30e93aacd8..9f4cfd00ee 100644
--- a/be/src/vec/exec/format/generic_reader.h
+++ b/be/src/vec/exec/format/generic_reader.h
@@ -60,6 +60,8 @@ public:
}
protected:
+ const size_t _MIN_BATCH_SIZE = 4064; // 4094 - 32(padding)
+
/// Whether the underlying FileReader has filled the partition&missing
columns
bool _fill_all_columns = false;
};
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp
b/be/src/vec/exec/format/json/new_json_reader.cpp
index d0d3acf284..57cb70f2de 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -108,7 +108,7 @@ Status NewJsonReader::get_next_block(Block* block, size_t*
read_rows, bool* eof)
return Status::OK();
}
- const int batch_size = _state->batch_size();
+ const int batch_size = std::max(_state->batch_size(),
(int)_MIN_BATCH_SIZE);
auto columns = block->mutate_columns();
while (columns[0]->size() < batch_size && !_reader_eof) {
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 2b2e0c1029..e5e43906d1 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -76,7 +76,7 @@ OrcReader::OrcReader(RuntimeProfile* profile, const
TFileScanRangeParams& params
: _profile(profile),
_scan_params(params),
_scan_range(range),
- _batch_size(batch_size),
+ _batch_size(std::max(batch_size, _MIN_BATCH_SIZE)),
_range_start_offset(range.start_offset),
_range_size(range.size),
_ctz(ctz),
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index 7579907c1c..1e9e4bd25a 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -38,7 +38,7 @@ ParquetReader::ParquetReader(RuntimeProfile* profile, const
TFileScanRangeParams
: _profile(profile),
_scan_params(params),
_scan_range(range),
- _batch_size(batch_size),
+ _batch_size(std::max(batch_size, _MIN_BATCH_SIZE)),
_range_start_offset(range.start_offset),
_range_size(range.size),
_ctz(ctz),
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index e18770e9fd..dcb4054673 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -397,9 +397,9 @@ public class SessionVariable implements Serializable,
Writable {
@VariableMgr.VarAttr(name = CODEGEN_LEVEL)
public int codegenLevel = 0;
- // 1024 minus 16 + 16 bytes padding that in padding pod array
+ // 4096 minus 16 + 16 bytes padding that in padding pod array
@VariableMgr.VarAttr(name = BATCH_SIZE)
- public int batchSize = 992;
+ public int batchSize = 4064;
@VariableMgr.VarAttr(name = DISABLE_STREAMING_PREAGGREGATIONS)
public boolean disableStreamPreaggregations = false;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]