This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 633eaeeca83 branch-4.0: [opt](memory) set source content column by
column when flush memtable for partial update (#59547)
633eaeeca83 is described below
commit 633eaeeca8331c17a1fccbf404060a71a32a384e
Author: hui lai <[email protected]>
AuthorDate: Mon Jan 5 17:37:56 2026 +0800
branch-4.0: [opt](memory) set source content column by column when flush
memtable for partial update (#59547)
pick #58782
### What problem does this PR solve?
related issue: https://github.com/apache/doris/issues/58780
Under the load to 5000-columns wide table, flush memtable consume lots
of memory:
<img width="526" height="430" alt="image"
src="https://github.com/user-attachments/assets/816bcc8b-d9d0-4105-a96b-3fbcc931b0b1"
/>
Set source content column by column when flush memtable for partial
update to solve this problem, the memory usage of
`_append_block_with_partial_content` is hardly visible in the profile
after optimization.
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
.../rowset/segment_v2/vertical_segment_writer.cpp | 24 ++++++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
index 80c8f78e093..7e9a1cdf1e9 100644
--- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
@@ -528,8 +528,6 @@ Status
VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da
for (auto i : including_cids) {
full_block.replace_by_position(i,
data.block->get_by_position(input_id++).column);
}
-
RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_columns(
- &full_block, data.row_pos, data.num_rows, including_cids));
bool have_input_seq_column = false;
// write including columns
@@ -537,6 +535,8 @@ Status
VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da
vectorized::IOlapColumnDataAccessor* seq_column = nullptr;
uint32_t segment_start_pos = 0;
for (auto cid : including_cids) {
+
RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_columns(
+ &full_block, data.row_pos, data.num_rows,
std::vector<uint32_t> {cid}));
// here we get segment column row num before append data.
segment_start_pos =
cast_set<uint32_t>(_column_writers[cid]->get_next_rowid());
// olap data convertor alway start from id = 0
@@ -554,6 +554,15 @@ Status
VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da
RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(),
column->get_data(),
data.num_rows));
RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid));
+ // Don't clear source content for key columns and sequence column here,
+ // as they will be used later in _full_encode_keys() and
_generate_primary_key_index().
+ // They will be cleared at the end of this method.
+ bool is_key_column = (cid < _num_sort_key_columns);
+ bool is_seq_column = (_tablet_schema->has_sequence_col() &&
+ cid == _tablet_schema->sequence_col_idx() &&
have_input_seq_column);
+ if (!is_key_column && !is_seq_column) {
+ _olap_data_convertor->clear_source_content(cid);
+ }
}
bool has_default_or_nullable = false;
@@ -629,9 +638,9 @@ Status
VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da
// convert missing columns and send to column writer
const auto& missing_cids =
_opts.rowset_ctx->partial_update_info->missing_cids;
-
RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_columns(
- &full_block, data.row_pos, data.num_rows, missing_cids));
for (auto cid : missing_cids) {
+
RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_columns(
+ &full_block, data.row_pos, data.num_rows,
std::vector<uint32_t> {cid}));
auto [status, column] = _olap_data_convertor->convert_column_data(cid);
if (!status.ok()) {
return status;
@@ -644,6 +653,13 @@ Status
VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da
RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(),
column->get_data(),
data.num_rows));
RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid));
+ // Don't clear source content for sequence column here if it will be
used later
+ // in _generate_primary_key_index(). It will be cleared at the end of
this method.
+ bool is_seq_column = (_tablet_schema->has_sequence_col() &&
!have_input_seq_column &&
+ cid == _tablet_schema->sequence_col_idx());
+ if (!is_seq_column) {
+ _olap_data_convertor->clear_source_content(cid);
+ }
}
_num_rows_updated += stats.num_rows_updated;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]