github-actions[bot] commented on code in PR #63182:
URL: https://github.com/apache/doris/pull/63182#discussion_r3226438269
##########
be/src/storage/segment/vertical_segment_writer.cpp:
##########
@@ -736,13 +736,48 @@ Status
VerticalSegmentWriter::_append_block_with_flexible_partial_content(RowsIn
RETURN_IF_ERROR(_create_column_writer(cid,
_tablet_schema->column(cid), _tablet_schema));
}
+ std::vector<BitmapValue>* skip_bitmaps = &(
+ assert_cast<ColumnBitmap*>(
+
data.block->get_by_position(skip_bitmap_col_idx).column->assume_mutable().get())
+ ->get_data());
+ if (_tablet_schema->num_variant_columns() > 0) {
+ std::vector<uint32_t> variant_cids;
+ variant_cids.reserve(_tablet_schema->num_variant_columns());
+ for (size_t cid = _tablet_schema->num_key_columns(); cid <
_tablet_schema->num_columns();
+ ++cid) {
+ const auto& column = _tablet_schema->column(cid);
+ if (!column.is_variant_type()) {
+ continue;
+ }
+ variant_cids.push_back(cast_set<uint32_t>(cid));
+ }
+ RETURN_IF_ERROR(variant_util::parse_and_materialize_variant_columns(
+ *const_cast<Block*>(data.block), *_tablet_schema,
variant_cids, true));
+ for (auto cid : variant_cids) {
+ const auto& column = _tablet_schema->column(cid);
+ for (size_t block_pos = data.row_pos; block_pos < data.row_pos +
data.num_rows;
+ ++block_pos) {
+ auto& skip_bitmap = skip_bitmaps->at(block_pos);
+ if (!skip_bitmap.contains(column.unique_id())) {
+ RETURN_IF_ERROR(variant_util::mark_variant_patch_paths(
+ *data.block->get_by_position(cid).column,
block_pos, column.unique_id(),
+ &skip_bitmap));
+ }
+ }
+ }
+ }
+
// 1. aggregate duplicate rows in block
RETURN_IF_ERROR(_block_aggregator.aggregate_for_flexible_partial_update(
const_cast<Block*>(data.block), data.num_rows, specified_rowsets,
segment_caches));
Review Comment:
This new aggregation step is not row-slice safe. Just above, marker
collection uses the absolute slice range `data.row_pos .. data.row_pos +
data.num_rows`, but `aggregate_for_flexible_partial_update()` only receives
`data.num_rows` and internally processes rows starting at 0. When a
`RowsInBlock` slice has `row_pos > 0` (the segment writer API supports this and
later code still passes `data.row_pos` to converters), duplicate-key
aggregation will merge/read the wrong rows. If aggregation swaps the block, the
block now starts at 0 while `data.row_pos` is kept unchanged, so later
skip-bitmap/delete-sign/key-column accesses can skip the aggregated rows or go
out of range. Please pass `row_pos` through the aggregation path or
materialize/reset the slice consistently, and add coverage for a flexible
VARIANT partial update slice with non-zero `row_pos` and duplicate keys.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]