This is an automated email from the ASF dual-hosted git repository.
mrhhsg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new df86f2784cb [opt](olap) Optimize the performance of
StructFileColumnIterator::read_by_rowids in scenarios where the rowids are
continuous (#58851)
df86f2784cb is described below
commit df86f2784cb2adcec12f5c904274ad3a51e09082
Author: Jerry Hu <[email protected]>
AuthorDate: Fri Dec 12 10:09:32 2025 +0800
[opt](olap) Optimize the performance of
StructFileColumnIterator::read_by_rowids in scenarios where the rowids are
continuous (#58851)
### What problem does this PR solve?
Avoid seeking and reading row by row.
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/olap/rowset/segment_v2/column_reader.cpp | 53 ++++++++++++++++-----
.../complex_types/test_pruned_columns.out | 55 +++++++++++++++++++++-
.../complex_types/test_pruned_columns.groovy | 25 ++++++++--
3 files changed, 115 insertions(+), 18 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 43aae937e40..b9fb575e56e 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -966,7 +966,7 @@ Status MapFileColumnIterator::next_batch(size_t* n,
vectorized::MutableColumnPtr
return Status::OK();
}
- auto& column_map = assert_cast<vectorized::ColumnMap&>(
+ auto& column_map = assert_cast<vectorized::ColumnMap&,
TypeCheckOnRelease::DISABLE>(
dst->is_nullable() ?
static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
: *dst);
auto column_offsets_ptr = column_map.get_offsets_column().assume_mutable();
@@ -1010,7 +1010,8 @@ Status MapFileColumnIterator::next_batch(size_t* n,
vectorized::MutableColumnPtr
RETURN_IF_ERROR(
_null_iterator->next_batch(&num_read, null_map_ptr,
&null_signs_has_null));
} else {
- auto& null_map =
assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr);
+ auto& null_map = assert_cast<vectorized::ColumnUInt8&,
TypeCheckOnRelease::DISABLE>(
+ *null_map_ptr);
null_map.insert_many_vals(0, num_read);
}
DCHECK(num_read == *n);
@@ -1095,7 +1096,8 @@ Status MapFileColumnIterator::read_by_rowids(const
rowid_t* rowids, const size_t
ordinal_t ns = 0;
RETURN_IF_ERROR(_offsets_iterator->_peek_one_offset(&ns));
// overwrite with sentinel
-
assert_cast<vectorized::ColumnOffset64&>(*next_starts_col).get_data()[i] = ns;
+ assert_cast<vectorized::ColumnOffset64&,
TypeCheckOnRelease::DISABLE>(*next_starts_col)
+ .get_data()[i] = ns;
}
}
@@ -1260,7 +1262,7 @@ Status StructFileColumnIterator::next_batch(size_t* n,
vectorized::MutableColumn
return Status::OK();
}
- auto& column_struct = assert_cast<vectorized::ColumnStruct&>(
+ auto& column_struct = assert_cast<vectorized::ColumnStruct&,
TypeCheckOnRelease::DISABLE>(
dst->is_nullable() ?
static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
: *dst);
for (size_t i = 0; i < column_struct.tuple_size(); i++) {
@@ -1286,7 +1288,8 @@ Status StructFileColumnIterator::next_batch(size_t* n,
vectorized::MutableColumn
RETURN_IF_ERROR(
_null_iterator->next_batch(&num_read, null_map_ptr,
&null_signs_has_null));
} else {
- auto& null_map =
assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr);
+ auto& null_map = assert_cast<vectorized::ColumnUInt8&,
TypeCheckOnRelease::DISABLE>(
+ *null_map_ptr);
null_map.insert_many_vals(0, num_read);
}
DCHECK(num_read == *n);
@@ -1318,12 +1321,33 @@ Status StructFileColumnIterator::read_by_rowids(const
rowid_t* rowids, const siz
return Status::OK();
}
- for (size_t i = 0; i < count; ++i) {
- RETURN_IF_ERROR(seek_to_ordinal(rowids[i]));
- size_t num_read = 1;
+ if (count == 0) {
+ return Status::OK();
+ }
+
+ size_t this_run = 1;
+ auto start_idx = rowids[0];
+ auto last_idx = rowids[0];
+ for (size_t i = 1; i < count; ++i) {
+ if (last_idx == rowids[i] - 1) {
+ last_idx = rowids[i];
+ this_run++;
+ continue;
+ }
+ RETURN_IF_ERROR(seek_to_ordinal(start_idx));
+ size_t num_read = this_run;
RETURN_IF_ERROR(next_batch(&num_read, dst, nullptr));
- DCHECK(num_read == 1);
+ DCHECK_EQ(num_read, this_run);
+
+ start_idx = rowids[i];
+ last_idx = rowids[i];
+ this_run = 1;
}
+
+ RETURN_IF_ERROR(seek_to_ordinal(start_idx));
+ size_t num_read = this_run;
+ RETURN_IF_ERROR(next_batch(&num_read, dst, nullptr));
+ DCHECK_EQ(num_read, this_run);
return Status::OK();
}
@@ -1425,8 +1449,9 @@ Status
OffsetFileColumnIterator::_peek_one_offset(ordinal_t* offset) {
_peek_tmp_col->clear();
RETURN_IF_ERROR(offset_page_decoder->peek_next_batch(&n,
_peek_tmp_col)); // not null
DCHECK(_peek_tmp_col->size() == 1);
- *offset =
- assert_cast<const
vectorized::ColumnOffset64*>(_peek_tmp_col.get())->get_element(0);
+ *offset = assert_cast<const vectorized::ColumnOffset64*,
TypeCheckOnRelease::DISABLE>(
+ _peek_tmp_col.get())
+ ->get_element(0);
} else {
*offset =
_offset_iterator->get_current_page()->next_array_item_ordinal;
}
@@ -1557,7 +1582,8 @@ Status ArrayFileColumnIterator::next_batch(size_t* n,
vectorized::MutableColumnP
RETURN_IF_ERROR(
_null_iterator->next_batch(&num_read, null_map_ptr,
&null_signs_has_null));
} else {
- auto& null_map =
assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr);
+ auto& null_map = assert_cast<vectorized::ColumnUInt8&,
TypeCheckOnRelease::DISABLE>(
+ *null_map_ptr);
null_map.insert_many_vals(0, num_read);
}
DCHECK(num_read == *n);
@@ -2137,7 +2163,8 @@ void
DefaultValueColumnIterator::_insert_many_default(vectorized::MutableColumnP
Status RowIdColumnIteratorV2::next_batch(size_t* n,
vectorized::MutableColumnPtr& dst,
bool* has_null) {
- auto* string_column = assert_cast<vectorized::ColumnString*>(dst.get());
+ auto* string_column =
+ assert_cast<vectorized::ColumnString*,
TypeCheckOnRelease::DISABLE>(dst.get());
for (uint32_t i = 0; i < *n; ++i) {
uint32_t row_id = _current_rowid + i;
diff --git
a/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out
b/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out
index 86728bafd1c..b3312aa670c 100644
--- a/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out
+++ b/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out
@@ -1,7 +1,18 @@
-- This file is automatically generated. You should know what you did if you
want to edit this
-- !sql --
-1 {"city":"beijing", "data":[{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}]}
-2 {"city":"shanghai", "data":[{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}]}
+1 {"city":"beijing", "data":[{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}],
"value":1}
+2 {"city":"shanghai", "data":[{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}],
"value":2}
+3 {"city":"guangzhou", "data":[{1:{"a":90, "b":60}, 2:{"a":110,
"b":40}}], "value":3}
+4 {"city":"shenzhen", "data":[{2:{"a":130, "b":20}, 1:{"a":150,
"b":40}}], "value":4}
+5 {"city":"hangzhou", "data":[{1:{"a":170, "b":80}, 2:{"a":190,
"b":40}}], "value":5}
+6 {"city":"nanjing", "data":[{2:{"a":210, "b":60}, 1:{"a":230, "b":40}}],
"value":6}
+7 {"city":"tianjin", "data":[{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}],
"value":7}
+8 {"city":"chongqing", "data":[{2:{"a":290, "b":80}, 1:{"a":310,
"b":40}}], "value":8}
+9 {"city":"wuhan", "data":[{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}],
"value":9}
+10 {"city":"xian", "data":[{2:{"a":370, "b":20}, 1:{"a":390, "b":40}}],
"value":10}
+11 {"city":"changsha", "data":[{1:{"a":410, "b":80}, 2:{"a":430,
"b":40}}], "value":11}
+12 {"city":"qingdao", "data":[{2:{"a":450, "b":60}, 1:{"a":470, "b":40}}],
"value":12}
+13 {"city":"dalian", "data":[{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}],
"value":13}
-- !sql1 --
1 [10]
@@ -9,18 +20,58 @@
-- !sql2 --
1 beijing
2 shanghai
+3 guangzhou
+4 shenzhen
+5 hangzhou
+6 nanjing
+7 tianjin
+8 chongqing
+9 wuhan
+10 xian
+11 changsha
+12 qingdao
+13 dalian
-- !sql3 --
1 [{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}]
2 [{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}]
+3 [{1:{"a":90, "b":60}, 2:{"a":110, "b":40}}]
+4 [{2:{"a":130, "b":20}, 1:{"a":150, "b":40}}]
+5 [{1:{"a":170, "b":80}, 2:{"a":190, "b":40}}]
+6 [{2:{"a":210, "b":60}, 1:{"a":230, "b":40}}]
+7 [{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}]
+8 [{2:{"a":290, "b":80}, 1:{"a":310, "b":40}}]
+9 [{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}]
+10 [{2:{"a":370, "b":20}, 1:{"a":390, "b":40}}]
+11 [{1:{"a":410, "b":80}, 2:{"a":430, "b":40}}]
+12 [{2:{"a":450, "b":60}, 1:{"a":470, "b":40}}]
+13 [{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}]
-- !sql4 --
1 [{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}]
2 [{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}]
+3 [{1:{"a":90, "b":60}, 2:{"a":110, "b":40}}]
+5 [{1:{"a":170, "b":80}, 2:{"a":190, "b":40}}]
+7 [{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}]
+9 [{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}]
+11 [{1:{"a":410, "b":80}, 2:{"a":430, "b":40}}]
+13 [{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}]
-- !sql5 --
1 beijing
2 shanghai
+3 guangzhou
+5 hangzhou
+7 tianjin
+9 wuhan
+11 changsha
+13 dalian
+
+-- !sql5_1 --
+61
+
+-- !sql5_2 --
+61
-- !sql6 --
2
diff --git
a/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy
b/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy
index c2a7e2b7146..a99b7b6da59 100644
---
a/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy
+++
b/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy
@@ -20,7 +20,7 @@ suite("test_pruned_columns") {
sql """
CREATE TABLE `tbl_test_pruned_columns` (
`id` int NULL,
- `s` struct<city:text,data:array<map<int,struct<a:int,b:double>>>>
NULL
+ `s` struct<city:text,data:array<map<int,struct<a:int,b:double>>>,
value:int> NULL
) ENGINE=OLAP
DUPLICATE KEY(`id`)
DISTRIBUTED BY RANDOM BUCKETS AUTO
@@ -31,8 +31,19 @@ suite("test_pruned_columns") {
sql """
insert into `tbl_test_pruned_columns` values
- (1, named_struct('city', 'beijing', 'data', array(map(1,
named_struct('a', 10, 'b', 20.0), 2, named_struct('a', 30, 'b', 40))))),
- (2, named_struct('city', 'shanghai', 'data', array(map(2,
named_struct('a', 50, 'b', 40.0), 1, named_struct('a', 70, 'b', 80)))));
+ (1, named_struct('city', 'beijing', 'data', array(map(1,
named_struct('a', 10, 'b', 20.0), 2, named_struct('a', 30, 'b', 40))), 'value',
1)),
+ (2, named_struct('city', 'shanghai', 'data', array(map(2,
named_struct('a', 50, 'b', 40.0), 1, named_struct('a', 70, 'b', 80))), 'value',
2)),
+ (3, named_struct('city', 'guangzhou', 'data', array(map(1,
named_struct('a', 90, 'b', 60.0), 2, named_struct('a', 110, 'b', 40))),
'value', 3)),
+ (4, named_struct('city', 'shenzhen', 'data', array(map(2,
named_struct('a', 130, 'b', 20.0), 1, named_struct('a', 150, 'b', 40))),
'value', 4)),
+ (5, named_struct('city', 'hangzhou', 'data', array(map(1,
named_struct('a', 170, 'b', 80.0), 2, named_struct('a', 190, 'b', 40))),
'value', 5)),
+ (6, named_struct('city', 'nanjing', 'data', array(map(2,
named_struct('a', 210, 'b', 60.0), 1, named_struct('a', 230, 'b', 40))),
'value', 6)),
+ (7, named_struct('city', 'tianjin', 'data', array(map(1,
named_struct('a', 250, 'b', 20.0), 2, named_struct('a', 270, 'b', 40))),
'value', 7)),
+ (8, named_struct('city', 'chongqing', 'data', array(map(2,
named_struct('a', 290, 'b', 80.0), 1, named_struct('a', 310, 'b', 40))),
'value', 8)),
+ (9, named_struct('city', 'wuhan', 'data', array(map(1,
named_struct('a', 330, 'b', 60.0), 2, named_struct('a', 350, 'b', 40))),
'value', 9)),
+ (10, named_struct('city', 'xian', 'data', array(map(2,
named_struct('a', 370, 'b', 20.0), 1, named_struct('a', 390, 'b', 40))),
'value', 10)),
+ (11, named_struct('city', 'changsha', 'data', array(map(1,
named_struct('a', 410, 'b', 80.0), 2, named_struct('a', 430, 'b', 40))),
'value', 11)),
+ (12, named_struct('city', 'qingdao', 'data', array(map(2,
named_struct('a', 450, 'b', 60.0), 1, named_struct('a', 470, 'b', 40))),
'value', 12)),
+ (13, named_struct('city', 'dalian', 'data', array(map(1,
named_struct('a', 490, 'b', 20.0), 2, named_struct('a', 510, 'b', 40))),
'value', 13));
"""
qt_sql """
@@ -59,6 +70,14 @@ suite("test_pruned_columns") {
select id, struct_element(s, 'city') from `tbl_test_pruned_columns`
where struct_element(struct_element(s, 'data')[1][2], 'b') = 40 order by 1;
"""
+ qt_sql5_1 """
+ select /*+ set enable_prune_nested_column = 1; */ sum(s.value) from
`tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13);
+ """
+
+ qt_sql5_2 """
+ select /*+ set enable_prune_nested_column = 0; */ sum(s.value) from
`tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13);
+ """
+
sql """DROP TABLE IF EXISTS `tbl_test_pruned_columns_map`"""
sql """
CREATE TABLE `tbl_test_pruned_columns_map` (
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]