This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 343d0def3d4 branch-4.0: [Improve](Variant) use COUNT_ON_INDEX on
variant subcolumns #60404 (#60461)
343d0def3d4 is described below
commit 343d0def3d46e6cfb5127e3177a8e822a89fca52
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Feb 4 10:06:07 2026 +0800
branch-4.0: [Improve](Variant) use COUNT_ON_INDEX on variant subcolumns
#60404 (#60461)
Cherry-picked from #60404
Co-authored-by: lihangyu <[email protected]>
---
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 14 +-
.../segment_iterator_no_need_read_data_test.cpp | 60 ++++++++
...t_variant_count_on_index_fault_injection.groovy | 157 +++++++++++++++++++++
3 files changed, 228 insertions(+), 3 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 4d97fe2611e..f7513d357a7 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -19,6 +19,7 @@
#include <assert.h>
#include <gen_cpp/Exprs_types.h>
+#include <gen_cpp/Opcodes_types.h>
#include <gen_cpp/Types_types.h>
#include <gen_cpp/olap_file.pb.h>
@@ -1149,10 +1150,14 @@ bool SegmentIterator::_need_read_data(ColumnId cid) {
// Then, return false.
const auto& column = _opts.tablet_schema->column(cid);
// Different subcolumns may share the same parent_unique_id, so we choose
to abandon this optimization.
- if (column.is_extracted_column()) {
+ if (column.is_extracted_column() &&
+ _opts.push_down_agg_type_opt != TPushAggOp::COUNT_ON_INDEX) {
return true;
}
int32_t unique_id = column.unique_id();
+ if (unique_id < 0) {
+ unique_id = column.parent_unique_id();
+ }
if ((_need_read_data_indices.contains(cid) &&
!_need_read_data_indices[cid] &&
!_output_columns.contains(unique_id)) ||
(_need_read_data_indices.contains(cid) &&
!_need_read_data_indices[cid] &&
@@ -2787,8 +2792,11 @@ void
SegmentIterator::_calculate_expr_in_remaining_conjunct_root() {
}
}
}
- if (child->is_slot_ref()) {
- auto* column_slot_ref =
assert_cast<vectorized::VSlotRef*>(child.get());
+ // Exmple: CAST(v['a'] AS VARCHAR) MATCH 'hello', do not add
CAST expr to index tracking.
+ auto expr_without_cast =
vectorized::VExpr::expr_without_cast(child);
+ if (expr_without_cast->is_slot_ref() && expr->op() !=
TExprOpcode::CAST) {
+ auto* column_slot_ref =
+
assert_cast<vectorized::VSlotRef*>(expr_without_cast.get());
_common_expr_index_exec_status[_schema->column_id(column_slot_ref->column_id())]
[expr.get()] = false;
_common_expr_to_slotref_map[root_expr_ctx.get()][column_slot_ref->column_id()] =
diff --git
a/be/test/olap/rowset/segment_v2/segment_iterator_no_need_read_data_test.cpp
b/be/test/olap/rowset/segment_v2/segment_iterator_no_need_read_data_test.cpp
new file mode 100644
index 00000000000..81cbfc8e0bf
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/segment_iterator_no_need_read_data_test.cpp
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gtest/gtest.h"
+#include "olap/rowset/segment_v2/segment_iterator.h"
+#include "olap/tablet_schema.h"
+
+namespace doris::segment_v2 {
+
+TEST(SegmentIteratorNoNeedReadDataTest, extracted_variant_count_on_index) {
+ TabletSchemaPB schema_pb;
+ schema_pb.set_keys_type(KeysType::DUP_KEYS);
+ auto* root = schema_pb.add_column();
+ root->set_unique_id(1);
+ root->set_name("data");
+ root->set_type("VARIANT");
+ root->set_is_key(false);
+ root->set_is_nullable(true);
+ root->set_variant_max_subcolumns_count(3);
+ root->set_variant_max_sparse_column_statistics_size(10000);
+ root->set_variant_sparse_hash_shard_count(1);
+
+ auto tablet_schema = std::make_shared<TabletSchema>();
+ tablet_schema->init_from_pb(schema_pb);
+
+ TabletColumn subcol =
+ TabletColumn::create_materialized_variant_column("data", {"items",
"content"}, 1, 3);
+ tablet_schema->append_column(subcol, TabletSchema::ColumnType::VARIANT);
+
+ const ColumnId subcol_cid =
tablet_schema->field_index(*subcol.path_info_ptr());
+ ASSERT_GE(subcol_cid, 0);
+
+ auto read_schema = std::make_shared<Schema>(tablet_schema);
+ SegmentIterator iter(nullptr, read_schema);
+ iter._opts.tablet_schema = tablet_schema;
+ iter._opts.push_down_agg_type_opt = TPushAggOp::COUNT_ON_INDEX;
+ iter._need_read_data_indices[static_cast<uint32_t>(subcol_cid)] = false;
+ iter._output_columns.emplace(1);
+
+ EXPECT_FALSE(iter._need_read_data(subcol_cid));
+
+ iter._opts.push_down_agg_type_opt = TPushAggOp::NONE;
+ EXPECT_TRUE(iter._need_read_data(subcol_cid));
+}
+
+} // namespace doris::segment_v2
diff --git
a/regression-test/suites/fault_injection_p0/test_variant_count_on_index_fault_injection.groovy
b/regression-test/suites/fault_injection_p0/test_variant_count_on_index_fault_injection.groovy
new file mode 100644
index 00000000000..81674214b31
--- /dev/null
+++
b/regression-test/suites/fault_injection_p0/test_variant_count_on_index_fault_injection.groovy
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_variant_count_on_index_fault_injection", "p0, nonConcurrent") {
+ def tbl = "test_variant_count_on_index_tbl"
+
+ def toInt = { v -> Integer.parseInt(v.toString()) }
+ def col0List = { res -> res.collect { it[0] == null ? "NULL" :
it[0].toString() } }
+
+ sql "DROP TABLE IF EXISTS ${tbl}"
+
+ sql "set enable_common_expr_pushdown = true"
+ sql "set enable_count_on_index_pushdown = true"
+ sql "set enable_match_without_inverted_index = false"
+ sql "set experimental_enable_nereids_planner = true"
+ sql "set enable_fallback_to_original_planner = false"
+
+ sql """
+ CREATE TABLE ${tbl} (
+ k INT,
+ v VARIANT<'c':bigint,
PROPERTIES("variant_max_subcolumns_count"="0")> NOT NULL,
+ v2 VARIANT<PROPERTIES("variant_max_subcolumns_count"="0")> NOT
NULL,
+ INDEX idx_v(v) USING INVERTED PROPERTIES("parser" = "english")
COMMENT '',
+ INDEX idx_v2(v2) USING INVERTED PROPERTIES("parser" = "english")
COMMENT '',
+ INDEX idx_v_c(v) USING INVERTED PROPERTIES("field_pattern" = "c")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(k)
+ DISTRIBUTED BY HASH(k) BUCKETS 1
+ PROPERTIES(
+ "replication_num" = "1",
+ "disable_auto_compaction" = "true"
+ );
+ """
+
+ sql """
+ INSERT INTO ${tbl} VALUES
+ (1, '{"a":"hello","b":"world","c":1}', '{"b":"foo"}'),
+ (2, '{"a":"hello","b":"world","c":2}', '{"b":"world"}'),
+ (3, '{"a":"hello","b":"xxx","c":1}', '{"c":1}'),
+ (4, '{"a":"xxx","b":"world","c":3}', '{"b":"bar"}'),
+ (5, '{"a":"hello hello","b":"world world","c":1}', '{"b":"baz"}');
+ """
+
+ sql "sync"
+ sql "analyze table ${tbl} with sync"
+
+ // ------------------------------------------------------------
+ // Case1: Correctness tests for each SQL
+ // ------------------------------------------------------------
+
+ // count on index
+ def r1 = sql "select count() from ${tbl} where v['a'] match 'hello'"
+ assertEquals(4, toInt(r1[0][0]))
+
+ def r2 = sql "select count(v['b']) from ${tbl} where v['a'] match 'hello'
and v['b'] match 'world'"
+ assertEquals(3, toInt(r2[0][0]))
+
+ def r3 = sql "select count(v) from ${tbl} where v['a'] match 'hello' and
v['b'] match 'world'"
+ assertEquals(3, toInt(r3[0][0]))
+
+ def r4 = sql "select count(v2['b']) from ${tbl} where v['a'] match 'hello'"
+ assertEquals(3, toInt(r4[0][0]))
+
+ // non count on index
+ def r5 = sql "select v['b'] from ${tbl} where v['a'] match 'hello'"
+ def r5v = col0List(r5).sort()
+ assertEquals(4, r5v.size())
+ assertTrue(r5v.any { it.toLowerCase().contains("world") })
+ assertTrue(r5v.any { it.toLowerCase().contains("xxx") })
+
+ def r6 = sql "select v2['b'] from ${tbl} where v['a'] match 'hello'"
+ def r6v = col0List(r6).sort()
+ assertEquals(4, r6v.size())
+ assertTrue(r6v.any { it.toLowerCase().contains("foo") })
+ assertTrue(r6v.any { it.toLowerCase().contains("world") })
+ assertTrue(r6v.any { it == "NULL" || it.toLowerCase().contains("null") })
+
+ def r7 = sql "select v['a'] from ${tbl} where v['a'] match 'hello'"
+ def r7v = col0List(r7).sort()
+ assertEquals(4, r7v.size())
+ assertTrue(r7v.any { it.toLowerCase().contains("hello hello") })
+
+ // numeric subcolumn: ensure numeric predicates can still use index-only
count optimization
+ def rn1 = sql "select count() from ${tbl} where cast(v['c'] as bigint) = 1"
+ assertEquals(3, toInt(rn1[0][0]))
+
+ def rn2 = sql "select cast(v['c'] as bigint) from ${tbl} where cast(v['c']
as bigint) = 1"
+ assertEquals(3, rn2.size())
+
+ // Extra: Ensure COUNT_ON_INDEX is chosen in plan for the 4 count queries.
+ explain {
+ sql("select count() from ${tbl} where v['a'] match 'hello'")
+ contains "pushAggOp=COUNT_ON_INDEX"
+ }
+ explain {
+ sql("select count(v['b']) from ${tbl} where v['a'] match 'hello' and
v['b'] match 'world'")
+ contains "pushAggOp=COUNT_ON_INDEX"
+ }
+ explain {
+ sql("select count(v) from ${tbl} where v['a'] match 'hello' and v['b']
match 'world'")
+ contains "pushAggOp=COUNT_ON_INDEX"
+ }
+ explain {
+ sql("select count(v2['b']) from ${tbl} where v['a'] match 'hello'")
+ contains "pushAggOp=COUNT_ON_INDEX"
+ }
+ explain {
+ sql("select count() from ${tbl} where cast(v['c'] as bigint) = 1")
+ contains "pushAggOp=COUNT_ON_INDEX"
+ }
+
+ def dp3 = sql "select count(v) from ${tbl} where v['a'] match 'hello' and
v['b'] match 'world'"
+ assertEquals(3, toInt(dp3[0][0]))
+
+ // ------------------------------------------------------------
+ // Case2: DebugPoint validation - COUNT_ON_INDEX works and doesn't read
data
+ // Reference: fault_injection_p0/test_need_read_data_fault_injection.groovy
+ // ------------------------------------------------------------
+
+ try {
+
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index",
[column_name: "v.a"])
+ def dp4 = sql "select count(v2['b']) from ${tbl} where v['a'] match
'hello'"
+ assertEquals(3, toInt(dp4[0][0]))
+ } finally {
+
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
+ }
+
+ try {
+
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
+
+ def dp1 = sql "select count() from ${tbl} where v['a'] match 'hello'"
+ assertEquals(4, toInt(dp1[0][0]))
+
+ def dp2 = sql "select count(v['b']) from ${tbl} where v['a'] match
'hello' and v['b'] match 'world'"
+ assertEquals(3, toInt(dp2[0][0]))
+
+ // TODO: FIXME
+ // def dpn1 = sql "select count() from ${tbl} where cast(v['c'] as
bigint) = 1"
+ // assertEquals(3, toInt(dpn1[0][0]))
+ } finally {
+
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]