(doris) branch branch-4.0 updated: branch-4.0: [Improve](Variant) use COUNT_ON_INDEX on variant subcolumns #60404 (#60461)

yiguolei Tue, 03 Feb 2026 18:07:07 -0800

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 343d0def3d4 branch-4.0: [Improve](Variant) use COUNT_ON_INDEX on 
variant subcolumns #60404 (#60461)
343d0def3d4 is described below

commit 343d0def3d46e6cfb5127e3177a8e822a89fca52
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Feb 4 10:06:07 2026 +0800

    branch-4.0: [Improve](Variant) use COUNT_ON_INDEX on variant subcolumns 
#60404 (#60461)
    
    Cherry-picked from #60404
    
    Co-authored-by: lihangyu <[email protected]>
---
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |  14 +-
 .../segment_iterator_no_need_read_data_test.cpp    |  60 ++++++++
 ...t_variant_count_on_index_fault_injection.groovy | 157 +++++++++++++++++++++
 3 files changed, 228 insertions(+), 3 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 4d97fe2611e..f7513d357a7 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -19,6 +19,7 @@
 
 #include <assert.h>
 #include <gen_cpp/Exprs_types.h>
+#include <gen_cpp/Opcodes_types.h>
 #include <gen_cpp/Types_types.h>
 #include <gen_cpp/olap_file.pb.h>
 
@@ -1149,10 +1150,14 @@ bool SegmentIterator::_need_read_data(ColumnId cid) {
     // Then, return false.
     const auto& column = _opts.tablet_schema->column(cid);
     // Different subcolumns may share the same parent_unique_id, so we choose 
to abandon this optimization.
-    if (column.is_extracted_column()) {
+    if (column.is_extracted_column() &&
+        _opts.push_down_agg_type_opt != TPushAggOp::COUNT_ON_INDEX) {
         return true;
     }
     int32_t unique_id = column.unique_id();
+    if (unique_id < 0) {
+        unique_id = column.parent_unique_id();
+    }
     if ((_need_read_data_indices.contains(cid) && 
!_need_read_data_indices[cid] &&
          !_output_columns.contains(unique_id)) ||
         (_need_read_data_indices.contains(cid) && 
!_need_read_data_indices[cid] &&
@@ -2787,8 +2792,11 @@ void 
SegmentIterator::_calculate_expr_in_remaining_conjunct_root() {
                         }
                     }
                 }
-                if (child->is_slot_ref()) {
-                    auto* column_slot_ref = 
assert_cast<vectorized::VSlotRef*>(child.get());
+                // Exmple: CAST(v['a'] AS VARCHAR) MATCH 'hello', do not add 
CAST expr to index tracking.
+                auto expr_without_cast = 
vectorized::VExpr::expr_without_cast(child);
+                if (expr_without_cast->is_slot_ref() && expr->op() != 
TExprOpcode::CAST) {
+                    auto* column_slot_ref =
+                            
assert_cast<vectorized::VSlotRef*>(expr_without_cast.get());
                     
_common_expr_index_exec_status[_schema->column_id(column_slot_ref->column_id())]
                                                   [expr.get()] = false;
                     
_common_expr_to_slotref_map[root_expr_ctx.get()][column_slot_ref->column_id()] =
diff --git 
a/be/test/olap/rowset/segment_v2/segment_iterator_no_need_read_data_test.cpp 
b/be/test/olap/rowset/segment_v2/segment_iterator_no_need_read_data_test.cpp
new file mode 100644
index 00000000000..81cbfc8e0bf
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/segment_iterator_no_need_read_data_test.cpp
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gtest/gtest.h"
+#include "olap/rowset/segment_v2/segment_iterator.h"
+#include "olap/tablet_schema.h"
+
+namespace doris::segment_v2 {
+
+TEST(SegmentIteratorNoNeedReadDataTest, extracted_variant_count_on_index) {
+    TabletSchemaPB schema_pb;
+    schema_pb.set_keys_type(KeysType::DUP_KEYS);
+    auto* root = schema_pb.add_column();
+    root->set_unique_id(1);
+    root->set_name("data");
+    root->set_type("VARIANT");
+    root->set_is_key(false);
+    root->set_is_nullable(true);
+    root->set_variant_max_subcolumns_count(3);
+    root->set_variant_max_sparse_column_statistics_size(10000);
+    root->set_variant_sparse_hash_shard_count(1);
+
+    auto tablet_schema = std::make_shared<TabletSchema>();
+    tablet_schema->init_from_pb(schema_pb);
+
+    TabletColumn subcol =
+            TabletColumn::create_materialized_variant_column("data", {"items", 
"content"}, 1, 3);
+    tablet_schema->append_column(subcol, TabletSchema::ColumnType::VARIANT);
+
+    const ColumnId subcol_cid = 
tablet_schema->field_index(*subcol.path_info_ptr());
+    ASSERT_GE(subcol_cid, 0);
+
+    auto read_schema = std::make_shared<Schema>(tablet_schema);
+    SegmentIterator iter(nullptr, read_schema);
+    iter._opts.tablet_schema = tablet_schema;
+    iter._opts.push_down_agg_type_opt = TPushAggOp::COUNT_ON_INDEX;
+    iter._need_read_data_indices[static_cast<uint32_t>(subcol_cid)] = false;
+    iter._output_columns.emplace(1);
+
+    EXPECT_FALSE(iter._need_read_data(subcol_cid));
+
+    iter._opts.push_down_agg_type_opt = TPushAggOp::NONE;
+    EXPECT_TRUE(iter._need_read_data(subcol_cid));
+}
+
+} // namespace doris::segment_v2
diff --git 
a/regression-test/suites/fault_injection_p0/test_variant_count_on_index_fault_injection.groovy
 
b/regression-test/suites/fault_injection_p0/test_variant_count_on_index_fault_injection.groovy
new file mode 100644
index 00000000000..81674214b31
--- /dev/null
+++ 
b/regression-test/suites/fault_injection_p0/test_variant_count_on_index_fault_injection.groovy
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_variant_count_on_index_fault_injection", "p0, nonConcurrent") {
+    def tbl = "test_variant_count_on_index_tbl"
+
+    def toInt = { v -> Integer.parseInt(v.toString()) }
+    def col0List = { res -> res.collect { it[0] == null ? "NULL" : 
it[0].toString() } }
+
+    sql "DROP TABLE IF EXISTS ${tbl}"
+
+    sql "set enable_common_expr_pushdown = true"
+    sql "set enable_count_on_index_pushdown = true"
+    sql "set enable_match_without_inverted_index = false"
+    sql "set experimental_enable_nereids_planner = true"
+    sql "set enable_fallback_to_original_planner = false"
+
+    sql """
+        CREATE TABLE ${tbl} (
+            k INT,
+            v  VARIANT<'c':bigint, 
PROPERTIES("variant_max_subcolumns_count"="0")> NOT NULL,
+            v2 VARIANT<PROPERTIES("variant_max_subcolumns_count"="0")> NOT 
NULL,
+            INDEX idx_v(v) USING INVERTED PROPERTIES("parser" = "english") 
COMMENT '',
+            INDEX idx_v2(v2) USING INVERTED PROPERTIES("parser" = "english") 
COMMENT '',
+            INDEX idx_v_c(v) USING INVERTED PROPERTIES("field_pattern" = "c")
+        ) ENGINE=OLAP
+        DUPLICATE KEY(k)
+        DISTRIBUTED BY HASH(k) BUCKETS 1
+        PROPERTIES(
+            "replication_num" = "1",
+            "disable_auto_compaction" = "true"
+        );
+    """
+
+    sql """
+        INSERT INTO ${tbl} VALUES
+            (1, '{"a":"hello","b":"world","c":1}', '{"b":"foo"}'),
+            (2, '{"a":"hello","b":"world","c":2}', '{"b":"world"}'),
+            (3, '{"a":"hello","b":"xxx","c":1}',   '{"c":1}'),
+            (4, '{"a":"xxx","b":"world","c":3}',  '{"b":"bar"}'),
+            (5, '{"a":"hello hello","b":"world world","c":1}', '{"b":"baz"}');
+    """
+
+    sql "sync"
+    sql "analyze table ${tbl} with sync"
+
+    // ------------------------------------------------------------
+    // Case1: Correctness tests for each SQL
+    // ------------------------------------------------------------
+
+    // count on index
+    def r1 = sql "select count() from ${tbl} where v['a'] match 'hello'"
+    assertEquals(4, toInt(r1[0][0]))
+
+    def r2 = sql "select count(v['b']) from ${tbl} where v['a'] match 'hello' 
and v['b'] match 'world'"
+    assertEquals(3, toInt(r2[0][0]))
+
+    def r3 = sql "select count(v) from ${tbl} where v['a'] match 'hello' and 
v['b'] match 'world'"
+    assertEquals(3, toInt(r3[0][0]))
+
+    def r4 = sql "select count(v2['b']) from ${tbl} where v['a'] match 'hello'"
+    assertEquals(3, toInt(r4[0][0]))
+
+    // non count on index
+    def r5 = sql "select v['b'] from ${tbl} where v['a'] match 'hello'"
+    def r5v = col0List(r5).sort()
+    assertEquals(4, r5v.size())
+    assertTrue(r5v.any { it.toLowerCase().contains("world") })
+    assertTrue(r5v.any { it.toLowerCase().contains("xxx") })
+
+    def r6 = sql "select v2['b'] from ${tbl} where v['a'] match 'hello'"
+    def r6v = col0List(r6).sort()
+    assertEquals(4, r6v.size())
+    assertTrue(r6v.any { it.toLowerCase().contains("foo") })
+    assertTrue(r6v.any { it.toLowerCase().contains("world") })
+    assertTrue(r6v.any { it == "NULL" || it.toLowerCase().contains("null") })
+
+    def r7 = sql "select v['a'] from ${tbl} where v['a'] match 'hello'"
+    def r7v = col0List(r7).sort()
+    assertEquals(4, r7v.size())
+    assertTrue(r7v.any { it.toLowerCase().contains("hello hello") })
+
+    // numeric subcolumn: ensure numeric predicates can still use index-only 
count optimization
+    def rn1 = sql "select count() from ${tbl} where cast(v['c'] as bigint) = 1"
+    assertEquals(3, toInt(rn1[0][0]))
+
+    def rn2 = sql "select cast(v['c'] as bigint) from ${tbl} where cast(v['c'] 
as bigint) = 1"
+    assertEquals(3, rn2.size())
+
+    // Extra: Ensure COUNT_ON_INDEX is chosen in plan for the 4 count queries.
+    explain {
+        sql("select count() from ${tbl} where v['a'] match 'hello'")
+        contains "pushAggOp=COUNT_ON_INDEX"
+    }
+    explain {
+        sql("select count(v['b']) from ${tbl} where v['a'] match 'hello' and 
v['b'] match 'world'")
+        contains "pushAggOp=COUNT_ON_INDEX"
+    }
+    explain {
+        sql("select count(v) from ${tbl} where v['a'] match 'hello' and v['b'] 
match 'world'")
+        contains "pushAggOp=COUNT_ON_INDEX"
+    }
+    explain {
+        sql("select count(v2['b']) from ${tbl} where v['a'] match 'hello'")
+        contains "pushAggOp=COUNT_ON_INDEX"
+    }
+    explain {
+        sql("select count() from ${tbl} where cast(v['c'] as bigint) = 1")
+        contains "pushAggOp=COUNT_ON_INDEX"
+    }
+
+    def dp3 = sql "select count(v) from ${tbl} where v['a'] match 'hello' and 
v['b'] match 'world'"
+    assertEquals(3, toInt(dp3[0][0]))
+
+    // ------------------------------------------------------------
+    // Case2: DebugPoint validation - COUNT_ON_INDEX works and doesn't read 
data
+    // Reference: fault_injection_p0/test_need_read_data_fault_injection.groovy
+    // ------------------------------------------------------------
+
+    try {
+        
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index",
 [column_name: "v.a"])
+        def dp4 = sql "select count(v2['b']) from ${tbl} where v['a'] match 
'hello'"
+        assertEquals(3, toInt(dp4[0][0]))
+    } finally {
+        
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
+    }
+
+    try {
+        
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
+
+        def dp1 = sql "select count() from ${tbl} where v['a'] match 'hello'"
+        assertEquals(4, toInt(dp1[0][0]))
+
+        def dp2 = sql "select count(v['b']) from ${tbl} where v['a'] match 
'hello' and v['b'] match 'world'"
+        assertEquals(3, toInt(dp2[0][0]))
+
+        // TODO: FIXME
+        // def dpn1 = sql "select count() from ${tbl} where cast(v['c'] as 
bigint) = 1"
+        // assertEquals(3, toInt(dpn1[0][0]))
+    } finally {
+        
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-4.0 updated: branch-4.0: [Improve](Variant) use COUNT_ON_INDEX on variant subcolumns #60404 (#60461)

Reply via email to