This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 7ee1bc437f0 [fix](search) Fix wildcard query on variant subcolumns 
returning empty results (#60793)
7ee1bc437f0 is described below

commit 7ee1bc437f069efe2d9553b51e9d8615a63c36f4
Author: Jack <[email protected]>
AuthorDate: Tue Feb 24 21:52:03 2026 +0800

    [fix](search) Fix wildcard query on variant subcolumns returning empty 
results (#60793)
    
    ### What problem does this PR solve?
    
    Related PR: #60782
    
    Problem Summary:
    When `search()` DSL uses wildcard patterns (e.g. `*ith`, `sm*th`,
    `sm?th`) on variant subcolumns with analyzer-based indexes
    (field_pattern), the queries return empty results even though regular
    TERM search works correctly.
    
    **Root cause:** In `FieldReaderResolver::resolve()`, only `EQUAL_QUERY`
    was upgraded to `MATCH_ANY_QUERY` for variant subcolumns with
    analyzer-based indexes. `WILDCARD_QUERY` was not upgraded, so
    `select_best_reader()` picked the `STRING_TYPE` reader instead of
    `FULLTEXT`. `WildcardWeight` then enumerated terms from the wrong
    (untokenized) index directory, finding no matches.
    
    **Fix:** Extend the query type upgrade condition to also cover
    `WILDCARD_QUERY`, so wildcard patterns correctly use the FULLTEXT index
    on variant subcolumns. Also fix a misleading comment in
    `inverted_index_iterator.cpp` where `is_equal_query()` was described as
    handling WILDCARD/REGEXP but actually only checks `EQUAL_QUERY`.
---
 .../rowset/segment_v2/inverted_index_iterator.cpp  |   2 +-
 be/src/vec/functions/function_search.cpp           |  10 +-
 .../data/search/test_search_variant_wildcard.out   |  42 ++++++
 .../search/test_search_variant_wildcard.groovy     | 166 +++++++++++++++++++++
 4 files changed, 215 insertions(+), 5 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
index 4df1560183a..fa0a7488015 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
@@ -174,7 +174,7 @@ Result<InvertedIndexReaderPtr> 
InvertedIndexIterator::select_for_text(
         }
     }
 
-    // EQUAL/WILDCARD/REGEXP queries prefer STRING_TYPE
+    // EQUAL queries prefer STRING_TYPE for exact match
     if (is_equal_query(query_type)) {
         for (const auto* entry : match.candidates) {
             if (entry->type == InvertedIndexReaderType::STRING_TYPE) {
diff --git a/be/src/vec/functions/function_search.cpp 
b/be/src/vec/functions/function_search.cpp
index 1aa12b658f3..79c697ef558 100644
--- a/be/src/vec/functions/function_search.cpp
+++ b/be/src/vec/functions/function_search.cpp
@@ -154,16 +154,18 @@ Status FieldReaderResolver::resolve(const std::string& 
field_name,
 
     // For variant subcolumns, FE resolves the field pattern to a specific 
index and sends
     // its index_properties via TSearchFieldBinding. When FE picks an 
analyzer-based index,
-    // upgrade EQUAL_QUERY to MATCH_ANY_QUERY so select_best_reader picks the 
FULLTEXT reader
-    // instead of STRING_TYPE. Without this, TERM clauses from lucene-mode DSL 
would open the
-    // wrong (untokenized) index directory and tokenized search terms would 
never match.
+    // upgrade certain query types to MATCH_ANY_QUERY so select_best_reader 
picks the FULLTEXT
+    // reader instead of STRING_TYPE. Without this upgrade:
+    // - TERM (EQUAL_QUERY) clauses would open the wrong (untokenized) index 
directory
+    // - WILDCARD clauses would enumerate terms from the wrong index, 
returning empty results
     InvertedIndexQueryType effective_query_type = query_type;
     auto fb_it = _field_binding_map.find(field_name);
     if (is_variant_sub && fb_it != _field_binding_map.end() &&
         fb_it->second->__isset.index_properties && 
!fb_it->second->index_properties.empty()) {
         if (inverted_index::InvertedIndexAnalyzer::should_analyzer(
                     fb_it->second->index_properties) &&
-            effective_query_type == InvertedIndexQueryType::EQUAL_QUERY) {
+            (effective_query_type == InvertedIndexQueryType::EQUAL_QUERY ||
+             effective_query_type == InvertedIndexQueryType::WILDCARD_QUERY)) {
             effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY;
         }
     }
diff --git a/regression-test/data/search/test_search_variant_wildcard.out 
b/regression-test/data/search/test_search_variant_wildcard.out
new file mode 100644
index 00000000000..d2f86c869ca
--- /dev/null
+++ b/regression-test/data/search/test_search_variant_wildcard.out
@@ -0,0 +1,42 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !term_smith --
+73095521135
+
+-- !term_smithson --
+73095446198
+
+-- !term_johnson --
+73095754047
+
+-- !wildcard_star_ith --
+73095521135
+
+-- !wildcard_sm_star_th --
+73095521135
+
+-- !wildcard_sm_q_th --
+73095521135
+
+-- !wildcard_smith_star --
+73095446198
+73095521135
+
+-- !wildcard_sm_star --
+73095446198
+73095521135
+
+-- !wildcard_star_son --
+73095446198
+73095754047
+
+-- !wildcard_firstname --
+73095521135
+
+-- !wildcard_and_term --
+73095521135
+
+-- !wildcard_star_all --
+73095446198
+73095521135
+73095754047
+
diff --git a/regression-test/suites/search/test_search_variant_wildcard.groovy 
b/regression-test/suites/search/test_search_variant_wildcard.groovy
new file mode 100644
index 00000000000..37fb79d35da
--- /dev/null
+++ b/regression-test/suites/search/test_search_variant_wildcard.groovy
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/**
+ * Test wildcard search on variant subcolumns in Lucene mode.
+ *
+ * Bug: Wildcard queries (*, ?) on variant subcolumns return empty results
+ * even when the data exists and regular TERM search works correctly.
+ *
+ * Root cause: In FieldReaderResolver::resolve(), only EQUAL_QUERY is upgraded
+ * to MATCH_ANY_QUERY for variant subcolumns with analyzers. WILDCARD_QUERY is
+ * not upgraded, so it may select the wrong index reader (STRING_TYPE instead 
of
+ * FULLTEXT), causing term enumeration to fail.
+ *
+ * Scenario: Contacts with firstname/lastname stored in variant subcolumns.
+ * - TERM search for 'smith' correctly returns John Smith
+ * - WILDCARD searches '*ith', 'sm*th', 'sm?th' should also match but returned 
empty
+ */
+suite("test_search_variant_wildcard", "p0") {
+    def tableName = "test_search_variant_wildcard"
+
+    sql """ set enable_match_without_inverted_index = false """
+    sql """ set enable_common_expr_pushdown = true """
+    sql """ set default_variant_enable_typed_paths_to_sparse = false """
+    sql """ set default_variant_enable_doc_mode = false """
+
+    sql "DROP TABLE IF EXISTS ${tableName}"
+
+    // Create table with variant column using field_pattern index
+    sql """
+        CREATE TABLE ${tableName} (
+            `id` BIGINT NOT NULL,
+            `props` variant<
+                MATCH_NAME_GLOB 'string_*' : string,
+                properties("variant_max_subcolumns_count" = "100")
+            > NULL,
+            INDEX idx_props (props) USING INVERTED PROPERTIES(
+                "parser" = "unicode",
+                "field_pattern" = "string_*",
+                "lower_case" = "true"
+            )
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`id`)
+        DISTRIBUTED BY HASH(`id`) BUCKETS 1
+        PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1",
+            "disable_auto_compaction" = "true"
+        )
+    """
+
+    // Insert test data matching the reported scenario
+    // string_8 = firstname, string_17 = lastname
+    sql """INSERT INTO ${tableName} VALUES
+        (73095521135, '{"string_8": "John", "string_17": "Smith"}'),
+        (73095446198, '{"string_8": "Jane", "string_17": "Smithson"}'),
+        (73095754047, '{"string_8": "Michael David", "string_17": "Johnson"}')
+    """
+
+    sql "sync"
+    Thread.sleep(5000)
+
+    // ============ Baseline: TERM search works ============
+
+    // Test 1: TERM search for 'smith' on lastname - should return John Smith
+    qt_term_smith """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('smith', 
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Test 2: TERM search for 'smithson' on lastname - should return Jane 
Smithson
+    qt_term_smithson """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('smithson', 
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Test 3: TERM search for 'johnson' on lastname - should return Michael 
David Johnson
+    qt_term_johnson """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('johnson', 
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // ============ Bug: Wildcard searches return empty ============
+
+    // Test 4: Leading wildcard '*ith' - should match "Smith" (ends with "ith")
+    qt_wildcard_star_ith """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('*ith', 
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Test 5: Middle wildcard 'sm*th' - should match "Smith"
+    qt_wildcard_sm_star_th """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('sm*th', 
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Test 6: Single char wildcard 'sm?th' - should match "Smith"
+    qt_wildcard_sm_q_th """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('sm?th', 
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Test 7: Trailing wildcard 'smith*' - should match "Smith" and "Smithson"
+    qt_wildcard_smith_star """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('smith*', 
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Test 8: Wildcard 'sm*' - should match "Smith" and "Smithson"
+    qt_wildcard_sm_star """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('sm*', 
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Test 9: Wildcard '*son' - should match "Smithson" and "Johnson"
+    qt_wildcard_star_son """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('*son', 
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Test 10: Wildcard on firstname field 'jo?n' - should match "John"
+    qt_wildcard_firstname """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('jo?n', 
'{"default_field":"props.string_8","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Test 11: Wildcard combined with AND - 'sm*th AND props.string_8:john'
+    qt_wildcard_and_term """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('props.string_17:sm*th AND props.string_8:john', 
'{"default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Test 12: Standalone wildcard '*' matches all non-null values
+    qt_wildcard_star_all """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM 
${tableName}
+        WHERE search('*', 
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+        ORDER BY id
+    """
+
+    // Clean up
+    sql "DROP TABLE IF EXISTS ${tableName}"
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to