This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 7ee1bc437f0 [fix](search) Fix wildcard query on variant subcolumns
returning empty results (#60793)
7ee1bc437f0 is described below
commit 7ee1bc437f069efe2d9553b51e9d8615a63c36f4
Author: Jack <[email protected]>
AuthorDate: Tue Feb 24 21:52:03 2026 +0800
[fix](search) Fix wildcard query on variant subcolumns returning empty
results (#60793)
### What problem does this PR solve?
Related PR: #60782
Problem Summary:
When `search()` DSL uses wildcard patterns (e.g. `*ith`, `sm*th`,
`sm?th`) on variant subcolumns with analyzer-based indexes
(field_pattern), the queries return empty results even though regular
TERM search works correctly.
**Root cause:** In `FieldReaderResolver::resolve()`, only `EQUAL_QUERY`
was upgraded to `MATCH_ANY_QUERY` for variant subcolumns with
analyzer-based indexes. `WILDCARD_QUERY` was not upgraded, so
`select_best_reader()` picked the `STRING_TYPE` reader instead of
`FULLTEXT`. `WildcardWeight` then enumerated terms from the wrong
(untokenized) index directory, finding no matches.
**Fix:** Extend the query type upgrade condition to also cover
`WILDCARD_QUERY`, so wildcard patterns correctly use the FULLTEXT index
on variant subcolumns. Also fix a misleading comment in
`inverted_index_iterator.cpp` where `is_equal_query()` was described as
handling WILDCARD/REGEXP but actually only checks `EQUAL_QUERY`.
---
.../rowset/segment_v2/inverted_index_iterator.cpp | 2 +-
be/src/vec/functions/function_search.cpp | 10 +-
.../data/search/test_search_variant_wildcard.out | 42 ++++++
.../search/test_search_variant_wildcard.groovy | 166 +++++++++++++++++++++
4 files changed, 215 insertions(+), 5 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
index 4df1560183a..fa0a7488015 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
@@ -174,7 +174,7 @@ Result<InvertedIndexReaderPtr>
InvertedIndexIterator::select_for_text(
}
}
- // EQUAL/WILDCARD/REGEXP queries prefer STRING_TYPE
+ // EQUAL queries prefer STRING_TYPE for exact match
if (is_equal_query(query_type)) {
for (const auto* entry : match.candidates) {
if (entry->type == InvertedIndexReaderType::STRING_TYPE) {
diff --git a/be/src/vec/functions/function_search.cpp
b/be/src/vec/functions/function_search.cpp
index 1aa12b658f3..79c697ef558 100644
--- a/be/src/vec/functions/function_search.cpp
+++ b/be/src/vec/functions/function_search.cpp
@@ -154,16 +154,18 @@ Status FieldReaderResolver::resolve(const std::string&
field_name,
// For variant subcolumns, FE resolves the field pattern to a specific
index and sends
// its index_properties via TSearchFieldBinding. When FE picks an
analyzer-based index,
- // upgrade EQUAL_QUERY to MATCH_ANY_QUERY so select_best_reader picks the
FULLTEXT reader
- // instead of STRING_TYPE. Without this, TERM clauses from lucene-mode DSL
would open the
- // wrong (untokenized) index directory and tokenized search terms would
never match.
+ // upgrade certain query types to MATCH_ANY_QUERY so select_best_reader
picks the FULLTEXT
+ // reader instead of STRING_TYPE. Without this upgrade:
+ // - TERM (EQUAL_QUERY) clauses would open the wrong (untokenized) index
directory
+ // - WILDCARD clauses would enumerate terms from the wrong index,
returning empty results
InvertedIndexQueryType effective_query_type = query_type;
auto fb_it = _field_binding_map.find(field_name);
if (is_variant_sub && fb_it != _field_binding_map.end() &&
fb_it->second->__isset.index_properties &&
!fb_it->second->index_properties.empty()) {
if (inverted_index::InvertedIndexAnalyzer::should_analyzer(
fb_it->second->index_properties) &&
- effective_query_type == InvertedIndexQueryType::EQUAL_QUERY) {
+ (effective_query_type == InvertedIndexQueryType::EQUAL_QUERY ||
+ effective_query_type == InvertedIndexQueryType::WILDCARD_QUERY)) {
effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY;
}
}
diff --git a/regression-test/data/search/test_search_variant_wildcard.out
b/regression-test/data/search/test_search_variant_wildcard.out
new file mode 100644
index 00000000000..d2f86c869ca
--- /dev/null
+++ b/regression-test/data/search/test_search_variant_wildcard.out
@@ -0,0 +1,42 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !term_smith --
+73095521135
+
+-- !term_smithson --
+73095446198
+
+-- !term_johnson --
+73095754047
+
+-- !wildcard_star_ith --
+73095521135
+
+-- !wildcard_sm_star_th --
+73095521135
+
+-- !wildcard_sm_q_th --
+73095521135
+
+-- !wildcard_smith_star --
+73095446198
+73095521135
+
+-- !wildcard_sm_star --
+73095446198
+73095521135
+
+-- !wildcard_star_son --
+73095446198
+73095754047
+
+-- !wildcard_firstname --
+73095521135
+
+-- !wildcard_and_term --
+73095521135
+
+-- !wildcard_star_all --
+73095446198
+73095521135
+73095754047
+
diff --git a/regression-test/suites/search/test_search_variant_wildcard.groovy
b/regression-test/suites/search/test_search_variant_wildcard.groovy
new file mode 100644
index 00000000000..37fb79d35da
--- /dev/null
+++ b/regression-test/suites/search/test_search_variant_wildcard.groovy
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/**
+ * Test wildcard search on variant subcolumns in Lucene mode.
+ *
+ * Bug: Wildcard queries (*, ?) on variant subcolumns return empty results
+ * even when the data exists and regular TERM search works correctly.
+ *
+ * Root cause: In FieldReaderResolver::resolve(), only EQUAL_QUERY is upgraded
+ * to MATCH_ANY_QUERY for variant subcolumns with analyzers. WILDCARD_QUERY is
+ * not upgraded, so it may select the wrong index reader (STRING_TYPE instead
of
+ * FULLTEXT), causing term enumeration to fail.
+ *
+ * Scenario: Contacts with firstname/lastname stored in variant subcolumns.
+ * - TERM search for 'smith' correctly returns John Smith
+ * - WILDCARD searches '*ith', 'sm*th', 'sm?th' should also match but returned
empty
+ */
+suite("test_search_variant_wildcard", "p0") {
+ def tableName = "test_search_variant_wildcard"
+
+ sql """ set enable_match_without_inverted_index = false """
+ sql """ set enable_common_expr_pushdown = true """
+ sql """ set default_variant_enable_typed_paths_to_sparse = false """
+ sql """ set default_variant_enable_doc_mode = false """
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ // Create table with variant column using field_pattern index
+ sql """
+ CREATE TABLE ${tableName} (
+ `id` BIGINT NOT NULL,
+ `props` variant<
+ MATCH_NAME_GLOB 'string_*' : string,
+ properties("variant_max_subcolumns_count" = "100")
+ > NULL,
+ INDEX idx_props (props) USING INVERTED PROPERTIES(
+ "parser" = "unicode",
+ "field_pattern" = "string_*",
+ "lower_case" = "true"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "disable_auto_compaction" = "true"
+ )
+ """
+
+ // Insert test data matching the reported scenario
+ // string_8 = firstname, string_17 = lastname
+ sql """INSERT INTO ${tableName} VALUES
+ (73095521135, '{"string_8": "John", "string_17": "Smith"}'),
+ (73095446198, '{"string_8": "Jane", "string_17": "Smithson"}'),
+ (73095754047, '{"string_8": "Michael David", "string_17": "Johnson"}')
+ """
+
+ sql "sync"
+ Thread.sleep(5000)
+
+ // ============ Baseline: TERM search works ============
+
+ // Test 1: TERM search for 'smith' on lastname - should return John Smith
+ qt_term_smith """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('smith',
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 2: TERM search for 'smithson' on lastname - should return Jane
Smithson
+ qt_term_smithson """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('smithson',
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 3: TERM search for 'johnson' on lastname - should return Michael
David Johnson
+ qt_term_johnson """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('johnson',
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // ============ Bug: Wildcard searches return empty ============
+
+ // Test 4: Leading wildcard '*ith' - should match "Smith" (ends with "ith")
+ qt_wildcard_star_ith """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('*ith',
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 5: Middle wildcard 'sm*th' - should match "Smith"
+ qt_wildcard_sm_star_th """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('sm*th',
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 6: Single char wildcard 'sm?th' - should match "Smith"
+ qt_wildcard_sm_q_th """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('sm?th',
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 7: Trailing wildcard 'smith*' - should match "Smith" and "Smithson"
+ qt_wildcard_smith_star """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('smith*',
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 8: Wildcard 'sm*' - should match "Smith" and "Smithson"
+ qt_wildcard_sm_star """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('sm*',
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 9: Wildcard '*son' - should match "Smithson" and "Johnson"
+ qt_wildcard_star_son """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('*son',
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 10: Wildcard on firstname field 'jo?n' - should match "John"
+ qt_wildcard_firstname """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('jo?n',
'{"default_field":"props.string_8","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 11: Wildcard combined with AND - 'sm*th AND props.string_8:john'
+ qt_wildcard_and_term """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('props.string_17:sm*th AND props.string_8:john',
'{"default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 12: Standalone wildcard '*' matches all non-null values
+ qt_wildcard_star_all """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM
${tableName}
+ WHERE search('*',
'{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Clean up
+ sql "DROP TABLE IF EXISTS ${tableName}"
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]