This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new eff59f423f3 [fix](search) Use FE-provided analyzer key for multi-index
columns in search() (#60798)
eff59f423f3 is described below
commit eff59f423f32655f8fa7b31a3ad9efd07d90493d
Author: Jack <[email protected]>
AuthorDate: Wed Feb 25 18:00:30 2026 +0800
[fix](search) Use FE-provided analyzer key for multi-index columns in
search() (#60798)
### What problem does this PR solve?
Issue Number: close #DORIS-24542
Problem Summary:
When a column has multiple inverted indexes with different analyzers
(e.g., one default untokenized index and one with English parser),
`search()` in Lucene/scalar mode returns empty results.
**Root cause:** In `FieldReaderResolver::resolve()`,
`select_best_reader()` was always called with an empty analyzer key
`""`, causing it to pick the wrong (untokenized) index for tokenized
queries. Additionally, the EQUAL_QUERY → MATCH_ANY_QUERY upgrade was
restricted to variant subcolumns only.
**Fix:**
1. Extract `analyzer_key` from FE-provided `index_properties` before
calling `select_best_reader()` and pass it through
2. Remove the `is_variant_sub` restriction on the query type upgrade so
regular columns with multiple indexes also get the correct FULLTEXT
reader
---
be/src/vec/functions/function_search.cpp | 34 +++-
.../search/test_search_multi_analyzer_lucene.out | 55 ++++++
.../test_search_multi_analyzer_lucene.groovy | 218 +++++++++++++++++++++
3 files changed, 302 insertions(+), 5 deletions(-)
diff --git a/be/src/vec/functions/function_search.cpp
b/be/src/vec/functions/function_search.cpp
index 79c697ef558..9bf1e1e8e54 100644
--- a/be/src/vec/functions/function_search.cpp
+++ b/be/src/vec/functions/function_search.cpp
@@ -154,14 +154,23 @@ Status FieldReaderResolver::resolve(const std::string&
field_name,
// For variant subcolumns, FE resolves the field pattern to a specific
index and sends
// its index_properties via TSearchFieldBinding. When FE picks an
analyzer-based index,
- // upgrade certain query types to MATCH_ANY_QUERY so select_best_reader
picks the FULLTEXT
- // reader instead of STRING_TYPE. Without this upgrade:
+ // upgrade EQUAL_QUERY/WILDCARD_QUERY to MATCH_ANY_QUERY so
select_best_reader picks the
+ // FULLTEXT reader instead of STRING_TYPE. Without this upgrade:
// - TERM (EQUAL_QUERY) clauses would open the wrong (untokenized) index
directory
// - WILDCARD clauses would enumerate terms from the wrong index,
returning empty results
+ //
+ // For regular (non-variant) columns with multiple indexes, the caller
(build_leaf_query)
+ // is responsible for passing the appropriate query_type: MATCH_ANY_QUERY
for tokenized
+ // queries (TERM) and EQUAL_QUERY for exact-match queries (EXACT). This
ensures
+ // select_best_reader picks FULLTEXT vs STRING_TYPE correctly without
needing an explicit
+ // analyzer key, since the query_type alone drives the reader type
preference.
InvertedIndexQueryType effective_query_type = query_type;
auto fb_it = _field_binding_map.find(field_name);
+ std::string analyzer_key;
if (is_variant_sub && fb_it != _field_binding_map.end() &&
fb_it->second->__isset.index_properties &&
!fb_it->second->index_properties.empty()) {
+ analyzer_key = normalize_analyzer_key(
+
build_analyzer_key_from_properties(fb_it->second->index_properties));
if (inverted_index::InvertedIndexAnalyzer::should_analyzer(
fb_it->second->index_properties) &&
(effective_query_type == InvertedIndexQueryType::EQUAL_QUERY ||
@@ -173,10 +182,10 @@ Status FieldReaderResolver::resolve(const std::string&
field_name,
Result<InvertedIndexReaderPtr> reader_result;
const auto& column_type = data_it->second.second;
if (column_type) {
- reader_result =
- inverted_iterator->select_best_reader(column_type,
effective_query_type, "");
+ reader_result = inverted_iterator->select_best_reader(column_type,
effective_query_type,
+ analyzer_key);
} else {
- reader_result = inverted_iterator->select_best_reader("");
+ reader_result = inverted_iterator->select_best_reader(analyzer_key);
}
if (!reader_result.has_value()) {
@@ -696,6 +705,21 @@ Status FunctionSearch::build_leaf_query(const
TSearchClause& clause,
const std::string& clause_type = clause.clause_type;
auto query_type = clause_type_to_query_type(clause_type);
+ // TERM, WILDCARD, PREFIX, and REGEXP in search DSL operate on individual
index terms
+ // (like Lucene TermQuery, WildcardQuery, PrefixQuery, RegexpQuery).
+ // Override to MATCH_ANY_QUERY so select_best_reader() prefers the
FULLTEXT reader
+ // when multiple indexes exist on the same column (one tokenized, one
untokenized).
+ // Without this, these queries would select the untokenized index and try
to match
+ // patterns like "h*llo" against full strings ("hello world") instead of
individual
+ // tokens ("hello"), returning empty results.
+ // EXACT must remain EQUAL_QUERY to prefer the untokenized STRING_TYPE
reader.
+ //
+ // Safe for single-index columns: select_best_reader() has a single-reader
fast path
+ // that returns the only reader directly, bypassing the query_type
preference logic.
+ if (clause_type == "TERM" || clause_type == "WILDCARD" || clause_type ==
"PREFIX" ||
+ clause_type == "REGEXP") {
+ query_type = InvertedIndexQueryType::MATCH_ANY_QUERY;
+ }
FieldReaderBinding binding;
RETURN_IF_ERROR(resolver.resolve(field_name, query_type, &binding));
diff --git a/regression-test/data/search/test_search_multi_analyzer_lucene.out
b/regression-test/data/search/test_search_multi_analyzer_lucene.out
new file mode 100644
index 00000000000..6dcff738147
--- /dev/null
+++ b/regression-test/data/search/test_search_multi_analyzer_lucene.out
@@ -0,0 +1,55 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !lucene_term --
+1 hello world
+2 hello doris
+
+-- !lucene_and --
+1 hello world
+
+-- !lucene_or --
+1 hello world
+2 hello doris
+3 world peace
+
+-- !lucene_wildcard --
+1 hello world
+2 hello doris
+
+-- !standard_term --
+1 hello world
+2 hello doris
+
+-- !standard_phrase --
+1 hello world
+
+-- !lucene_phrase --
+1 hello world
+
+-- !any_multi_index --
+1 hello world
+2 hello doris
+3 world peace
+
+-- !all_multi_index --
+1 hello world
+
+-- !untok_wildcard_no_match --
+
+-- !untok_wildcard_prefix --
+1 hello world
+2 hello doris
+
+-- !untok_wildcard_suffix --
+1 hello world
+
+-- !untok_prefix --
+1 hello world
+2 hello doris
+
+-- !untok_regexp --
+1 hello world
+2 hello doris
+
+-- !untok_exact --
+1 hello world
+
diff --git
a/regression-test/suites/search/test_search_multi_analyzer_lucene.groovy
b/regression-test/suites/search/test_search_multi_analyzer_lucene.groovy
new file mode 100644
index 00000000000..2d7657728bb
--- /dev/null
+++ b/regression-test/suites/search/test_search_multi_analyzer_lucene.groovy
@@ -0,0 +1,218 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/**
+ * Regression test for DORIS-24542:
+ * search() in Lucene/scalar mode returns empty results when a column has
+ * multiple inverted indexes with different analyzers (e.g., one default
+ * untokenized index and one with an English parser).
+ *
+ * Root cause: BE FieldReaderResolver::resolve() did not use FE-provided
+ * index_properties to select the correct reader, causing it to pick the
+ * wrong (untokenized) index for tokenized queries.
+ */
+suite("test_search_multi_analyzer_lucene") {
+ def tableName = "search_multi_analyzer_lucene_test"
+
+ // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy
testing.
+ sql """ set enable_common_expr_pushdown = true """
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ // Reproduce DORIS-24542: two inverted indexes on same column with
different analyzers
+ sql """
+ CREATE TABLE ${tableName} (
+ id INT,
+ title VARCHAR(255) NOT NULL,
+ INDEX idx_title0 (title) USING INVERTED,
+ INDEX idx_title3 (title) USING INVERTED PROPERTIES("lower_case" =
"true", "parser" = "english", "support_phrase" = "true")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "inverted_index_storage_format" = "V2"
+ )
+ """
+
+ sql """INSERT INTO ${tableName} VALUES
+ (1, 'hello world'),
+ (2, 'hello doris'),
+ (3, 'world peace'),
+ (4, 'foo bar baz')
+ """
+
+ Thread.sleep(3000)
+
+ // Test 1: Lucene mode TERM query - should use tokenized index
+ qt_lucene_term """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('hello', '{"default_field":"title",
"default_operator":"AND", "mode":"lucene", "minimum_should_match": 0}')
+ ORDER BY id
+ """
+
+ // Test 2: Lucene mode AND query
+ qt_lucene_and """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('hello AND world', '{"default_field":"title",
"default_operator":"AND", "mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 3: Lucene mode OR query
+ qt_lucene_or """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('hello OR peace', '{"default_field":"title",
"default_operator":"OR", "mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 4: Lucene mode wildcard query
+ qt_lucene_wildcard """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('h*llo', '{"default_field":"title",
"default_operator":"AND", "mode":"lucene", "minimum_should_match": 0}')
+ ORDER BY id
+ """
+
+ // Test 5: Standard mode TERM query (non-lucene) with multi-index
+ qt_standard_term """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:hello')
+ ORDER BY id
+ """
+
+ // Test 6: Standard mode phrase query with multi-index
+ qt_standard_phrase """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:"hello world"')
+ ORDER BY id
+ """
+
+ // Test 7: Lucene mode phrase query with multi-index
+ qt_lucene_phrase """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('"hello world"', '{"default_field":"title",
"default_operator":"AND", "mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 8: ANY clause with multi-index
+ qt_any_multi_index """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:ANY(hello peace)')
+ ORDER BY id
+ """
+
+ // Test 9: ALL clause with multi-index
+ qt_all_multi_index """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:ALL(hello world)')
+ ORDER BY id
+ """
+
+ // Cleanup
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ //
=========================================================================
+ // Untokenized-only index: verify WILDCARD/PREFIX/REGEXP still work
correctly
+ // when only a STRING_TYPE (no parser) index exists on the column.
+ // select_best_reader() single-reader fast path must return the only reader
+ // regardless of the MATCH_ANY_QUERY override.
+ //
=========================================================================
+ def untokTable = "search_untokenized_only_test"
+ sql "DROP TABLE IF EXISTS ${untokTable}"
+
+ sql """
+ CREATE TABLE ${untokTable} (
+ id INT,
+ title VARCHAR(255) NOT NULL,
+ INDEX idx_title (title) USING INVERTED
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "inverted_index_storage_format" = "V2"
+ )
+ """
+
+ sql """INSERT INTO ${untokTable} VALUES
+ (1, 'hello world'),
+ (2, 'hello doris'),
+ (3, 'world peace'),
+ (4, 'foo bar baz')
+ """
+
+ Thread.sleep(3000)
+
+ // Test 10: Untokenized wildcard h*llo - no match because full string
"hello world" != h*llo
+ qt_untok_wildcard_no_match """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${untokTable}
+ WHERE search('h*llo', '{"default_field":"title", "mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 11: Untokenized wildcard hello* - matches full strings starting
with "hello"
+ qt_untok_wildcard_prefix """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${untokTable}
+ WHERE search('hello*', '{"default_field":"title", "mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 12: Untokenized wildcard *world - matches full strings ending with
"world"
+ qt_untok_wildcard_suffix """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${untokTable}
+ WHERE search('*world', '{"default_field":"title", "mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 13: Untokenized PREFIX hel* - matches full strings starting with
"hel"
+ qt_untok_prefix """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${untokTable}
+ WHERE search('title:hel*', '{"mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 14: Untokenized REGEXP hel.* - matches full strings matching regex
+ qt_untok_regexp """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${untokTable}
+ WHERE search('title:/hel.*/', '{"mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Test 15: Untokenized exact phrase match
+ qt_untok_exact """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${untokTable}
+ WHERE search('title:"hello world"', '{"mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Cleanup
+ sql "DROP TABLE IF EXISTS ${untokTable}"
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]