This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 2b0b87be27e [fix](inverted index) Fix empty string MATCH on keyword 
index returning wrong results (#60500)
2b0b87be27e is described below

commit 2b0b87be27e9cdbedecae23988fe9130b336d4f0
Author: Jack <[email protected]>
AuthorDate: Thu Feb 5 10:43:18 2026 +0800

    [fix](inverted index) Fix empty string MATCH on keyword index returning 
wrong results (#60500)
    
    ## Proposed changes
    
    Fix empty string MATCH on keyword index returning wrong results.
    
    The multi-analyzer feature commit (2c950e140a5) incorrectly added an
    empty string check that prevented `MATCH ''` from finding rows with
    empty string values in keyword indexes.
    
    For keyword index (no tokenization), empty string is a valid exact match
    value and should be matchable. The previous code incorrectly skipped
    empty strings with the comment "empty query should match nothing", which
    is wrong for keyword indexes.
    
    ## Problem
    
    ```sql
    -- Table with keyword index (no parser)
    CREATE TABLE test (id INT, col TEXT, INDEX idx(col) USING INVERTED);
    INSERT INTO test VALUES (1, ''), (2, 'data');
    
    -- Before fix: returns 0 (WRONG!)
    -- After fix: returns 1 (CORRECT!)
    SELECT count() FROM test WHERE col MATCH '';
    ```
    
    ## Changes
    
    This fix removes the empty string check for keyword index paths in:
    - `be/src/vec/functions/match.cpp` (slow path)
    - `be/src/olap/rowset/segment_v2/inverted_index_reader.cpp` (index path)
    - `be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp`
    
    Added regression test `test_empty_string_match.groovy` to cover:
    - Empty string match on keyword index (both index and slow paths)
    - Empty string match on tokenized index (should return 0)
    - match_any and match_all with empty string
    
    ## Check List (For Author)
    
    - Test
        - [x] Regression test
        - [x] Unit Test
        - [ ] Manual test
        - [ ] No need to test
    
    - Behavior changed:
    - [x] Yes. `MATCH ''` on keyword index now correctly matches rows with
    empty string values.
    
    - Does this need documentation?
        - [ ] No.
---
 .../inverted_index/analyzer/analyzer.cpp           |  6 +-
 .../rowset/segment_v2/inverted_index_reader.cpp    |  7 +-
 be/src/vec/functions/match.cpp                     | 13 ++--
 .../inverted_index_p0/test_empty_string_match.out  | 26 ++++++++
 .../test_empty_string_match.groovy                 | 78 ++++++++++++++++++++++
 5 files changed, 115 insertions(+), 15 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
index ec9d81f1503..cad3837d081 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
@@ -187,10 +187,10 @@ std::vector<TermInfo> 
InvertedIndexAnalyzer::get_analyse_result(
 std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
         const std::string& search_str, const std::map<std::string, 
std::string>& properties) {
     if (!should_analyzer(properties)) {
+        // Keyword index: all strings (including empty) are valid tokens for 
exact match.
+        // Empty string is a valid value in keyword index and should be 
matchable.
         std::vector<TermInfo> result;
-        if (!search_str.empty()) {
-            result.emplace_back(search_str);
-        }
+        result.emplace_back(search_str);
         return result;
     }
     InvertedIndexAnalyzerConfig config;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index f34a546a105..fecf5b64462 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -315,10 +315,9 @@ Status FullTextIndexReader::query(const 
IndexQueryContextPtr& context,
         } else {
             SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer);
             if (analyzer_ctx != nullptr && !analyzer_ctx->should_tokenize()) {
-                // Don't add empty string as token - empty query should match 
nothing
-                if (!search_str.empty()) {
-                    query_info.term_infos.emplace_back(search_str);
-                }
+                // Keyword index: all strings (including empty) are valid 
tokens for exact match.
+                // Empty string is a valid value in keyword index and should 
be matchable.
+                query_info.term_infos.emplace_back(search_str);
             } else if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != 
nullptr) {
                 // Use analyzer from query context for consistent behavior 
across all segments.
                 // This ensures that the query uses the same analyzer settings 
(e.g., lowercase)
diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index c3a2ec9fce9..502a636b8c8 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -205,20 +205,17 @@ std::vector<TermInfo> 
FunctionMatchBase::analyse_query_str_token(
     // - PARSER_NONE: no tokenization (keyword/exact match)
     // - Other parsers: tokenize using the analyzer
     if (!analyzer_ctx->should_tokenize()) {
-        // Keyword index or no tokenization needed
-        // Don't add empty string as token - empty query should match nothing
-        if (!match_query_str.empty()) {
-            query_tokens.emplace_back(match_query_str);
-        }
+        // Keyword index: all strings (including empty) are valid tokens for 
exact match.
+        // Empty string is a valid value in keyword index and should be 
matchable.
+        query_tokens.emplace_back(match_query_str);
         return query_tokens;
     }
 
     // Safety check: if analyzer is nullptr but tokenization is expected, fall 
back to no tokenization
     if (analyzer_ctx->analyzer == nullptr) {
         VLOG_DEBUG << "Analyzer is nullptr, falling back to no tokenization";
-        if (!match_query_str.empty()) {
-            query_tokens.emplace_back(match_query_str);
-        }
+        // For fallback case, also allow empty strings to be matched
+        query_tokens.emplace_back(match_query_str);
         return query_tokens;
     }
 
diff --git a/regression-test/data/inverted_index_p0/test_empty_string_match.out 
b/regression-test/data/inverted_index_p0/test_empty_string_match.out
new file mode 100644
index 00000000000..c05432b680b
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_empty_string_match.out
@@ -0,0 +1,26 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !keyword_index_path --
+1
+3
+
+-- !keyword_slow_path --
+1
+3
+
+-- !english_index_path --
+0
+
+-- !english_slow_path --
+0
+
+-- !keyword_nonempty --
+2
+
+-- !match_any_empty --
+1
+3
+
+-- !match_all_empty --
+1
+3
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_empty_string_match.groovy 
b/regression-test/suites/inverted_index_p0/test_empty_string_match.groovy
new file mode 100644
index 00000000000..798e0100f1b
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_empty_string_match.groovy
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_empty_string_match", "p0") {
+    def tableName = "test_empty_string_match"
+
+    sql "DROP TABLE IF EXISTS ${tableName}"
+    sql """
+        CREATE TABLE ${tableName} (
+            id INT,
+            keyword_col TEXT DEFAULT '',
+            english_col TEXT DEFAULT '',
+            INDEX keyword_idx(keyword_col) USING INVERTED COMMENT 'keyword 
index',
+            INDEX english_idx(english_col) USING INVERTED PROPERTIES("parser" 
= "english") COMMENT 'english parser'
+        ) ENGINE=OLAP
+        DUPLICATE KEY(id)
+        DISTRIBUTED BY HASH(id) BUCKETS 1
+        PROPERTIES("replication_allocation" = "tag.location.default: 1");
+    """
+
+    sql """
+        INSERT INTO ${tableName} VALUES
+        (1, '', 'hello world'),
+        (2, 'test', ''),
+        (3, '', ''),
+        (4, 'data', 'some text');
+    """
+
+    sql "SET enable_common_expr_pushdown = true"
+
+    // Test 1: Empty string match on keyword index (index path)
+    // Should match rows where keyword_col is empty string (rows 1 and 3)
+    sql "SET enable_inverted_index_query = true"
+    qt_keyword_index_path """SELECT id FROM ${tableName} WHERE keyword_col 
match '' ORDER BY id"""
+
+    // Test 2: Empty string match on keyword index (slow path)
+    // Should also match rows where keyword_col is empty string
+    sql "SET enable_inverted_index_query = false"
+    sql "SET enable_match_without_inverted_index = true"
+    qt_keyword_slow_path """SELECT id FROM ${tableName} WHERE keyword_col 
match '' ORDER BY id"""
+
+    // Test 3: Empty string match on tokenized index (index path)
+    // Should return no rows because empty string tokenizes to nothing
+    sql "SET enable_inverted_index_query = true"
+    qt_english_index_path """SELECT count() FROM ${tableName} WHERE 
english_col match ''"""
+
+    // Test 4: Empty string match on tokenized index (slow path)
+    // Should also return no rows
+    sql "SET enable_inverted_index_query = false"
+    qt_english_slow_path """SELECT count() FROM ${tableName} WHERE english_col 
match ''"""
+
+    // Test 5: Non-empty string match on keyword index should work as before
+    sql "SET enable_inverted_index_query = true"
+    qt_keyword_nonempty """SELECT id FROM ${tableName} WHERE keyword_col match 
'test' ORDER BY id"""
+
+    // Test 6: Verify match_any with empty string on keyword index
+    sql "SET enable_inverted_index_query = false"
+    qt_match_any_empty """SELECT id FROM ${tableName} WHERE keyword_col 
match_any '' ORDER BY id"""
+
+    // Test 7: Verify match_all with empty string on keyword index
+    qt_match_all_empty """SELECT id FROM ${tableName} WHERE keyword_col 
match_all '' ORDER BY id"""
+
+    sql "DROP TABLE IF EXISTS ${tableName}"
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to