This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 574b84fa9bc [fix](search) Fix slash character in search query_string 
terms (#61599)
574b84fa9bc is described below

commit 574b84fa9bc8530c38912a1c21f2417a7b30fa9d
Author: Jack <[email protected]>
AuthorDate: Mon Mar 23 14:51:29 2026 +0800

    [fix](search) Fix slash character in search query_string terms (#61599)
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary:
    
    The ANTLR lexer in the search() DSL parser excluded `/` from
    `TERM_CHAR`, causing terms like `AC/DC` to be incorrectly tokenized. The
    slash was silently skipped by ANTLR's default error recovery, splitting
    `AC/DC` into two separate terms `AC` and `DC` instead of treating it as
    a single term.
    
    This caused inconsistent behavior compared to Elasticsearch's
    query_string parsing, where `AC\/DC` (escaped slash) is handled as a
    single analyzed term.
    
    **Fix**: Add `/` to the `TERM_CHAR` fragment in `SearchLexer.g4`. This
    allows `/` to appear within terms (e.g., `AC/DC` -> single term) while
    regex patterns like `/[a-z]+/` still work correctly since `/` remains
    excluded from `TERM_START_CHAR`.
---
 .../org/apache/doris/analysis/SearchLexer.g4       |   1 +
 .../functions/scalar/SearchDslParserTest.java      |  75 +++++++++++++
 .../data/search/test_search_slash_in_term.out      |  32 ++++++
 .../suites/search/test_search_slash_in_term.groovy | 125 +++++++++++++++++++++
 4 files changed, 233 insertions(+)

diff --git 
a/fe/fe-core/src/main/antlr4/org/apache/doris/analysis/SearchLexer.g4 
b/fe/fe-core/src/main/antlr4/org/apache/doris/analysis/SearchLexer.g4
index 15ee0eaeb36..7b691a61337 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/analysis/SearchLexer.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/analysis/SearchLexer.g4
@@ -32,6 +32,7 @@ fragment TERM_CHAR
     : TERM_START_CHAR
     | '-'
     | '+'
+    | '/'
     ;
 
 fragment QUOTED_CHAR
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
index 6dc16a1da7a..c078e569121 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
@@ -1094,6 +1094,81 @@ public class SearchDslParserTest {
         Assertions.assertEquals("path\\to\\file", plan.getRoot().getValue());
     }
 
+    @Test
+    public void testSlashInTerm() {
+        // DORIS-24624: slash within a term should be treated as a regular 
character
+        // e.g., AC/DC should parse as a single term, not trigger regex parsing
+        String dsl = "title:AC/DC";
+        QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+        Assertions.assertNotNull(plan);
+        Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+        Assertions.assertEquals("title", plan.getRoot().getField());
+        Assertions.assertEquals("AC/DC", plan.getRoot().getValue());
+    }
+
+    @Test
+    public void testSlashInTermBareQuery() {
+        // DORIS-24624: slash within a bare term (using default_field)
+        String dsl = "AC/DC";
+        QsPlan plan = SearchDslParser.parseDsl(dsl, 
"{\"default_field\":\"title\",\"default_operator\":\"OR\"}");
+
+        Assertions.assertNotNull(plan);
+        Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+        Assertions.assertEquals("title", plan.getRoot().getField());
+        Assertions.assertEquals("AC/DC", plan.getRoot().getValue());
+    }
+
+    @Test
+    public void testSlashInTermLuceneMode() {
+        // DORIS-24624: slash within a bare term in Lucene mode
+        String dsl = "AC/DC";
+        QsPlan plan = SearchDslParser.parseDsl(dsl,
+                
"{\"default_field\":\"title\",\"default_operator\":\"OR\",\"minimum_should_match\":0}");
+
+        Assertions.assertNotNull(plan);
+        Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+        Assertions.assertEquals("title", plan.getRoot().getField());
+        Assertions.assertEquals("AC/DC", plan.getRoot().getValue());
+    }
+
+    @Test
+    public void testEscapedSlashInTerm() {
+        // Escaped slash should also work and produce same result as unescaped
+        String dsl = "title:AC\\/DC";
+        QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+        Assertions.assertNotNull(plan);
+        Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+        Assertions.assertEquals("title", plan.getRoot().getField());
+        // After unescape: AC\/DC -> AC/DC
+        Assertions.assertEquals("AC/DC", plan.getRoot().getValue());
+    }
+
+    @Test
+    public void testMultipleSlashesInTerm() {
+        // Multiple slashes within a term
+        String dsl = "path:foo/bar/baz";
+        QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+        Assertions.assertNotNull(plan);
+        Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType());
+        Assertions.assertEquals("path", plan.getRoot().getField());
+        Assertions.assertEquals("foo/bar/baz", plan.getRoot().getValue());
+    }
+
+    @Test
+    public void testSlashDoesNotBreakRegexp() {
+        // Regex pattern /pattern/ should still work correctly
+        String dsl = "title:/[a-z]+/";
+        QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+        Assertions.assertNotNull(plan);
+        Assertions.assertEquals(QsClauseType.REGEXP, plan.getRoot().getType());
+        Assertions.assertEquals("title", plan.getRoot().getField());
+        Assertions.assertEquals("[a-z]+", plan.getRoot().getValue());
+    }
+
     @Test
     public void testUppercaseAndOperator() {
         // Test: uppercase AND should be treated as operator
diff --git a/regression-test/data/search/test_search_slash_in_term.out 
b/regression-test/data/search/test_search_slash_in_term.out
new file mode 100644
index 00000000000..3b0d6acd7bf
--- /dev/null
+++ b/regression-test/data/search/test_search_slash_in_term.out
@@ -0,0 +1,32 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !slash_in_term --
+1      AC/DC is a rock band
+2      AC power supply
+3      DC comics
+
+-- !escaped_slash_in_term --
+1      AC/DC is a rock band
+2      AC power supply
+3      DC comics
+
+-- !slash_bare_lucene --
+1      AC/DC is a rock band
+2      AC power supply
+3      DC comics
+
+-- !escaped_slash_bare_lucene --
+1      AC/DC is a rock band
+2      AC power supply
+3      DC comics
+
+-- !multi_slash --
+4      path/to/file
+
+-- !regex_still_works --
+1      AC/DC is a rock band
+
+-- !slash_standard_mode --
+1      AC/DC is a rock band
+2      AC power supply
+3      DC comics
+
diff --git a/regression-test/suites/search/test_search_slash_in_term.groovy 
b/regression-test/suites/search/test_search_slash_in_term.groovy
new file mode 100644
index 00000000000..0749929f252
--- /dev/null
+++ b/regression-test/suites/search/test_search_slash_in_term.groovy
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/**
+ * DORIS-24624: Tests for slash (/) character handling in search() function.
+ *
+ * The slash character is used as a regex delimiter in Lucene query_string 
syntax
+ * (e.g., /pattern/). However, when it appears in the middle of a term (e.g., 
AC/DC),
+ * it should be treated as a regular character, not as a regex delimiter.
+ *
+ * This test verifies that:
+ * 1. Slash within a term (AC/DC) is parsed correctly as a single term
+ * 2. Escaped slash (AC\/DC) produces the same result
+ * 3. Regex patterns (/pattern/) still work correctly
+ * 4. Both standard and lucene modes handle slashes consistently
+ */
+suite("test_search_slash_in_term", "p0") {
+    def tableName = "search_slash_in_term_test"
+
+    sql """ set enable_common_expr_pushdown = true """
+
+    sql "DROP TABLE IF EXISTS ${tableName}"
+
+    sql """
+        CREATE TABLE ${tableName} (
+            id INT,
+            title VARCHAR(200),
+            content VARCHAR(500),
+            INDEX idx_title(title) USING INVERTED PROPERTIES("parser" = 
"standard"),
+            INDEX idx_content(content) USING INVERTED PROPERTIES("parser" = 
"standard")
+        ) ENGINE=OLAP
+        DUPLICATE KEY(id)
+        DISTRIBUTED BY HASH(id) BUCKETS 1
+        PROPERTIES ("replication_allocation" = "tag.location.default: 1")
+    """
+
+    sql """INSERT INTO ${tableName} VALUES
+        (1, 'AC/DC is a rock band', 'rock music'),
+        (2, 'AC power supply', 'electrical engineering'),
+        (3, 'DC comics', 'entertainment'),
+        (4, 'path/to/file', 'file system'),
+        (5, 'a/b/c/d', 'multi slash path'),
+        (6, 'hello world', 'greeting'),
+        (7, 'acdc together', 'no slash')
+    """
+
+    // Wait for index building
+    Thread.sleep(3000)
+
+    // ============ Test 1: Slash in term with field prefix ============
+    // title:AC/DC should parse as single term, standard analyzer tokenizes to 
"ac" and "dc"
+    // With default OR operator, matches rows containing "ac" or "dc" in title
+    order_qt_slash_in_term """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+        FROM ${tableName}
+        WHERE search('title:AC/DC')
+        ORDER BY id
+    """
+
+    // ============ Test 2: Escaped slash should produce same result 
============
+    // title:AC\/DC should produce the same result as title:AC/DC
+    // Groovy: \\\\/ -> SQL: \\/ -> DSL: \/ -> unescaped: /
+    order_qt_escaped_slash_in_term """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+        FROM ${tableName}
+        WHERE search('title:AC\\\\/DC')
+        ORDER BY id
+    """
+
+    // ============ Test 3: Slash in term with default_field (lucene mode) 
============
+    // Bare AC/DC with default_field should work
+    order_qt_slash_bare_lucene """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+        FROM ${tableName}
+        WHERE search('AC/DC', 
'{"default_field":"title","default_operator":"OR","minimum_should_match":0}')
+        ORDER BY id
+    """
+
+    // ============ Test 4: Escaped slash with default_field should match 
============
+    order_qt_escaped_slash_bare_lucene """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+        FROM ${tableName}
+        WHERE search('AC\\\\/DC', 
'{"default_field":"title","default_operator":"OR","minimum_should_match":0}')
+        ORDER BY id
+    """
+
+    // ============ Test 5: Multiple slashes in term ============
+    order_qt_multi_slash """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+        FROM ${tableName}
+        WHERE search('title:path/to/file')
+        ORDER BY id
+    """
+
+    // ============ Test 6: Regex pattern still works ============
+    // /[a-z]+/ should be parsed as regex, not as term with slashes
+    order_qt_regex_still_works """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+        FROM ${tableName}
+        WHERE search('title:/rock/')
+        ORDER BY id
+    """
+
+    // ============ Test 7: Slash in term with standard mode ============
+    order_qt_slash_standard_mode """
+        SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+        FROM ${tableName}
+        WHERE search('AC/DC', '{"default_field":"title","mode":"standard"}')
+        ORDER BY id
+    """
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to