This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new fef71d4cb05 branch-4.0: [feature](search) introduce lucene bool mode
for search function #59394 (#59745)
fef71d4cb05 is described below
commit fef71d4cb0586370e08eba0710122dd419c1036a
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Jan 14 10:05:33 2026 +0800
branch-4.0: [feature](search) introduce lucene bool mode for search
function #59394 (#59745)
Cherry-picked from #59394
**Note:** This PR depends on #59766 (cherry-pick of #58545) being merged
first.
## Summary
Introduce lucene bool mode for search function.
## Test plan
- [ ] Regression tests (after dependency PR merged)
Related PRs: #59394
Depends on: #59766
Co-authored-by: Jack <[email protected]>
---
be/src/vec/functions/function_search.cpp | 50 +-
be/test/vec/function/function_search_test.cpp | 441 ++++++++++++
.../org/apache/doris/analysis/SearchPredicate.java | 24 +
.../trees/expressions/functions/scalar/Search.java | 60 +-
.../functions/scalar/SearchDslParser.java | 761 ++++++++++++++++++++-
.../functions/scalar/SearchDslParserTest.java | 348 ++++++++++
gensrc/thrift/Exprs.thrift | 14 +-
regression-test/data/search/test_search_escape.out | 46 ++
.../data/search/test_search_lucene_mode.out | 86 +++
.../test_search_default_field_operator.groovy | 48 +-
.../suites/search/test_search_escape.groovy | 189 +++++
.../test_search_inverted_is_null_pushdown.groovy | 74 ++
.../suites/search/test_search_lucene_mode.groovy | 250 +++++++
13 files changed, 2305 insertions(+), 86 deletions(-)
diff --git a/be/src/vec/functions/function_search.cpp
b/be/src/vec/functions/function_search.cpp
index 6fd7da39208..4a4a397e8d8 100644
--- a/be/src/vec/functions/function_search.cpp
+++ b/be/src/vec/functions/function_search.cpp
@@ -317,7 +317,8 @@ Status
FunctionSearch::evaluate_inverted_index_with_search_param(
// Aligned with FE QsClauseType enum - uses enum.name() as clause_type
FunctionSearch::ClauseTypeCategory FunctionSearch::get_clause_type_category(
const std::string& clause_type) const {
- if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT") {
+ if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" ||
+ clause_type == "OCCUR_BOOLEAN") {
return ClauseTypeCategory::COMPOUND;
} else if (clause_type == "TERM" || clause_type == "PREFIX" || clause_type
== "WILDCARD" ||
clause_type == "REGEXP" || clause_type == "RANGE" ||
clause_type == "LIST" ||
@@ -377,6 +378,7 @@ InvertedIndexQueryType
FunctionSearch::clause_type_to_query_type(
{"AND", InvertedIndexQueryType::BOOLEAN_QUERY},
{"OR", InvertedIndexQueryType::BOOLEAN_QUERY},
{"NOT", InvertedIndexQueryType::BOOLEAN_QUERY},
+ {"OCCUR_BOOLEAN", InvertedIndexQueryType::BOOLEAN_QUERY},
// Non-tokenized queries (exact matching, pattern matching)
{"TERM", InvertedIndexQueryType::EQUAL_QUERY},
@@ -406,6 +408,20 @@ InvertedIndexQueryType
FunctionSearch::clause_type_to_query_type(
return InvertedIndexQueryType::EQUAL_QUERY;
}
+// Map Thrift TSearchOccur to query_v2::Occur
+static query_v2::Occur map_thrift_occur(TSearchOccur::type thrift_occur) {
+ switch (thrift_occur) {
+ case TSearchOccur::MUST:
+ return query_v2::Occur::MUST;
+ case TSearchOccur::SHOULD:
+ return query_v2::Occur::SHOULD;
+ case TSearchOccur::MUST_NOT:
+ return query_v2::Occur::MUST_NOT;
+ default:
+ return query_v2::Occur::MUST;
+ }
+}
+
Status FunctionSearch::build_query_recursive(const TSearchClause& clause,
const
std::shared_ptr<IndexQueryContext>& context,
FieldReaderResolver& resolver,
@@ -418,6 +434,38 @@ Status FunctionSearch::build_query_recursive(const
TSearchClause& clause,
}
const std::string& clause_type = clause.clause_type;
+
+ // Handle OCCUR_BOOLEAN - Lucene-style boolean query with
MUST/SHOULD/MUST_NOT
+ if (clause_type == "OCCUR_BOOLEAN") {
+ auto builder =
segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder();
+
+ // Set minimum_should_match if specified
+ if (clause.__isset.minimum_should_match) {
+
builder->set_minimum_number_should_match(clause.minimum_should_match);
+ }
+
+ if (clause.__isset.children) {
+ for (const auto& child_clause : clause.children) {
+ query_v2::QueryPtr child_query;
+ std::string child_binding_key;
+ RETURN_IF_ERROR(build_query_recursive(child_clause, context,
resolver, &child_query,
+ &child_binding_key));
+
+ // Determine occur type from child clause
+ query_v2::Occur occur = query_v2::Occur::MUST; // default
+ if (child_clause.__isset.occur) {
+ occur = map_thrift_occur(child_clause.occur);
+ }
+
+ builder->add(child_query, occur);
+ }
+ }
+
+ *out = builder->build();
+ return Status::OK();
+ }
+
+ // Handle standard boolean operators (AND/OR/NOT)
if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT") {
query_v2::OperatorType op = query_v2::OperatorType::OP_AND;
if (clause_type == "OR") {
diff --git a/be/test/vec/function/function_search_test.cpp
b/be/test/vec/function/function_search_test.cpp
index a4f53068d6f..64b64b0d667 100644
--- a/be/test/vec/function/function_search_test.cpp
+++ b/be/test/vec/function/function_search_test.cpp
@@ -1760,4 +1760,445 @@ TEST_F(FunctionSearchTest, TestMultiPhraseQueryCase) {
ASSERT_NE(multi_phrase_weight, nullptr);
}
+// ============== Lucene Mode (OCCUR_BOOLEAN) Tests ==============
+
+TEST_F(FunctionSearchTest, TestOccurBooleanClauseTypeCategory) {
+ // Test that OCCUR_BOOLEAN is classified as COMPOUND
+ EXPECT_EQ(FunctionSearch::ClauseTypeCategory::COMPOUND,
+ function_search->get_clause_type_category("OCCUR_BOOLEAN"));
+}
+
+TEST_F(FunctionSearchTest, TestOccurBooleanQueryType) {
+ // Test that OCCUR_BOOLEAN maps to BOOLEAN_QUERY
+ EXPECT_EQ(segment_v2::InvertedIndexQueryType::BOOLEAN_QUERY,
+ function_search->clause_type_to_query_type("OCCUR_BOOLEAN"));
+}
+
+TEST_F(FunctionSearchTest, TestOccurBooleanSearchParam) {
+ // Test creating OCCUR_BOOLEAN search param (Lucene mode)
+ TSearchParam searchParam;
+ searchParam.original_dsl = "field:a AND field:b OR field:c";
+
+ // Create child clauses with occur types
+ TSearchClause mustClause1;
+ mustClause1.clause_type = "TERM";
+ mustClause1.field_name = "field";
+ mustClause1.value = "a";
+ mustClause1.__isset.field_name = true;
+ mustClause1.__isset.value = true;
+ mustClause1.occur = TSearchOccur::MUST;
+ mustClause1.__isset.occur = true;
+
+ TSearchClause mustClause2;
+ mustClause2.clause_type = "TERM";
+ mustClause2.field_name = "field";
+ mustClause2.value = "b";
+ mustClause2.__isset.field_name = true;
+ mustClause2.__isset.value = true;
+ mustClause2.occur = TSearchOccur::MUST;
+ mustClause2.__isset.occur = true;
+
+ // Create root OCCUR_BOOLEAN clause
+ TSearchClause rootClause;
+ rootClause.clause_type = "OCCUR_BOOLEAN";
+ rootClause.children = {mustClause1, mustClause2};
+ rootClause.__isset.children = true;
+ rootClause.minimum_should_match = 0;
+ rootClause.__isset.minimum_should_match = true;
+ searchParam.root = rootClause;
+
+ // Verify structure
+ EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type);
+ EXPECT_EQ(2, searchParam.root.children.size());
+ EXPECT_EQ(TSearchOccur::MUST, searchParam.root.children[0].occur);
+ EXPECT_EQ(TSearchOccur::MUST, searchParam.root.children[1].occur);
+ EXPECT_EQ(0, searchParam.root.minimum_should_match);
+}
+
+TEST_F(FunctionSearchTest, TestOccurBooleanWithMustNotClause) {
+ // Test OCCUR_BOOLEAN with MUST_NOT (NOT operator in Lucene mode)
+ TSearchParam searchParam;
+ searchParam.original_dsl = "NOT field:a";
+
+ TSearchClause mustNotClause;
+ mustNotClause.clause_type = "TERM";
+ mustNotClause.field_name = "field";
+ mustNotClause.value = "a";
+ mustNotClause.__isset.field_name = true;
+ mustNotClause.__isset.value = true;
+ mustNotClause.occur = TSearchOccur::MUST_NOT;
+ mustNotClause.__isset.occur = true;
+
+ TSearchClause rootClause;
+ rootClause.clause_type = "OCCUR_BOOLEAN";
+ rootClause.children = {mustNotClause};
+ rootClause.__isset.children = true;
+ searchParam.root = rootClause;
+
+ // Verify structure
+ EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type);
+ EXPECT_EQ(1, searchParam.root.children.size());
+ EXPECT_EQ(TSearchOccur::MUST_NOT, searchParam.root.children[0].occur);
+}
+
+TEST_F(FunctionSearchTest, TestOccurBooleanWithShouldClauses) {
+ // Test OCCUR_BOOLEAN with SHOULD clauses (OR in Lucene mode)
+ TSearchParam searchParam;
+ searchParam.original_dsl = "field:a OR field:b";
+
+ TSearchClause shouldClause1;
+ shouldClause1.clause_type = "TERM";
+ shouldClause1.field_name = "field";
+ shouldClause1.value = "a";
+ shouldClause1.__isset.field_name = true;
+ shouldClause1.__isset.value = true;
+ shouldClause1.occur = TSearchOccur::SHOULD;
+ shouldClause1.__isset.occur = true;
+
+ TSearchClause shouldClause2;
+ shouldClause2.clause_type = "TERM";
+ shouldClause2.field_name = "field";
+ shouldClause2.value = "b";
+ shouldClause2.__isset.field_name = true;
+ shouldClause2.__isset.value = true;
+ shouldClause2.occur = TSearchOccur::SHOULD;
+ shouldClause2.__isset.occur = true;
+
+ TSearchClause rootClause;
+ rootClause.clause_type = "OCCUR_BOOLEAN";
+ rootClause.children = {shouldClause1, shouldClause2};
+ rootClause.__isset.children = true;
+ rootClause.minimum_should_match = 1;
+ rootClause.__isset.minimum_should_match = true;
+ searchParam.root = rootClause;
+
+ // Verify structure
+ EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type);
+ EXPECT_EQ(2, searchParam.root.children.size());
+ EXPECT_EQ(TSearchOccur::SHOULD, searchParam.root.children[0].occur);
+ EXPECT_EQ(TSearchOccur::SHOULD, searchParam.root.children[1].occur);
+ EXPECT_EQ(1, searchParam.root.minimum_should_match);
+}
+
+TEST_F(FunctionSearchTest, TestOccurBooleanMixedOccurTypes) {
+ // Test OCCUR_BOOLEAN with mixed MUST, SHOULD, MUST_NOT (complex Lucene
query)
+ // Example: +a +b c -d (a AND b, c is optional, NOT d)
+ TSearchParam searchParam;
+ searchParam.original_dsl = "field:a AND field:b OR field:c NOT field:d";
+
+ TSearchClause mustClause1;
+ mustClause1.clause_type = "TERM";
+ mustClause1.field_name = "field";
+ mustClause1.value = "a";
+ mustClause1.__isset.field_name = true;
+ mustClause1.__isset.value = true;
+ mustClause1.occur = TSearchOccur::MUST;
+ mustClause1.__isset.occur = true;
+
+ TSearchClause mustClause2;
+ mustClause2.clause_type = "TERM";
+ mustClause2.field_name = "field";
+ mustClause2.value = "b";
+ mustClause2.__isset.field_name = true;
+ mustClause2.__isset.value = true;
+ mustClause2.occur = TSearchOccur::MUST;
+ mustClause2.__isset.occur = true;
+
+ TSearchClause shouldClause;
+ shouldClause.clause_type = "TERM";
+ shouldClause.field_name = "field";
+ shouldClause.value = "c";
+ shouldClause.__isset.field_name = true;
+ shouldClause.__isset.value = true;
+ shouldClause.occur = TSearchOccur::SHOULD;
+ shouldClause.__isset.occur = true;
+
+ TSearchClause mustNotClause;
+ mustNotClause.clause_type = "TERM";
+ mustNotClause.field_name = "field";
+ mustNotClause.value = "d";
+ mustNotClause.__isset.field_name = true;
+ mustNotClause.__isset.value = true;
+ mustNotClause.occur = TSearchOccur::MUST_NOT;
+ mustNotClause.__isset.occur = true;
+
+ TSearchClause rootClause;
+ rootClause.clause_type = "OCCUR_BOOLEAN";
+ rootClause.children = {mustClause1, mustClause2, shouldClause,
mustNotClause};
+ rootClause.__isset.children = true;
+ rootClause.minimum_should_match = 0;
+ rootClause.__isset.minimum_should_match = true;
+ searchParam.root = rootClause;
+
+ // Verify structure
+ EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type);
+ EXPECT_EQ(4, searchParam.root.children.size());
+ EXPECT_EQ(TSearchOccur::MUST, searchParam.root.children[0].occur);
+ EXPECT_EQ(TSearchOccur::MUST, searchParam.root.children[1].occur);
+ EXPECT_EQ(TSearchOccur::SHOULD, searchParam.root.children[2].occur);
+ EXPECT_EQ(TSearchOccur::MUST_NOT, searchParam.root.children[3].occur);
+ EXPECT_EQ(0, searchParam.root.minimum_should_match);
+}
+
+TEST_F(FunctionSearchTest, TestOccurBooleanMinimumShouldMatchZero) {
+ // Test that SHOULD clauses are effectively ignored when
minimum_should_match=0
+ // and MUST clauses exist
+ TSearchParam searchParam;
+ searchParam.original_dsl = "field:a AND field:b OR field:c";
+
+ TSearchClause mustClause1;
+ mustClause1.clause_type = "TERM";
+ mustClause1.field_name = "field";
+ mustClause1.value = "a";
+ mustClause1.__isset.field_name = true;
+ mustClause1.__isset.value = true;
+ mustClause1.occur = TSearchOccur::MUST;
+ mustClause1.__isset.occur = true;
+
+ TSearchClause mustClause2;
+ mustClause2.clause_type = "TERM";
+ mustClause2.field_name = "field";
+ mustClause2.value = "b";
+ mustClause2.__isset.field_name = true;
+ mustClause2.__isset.value = true;
+ mustClause2.occur = TSearchOccur::MUST;
+ mustClause2.__isset.occur = true;
+
+ // Note: In Lucene mode with minimum_should_match=0 and MUST clauses,
+ // SHOULD clauses are filtered out during FE parsing.
+ // So only MUST clauses should be present.
+ TSearchClause rootClause;
+ rootClause.clause_type = "OCCUR_BOOLEAN";
+ rootClause.children = {mustClause1, mustClause2};
+ rootClause.__isset.children = true;
+ rootClause.minimum_should_match = 0;
+ rootClause.__isset.minimum_should_match = true;
+ searchParam.root = rootClause;
+
+ // Verify structure
+ EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type);
+ EXPECT_EQ(2, searchParam.root.children.size());
+ EXPECT_EQ(0, searchParam.root.minimum_should_match);
+}
+
+TEST_F(FunctionSearchTest, TestOccurBooleanMinimumShouldMatchOne) {
+ // Test that at least one SHOULD clause must match when
minimum_should_match=1
+ TSearchParam searchParam;
+ searchParam.original_dsl = "field:a OR field:b OR field:c";
+
+ TSearchClause shouldClause1;
+ shouldClause1.clause_type = "TERM";
+ shouldClause1.field_name = "field";
+ shouldClause1.value = "a";
+ shouldClause1.__isset.field_name = true;
+ shouldClause1.__isset.value = true;
+ shouldClause1.occur = TSearchOccur::SHOULD;
+ shouldClause1.__isset.occur = true;
+
+ TSearchClause shouldClause2;
+ shouldClause2.clause_type = "TERM";
+ shouldClause2.field_name = "field";
+ shouldClause2.value = "b";
+ shouldClause2.__isset.field_name = true;
+ shouldClause2.__isset.value = true;
+ shouldClause2.occur = TSearchOccur::SHOULD;
+ shouldClause2.__isset.occur = true;
+
+ TSearchClause shouldClause3;
+ shouldClause3.clause_type = "TERM";
+ shouldClause3.field_name = "field";
+ shouldClause3.value = "c";
+ shouldClause3.__isset.field_name = true;
+ shouldClause3.__isset.value = true;
+ shouldClause3.occur = TSearchOccur::SHOULD;
+ shouldClause3.__isset.occur = true;
+
+ TSearchClause rootClause;
+ rootClause.clause_type = "OCCUR_BOOLEAN";
+ rootClause.children = {shouldClause1, shouldClause2, shouldClause3};
+ rootClause.__isset.children = true;
+ rootClause.minimum_should_match = 1;
+ rootClause.__isset.minimum_should_match = true;
+ searchParam.root = rootClause;
+
+ // Verify structure
+ EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type);
+ EXPECT_EQ(3, searchParam.root.children.size());
+ EXPECT_EQ(1, searchParam.root.minimum_should_match);
+}
+
+TEST_F(FunctionSearchTest, TestOccurBooleanAnalyzeFieldQueryType) {
+ // Test field query type analysis for OCCUR_BOOLEAN
+ TSearchClause mustClause;
+ mustClause.clause_type = "TERM";
+ mustClause.field_name = "title";
+ mustClause.value = "hello";
+ mustClause.__isset.field_name = true;
+ mustClause.__isset.value = true;
+ mustClause.occur = TSearchOccur::MUST;
+ mustClause.__isset.occur = true;
+
+ TSearchClause rootClause;
+ rootClause.clause_type = "OCCUR_BOOLEAN";
+ rootClause.children = {mustClause};
+ rootClause.__isset.children = true;
+
+ // Test field-specific query type analysis
+ auto title_query_type = function_search->analyze_field_query_type("title",
rootClause);
+ EXPECT_EQ(segment_v2::InvertedIndexQueryType::EQUAL_QUERY,
title_query_type);
+
+ // Test field not in query
+ auto other_query_type =
function_search->analyze_field_query_type("other_field", rootClause);
+ EXPECT_EQ(segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY,
other_query_type);
+}
+
+TEST_F(FunctionSearchTest, TestOccurBooleanWithPhraseQuery) {
+ // Test OCCUR_BOOLEAN with PHRASE child clause
+ TSearchParam searchParam;
+ searchParam.original_dsl = "content:\"machine learning\" AND title:hello";
+
+ TSearchClause phraseClause;
+ phraseClause.clause_type = "PHRASE";
+ phraseClause.field_name = "content";
+ phraseClause.value = "machine learning";
+ phraseClause.__isset.field_name = true;
+ phraseClause.__isset.value = true;
+ phraseClause.occur = TSearchOccur::MUST;
+ phraseClause.__isset.occur = true;
+
+ TSearchClause termClause;
+ termClause.clause_type = "TERM";
+ termClause.field_name = "title";
+ termClause.value = "hello";
+ termClause.__isset.field_name = true;
+ termClause.__isset.value = true;
+ termClause.occur = TSearchOccur::MUST;
+ termClause.__isset.occur = true;
+
+ TSearchClause rootClause;
+ rootClause.clause_type = "OCCUR_BOOLEAN";
+ rootClause.children = {phraseClause, termClause};
+ rootClause.__isset.children = true;
+ searchParam.root = rootClause;
+
+ // Verify structure
+ EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type);
+ EXPECT_EQ(2, searchParam.root.children.size());
+ EXPECT_EQ("PHRASE", searchParam.root.children[0].clause_type);
+ EXPECT_EQ("TERM", searchParam.root.children[1].clause_type);
+
+ // Test field-specific query type analysis
+ auto content_query_type =
+ function_search->analyze_field_query_type("content",
searchParam.root);
+ EXPECT_EQ(segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY,
content_query_type);
+
+ auto title_query_type = function_search->analyze_field_query_type("title",
searchParam.root);
+ EXPECT_EQ(segment_v2::InvertedIndexQueryType::EQUAL_QUERY,
title_query_type);
+}
+
+TEST_F(FunctionSearchTest, TestOccurBooleanNestedQuery) {
+ // Test nested OCCUR_BOOLEAN query
+ TSearchParam searchParam;
+ searchParam.original_dsl = "(field:a AND field:b) OR field:c";
+
+ TSearchClause innerMust1;
+ innerMust1.clause_type = "TERM";
+ innerMust1.field_name = "field";
+ innerMust1.value = "a";
+ innerMust1.__isset.field_name = true;
+ innerMust1.__isset.value = true;
+ innerMust1.occur = TSearchOccur::MUST;
+ innerMust1.__isset.occur = true;
+
+ TSearchClause innerMust2;
+ innerMust2.clause_type = "TERM";
+ innerMust2.field_name = "field";
+ innerMust2.value = "b";
+ innerMust2.__isset.field_name = true;
+ innerMust2.__isset.value = true;
+ innerMust2.occur = TSearchOccur::MUST;
+ innerMust2.__isset.occur = true;
+
+ TSearchClause innerOccurBoolean;
+ innerOccurBoolean.clause_type = "OCCUR_BOOLEAN";
+ innerOccurBoolean.children = {innerMust1, innerMust2};
+ innerOccurBoolean.__isset.children = true;
+ innerOccurBoolean.occur = TSearchOccur::SHOULD;
+ innerOccurBoolean.__isset.occur = true;
+
+ TSearchClause shouldClause;
+ shouldClause.clause_type = "TERM";
+ shouldClause.field_name = "field";
+ shouldClause.value = "c";
+ shouldClause.__isset.field_name = true;
+ shouldClause.__isset.value = true;
+ shouldClause.occur = TSearchOccur::SHOULD;
+ shouldClause.__isset.occur = true;
+
+ TSearchClause rootClause;
+ rootClause.clause_type = "OCCUR_BOOLEAN";
+ rootClause.children = {innerOccurBoolean, shouldClause};
+ rootClause.__isset.children = true;
+ rootClause.minimum_should_match = 1;
+ rootClause.__isset.minimum_should_match = true;
+ searchParam.root = rootClause;
+
+ // Verify structure
+ EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type);
+ EXPECT_EQ(2, searchParam.root.children.size());
+ EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.children[0].clause_type);
+ EXPECT_EQ("TERM", searchParam.root.children[1].clause_type);
+ EXPECT_EQ(1, searchParam.root.minimum_should_match);
+}
+
+TEST_F(FunctionSearchTest, TestEvaluateInvertedIndexWithOccurBoolean) {
+ // Test evaluate_inverted_index_with_search_param with OCCUR_BOOLEAN
+ TSearchParam search_param;
+ search_param.original_dsl = "title:hello AND content:world";
+
+ TSearchClause mustClause1;
+ mustClause1.clause_type = "TERM";
+ mustClause1.field_name = "title";
+ mustClause1.value = "hello";
+ mustClause1.__isset.field_name = true;
+ mustClause1.__isset.value = true;
+ mustClause1.occur = TSearchOccur::MUST;
+ mustClause1.__isset.occur = true;
+
+ TSearchClause mustClause2;
+ mustClause2.clause_type = "TERM";
+ mustClause2.field_name = "content";
+ mustClause2.value = "world";
+ mustClause2.__isset.field_name = true;
+ mustClause2.__isset.value = true;
+ mustClause2.occur = TSearchOccur::MUST;
+ mustClause2.__isset.occur = true;
+
+ TSearchClause rootClause;
+ rootClause.clause_type = "OCCUR_BOOLEAN";
+ rootClause.children = {mustClause1, mustClause2};
+ rootClause.__isset.children = true;
+ rootClause.minimum_should_match = 0;
+ rootClause.__isset.minimum_should_match = true;
+ search_param.root = rootClause;
+
+ std::unordered_map<std::string, vectorized::IndexFieldNameAndTypePair>
data_types;
+ std::unordered_map<std::string, IndexIterator*> iterators;
+
+ // No real iterators - will fail but tests the code path
+ data_types["title"] = {"title", nullptr};
+ data_types["content"] = {"content", nullptr};
+ iterators["title"] = nullptr;
+ iterators["content"] = nullptr;
+
+ uint32_t num_rows = 100;
+ InvertedIndexResultBitmap bitmap_result;
+
+ auto status = function_search->evaluate_inverted_index_with_search_param(
+ search_param, data_types, iterators, num_rows, bitmap_result);
+ // Will return OK because root_query is nullptr (all child queries fail)
+ // EXPECT_TRUE(status.ok());
+ EXPECT_TRUE(status.is<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>());
+}
+
} // namespace doris::vectorized
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java
index 659e3acee3a..961545a709f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java
@@ -26,6 +26,7 @@ import org.apache.doris.thrift.TExprNode;
import org.apache.doris.thrift.TExprNodeType;
import org.apache.doris.thrift.TSearchClause;
import org.apache.doris.thrift.TSearchFieldBinding;
+import org.apache.doris.thrift.TSearchOccur;
import org.apache.doris.thrift.TSearchParam;
import org.apache.logging.log4j.LogManager;
@@ -312,6 +313,16 @@ public class SearchPredicate extends Predicate {
clause.setValue(node.value);
}
+ // Convert occur type for Lucene-style boolean queries
+ if (node.occur != null) {
+ clause.setOccur(convertQsOccurToThrift(node.occur));
+ }
+
+ // Convert minimum_should_match for OCCUR_BOOLEAN
+ if (node.minimumShouldMatch != null) {
+ clause.setMinimumShouldMatch(node.minimumShouldMatch);
+ }
+
if (node.children != null && !node.children.isEmpty()) {
List<TSearchClause> childClauses = new ArrayList<>();
for (SearchDslParser.QsNode child : node.children) {
@@ -323,6 +334,19 @@ public class SearchPredicate extends Predicate {
return clause;
}
+ private TSearchOccur convertQsOccurToThrift(SearchDslParser.QsOccur occur)
{
+ switch (occur) {
+ case MUST:
+ return TSearchOccur.MUST;
+ case SHOULD:
+ return TSearchOccur.SHOULD;
+ case MUST_NOT:
+ return TSearchOccur.MUST_NOT;
+ default:
+ return TSearchOccur.MUST;
+ }
+ }
+
// Getters
public String getDslString() {
return dslString;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Search.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Search.java
index f89af38cc22..3a98a6cbf05 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Search.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Search.java
@@ -34,10 +34,17 @@ import java.util.List;
* ScalarFunction 'search' - simplified architecture similar to MultiMatch.
* Handles DSL parsing and generates SearchPredicate during translation.
* <p>
- * Supports 1-3 parameters:
- * - search(dsl_string): Traditional usage
- * - search(dsl_string, default_field): Simplified syntax with default field
- * - search(dsl_string, default_field, default_operator): Full control over
expansion
+ * Supports 1-2 parameters:
+ * - search(dsl_string): Traditional usage with field specified in DSL
+ * - search(dsl_string, options): With JSON options for configuration
+ * <p>
+ * Options parameter (JSON format):
+ * - default_field: default field name when DSL doesn't specify field
+ * - default_operator: "and" or "or" for multi-term queries (default: "and")
+ * - mode: "standard" (default) or "lucene" (ES/Lucene-style boolean parsing)
+ * - minimum_should_match: integer for Lucene mode (default: 0 for filter
context)
+ * <p>
+ * Example options:
'{"default_field":"title","mode":"lucene","minimum_should_match":0}'
*/
public class Search extends ScalarFunction
implements ExplicitlyCastableSignature, AlwaysNotNullable {
@@ -45,11 +52,8 @@ public class Search extends ScalarFunction
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
// Original signature: search(dsl_string)
FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE),
- // With default field: search(dsl_string, default_field)
-
FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE,
StringType.INSTANCE),
- // With default field and operator: search(dsl_string,
default_field, default_operator)
-
FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE,
StringType.INSTANCE,
- StringType.INSTANCE)
+ // With options: search(dsl_string, options)
+
FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE,
StringType.INSTANCE)
);
public Search(Expression... varArgs) {
@@ -62,8 +66,8 @@ public class Search extends ScalarFunction
@Override
public Search withChildren(List<Expression> children) {
- Preconditions.checkArgument(children.size() >= 1 && children.size() <=
3,
- "search() requires 1-3 arguments");
+ Preconditions.checkArgument(children.size() >= 1 && children.size() <=
2,
+ "search() requires 1-2 arguments");
return new Search(getFunctionParams(children));
}
@@ -89,31 +93,23 @@ public class Search extends ScalarFunction
}
/**
- * Get default field from second argument (optional)
+ * Get options JSON string from second argument (optional).
+ * Options is a JSON string containing all configuration:
+ * - default_field: default field name when DSL doesn't specify field
+ * - default_operator: "and" or "or" for multi-term queries
+ * - mode: "standard" or "lucene"
+ * - minimum_should_match: integer for Lucene mode
+ * Example:
'{"default_field":"title","mode":"lucene","minimum_should_match":0}'
*/
- public String getDefaultField() {
+ public String getOptionsJson() {
if (children().size() < 2) {
return null;
}
- Expression fieldArg = child(1);
- if (fieldArg instanceof StringLikeLiteral) {
- return ((StringLikeLiteral) fieldArg).getStringValue();
- }
- return fieldArg.toString();
- }
-
- /**
- * Get default operator from third argument (optional)
- */
- public String getDefaultOperator() {
- if (children().size() < 3) {
- return null;
- }
- Expression operatorArg = child(2);
- if (operatorArg instanceof StringLikeLiteral) {
- return ((StringLikeLiteral) operatorArg).getStringValue();
+ Expression optionsArg = child(1);
+ if (optionsArg instanceof StringLikeLiteral) {
+ return ((StringLikeLiteral) optionsArg).getStringValue();
}
- return operatorArg.toString();
+ return optionsArg.toString();
}
/**
@@ -122,7 +118,7 @@ public class Search extends ScalarFunction
*/
public SearchDslParser.QsPlan getQsPlan() {
// Lazy evaluation will be handled in SearchPredicate
- return SearchDslParser.parseDsl(getDslString(), getDefaultField(),
getDefaultOperator());
+ return SearchDslParser.parseDsl(getDslString(), getOptionsJson());
}
@Override
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java
index 8dfd9febb68..b4c880546a7 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java
@@ -61,11 +61,42 @@ public class SearchDslParser {
* Parse DSL string and return intermediate representation
*/
public static QsPlan parseDsl(String dsl) {
- return parseDsl(dsl, null, null);
+ return parseDsl(dsl, (String) null);
}
/**
- * Parse DSL string with default field and operator support
+ * Parse DSL string with JSON options support.
+ * This is the primary method for the new 2-parameter search function
signature.
+ *
+ * @param dsl DSL query string
+ * @param optionsJson JSON options string containing all configuration:
+ * - default_field: default field name when DSL doesn't
specify field
+ * - default_operator: "and" or "or" for multi-term
queries
+ * - mode: "standard" or "lucene"
+ * - minimum_should_match: integer for Lucene mode
+ * Example:
'{"default_field":"title","mode":"lucene","minimum_should_match":0}'
+ * @return Parsed QsPlan
+ */
+ public static QsPlan parseDsl(String dsl, String optionsJson) {
+ // Parse options from JSON
+ SearchOptions searchOptions = parseOptions(optionsJson);
+
+ // Extract default_field and default_operator from options
+ String defaultField = searchOptions.getDefaultField();
+ String defaultOperator = searchOptions.getDefaultOperator();
+
+ // Use Lucene mode parser if specified
+ if (searchOptions.isLuceneMode()) {
+ return parseDslLuceneMode(dsl, defaultField, defaultOperator,
searchOptions);
+ }
+
+ // Standard mode parsing
+ return parseDslStandardMode(dsl, defaultField, defaultOperator);
+ }
+
+ /**
+ * Parse DSL string with default field and operator support (legacy
method).
+ * Kept for backward compatibility.
*
* @param dsl DSL query string
* @param defaultField Default field name when DSL doesn't specify field
(optional)
@@ -73,6 +104,13 @@ public class SearchDslParser {
* @return Parsed QsPlan
*/
public static QsPlan parseDsl(String dsl, String defaultField, String
defaultOperator) {
+ return parseDslStandardMode(dsl, defaultField, defaultOperator);
+ }
+
+ /**
+ * Standard mode parsing (original behavior)
+ */
+ private static QsPlan parseDslStandardMode(String dsl, String
defaultField, String defaultOperator) {
if (dsl == null || dsl.trim().isEmpty()) {
return new QsPlan(new QsNode(QsClauseType.TERM, "error",
"empty_dsl"), new ArrayList<>());
}
@@ -204,14 +242,19 @@ public class SearchDslParser {
}
/**
- * Check if DSL contains field references (has colon not in quoted strings)
+ * Check if DSL contains field references (has colon not in quoted strings
or escaped)
*/
private static boolean containsFieldReference(String dsl) {
boolean inQuotes = false;
boolean inRegex = false;
for (int i = 0; i < dsl.length(); i++) {
char c = dsl.charAt(i);
- if (c == '"' && (i == 0 || dsl.charAt(i - 1) != '\\')) {
+ // Handle escape sequences - skip the escaped character
+ if (c == '\\' && i + 1 < dsl.length()) {
+ i++; // Skip next character (it's escaped)
+ continue;
+ }
+ if (c == '"') {
inQuotes = !inQuotes;
} else if (c == '/' && !inQuotes) {
inRegex = !inRegex;
@@ -247,6 +290,7 @@ public class SearchDslParser {
/**
* Add field prefix to expressions with explicit operators
* Example: "foo AND bar" → "field:foo AND field:bar"
+ * Handles escape sequences properly (e.g., "First\ Value" stays as single
term)
*/
private static String addFieldPrefixToOperatorExpression(String dsl,
String defaultField) {
StringBuilder result = new StringBuilder();
@@ -254,7 +298,7 @@ public class SearchDslParser {
int i = 0;
while (i < dsl.length()) {
- // Skip whitespace
+ // Skip whitespace (but not escaped whitespace)
while (i < dsl.length() && Character.isWhitespace(dsl.charAt(i))) {
i++;
}
@@ -262,6 +306,14 @@ public class SearchDslParser {
break;
}
+ // Handle escape sequences - include both backslash and next char
+ if (dsl.charAt(i) == '\\' && i + 1 < dsl.length()) {
+ currentTerm.append(dsl.charAt(i));
+ currentTerm.append(dsl.charAt(i + 1));
+ i += 2;
+ continue;
+ }
+
// Try to match operators
String remaining = dsl.substring(i);
String upperRemaining = remaining.toUpperCase();
@@ -333,7 +385,7 @@ public class SearchDslParser {
}
/**
- * Tokenize DSL into terms (split by whitespace, respecting quotes and
functions)
+ * Tokenize DSL into terms (split by whitespace, respecting quotes,
escapes, and functions)
*/
private static List<String> tokenizeDsl(String dsl) {
List<String> terms = new ArrayList<>();
@@ -358,8 +410,13 @@ public class SearchDslParser {
inParens = false;
}
currentTerm.append(c);
+ } else if (c == '\\' && i + 1 < dsl.length()) {
+ // Escape sequence - include both backslash and next char in
term
+ currentTerm.append(c);
+ currentTerm.append(dsl.charAt(i + 1));
+ i++; // Skip next character
} else if (Character.isWhitespace(c) && !inQuotes && !inParens) {
- // End of term
+ // End of term (only if not escaped - handled above)
if (currentTerm.length() > 0) {
terms.add(currentTerm.toString());
currentTerm = new StringBuilder();
@@ -379,6 +436,7 @@ public class SearchDslParser {
/**
* Check if a term contains wildcard characters (* or ?)
+ * Escaped wildcards (\* or \?) are not counted.
*/
private static boolean containsWildcard(String term) {
// Ignore wildcards in quoted strings or regex
@@ -388,26 +446,48 @@ public class SearchDslParser {
if (term.startsWith("/") && term.endsWith("/")) {
return false;
}
- return term.contains("*") || term.contains("?");
+ // Check for unescaped wildcards
+ for (int i = 0; i < term.length(); i++) {
+ char c = term.charAt(i);
+ if (c == '\\' && i + 1 < term.length()) {
+ // Skip escaped character
+ i++;
+ continue;
+ }
+ if (c == '*' || c == '?') {
+ return true;
+ }
+ }
+ return false;
}
/**
* Clause types supported
*/
public enum QsClauseType {
- TERM, // field:value
- PHRASE, // field:"phrase search"
- PREFIX, // field:prefix*
- WILDCARD, // field:*wild*card*
- REGEXP, // field:/pattern/
- RANGE, // field:[1 TO 10] or field:{1 TO 10}
- LIST, // field:IN(value1 value2)
- ANY, // field:ANY(value) - any match
- ALL, // field:ALL(value) - all match
- EXACT, // field:EXACT(value) - exact match without tokenization
- AND, // clause1 AND clause2
- OR, // clause1 OR clause2
- NOT // NOT clause
+ TERM, // field:value
+ PHRASE, // field:"phrase search"
+ PREFIX, // field:prefix*
+ WILDCARD, // field:*wild*card*
+ REGEXP, // field:/pattern/
+ RANGE, // field:[1 TO 10] or field:{1 TO 10}
+ LIST, // field:IN(value1 value2)
+ ANY, // field:ANY(value) - any match
+ ALL, // field:ALL(value) - all match
+ EXACT, // field:EXACT(value) - exact match without
tokenization
+ AND, // clause1 AND clause2 (standard boolean algebra)
+ OR, // clause1 OR clause2 (standard boolean algebra)
+ NOT, // NOT clause (standard boolean algebra)
+ OCCUR_BOOLEAN // Lucene-style boolean query with MUST/SHOULD/MUST_NOT
+ }
+
+ /**
+ * Occur type for Lucene-style boolean queries
+ */
+ public enum QsOccur {
+ MUST, // Term must appear (equivalent to +term)
+ SHOULD, // Term should appear (optional)
+ MUST_NOT // Term must not appear (equivalent to -term)
}
/**
@@ -601,15 +681,15 @@ public class SearchDslParser {
}
private QsNode createTermNode(String fieldName, String value) {
- return new QsNode(QsClauseType.TERM, fieldName, value);
+ return new QsNode(QsClauseType.TERM, fieldName,
unescapeTermValue(value));
}
private QsNode createPrefixNode(String fieldName, String value) {
- return new QsNode(QsClauseType.PREFIX, fieldName, value);
+ return new QsNode(QsClauseType.PREFIX, fieldName,
unescapeTermValue(value));
}
private QsNode createWildcardNode(String fieldName, String value) {
- return new QsNode(QsClauseType.WILDCARD, fieldName, value);
+ return new QsNode(QsClauseType.WILDCARD, fieldName,
unescapeTermValue(value));
}
private QsNode createRegexpNode(String fieldName, String regexpText) {
@@ -780,15 +860,35 @@ public class SearchDslParser {
@JsonProperty("children")
public List<QsNode> children;
+ @JsonProperty("occur")
+ public QsOccur occur;
+
+ @JsonProperty("minimumShouldMatch")
+ public Integer minimumShouldMatch;
+
+ /**
+ * Constructor for JSON deserialization
+ *
+ * @param type the clause type
+ * @param field the field name
+ * @param value the field value
+ * @param children the child nodes
+ * @param occur the occurrence type
+ * @param minimumShouldMatch the minimum should match value
+ */
@JsonCreator
public QsNode(@JsonProperty("type") QsClauseType type,
@JsonProperty("field") String field,
@JsonProperty("value") String value,
- @JsonProperty("children") List<QsNode> children) {
+ @JsonProperty("children") List<QsNode> children,
+ @JsonProperty("occur") QsOccur occur,
+ @JsonProperty("minimumShouldMatch") Integer
minimumShouldMatch) {
this.type = type;
this.field = field;
this.value = value;
this.children = children != null ? children : new ArrayList<>();
+ this.occur = occur;
+ this.minimumShouldMatch = minimumShouldMatch;
}
public QsNode(QsClauseType type, String field, String value) {
@@ -803,9 +903,20 @@ public class SearchDslParser {
this.children = children != null ? children : new ArrayList<>();
}
+ public QsNode(QsClauseType type, List<QsNode> children, Integer
minimumShouldMatch) {
+ this.type = type;
+ this.children = children != null ? children : new ArrayList<>();
+ this.minimumShouldMatch = minimumShouldMatch;
+ }
+
+ public QsNode withOccur(QsOccur occur) {
+ this.occur = occur;
+ return this;
+ }
+
@Override
public int hashCode() {
- return Objects.hash(type, field, value, children);
+ return Objects.hash(type, field, value, children, occur,
minimumShouldMatch);
}
@Override
@@ -820,7 +931,9 @@ public class SearchDslParser {
return type == qsNode.type
&& Objects.equals(field, qsNode.field)
&& Objects.equals(value, qsNode.value)
- && Objects.equals(children, qsNode.children);
+ && Objects.equals(children, qsNode.children)
+ && occur == qsNode.occur
+ && Objects.equals(minimumShouldMatch,
qsNode.minimumShouldMatch);
}
}
@@ -859,4 +972,598 @@ public class SearchDslParser {
&& Objects.equals(fieldName, that.fieldName);
}
}
+
+ /**
+ * Search options parsed from JSON.
+ * Supports all configuration in a single JSON object:
+ * - default_field: default field name when DSL doesn't specify field
+ * - default_operator: "and" or "or" for multi-term queries (default: "or")
+ * - mode: "standard" (default) or "lucene" (ES/Lucene-style boolean
parsing)
+ * - minimum_should_match: integer for Lucene mode (default: 0 for filter
context)
+ */
+ public static class SearchOptions {
+ private String defaultField = null;
+ private String defaultOperator = null;
+ private String mode = "standard";
+ private Integer minimumShouldMatch = null;
+
+ public String getDefaultField() {
+ return defaultField;
+ }
+
+ public void setDefaultField(String defaultField) {
+ this.defaultField = defaultField;
+ }
+
+ public String getDefaultOperator() {
+ return defaultOperator;
+ }
+
+ public void setDefaultOperator(String defaultOperator) {
+ this.defaultOperator = defaultOperator;
+ }
+
+ public boolean isLuceneMode() {
+ return "lucene".equalsIgnoreCase(mode);
+ }
+
+ public String getMode() {
+ return mode;
+ }
+
+ public void setMode(String mode) {
+ this.mode = mode;
+ }
+
+ public Integer getMinimumShouldMatch() {
+ return minimumShouldMatch;
+ }
+
+ public void setMinimumShouldMatch(Integer minimumShouldMatch) {
+ this.minimumShouldMatch = minimumShouldMatch;
+ }
+ }
+
+ /**
+ * Parse options JSON string.
+ * Supports the following fields:
+ * - default_field: default field name when DSL doesn't specify field
+ * - default_operator: "and" or "or" for multi-term queries
+ * - mode: "standard" or "lucene"
+ * - minimum_should_match: integer for Lucene mode
+ */
+ private static SearchOptions parseOptions(String optionsJson) {
+ SearchOptions options = new SearchOptions();
+ if (optionsJson == null || optionsJson.trim().isEmpty()) {
+ return options;
+ }
+
+ try {
+ // Parse JSON using Jackson
+ com.fasterxml.jackson.databind.JsonNode jsonNode =
JSON_MAPPER.readTree(optionsJson);
+
+ if (jsonNode.has("default_field")) {
+
options.setDefaultField(jsonNode.get("default_field").asText());
+ }
+ if (jsonNode.has("default_operator")) {
+
options.setDefaultOperator(jsonNode.get("default_operator").asText());
+ }
+ if (jsonNode.has("mode")) {
+ options.setMode(jsonNode.get("mode").asText());
+ }
+ if (jsonNode.has("minimum_should_match")) {
+
options.setMinimumShouldMatch(jsonNode.get("minimum_should_match").asInt());
+ }
+ } catch (Exception e) {
+ LOG.warn("Failed to parse search options JSON: {}", optionsJson,
e);
+ }
+
+ return options;
+ }
+
+ /**
+ * Lucene mode parsing - implements ES/Lucene-style boolean query
semantics.
+ * <p>
+ * Key differences from standard mode:
+ * - Operators are processed left-to-right as modifiers (not traditional
boolean algebra)
+ * - AND marks preceding and current terms as MUST (+)
+ * - OR marks preceding and current terms as SHOULD
+ * - NOT marks current term as MUST_NOT (-)
+ * - When minimum_should_match=0 and there are MUST clauses, SHOULD
clauses are ignored
+ * <p>
+ * Examples:
+ * - "a AND b OR c" → +a (with minimum_should_match=0, SHOULD terms
discarded)
+ * - "a AND b OR NOT c AND d" → +a -c +d
+ * - "a OR b OR c" → a b c (at least one must match)
+ */
+ private static QsPlan parseDslLuceneMode(String dsl, String defaultField,
String defaultOperator,
+ SearchOptions options) {
+ if (dsl == null || dsl.trim().isEmpty()) {
+ return new QsPlan(new QsNode(QsClauseType.TERM, "error",
"empty_dsl"), new ArrayList<>());
+ }
+
+ // Expand simplified DSL if default field is provided
+ String expandedDsl = dsl;
+ if (defaultField != null && !defaultField.trim().isEmpty()) {
+ expandedDsl = expandSimplifiedDsl(dsl.trim(), defaultField.trim(),
+ normalizeDefaultOperator(defaultOperator));
+ }
+
+ try {
+ // Create ANTLR lexer and parser
+ SearchLexer lexer = new SearchLexer(new
ANTLRInputStream(expandedDsl));
+ CommonTokenStream tokens = new CommonTokenStream(lexer);
+ SearchParser parser = new SearchParser(tokens);
+
+ // Add error listener
+ parser.removeErrorListeners();
+ parser.addErrorListener(new
org.antlr.v4.runtime.BaseErrorListener() {
+ @Override
+ public void syntaxError(org.antlr.v4.runtime.Recognizer<?, ?>
recognizer,
+ Object offendingSymbol,
+ int line, int charPositionInLine,
+ String msg, org.antlr.v4.runtime.RecognitionException
e) {
+ throw new RuntimeException("Invalid search DSL syntax at
line " + line
+ + ":" + charPositionInLine + " " + msg);
+ }
+ });
+
+ // Parse using standard parser first
+ ParseTree tree = parser.search();
+ if (tree == null) {
+ throw new RuntimeException("Invalid search DSL syntax");
+ }
+
+ // Build AST using Lucene-mode visitor
+ QsLuceneModeAstBuilder visitor = new
QsLuceneModeAstBuilder(options);
+ QsNode root = visitor.visit(tree);
+
+ // Extract field bindings
+ Set<String> fieldNames = visitor.getFieldNames();
+ List<QsFieldBinding> bindings = new ArrayList<>();
+ int slotIndex = 0;
+ for (String fieldName : fieldNames) {
+ bindings.add(new QsFieldBinding(fieldName, slotIndex++));
+ }
+
+ return new QsPlan(root, bindings);
+
+ } catch (Exception e) {
+ LOG.error("Failed to parse search DSL in Lucene mode: '{}'
(expanded: '{}')", dsl, expandedDsl, e);
+ throw new RuntimeException("Invalid search DSL syntax: " + dsl +
". Error: " + e.getMessage(), e);
+ }
+ }
+
+ /**
+ * ANTLR visitor for Lucene-mode AST building.
+ * Transforms standard boolean expressions into Lucene-style OCCUR_BOOLEAN
queries.
+ */
+ private static class QsLuceneModeAstBuilder extends
SearchParserBaseVisitor<QsNode> {
+ private final Set<String> fieldNames = new HashSet<>();
+ private final SearchOptions options;
+ private String currentFieldName = null;
+
+ public QsLuceneModeAstBuilder(SearchOptions options) {
+ this.options = options;
+ }
+
+ public Set<String> getFieldNames() {
+ return fieldNames;
+ }
+
+ @Override
+ public QsNode visitSearch(SearchParser.SearchContext ctx) {
+ QsNode result = visit(ctx.clause());
+ if (result == null) {
+ throw new RuntimeException("Invalid search clause");
+ }
+ return result;
+ }
+
+ @Override
+ public QsNode visitOrClause(SearchParser.OrClauseContext ctx) {
+ // In Lucene mode, we need to process the entire OR chain together
+ // to correctly assign MUST/SHOULD/MUST_NOT based on operator
sequence
+ return processLuceneBooleanChain(ctx);
+ }
+
+ /**
+ * Process the entire boolean expression chain in Lucene mode.
+ * This is the core of Lucene-style boolean parsing.
+ */
+ private QsNode processLuceneBooleanChain(SearchParser.OrClauseContext
ctx) {
+ // Collect all terms and operators from the expression tree
+ List<TermWithOccur> terms = new ArrayList<>();
+ collectTermsWithOperators(ctx, terms, QsOccur.MUST); //
default_operator = AND means MUST
+
+ if (terms.isEmpty()) {
+ throw new RuntimeException("No terms found in boolean
expression");
+ }
+
+ if (terms.size() == 1) {
+ TermWithOccur singleTerm = terms.get(0);
+ if (singleTerm.isNegated) {
+ // Single negated term - must wrap in OCCUR_BOOLEAN for BE
to handle MUST_NOT
+ singleTerm.node.occur = QsOccur.MUST_NOT;
+ List<QsNode> children = new ArrayList<>();
+ children.add(singleTerm.node);
+ return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 0);
+ }
+ // Single non-negated term - return directly without wrapper
+ return singleTerm.node;
+ }
+
+ // Apply Lucene boolean logic
+ applyLuceneBooleanLogic(terms);
+
+ // Determine minimum_should_match
+ Integer minShouldMatch = options.getMinimumShouldMatch();
+ if (minShouldMatch == null) {
+ // Default: 0 if there are MUST clauses, 1 if only SHOULD
+ boolean hasMust = terms.stream().anyMatch(t -> t.occur ==
QsOccur.MUST);
+ boolean hasMustNot = terms.stream().anyMatch(t -> t.occur ==
QsOccur.MUST_NOT);
+ minShouldMatch = (hasMust || hasMustNot) ? 0 : 1;
+ }
+
+ // Filter out SHOULD clauses if minimum_should_match=0 and there
are MUST clauses
+ final int finalMinShouldMatch = minShouldMatch;
+ if (minShouldMatch == 0) {
+ boolean hasMust = terms.stream().anyMatch(t -> t.occur ==
QsOccur.MUST);
+ if (hasMust) {
+ terms = terms.stream()
+ .filter(t -> t.occur != QsOccur.SHOULD)
+ .collect(java.util.stream.Collectors.toList());
+ }
+ }
+
+ if (terms.isEmpty()) {
+ throw new RuntimeException("All terms filtered out in Lucene
boolean logic");
+ }
+
+ if (terms.size() == 1) {
+ TermWithOccur remainingTerm = terms.get(0);
+ if (remainingTerm.occur == QsOccur.MUST_NOT) {
+ // Single MUST_NOT term - must wrap in OCCUR_BOOLEAN for
BE to handle
+ remainingTerm.node.occur = QsOccur.MUST_NOT;
+ List<QsNode> children = new ArrayList<>();
+ children.add(remainingTerm.node);
+ return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 0);
+ }
+ return remainingTerm.node;
+ }
+
+ // Build OCCUR_BOOLEAN node
+ List<QsNode> children = new ArrayList<>();
+ for (TermWithOccur term : terms) {
+ term.node.occur = term.occur;
+ children.add(term.node);
+ }
+
+ return new QsNode(QsClauseType.OCCUR_BOOLEAN, children,
finalMinShouldMatch);
+ }
+
+ /**
+ * Collect terms from the parse tree with their positions
+ */
+ private void collectTermsWithOperators(ParseTree ctx,
List<TermWithOccur> terms, QsOccur defaultOccur) {
+ if (ctx instanceof SearchParser.OrClauseContext) {
+ SearchParser.OrClauseContext orCtx =
(SearchParser.OrClauseContext) ctx;
+ List<SearchParser.AndClauseContext> andClauses =
orCtx.andClause();
+
+ for (int i = 0; i < andClauses.size(); i++) {
+ // Mark that this term is introduced by OR if not the first
+ boolean introducedByOr = (i > 0);
+ collectTermsFromAndClause(andClauses.get(i), terms,
defaultOccur, introducedByOr);
+ }
+ }
+ }
+
+ private void collectTermsFromAndClause(SearchParser.AndClauseContext
ctx, List<TermWithOccur> terms,
+ QsOccur defaultOccur, boolean introducedByOr) {
+ List<SearchParser.NotClauseContext> notClauses = ctx.notClause();
+
+ for (int i = 0; i < notClauses.size(); i++) {
+ boolean introducedByAnd = (i > 0);
+ collectTermsFromNotClause(notClauses.get(i), terms,
defaultOccur, introducedByOr, introducedByAnd);
+ // After first term, all subsequent in same AND chain are
introducedByOr=false
+ introducedByOr = false;
+ }
+ }
+
+ private void collectTermsFromNotClause(SearchParser.NotClauseContext
ctx, List<TermWithOccur> terms,
+ QsOccur defaultOccur, boolean introducedByOr, boolean
introducedByAnd) {
+ boolean isNegated = ctx.NOT() != null;
+ SearchParser.AtomClauseContext atomCtx = ctx.atomClause();
+
+ if (atomCtx.clause() != null) {
+ // Parenthesized clause - visit recursively
+ QsNode subNode = visit(atomCtx.clause());
+ TermWithOccur term = new TermWithOccur(subNode, defaultOccur);
+ term.introducedByOr = introducedByOr;
+ term.introducedByAnd = introducedByAnd;
+ term.isNegated = isNegated;
+ terms.add(term);
+ } else {
+ // Field query
+ QsNode node = visit(atomCtx.fieldQuery());
+ TermWithOccur term = new TermWithOccur(node, defaultOccur);
+ term.introducedByOr = introducedByOr;
+ term.introducedByAnd = introducedByAnd;
+ term.isNegated = isNegated;
+ terms.add(term);
+ }
+ }
+
+ /**
+ * Apply Lucene boolean logic to determine final MUST/SHOULD/MUST_NOT
for each term.
+ * <p>
+ * Rules (processed left-to-right):
+ * 1. First term: MUST (due to default_operator=AND)
+ * 2. AND introduces: marks preceding and current as MUST
+ * 3. OR introduces: marks preceding and current as SHOULD
+ * 4. NOT modifier: marks current as MUST_NOT
+ * 5. AND after MUST_NOT: the MUST_NOT term is not affected, current
becomes MUST
+ */
+ private void applyLuceneBooleanLogic(List<TermWithOccur> terms) {
+ for (int i = 0; i < terms.size(); i++) {
+ TermWithOccur current = terms.get(i);
+
+ if (current.isNegated) {
+ // NOT modifier - mark as MUST_NOT
+ current.occur = QsOccur.MUST_NOT;
+
+ // OR + NOT: preceding becomes SHOULD (if not already
MUST_NOT)
+ if (current.introducedByOr && i > 0) {
+ TermWithOccur prev = terms.get(i - 1);
+ if (prev.occur != QsOccur.MUST_NOT) {
+ prev.occur = QsOccur.SHOULD;
+ }
+ }
+ } else if (current.introducedByAnd) {
+ // AND introduces: both preceding and current are MUST
+ current.occur = QsOccur.MUST;
+ if (i > 0) {
+ TermWithOccur prev = terms.get(i - 1);
+ // Don't change MUST_NOT to MUST
+ if (prev.occur != QsOccur.MUST_NOT) {
+ prev.occur = QsOccur.MUST;
+ }
+ }
+ } else if (current.introducedByOr) {
+ // OR introduces: both preceding and current are SHOULD
+ current.occur = QsOccur.SHOULD;
+ if (i > 0) {
+ TermWithOccur prev = terms.get(i - 1);
+ // Don't change MUST_NOT to SHOULD
+ if (prev.occur != QsOccur.MUST_NOT) {
+ prev.occur = QsOccur.SHOULD;
+ }
+ }
+ } else {
+ // First term: MUST (default_operator=AND)
+ current.occur = QsOccur.MUST;
+ }
+ }
+ }
+
+ @Override
+ public QsNode visitAndClause(SearchParser.AndClauseContext ctx) {
+ // This is called for simple cases, delegate to parent's logic
+ if (ctx.notClause().size() == 1) {
+ return visit(ctx.notClause(0));
+ }
+
+ // Multiple AND terms - use processLuceneBooleanChain via parent
+ List<QsNode> children = new ArrayList<>();
+ for (SearchParser.NotClauseContext notCtx : ctx.notClause()) {
+ QsNode child = visit(notCtx);
+ if (child != null) {
+ children.add(child);
+ }
+ }
+
+ if (children.size() == 1) {
+ return children.get(0);
+ }
+
+ return new QsNode(QsClauseType.AND, children);
+ }
+
+ @Override
+ public QsNode visitNotClause(SearchParser.NotClauseContext ctx) {
+ if (ctx.NOT() != null) {
+ QsNode child = visit(ctx.atomClause());
+ if (child == null) {
+ throw new RuntimeException("Invalid NOT clause: missing
operand");
+ }
+ List<QsNode> children = new ArrayList<>();
+ children.add(child);
+ return new QsNode(QsClauseType.NOT, children);
+ }
+ return visit(ctx.atomClause());
+ }
+
+ @Override
+ public QsNode visitAtomClause(SearchParser.AtomClauseContext ctx) {
+ if (ctx.clause() != null) {
+ return visit(ctx.clause());
+ }
+ return visit(ctx.fieldQuery());
+ }
+
+ @Override
+ public QsNode visitFieldQuery(SearchParser.FieldQueryContext ctx) {
+ // Build complete field path
+ StringBuilder fullPath = new StringBuilder();
+ List<SearchParser.FieldSegmentContext> segments =
ctx.fieldPath().fieldSegment();
+
+ for (int i = 0; i < segments.size(); i++) {
+ if (i > 0) {
+ fullPath.append('.');
+ }
+ String segment = segments.get(i).getText();
+ if (segment.startsWith("\"") && segment.endsWith("\"")) {
+ segment = segment.substring(1, segment.length() - 1);
+ }
+ fullPath.append(segment);
+ }
+
+ String fieldPath = fullPath.toString();
+ fieldNames.add(fieldPath);
+
+ String previousFieldName = currentFieldName;
+ currentFieldName = fieldPath;
+
+ try {
+ return visit(ctx.searchValue());
+ } finally {
+ currentFieldName = previousFieldName;
+ }
+ }
+
+ @Override
+ public QsNode visitSearchValue(SearchParser.SearchValueContext ctx) {
+ String fieldName = currentFieldName != null ? currentFieldName :
"_all";
+
+ if (ctx.TERM() != null) {
+ return new QsNode(QsClauseType.TERM, fieldName,
unescapeTermValue(ctx.TERM().getText()));
+ }
+ if (ctx.PREFIX() != null) {
+ return new QsNode(QsClauseType.PREFIX, fieldName,
unescapeTermValue(ctx.PREFIX().getText()));
+ }
+ if (ctx.WILDCARD() != null) {
+ return new QsNode(QsClauseType.WILDCARD, fieldName,
unescapeTermValue(ctx.WILDCARD().getText()));
+ }
+ if (ctx.REGEXP() != null) {
+ String regexp = ctx.REGEXP().getText();
+ if (regexp.startsWith("/") && regexp.endsWith("/")) {
+ regexp = regexp.substring(1, regexp.length() - 1);
+ }
+ return new QsNode(QsClauseType.REGEXP, fieldName, regexp);
+ }
+ if (ctx.QUOTED() != null) {
+ String quoted = ctx.QUOTED().getText();
+ if (quoted.startsWith("\"") && quoted.endsWith("\"")) {
+ quoted = quoted.substring(1, quoted.length() - 1);
+ }
+ return new QsNode(QsClauseType.PHRASE, fieldName, quoted);
+ }
+ if (ctx.rangeValue() != null) {
+ SearchParser.RangeValueContext rangeCtx = ctx.rangeValue();
+ String rangeText;
+ if (rangeCtx.LBRACKET() != null) {
+ rangeText = "[" + rangeCtx.rangeEndpoint(0).getText() + "
TO "
+ + rangeCtx.rangeEndpoint(1).getText() + "]";
+ } else {
+ rangeText = "{" + rangeCtx.rangeEndpoint(0).getText() + "
TO "
+ + rangeCtx.rangeEndpoint(1).getText() + "}";
+ }
+ return new QsNode(QsClauseType.RANGE, fieldName, rangeText);
+ }
+ if (ctx.listValue() != null) {
+ StringBuilder listText = new StringBuilder("IN(");
+ for (int i = 0; i < ctx.listValue().LIST_TERM().size(); i++) {
+ if (i > 0) {
+ listText.append(" ");
+ }
+ listText.append(ctx.listValue().LIST_TERM(i).getText());
+ }
+ listText.append(")");
+ return new QsNode(QsClauseType.LIST, fieldName,
listText.toString());
+ }
+ if (ctx.anyAllValue() != null) {
+ String text = ctx.anyAllValue().getText();
+ String innerContent = extractParenthesesContent(text);
+ String sanitizedContent = stripOuterQuotes(innerContent);
+ if (text.toUpperCase().startsWith("ANY(")) {
+ return new QsNode(QsClauseType.ANY, fieldName,
sanitizedContent);
+ }
+ return new QsNode(QsClauseType.ALL, fieldName,
sanitizedContent);
+ }
+ if (ctx.exactValue() != null) {
+ String innerContent =
extractParenthesesContent(ctx.exactValue().getText());
+ return new QsNode(QsClauseType.EXACT, fieldName, innerContent);
+ }
+
+ return new QsNode(QsClauseType.TERM, fieldName,
unescapeTermValue(ctx.getText()));
+ }
+
+ private String extractParenthesesContent(String text) {
+ int openParen = text.indexOf('(');
+ int closeParen = text.lastIndexOf(')');
+ if (openParen >= 0 && closeParen > openParen) {
+ return text.substring(openParen + 1, closeParen).trim();
+ }
+ return "";
+ }
+
+ private String stripOuterQuotes(String text) {
+ if (text == null || text.length() < 2) {
+ return text;
+ }
+ char first = text.charAt(0);
+ char last = text.charAt(text.length() - 1);
+ if ((first == '"' && last == '"') || (first == '\'' && last ==
'\'')) {
+ return text.substring(1, text.length() - 1);
+ }
+ return text;
+ }
+ }
+
+ /**
+ * Helper class to track term with its occur status during parsing
+ */
+ private static class TermWithOccur {
+ QsNode node;
+ QsOccur occur;
+ boolean introducedByOr = false;
+ boolean introducedByAnd = false;
+ boolean isNegated = false;
+
+ TermWithOccur(QsNode node, QsOccur occur) {
+ this.node = node;
+ this.occur = occur;
+ }
+ }
+
+ /**
+ * Process escape sequences in a term value.
+ * Converts escape sequences to their literal characters:
+ * - \ (backslash space) -> space
+ * - \( -> (
+ * - \) -> )
+ * - \: -> :
+ * - \\ -> \
+ * - \* -> *
+ * - \? -> ?
+ * - etc.
+ *
+ * @param value The raw term value with escape sequences
+ * @return The unescaped value
+ */
+ private static String unescapeTermValue(String value) {
+ if (value == null || value.isEmpty()) {
+ return value;
+ }
+
+ // Quick check: if no backslash, return as-is
+ if (value.indexOf('\\') < 0) {
+ return value;
+ }
+
+ StringBuilder result = new StringBuilder(value.length());
+ int i = 0;
+ while (i < value.length()) {
+ char c = value.charAt(i);
+ if (c == '\\' && i + 1 < value.length()) {
+ // Escape sequence - take the next character literally
+ result.append(value.charAt(i + 1));
+ i += 2;
+ } else {
+ result.append(c);
+ i++;
+ }
+ }
+ return result.toString();
+ }
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
index eb1bf3f5d3a..6279aead20a 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java
@@ -20,6 +20,7 @@ package
org.apache.doris.nereids.trees.expressions.functions.scalar;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsClauseType;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsFieldBinding;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsNode;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsOccur;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsPlan;
import org.junit.jupiter.api.Assertions;
@@ -577,4 +578,351 @@ public class SearchDslParserTest {
Assertions.assertEquals("tags", plan.fieldBindings.get(0).fieldName);
Assertions.assertEquals(0, plan.fieldBindings.get(0).slotIndex);
}
+
+ // ============ Tests for Lucene Mode Parsing ============
+
+ @Test
+ public void testLuceneModeSimpleAndQuery() {
+ // Test: "a AND b" in Lucene mode → both MUST
+ String dsl = "field:a AND field:b";
+ String options = "{\"mode\":\"lucene\",\"minimum_should_match\":0}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.root.type);
+ Assertions.assertEquals(2, plan.root.children.size());
+ Assertions.assertEquals(Integer.valueOf(0),
plan.root.minimumShouldMatch);
+
+ // Both children should have MUST occur
+ for (QsNode child : plan.root.children) {
+ Assertions.assertEquals(SearchDslParser.QsOccur.MUST, child.occur);
+ }
+ }
+
+ @Test
+ public void testLuceneModeSimpleOrQuery() {
+ // Test: "a OR b OR c" in Lucene mode → all SHOULD, at least one must
match
+ String dsl = "field:a OR field:b OR field:c";
+ String options = "{\"mode\":\"lucene\"}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.root.type);
+ Assertions.assertEquals(3, plan.root.children.size());
+
+ // All children should have SHOULD occur
+ for (QsNode child : plan.root.children) {
+ Assertions.assertEquals(SearchDslParser.QsOccur.SHOULD,
child.occur);
+ }
+
+ // minimum_should_match should be 1 (at least one must match)
+ Assertions.assertEquals(Integer.valueOf(1),
plan.root.minimumShouldMatch);
+ }
+
+ @Test
+ public void testLuceneModeAndOrMixed() {
+ // Test: "a AND b OR c" in Lucene mode with minimum_should_match=0
+ // Expected: +a (SHOULD terms discarded because MUST exists)
+ String dsl = "field:a AND field:b OR field:c";
+ String options = "{\"mode\":\"lucene\",\"minimum_should_match\":0}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ // With minimum_should_match=0 and MUST clauses present, SHOULD is
discarded
+ // Only "a" remains with MUST
+ Assertions.assertEquals(QsClauseType.TERM, plan.root.type);
+ Assertions.assertEquals("field", plan.root.field);
+ Assertions.assertEquals("a", plan.root.value);
+ }
+
+ @Test
+ public void testLuceneModeAndOrNotMixed() {
+ // Test: "a AND b OR NOT c AND d" in Lucene mode
+ // Expected processing:
+ // - a: MUST (first term, default_operator=AND)
+ // - b: MUST (AND introduces)
+ // - c: MUST_NOT (OR + NOT, but OR makes preceding SHOULD, NOT makes
current MUST_NOT)
+ // - d: MUST (AND introduces)
+ // With minimum_should_match=0: b becomes SHOULD and is discarded
+ // Result: +a -c +d
+ String dsl = "field:a AND field:b OR NOT field:c AND field:d";
+ String options = "{\"mode\":\"lucene\",\"minimum_should_match\":0}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.root.type);
+
+ // Should have 3 children: a(MUST), c(MUST_NOT), d(MUST)
+ // b is filtered out because it becomes SHOULD
+ Assertions.assertEquals(3, plan.root.children.size());
+
+ QsNode nodeA = plan.root.children.get(0);
+ Assertions.assertEquals("a", nodeA.value);
+ Assertions.assertEquals(SearchDslParser.QsOccur.MUST, nodeA.occur);
+
+ QsNode nodeC = plan.root.children.get(1);
+ Assertions.assertEquals("c", nodeC.value);
+ Assertions.assertEquals(SearchDslParser.QsOccur.MUST_NOT, nodeC.occur);
+
+ QsNode nodeD = plan.root.children.get(2);
+ Assertions.assertEquals("d", nodeD.value);
+ Assertions.assertEquals(SearchDslParser.QsOccur.MUST, nodeD.occur);
+ }
+
+ @Test
+ public void testLuceneModeWithDefaultField() {
+ // Test: Lucene mode with default field expansion
+ String dsl = "aterm AND bterm OR cterm";
+ // Now default_field and default_operator are inside the options JSON
+ String options =
"{\"default_field\":\"firstname\",\"default_operator\":\"and\","
+ + "\"mode\":\"lucene\",\"minimum_should_match\":0}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ // With minimum_should_match=0, only aterm (MUST) remains
+ Assertions.assertEquals(QsClauseType.TERM, plan.root.type);
+ Assertions.assertEquals("firstname", plan.root.field);
+ Assertions.assertEquals("aterm", plan.root.value);
+ }
+
+ @Test
+ public void testLuceneModeNotOperator() {
+ // Test: "NOT a" in Lucene mode
+ // In Lucene mode, single NOT produces OCCUR_BOOLEAN with a MUST_NOT
child
+ // (wrapped for BE to handle the negation properly)
+ String dsl = "NOT field:a";
+ String options = "{\"mode\":\"lucene\"}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.root.type);
+ Assertions.assertEquals(1, plan.root.children.size());
+ Assertions.assertEquals(QsClauseType.TERM,
plan.root.children.get(0).type);
+ Assertions.assertEquals(QsOccur.MUST_NOT,
plan.root.children.get(0).occur);
+ }
+
+ @Test
+ public void testLuceneModeMinimumShouldMatchExplicit() {
+ // Test: explicit minimum_should_match=1 keeps SHOULD clauses
+ String dsl = "field:a AND field:b OR field:c";
+ String options = "{\"mode\":\"lucene\",\"minimum_should_match\":1}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.root.type);
+ // All 3 terms should be present
+ Assertions.assertEquals(3, plan.root.children.size());
+ Assertions.assertEquals(Integer.valueOf(1),
plan.root.minimumShouldMatch);
+ }
+
+ @Test
+ public void testLuceneModeSingleTerm() {
+ // Test: single term should not create OCCUR_BOOLEAN wrapper
+ String dsl = "field:hello";
+ String options = "{\"mode\":\"lucene\"}";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.root.type);
+ Assertions.assertEquals("field", plan.root.field);
+ Assertions.assertEquals("hello", plan.root.value);
+ }
+
+ @Test
+ public void testStandardModeUnchanged() {
+ // Test: standard mode (default) should work as before
+ String dsl = "field:a AND field:b OR field:c";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, (String) null);
+
+ Assertions.assertNotNull(plan);
+ // Standard mode uses traditional boolean algebra: OR at top level
+ Assertions.assertEquals(QsClauseType.OR, plan.root.type);
+ }
+
+ @Test
+ public void testLuceneModeInvalidJson() {
+ // Test: invalid JSON options should fall back to standard mode
+ String dsl = "field:a AND field:b";
+ String options = "not valid json";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, options);
+
+ Assertions.assertNotNull(plan);
+ // Should fall back to standard mode (AND type)
+ Assertions.assertEquals(QsClauseType.AND, plan.root.type);
+ }
+
+ @Test
+ public void testLuceneModeEmptyOptions() {
+ // Test: empty options string should use standard mode
+ String dsl = "field:a AND field:b";
+ QsPlan plan = SearchDslParser.parseDsl(dsl, "");
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.AND, plan.root.type);
+ }
+
+ // ============ Tests for Escape Handling ============
+
+ @Test
+ public void testEscapedSpaceInTerm() {
+ // Test: "First\ Value" should be treated as a single term "First
Value"
+ // The escape sequence is processed: \ + space -> space
+ String dsl = "field:First\\ Value";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.root.type);
+ Assertions.assertEquals("field", plan.root.field);
+ // After unescape: "First\ Value" -> "First Value"
+ Assertions.assertEquals("First Value", plan.root.value);
+ }
+
+ @Test
+ public void testEscapedParentheses() {
+ // Test: \( and \) should be treated as literal characters, not
grouping
+ // The escape sequence is processed: \( -> ( and \) -> )
+ String dsl = "field:hello\\(world\\)";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.root.type);
+ Assertions.assertEquals("field", plan.root.field);
+ // After unescape: "hello\(world\)" -> "hello(world)"
+ Assertions.assertEquals("hello(world)", plan.root.value);
+ }
+
+ @Test
+ public void testEscapedColon() {
+ // Test: \: should be treated as literal colon, not field separator
+ // The escape sequence is processed: \: -> :
+ String dsl = "field:value\\:with\\:colons";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.root.type);
+ Assertions.assertEquals("field", plan.root.field);
+ // After unescape: "value\:with\:colons" -> "value:with:colons"
+ Assertions.assertEquals("value:with:colons", plan.root.value);
+ }
+
+ @Test
+ public void testEscapedBackslash() {
+ // Test: \\ should be treated as a literal backslash
+ // The escape sequence is processed: \\ -> \
+ String dsl = "field:path\\\\to\\\\file";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.root.type);
+ Assertions.assertEquals("field", plan.root.field);
+ // After unescape: "path\\to\\file" -> "path\to\file"
+ Assertions.assertEquals("path\\to\\file", plan.root.value);
+ }
+
+ @Test
+ public void testUppercaseAndOperator() {
+ // Test: uppercase AND should be treated as operator
+ String dsl = "field:a AND field:b";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.AND, plan.root.type);
+ Assertions.assertEquals(2, plan.root.children.size());
+ }
+
+ @Test
+ public void testLowercaseAndOperator() {
+ // Test: Currently lowercase 'and' is also treated as operator
+ // According to PDF requirement, only uppercase should be operators
+ // This test documents current behavior - may need to change
+ String dsl = "field:a and field:b";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ // Current behavior: lowercase 'and' IS an operator
+ Assertions.assertEquals(QsClauseType.AND, plan.root.type);
+ // TODO: If PDF requires only uppercase, this should fail and return
OR or different structure
+ }
+
+ @Test
+ public void testUppercaseOrOperator() {
+ // Test: uppercase OR should be treated as operator
+ String dsl = "field:a OR field:b";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.OR, plan.root.type);
+ Assertions.assertEquals(2, plan.root.children.size());
+ }
+
+ @Test
+ public void testLowercaseOrOperator() {
+ // Test: Currently lowercase 'or' is also treated as operator
+ // According to PDF requirement, only uppercase should be operators
+ String dsl = "field:a or field:b";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ // Current behavior: lowercase 'or' IS an operator
+ Assertions.assertEquals(QsClauseType.OR, plan.root.type);
+ // TODO: If PDF requires only uppercase, this should fail
+ }
+
+ @Test
+ public void testUppercaseNotOperator() {
+ // Test: uppercase NOT should be treated as operator
+ String dsl = "NOT field:spam";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.NOT, plan.root.type);
+ }
+
+ @Test
+ public void testLowercaseNotOperator() {
+ // Test: Currently lowercase 'not' is also treated as operator
+ // According to PDF requirement, only uppercase should be operators
+ String dsl = "not field:spam";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ // Current behavior: lowercase 'not' IS an operator
+ Assertions.assertEquals(QsClauseType.NOT, plan.root.type);
+ // TODO: If PDF requires only uppercase, this should fail
+ }
+
+ @Test
+ public void testExclamationNotOperator() {
+ // Test: ! should be treated as NOT operator
+ String dsl = "!field:spam";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ // Current behavior: ! IS a NOT operator
+ Assertions.assertEquals(QsClauseType.NOT, plan.root.type);
+ }
+
+ @Test
+ public void testEscapedSpecialCharactersInQuoted() {
+ // Test: escaped characters inside quoted strings
+ // Note: For PHRASE queries, escape handling is preserved as-is for now
+ // The backend will handle escape processing for phrase queries
+ String dsl = "field:\"hello\\\"world\"";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.PHRASE, plan.root.type);
+ Assertions.assertEquals("hello\\\"world", plan.root.value);
+ }
+
+ @Test
+ public void testNoEscapeWithoutBackslash() {
+ // Test: normal term without escape characters
+ String dsl = "field:normalterm";
+ QsPlan plan = SearchDslParser.parseDsl(dsl);
+
+ Assertions.assertNotNull(plan);
+ Assertions.assertEquals(QsClauseType.TERM, plan.root.type);
+ Assertions.assertEquals("normalterm", plan.root.value);
+ }
}
diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift
index 137818634cb..2bca8c2b877 100644
--- a/gensrc/thrift/Exprs.thrift
+++ b/gensrc/thrift/Exprs.thrift
@@ -238,11 +238,21 @@ struct TSchemaChangeExpr {
}
// Search DSL parameter structure
+
+// Occur type for Lucene-style boolean queries
+enum TSearchOccur {
+ MUST = 0, // Term must appear (equivalent to +term)
+ SHOULD = 1, // Term should appear (optional, but contributes to matching)
+ MUST_NOT = 2 // Term must not appear (equivalent to -term)
+}
+
struct TSearchClause {
- 1: required string clause_type // TERM, QUOTED, PREFIX, WILDCARD, REGEXP,
RANGE, LIST, ANY_ALL, AND, OR, NOT
+ 1: required string clause_type // TERM, QUOTED, PREFIX, WILDCARD, REGEXP,
RANGE, LIST, ANY_ALL, AND, OR, NOT, OCCUR_BOOLEAN
2: optional string field_name // Field name for leaf clauses
3: optional string value // Search value for leaf clauses
- 4: optional list<TSearchClause> children // Child clauses for compound
clauses (AND, OR, NOT)
+ 4: optional list<TSearchClause> children // Child clauses for compound
clauses (AND, OR, NOT, OCCUR_BOOLEAN)
+ 5: optional TSearchOccur occur // Occur type for this clause (used with
OCCUR_BOOLEAN parent)
+ 6: optional i32 minimum_should_match // Minimum number of SHOULD clauses
that must match (for OCCUR_BOOLEAN)
}
struct TSearchFieldBinding {
diff --git a/regression-test/data/search/test_search_escape.out
b/regression-test/data/search/test_search_escape.out
new file mode 100644
index 00000000000..09bd9f80b2b
--- /dev/null
+++ b/regression-test/data/search/test_search_escape.out
@@ -0,0 +1,46 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !escape_space --
+1 First Value
+
+-- !phrase_query --
+1 First Value
+
+-- !escape_parentheses --
+3 hello(world)
+
+-- !escape_colon --
+5 key:value
+
+-- !escape_backslash --
+6 path\\to\\file
+
+-- !uppercase_and --
+7 first fruit
+
+-- !uppercase_or --
+1 first content
+2 second content
+7 first fruit
+8 second fruit
+
+-- !uppercase_not --
+8 second fruit
+
+-- !lowercase_and --
+7 first fruit
+
+-- !lowercase_or --
+1 first content
+2 second content
+7 first fruit
+8 second fruit
+
+-- !exclamation_not --
+8 second fruit
+
+-- !default_field_escape --
+1 First Value
+
+-- !lucene_mode_escape --
+1 First Value
+
diff --git a/regression-test/data/search/test_search_lucene_mode.out
b/regression-test/data/search/test_search_lucene_mode.out
new file mode 100644
index 00000000000..68d8e6c1279
--- /dev/null
+++ b/regression-test/data/search/test_search_lucene_mode.out
@@ -0,0 +1,86 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !standard_and --
+1 apple banana cherry
+2 apple banana
+
+-- !lucene_and --
+1 apple banana cherry
+2 apple banana
+
+-- !standard_or --
+1 apple banana cherry
+2 apple banana
+3 apple
+5 cherry date
+6 date elderberry
+8 apple fig
+
+-- !lucene_or --
+1 apple banana cherry
+2 apple banana
+3 apple
+5 cherry date
+6 date elderberry
+8 apple fig
+
+-- !lucene_complex_and_or --
+1 apple banana cherry
+2 apple banana
+3 apple
+8 apple fig
+
+-- !lucene_min_should_match_1 --
+1 apple banana cherry
+2 apple banana
+
+-- !lucene_not --
+
+-- !lucene_and_not --
+3 apple
+8 apple fig
+
+-- !lucene_or_not --
+3 apple
+8 apple fig
+
+-- !lucene_or_only --
+1 apple banana cherry
+2 apple banana
+3 apple
+5 cherry date
+6 date elderberry
+7 fig grape
+8 apple fig
+
+-- !lucene_cross_field --
+1 apple banana cherry fruit
+2 apple banana fruit
+3 apple fruit
+
+-- !standard_cross_field --
+1 apple banana cherry fruit
+2 apple banana fruit
+3 apple fruit
+
+-- !lucene_phrase --
+1 apple banana cherry
+2 apple banana
+
+-- !lucene_wildcard --
+1 apple banana cherry
+2 apple banana
+
+-- !standard_unchanged --
+1 apple banana cherry
+2 apple banana
+
+-- !empty_options --
+1 apple banana cherry
+2 apple banana
+
+-- !lucene_min_should_match_0 --
+1 apple banana cherry
+2 apple banana
+3 apple
+8 apple fig
+
diff --git
a/regression-test/suites/search/test_search_default_field_operator.groovy
b/regression-test/suites/search/test_search_default_field_operator.groovy
index fd5c7ce6198..23082586235 100644
--- a/regression-test/suites/search/test_search_default_field_operator.groovy
+++ b/regression-test/suites/search/test_search_default_field_operator.groovy
@@ -52,41 +52,41 @@ suite("test_search_default_field_operator") {
// ============ Test 1: Wildcard Prefix with Default Field ============
// Requirement: firstname EQ Chris*
- // SQL: search('Chris*', 'firstname')
+ // SQL: search('Chris*', '{"default_field":"firstname"}')
// Expected: Chris (1), Christopher (2)
// Note: Without parser, inverted index is case-sensitive
qt_wildcard_prefix """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
FROM ${tableName}
- WHERE search('Chris*', 'firstname')
+ WHERE search('Chris*', '{"default_field":"firstname"}')
ORDER BY id
"""
// ============ Test 2: Multi-term AND with Default Operator ============
// Requirement: tags EQ foo bar (with AND semantics)
- // SQL: search('foo bar', 'tags', 'and')
+ // SQL: search('foo bar',
'{"default_field":"tags","default_operator":"and"}')
// Expected: 'foo bar' (1), 'bar foo' (3)
qt_multi_term_and """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
FROM ${tableName}
- WHERE search('foo bar', 'tags', 'and')
+ WHERE search('foo bar',
'{"default_field":"tags","default_operator":"and"}')
ORDER BY id
"""
// ============ Test 3: Multi-term OR with Default Operator ============
// Requirement: tags EQ foo OR bark (with OR semantics)
- // SQL: search('foo bark', 'tags', 'or')
+ // SQL: search('foo bark',
'{"default_field":"tags","default_operator":"or"}')
// Expected: 'foo bar' (1), 'bar foo' (3), 'foolish bark' (4)
qt_multi_term_or """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
FROM ${tableName}
- WHERE search('foo bark', 'tags', 'or')
+ WHERE search('foo bark',
'{"default_field":"tags","default_operator":"or"}')
ORDER BY id
"""
// ============ Test 4: Multi-wildcard AND ============
// Requirement: tags EQ foo* bar* (with AND semantics)
- // SQL: search('foo* bar*', 'tags', 'and')
+ // SQL: search('foo* bar*',
'{"default_field":"tags","default_operator":"and"}')
// Expands to: tags:foo* AND tags:bar*
// Expected: rows with tokens matching foo* AND tokens matching bar*
// - 'foo bar' (1): tokens=['foo','bar'] - matches foo* ✓ and bar* ✓
@@ -96,29 +96,29 @@ suite("test_search_default_field_operator") {
qt_wildcard_multi_and """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
FROM ${tableName}
- WHERE search('foo* bar*', 'tags', 'and')
+ WHERE search('foo* bar*',
'{"default_field":"tags","default_operator":"and"}')
ORDER BY id
"""
// ============ Test 5: Explicit OR operator overrides default ============
- // SQL: search('foo OR bark', 'tags', 'and')
+ // SQL: search('foo OR bark',
'{"default_field":"tags","default_operator":"and"}')
// The explicit OR in DSL should override the default 'and' operator
// Expected: 'foo bar' (1), 'bar foo' (3), 'foolish bark' (4)
qt_explicit_or_override """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
FROM ${tableName}
- WHERE search('foo OR bark', 'tags', 'and')
+ WHERE search('foo OR bark',
'{"default_field":"tags","default_operator":"and"}')
ORDER BY id
"""
// ============ Test 6: EXACT function with default field ============
// Requirement: EXACT(foo bar) on tags_exact field (no tokenization)
- // SQL: search('EXACT(foo bar)', 'tags_exact')
+ // SQL: search('EXACT(foo bar)', '{"default_field":"tags_exact"}')
// Expected: 'foo bar' (1) only - exact string match
qt_exact_function """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags_exact
FROM ${tableName}
- WHERE search('EXACT(foo bar)', 'tags_exact')
+ WHERE search('EXACT(foo bar)', '{"default_field":"tags_exact"}')
ORDER BY id
"""
@@ -135,7 +135,7 @@ suite("test_search_default_field_operator") {
qt_single_term """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
FROM ${tableName}
- WHERE search('bar', 'tags')
+ WHERE search('bar', '{"default_field":"tags"}')
ORDER BY id
"""
@@ -143,7 +143,7 @@ suite("test_search_default_field_operator") {
qt_wildcard_middle """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
FROM ${tableName}
- WHERE search('*ris*', 'firstname')
+ WHERE search('*ris*', '{"default_field":"firstname"}')
ORDER BY id
"""
@@ -153,7 +153,7 @@ suite("test_search_default_field_operator") {
qt_case_sensitive """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
FROM ${tableName}
- WHERE search('CHRIS*', 'firstname')
+ WHERE search('CHRIS*', '{"default_field":"firstname"}')
ORDER BY id
"""
@@ -161,7 +161,7 @@ suite("test_search_default_field_operator") {
qt_default_or """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
FROM ${tableName}
- WHERE search('foo bark', 'tags')
+ WHERE search('foo bark', '{"default_field":"tags"}')
ORDER BY id
"""
@@ -169,7 +169,7 @@ suite("test_search_default_field_operator") {
qt_any_function """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
FROM ${tableName}
- WHERE search('ANY(foo bark)', 'tags')
+ WHERE search('ANY(foo bark)', '{"default_field":"tags"}')
ORDER BY id
"""
@@ -177,7 +177,7 @@ suite("test_search_default_field_operator") {
qt_all_function """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
FROM ${tableName}
- WHERE search('ALL(foo bar)', 'tags')
+ WHERE search('ALL(foo bar)', '{"default_field":"tags"}')
ORDER BY id
"""
@@ -185,7 +185,7 @@ suite("test_search_default_field_operator") {
qt_complex_wildcard """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
FROM ${tableName}
- WHERE search('?evin', 'firstname')
+ WHERE search('?evin', '{"default_field":"firstname"}')
ORDER BY id
"""
@@ -193,7 +193,7 @@ suite("test_search_default_field_operator") {
qt_explicit_and """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
FROM ${tableName}
- WHERE search('foo AND bar', 'tags')
+ WHERE search('foo AND bar', '{"default_field":"tags"}')
ORDER BY id
"""
@@ -209,19 +209,19 @@ suite("test_search_default_field_operator") {
qt_not_operator """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
FROM ${tableName}
- WHERE search('NOT foobar', 'tags')
+ WHERE search('NOT foobar', '{"default_field":"tags"}')
ORDER BY id
"""
// ============ Test 18: Combining different parameter counts ============
- // Tests mixing 1-param, 2-param, and 3-param search() calls in same query
+ // Tests mixing 1-param and 2-param search() calls in same query
// - search('firstname:Chris*'): 1-param, traditional syntax → matches id
1,2
- // - search('foo*', 'tags', 'or'): 3-param with wildcard → matches id 1,3,4
+ // - search('foo*', '{"default_field":"tags","default_operator":"or"}'):
2-param with JSON options → matches id 1,3,4
// - OR combination → matches id 1,2,3,4 (all rows)
qt_param_count_mix """
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id
FROM ${tableName}
- WHERE search('firstname:Chris*') OR search('foo*', 'tags', 'or')
+ WHERE search('firstname:Chris*') OR search('foo*',
'{"default_field":"tags","default_operator":"or"}')
ORDER BY id
"""
diff --git a/regression-test/suites/search/test_search_escape.groovy
b/regression-test/suites/search/test_search_escape.groovy
new file mode 100644
index 00000000000..629d3fcc1f5
--- /dev/null
+++ b/regression-test/suites/search/test_search_escape.groovy
@@ -0,0 +1,189 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/**
+ * Tests for escape character handling in search() function.
+ *
+ * Escape semantics in DSL:
+ * - Backslash (\) escapes the next character
+ * - Escaped space (\ ) joins terms: "First\ Value" -> single term "First
Value"
+ * - Escaped parentheses (\( \)) are literal characters, not grouping
+ * - Escaped colon (\:) is literal, not field separator
+ * - Escaped backslash (\\) is a literal backslash
+ *
+ * Escape chain in Groovy regression tests:
+ * - Groovy string: \\\\ -> SQL string: \\ -> DSL: \ (escape char)
+ * - Groovy string: \\\\\\\\ -> SQL string: \\\\ -> DSL: \\ -> literal: \
+ */
+suite("test_search_escape") {
+ def tableName = "search_escape_test"
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ // Create table with inverted indexes
+ // parser=none: store the entire value as a single term (no tokenization)
+ sql """
+ CREATE TABLE ${tableName} (
+ id INT,
+ title VARCHAR(200),
+ content VARCHAR(500),
+ INDEX idx_title(title) USING INVERTED PROPERTIES("parser" =
"none"),
+ INDEX idx_content(content) USING INVERTED PROPERTIES("parser" =
"english")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_allocation" = "tag.location.default: 1")
+ """
+
+ // Insert test data
+ // With parser=none, these values are stored as-is (single terms)
+ // Groovy \\\\ -> SQL \\ -> stored as single backslash \
+ sql """INSERT INTO ${tableName} VALUES
+ (1, 'First Value', 'first content'),
+ (2, 'FirstValue', 'second content'),
+ (3, 'hello(world)', 'third content'),
+ (4, 'hello world', 'fourth content'),
+ (5, 'key:value', 'fifth content'),
+ (6, 'path\\\\to\\\\file', 'sixth content'),
+ (7, 'apple', 'first fruit'),
+ (8, 'banana', 'second fruit')
+ """
+
+ // Wait for index building
+ Thread.sleep(3000)
+
+ // ============ Test 1: Escaped space - search for "First Value" as single
term ============
+ // DSL: title:First\ Value -> searches for term "First Value" (with space)
+ // Groovy: \\\\ -> SQL: \\ -> DSL: \ (escape)
+ // This should match row 1 which has "First Value" stored as single term
(parser=none)
+ qt_escape_space """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:First\\\\ Value')
+ ORDER BY id
+ """
+
+ // ============ Test 2: Without escape - space separates terms ============
+ // DSL: title:First Value -> "First" and "Value" as separate terms (syntax
error without field)
+ // This query won't work as expected, showing the difference
+ // Using phrase query instead to show the contrast
+ qt_phrase_query """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:"First Value"')
+ ORDER BY id
+ """
+
+ // ============ Test 3: Escaped parentheses ============
+ // DSL: title:hello\(world\) -> searches for literal "hello(world)"
+ // Groovy: \\\\( -> SQL: \\( -> DSL: \( -> literal: (
+ qt_escape_parentheses """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:hello\\\\(world\\\\)')
+ ORDER BY id
+ """
+
+ // ============ Test 4: Escaped colon ============
+ // DSL: title:key\:value -> searches for literal "key:value"
+ // Groovy: \\\\: -> SQL: \\: -> DSL: \: -> literal: :
+ qt_escape_colon """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:key\\\\:value')
+ ORDER BY id
+ """
+
+ // ============ Test 5: Escaped backslash ============
+ // DSL: title:path\\to\\file -> searches for "path\to\file"
+ // Groovy: \\\\\\\\ -> SQL: \\\\ -> DSL: \\ -> literal: \
+ // Data stored: path\to\file (Groovy \\\\ -> SQL \\ -> stored \)
+ qt_escape_backslash """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:path\\\\\\\\to\\\\\\\\file')
+ ORDER BY id
+ """
+
+ // ============ Test 6: Uppercase AND operator ============
+ qt_uppercase_and """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content
+ FROM ${tableName}
+ WHERE search('content:first AND content:fruit')
+ ORDER BY id
+ """
+
+ // ============ Test 7: Uppercase OR operator ============
+ qt_uppercase_or """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content
+ FROM ${tableName}
+ WHERE search('content:first OR content:second')
+ ORDER BY id
+ """
+
+ // ============ Test 8: Uppercase NOT operator ============
+ qt_uppercase_not """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content
+ FROM ${tableName}
+ WHERE search('content:fruit AND NOT content:first')
+ ORDER BY id
+ """
+
+ // ============ Test 9: Lowercase and operator ============
+ qt_lowercase_and """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content
+ FROM ${tableName}
+ WHERE search('content:first and content:fruit')
+ ORDER BY id
+ """
+
+ // ============ Test 10: Lowercase or operator ============
+ qt_lowercase_or """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content
+ FROM ${tableName}
+ WHERE search('content:first or content:second')
+ ORDER BY id
+ """
+
+ // ============ Test 11: Exclamation NOT operator ============
+ qt_exclamation_not """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content
+ FROM ${tableName}
+ WHERE search('content:fruit AND !content:first')
+ ORDER BY id
+ """
+
+ // ============ Test 12: Default field with escaped space ============
+ // DSL: First\ Value with default_field=title (JSON options format)
+ qt_default_field_escape """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('First\\\\ Value',
'{"default_field":"title","default_operator":"and"}')
+ ORDER BY id
+ """
+
+ // ============ Test 13: Lucene mode with escaped space ============
+ qt_lucene_mode_escape """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('First\\\\ Value',
'{"default_field":"title","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // Cleanup
+ sql "DROP TABLE IF EXISTS ${tableName}"
+}
diff --git
a/regression-test/suites/search/test_search_inverted_is_null_pushdown.groovy
b/regression-test/suites/search/test_search_inverted_is_null_pushdown.groovy
new file mode 100644
index 00000000000..8a4eec5f8ea
--- /dev/null
+++ b/regression-test/suites/search/test_search_inverted_is_null_pushdown.groovy
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_search_inverted_is_null_pushdown", "p0") {
+ def tableName = "tbl_search_inverted_is_null_pushdown"
+ sql """DROP TABLE IF EXISTS ${tableName}"""
+ sql """
+ CREATE TABLE ${tableName} (
+ id INT,
+ dt DATE NULL,
+ str_col STRING NULL,
+ val INT NULL,
+ INDEX idx_dt (dt) USING INVERTED,
+ INDEX idx_str (str_col) USING INVERTED,
+ INDEX idx_val (val) USING INVERTED
+ )
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES(
+ "replication_allocation" = "tag.location.default: 1"
+ )
+ """
+
+ sql """INSERT INTO ${tableName} VALUES
+ (1, NULL, 'foo', 1),
+ (2, NULL, 'bar', -1),
+ (3, '2024-01-01', 'baz', 5),
+ (4, NULL, 'qux', 10)
+ """
+
+ sql "SET enable_common_expr_pushdown=true"
+ sql "SET inverted_index_skip_threshold=0"
+
+ def nullBranchQuery = """
+ SELECT COUNT(*)
+ FROM ${tableName}
+ WHERE (str_col LIKE CONCAT('%', 'no-hit', '%'))
+ OR (dt IS NULL) AND NOT val BETWEEN -9223372036854775808 AND 0
+ """
+
+ def negatedNotNullQuery = """
+ SELECT COUNT(*)
+ FROM ${tableName}
+ WHERE NOT (dt IS NOT NULL)
+ """
+
+ sql "SET enable_inverted_index_query=true"
+ def resultWithIndex = sql(nullBranchQuery)
+ def resultWithIndexNegatedNotNull = sql(negatedNotNullQuery)
+ assertEquals(2, resultWithIndex[0][0]) // previously returned 0 when
dt IS NULL relied on inverted index
+ assertEquals(3, resultWithIndexNegatedNotNull[0][0]) // previously
returned 0 when NOT (dt IS NOT NULL) was evaluated via inverted index
+
+ sql "SET enable_inverted_index_query=false"
+ def resultWithoutIndex = sql(nullBranchQuery)
+ def resultWithoutIndexNegatedNotNull = sql(negatedNotNullQuery)
+ assertEquals(2, resultWithoutIndex[0][0])
+ assertEquals(3, resultWithoutIndexNegatedNotNull[0][0])
+
+ sql """DROP TABLE IF EXISTS ${tableName}"""
+}
diff --git a/regression-test/suites/search/test_search_lucene_mode.groovy
b/regression-test/suites/search/test_search_lucene_mode.groovy
new file mode 100644
index 00000000000..8e9d4edb7e3
--- /dev/null
+++ b/regression-test/suites/search/test_search_lucene_mode.groovy
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/**
+ * Tests for Lucene mode parsing in search() function.
+ *
+ * Lucene mode mimics Elasticsearch/Lucene query_string behavior where boolean
+ * operators work as left-to-right modifiers, not traditional boolean algebra.
+ *
+ * Key differences from standard mode:
+ * - AND/OR/NOT are modifiers that affect adjacent terms
+ * - Operator precedence is left-to-right
+ * - Uses MUST/SHOULD/MUST_NOT internally (like Lucene's Occur enum)
+ * - minimum_should_match controls SHOULD clause behavior
+ *
+ * Enable Lucene mode with options parameter (JSON format):
+ * search(dsl,
'{"default_field":"title","default_operator":"and","mode":"lucene"}')
+ */
+suite("test_search_lucene_mode") {
+ def tableName = "search_lucene_mode_test"
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ // Create table with inverted indexes
+ sql """
+ CREATE TABLE ${tableName} (
+ id INT,
+ title VARCHAR(100),
+ content VARCHAR(200),
+ category VARCHAR(50),
+ INDEX idx_title(title) USING INVERTED PROPERTIES("parser" =
"english"),
+ INDEX idx_content(content) USING INVERTED PROPERTIES("parser" =
"english"),
+ INDEX idx_category(category) USING INVERTED
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_allocation" = "tag.location.default: 1")
+ """
+
+ // Insert test data
+ // Test data designed to verify Lucene-style boolean logic
+ sql """INSERT INTO ${tableName} VALUES
+ (1, 'apple banana cherry', 'red green blue', 'fruit'),
+ (2, 'apple banana', 'red green', 'fruit'),
+ (3, 'apple', 'red', 'fruit'),
+ (4, 'banana cherry', 'green blue', 'fruit'),
+ (5, 'cherry date', 'blue yellow', 'fruit'),
+ (6, 'date elderberry', 'yellow purple', 'berry'),
+ (7, 'fig grape', 'orange pink', 'mixed'),
+ (8, 'apple fig', 'red orange', 'mixed')
+ """
+
+ // Wait for index building
+ Thread.sleep(3000)
+
+ // ============ Test 1: Standard mode AND behavior ============
+ // In standard mode, 'apple AND banana' behaves like boolean AND
+ qt_standard_and """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:apple AND title:banana')
+ ORDER BY id
+ """
+
+ // ============ Test 2: Lucene mode AND behavior ============
+ // In Lucene mode, 'a AND b' marks both as MUST (+a +b)
+ // Expected same result as standard mode for simple AND
+ qt_lucene_and """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('apple AND banana',
'{"default_field":"title","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // ============ Test 3: Standard mode OR behavior ============
+ // In standard mode, 'apple OR date' returns any row matching either
+ qt_standard_or """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:apple OR title:date')
+ ORDER BY id
+ """
+
+ // ============ Test 4: Lucene mode OR behavior ============
+ // In Lucene mode, 'a OR b' marks both as SHOULD with
minimum_should_match=1
+ qt_lucene_or """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('apple OR date',
'{"default_field":"title","default_operator":"or","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // ============ Test 5: Lucene mode complex expression ============
+ // 'a AND b OR c' in Lucene mode (left-to-right parsing):
+ // - 'apple' starts as MUST (default_operator=AND)
+ // - 'AND banana' makes 'banana' MUST
+ // - 'OR cherry' makes 'cherry' SHOULD, AND changes 'banana' from MUST to
SHOULD!
+ // Final state: +apple banana cherry (only 'apple' is MUST, 'banana' and
'cherry' are SHOULD)
+ // With minimum_should_match=0 (default when MUST exists), SHOULD clauses
are discarded.
+ // So effectively: +apple only
+ // Expected: rows containing 'apple' -> 1, 2, 3, 8
+ qt_lucene_complex_and_or """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('apple AND banana OR cherry',
'{"default_field":"title","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // ============ Test 6: Lucene mode with explicit minimum_should_match=1
============
+ // 'a AND b OR c' with minimum_should_match=1 (same Lucene left-to-right
parsing):
+ // - 'apple': MUST
+ // - 'AND banana': banana becomes MUST
+ // - 'OR cherry': cherry becomes SHOULD, banana changes from MUST to SHOULD
+ // Final state: +apple banana cherry (apple is MUST, banana and cherry are
SHOULD)
+ // With minimum_should_match=1, at least 1 SHOULD must match.
+ // So effectively: apple AND (banana OR cherry)
+ // Expected: rows with 'apple' AND ('banana' OR 'cherry') -> 1, 2
+ qt_lucene_min_should_match_1 """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('apple AND banana OR cherry',
'{"default_field":"title","default_operator":"and","mode":"lucene","minimum_should_match":1}')
+ ORDER BY id
+ """
+
+ // ============ Test 7: Lucene mode NOT operator (pure negative query)
============
+ // 'NOT a' in Lucene mode produces a pure MUST_NOT query.
+ // IMPORTANT: In Lucene/ES semantics, a pure negative query (only
MUST_NOT, no MUST/SHOULD)
+ // returns EMPTY results because there's no positive clause to match
against.
+ // This is correct Lucene behavior - to get "all except X", you need:
+ // match_all AND NOT X (i.e., a positive clause combined with negation)
+ // Expected: empty result (correct Lucene semantics)
+ qt_lucene_not """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('NOT apple',
'{"default_field":"title","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // ============ Test 8: Lucene mode AND NOT ============
+ // 'a AND NOT b' in Lucene mode:
+ // - 'a' is MUST
+ // - 'NOT b' makes 'b' MUST_NOT
+ // Expected: rows with 'apple' but NOT 'banana'
+ qt_lucene_and_not """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('apple AND NOT banana',
'{"default_field":"title","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // ============ Test 9: Lucene mode OR NOT ============
+ // 'a OR NOT b' in Lucene mode:
+ // - 'a' is SHOULD
+ // - 'NOT b' makes 'b' MUST_NOT
+ // Expected: rows with 'apple' OR (NOT 'banana')
+ qt_lucene_or_not """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('apple OR NOT banana',
'{"default_field":"title","default_operator":"or","mode":"lucene","minimum_should_match":1}')
+ ORDER BY id
+ """
+
+ // ============ Test 10: Lucene mode only OR (SHOULD only) ============
+ // 'a OR b OR c' with only SHOULD clauses
+ // minimum_should_match defaults to 1 (at least one must match)
+ qt_lucene_or_only """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('apple OR date OR fig',
'{"default_field":"title","default_operator":"or","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // ============ Test 11: Lucene mode cross-field query ============
+ // Multi-field query with Lucene mode
+ qt_lucene_cross_field """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title,
category
+ FROM ${tableName}
+ WHERE search('title:apple AND category:fruit',
'{"default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // ============ Test 12: Standard mode for comparison ============
+ // Same query in standard mode for comparison
+ qt_standard_cross_field """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title,
category
+ FROM ${tableName}
+ WHERE search('title:apple AND category:fruit')
+ ORDER BY id
+ """
+
+ // ============ Test 13: Lucene mode with phrase query ============
+ qt_lucene_phrase """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('"apple banana"',
'{"default_field":"title","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // ============ Test 14: Lucene mode with wildcard ============
+ qt_lucene_wildcard """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('app* AND ban*',
'{"default_field":"title","default_operator":"and","mode":"lucene"}')
+ ORDER BY id
+ """
+
+ // ============ Test 15: Verify standard mode unchanged ============
+ // Ensure standard mode is not affected by the Lucene mode addition
+ qt_standard_unchanged """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('title:apple AND title:banana')
+ ORDER BY id
+ """
+
+ // ============ Test 16: Lucene mode with empty options (should use
standard mode) ============
+ qt_empty_options """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('apple AND banana',
'{"default_field":"title","default_operator":"and"}')
+ ORDER BY id
+ """
+
+ // ============ Test 17: Lucene mode minimum_should_match=0 default
behavior ============
+ // With minimum_should_match=0 (default in filter context), SHOULD clauses
are discarded
+ // when MUST clauses exist
+ qt_lucene_min_should_match_0 """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title
+ FROM ${tableName}
+ WHERE search('apple AND banana OR date',
'{"default_field":"title","default_operator":"and","mode":"lucene","minimum_should_match":0}')
+ ORDER BY id
+ """
+
+ // Cleanup
+ sql "DROP TABLE IF EXISTS ${tableName}"
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]