(doris) branch branch-3.1 updated: [opt](inverted index) support builtin analyzer names in custom analyzer field (#57727)

airborne Wed, 05 Nov 2025 21:07:38 -0800

This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 35a8a197916 [opt](inverted index) support builtin analyzer names in 
custom analyzer field (#57727)
35a8a197916 is described below

commit 35a8a197916cb629a0fb2829c2fd30ebc0779881
Author: zzzxl <[email protected]>
AuthorDate: Thu Nov 6 13:07:23 2025 +0800

    [opt](inverted index) support builtin analyzer names in custom analyzer 
field (#57727)
    
    picked from #57512
---
 .../inverted_index/analyzer/analyzer.cpp           | 137 +++++---
 .../segment_v2/inverted_index/analyzer/analyzer.h  |  12 +-
 .../inverted_index/ananlyzer/analyzer_test.cpp     | 348 +++++++++++++++++++++
 .../org/apache/doris/indexpolicy/IndexPolicy.java  |   3 +
 .../apache/doris/indexpolicy/IndexPolicyMgr.java   |  25 +-
 .../test_builtin_analyzer_in_custom_analyzer.out   |  49 +++
 ...test_builtin_analyzer_in_custom_analyzer.groovy | 140 +++++++++
 7 files changed, 657 insertions(+), 57 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
index bc81f0a7685..c7c5110f1ac 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
@@ -50,67 +50,100 @@ ReaderPtr 
InvertedIndexAnalyzer::create_reader(CharFilterMap& char_filter_map) {
     return reader;
 }
 
-std::shared_ptr<lucene::analysis::Analyzer> 
InvertedIndexAnalyzer::create_analyzer(
-        const InvertedIndexCtx* inverted_index_ctx) {
+bool InvertedIndexAnalyzer::is_builtin_analyzer(const std::string& 
analyzer_name) {
+    return analyzer_name == INVERTED_INDEX_PARSER_NONE ||
+           analyzer_name == INVERTED_INDEX_PARSER_STANDARD ||
+           analyzer_name == INVERTED_INDEX_PARSER_UNICODE ||
+           analyzer_name == INVERTED_INDEX_PARSER_ENGLISH ||
+           analyzer_name == INVERTED_INDEX_PARSER_CHINESE ||
+           analyzer_name == INVERTED_INDEX_PARSER_ICU ||
+           analyzer_name == INVERTED_INDEX_PARSER_BASIC ||
+           analyzer_name == INVERTED_INDEX_PARSER_IK;
+}
+
+AnalyzerPtr 
InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserType 
parser_type,
+                                                           const std::string& 
parser_mode,
+                                                           const std::string& 
lower_case,
+                                                           const std::string& 
stop_words) {
     std::shared_ptr<lucene::analysis::Analyzer> analyzer;
-    if (!inverted_index_ctx->custom_analyzer.empty()) {
-        analyzer = 
doris::ExecEnv::GetInstance()->index_policy_mgr()->get_policy_by_name(
-                inverted_index_ctx->custom_analyzer);
-    } else {
-        auto analyser_type = inverted_index_ctx->parser_type;
-        if (analyser_type == InvertedIndexParserType::PARSER_STANDARD ||
-            analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
-            analyzer = 
std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
-        } else if (analyser_type == InvertedIndexParserType::PARSER_ENGLISH) {
-            analyzer = 
std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
-        } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
-            auto chinese_analyzer =
-                    
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
-            chinese_analyzer->initDict(config::inverted_index_dict_path);
-            auto mode = inverted_index_ctx->parser_mode;
-            if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
-                
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
-            } else {
-                chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
-            }
-            analyzer = std::move(chinese_analyzer);
-        } else if (analyser_type == InvertedIndexParserType::PARSER_ICU) {
-            analyzer = std::make_shared<ICUAnalyzer>();
-            analyzer->initDict(config::inverted_index_dict_path + "/icu");
-        } else if (analyser_type == InvertedIndexParserType::PARSER_BASIC) {
-            analyzer = std::make_shared<BasicAnalyzer>();
-        } else if (analyser_type == InvertedIndexParserType::PARSER_IK) {
-            auto ik_analyzer = std::make_shared<IKAnalyzer>();
-            ik_analyzer->initDict(config::inverted_index_dict_path + "/ik");
-            auto mode = inverted_index_ctx->parser_mode;
-            if (mode == INVERTED_INDEX_PARSER_SMART) {
-                ik_analyzer->setMode(true);
-            } else {
-                ik_analyzer->setMode(false);
-            }
-            analyzer = std::move(ik_analyzer);
+
+    if (parser_type == InvertedIndexParserType::PARSER_STANDARD ||
+        parser_type == InvertedIndexParserType::PARSER_UNICODE) {
+        analyzer = 
std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
+    } else if (parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
+        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
+    } else if (parser_type == InvertedIndexParserType::PARSER_CHINESE) {
+        auto chinese_analyzer =
+                
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
+        chinese_analyzer->initDict(config::inverted_index_dict_path);
+        if (parser_mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
+            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
         } else {
-            // default
-            analyzer = 
std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
+            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
         }
-        // set lowercase
-        auto lowercase = inverted_index_ctx->lower_case;
-        if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
-            analyzer->set_lowercase(true);
-        } else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
-            analyzer->set_lowercase(false);
-        }
-        // set stop words
-        auto stop_words = inverted_index_ctx->stop_words;
-        if (stop_words == "none") {
-            analyzer->set_stopwords(nullptr);
+        analyzer = std::move(chinese_analyzer);
+    } else if (parser_type == InvertedIndexParserType::PARSER_ICU) {
+        analyzer = std::make_shared<ICUAnalyzer>();
+        analyzer->initDict(config::inverted_index_dict_path + "/icu");
+    } else if (parser_type == InvertedIndexParserType::PARSER_BASIC) {
+        analyzer = std::make_shared<BasicAnalyzer>();
+    } else if (parser_type == InvertedIndexParserType::PARSER_IK) {
+        auto ik_analyzer = std::make_shared<IKAnalyzer>();
+        ik_analyzer->initDict(config::inverted_index_dict_path + "/ik");
+        if (parser_mode == INVERTED_INDEX_PARSER_SMART) {
+            ik_analyzer->setMode(true);
         } else {
-            analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
+            ik_analyzer->setMode(false);
         }
+        analyzer = std::move(ik_analyzer);
+    } else {
+        // default
+        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
+    }
+
+    // set lowercase
+    if (lower_case == INVERTED_INDEX_PARSER_TRUE) {
+        analyzer->set_lowercase(true);
+    } else if (lower_case == INVERTED_INDEX_PARSER_FALSE) {
+        analyzer->set_lowercase(false);
+    }
+
+    // set stop words
+    if (stop_words == "none") {
+        analyzer->set_stopwords(nullptr);
+    } else {
+        analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
     }
+
     return analyzer;
 }
 
+std::shared_ptr<lucene::analysis::Analyzer> 
InvertedIndexAnalyzer::create_analyzer(
+        const InvertedIndexCtx* inverted_index_ctx) {
+    const std::string& analyzer_name = inverted_index_ctx->custom_analyzer;
+    if (analyzer_name.empty()) {
+        return create_builtin_analyzer(
+                inverted_index_ctx->parser_type, 
inverted_index_ctx->parser_mode,
+                inverted_index_ctx->lower_case, 
inverted_index_ctx->stop_words);
+    }
+
+    if (is_builtin_analyzer(analyzer_name)) {
+        InvertedIndexParserType parser_type =
+                get_inverted_index_parser_type_from_string(analyzer_name);
+        return create_builtin_analyzer(parser_type, 
inverted_index_ctx->parser_mode,
+                                       inverted_index_ctx->lower_case,
+                                       inverted_index_ctx->stop_words);
+    }
+
+    auto* index_policy_mgr = doris::ExecEnv::GetInstance()->index_policy_mgr();
+    if (!index_policy_mgr) {
+        throw Exception(ErrorCode::INVERTED_INDEX_ANALYZER_ERROR,
+                        "Index policy manager is not initialized");
+    }
+
+    return index_policy_mgr->get_policy_by_name(analyzer_name);
+}
+
 std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
         ReaderPtr reader, lucene::analysis::Analyzer* analyzer) {
     std::vector<TermInfo> analyse_result;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
index df777e88889..71f575c4c83 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
@@ -36,12 +36,19 @@ class Analyzer;
 } // namespace lucene
 
 namespace doris::segment_v2::inverted_index {
+
+using AnalyzerPtr = std::shared_ptr<lucene::analysis::Analyzer>;
+
 class InvertedIndexAnalyzer {
 public:
     static ReaderPtr create_reader(CharFilterMap& char_filter_map);
 
-    static std::shared_ptr<lucene::analysis::Analyzer> create_analyzer(
-            const InvertedIndexCtx* inverted_index_ctx);
+    static bool is_builtin_analyzer(const std::string& analyzer_name);
+    static AnalyzerPtr create_builtin_analyzer(InvertedIndexParserType 
parser_type,
+                                               const std::string& parser_mode,
+                                               const std::string& lower_case,
+                                               const std::string& stop_words);
+    static AnalyzerPtr create_analyzer(const InvertedIndexCtx* 
inverted_index_ctx);
 
     static std::vector<TermInfo> get_analyse_result(ReaderPtr reader,
                                                     
lucene::analysis::Analyzer* analyzer);
@@ -51,4 +58,5 @@ public:
 
     static bool should_analyzer(const std::map<std::string, std::string>& 
properties);
 };
+
 } // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp 
b/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp
new file mode 100644
index 00000000000..46fc3077419
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp
@@ -0,0 +1,348 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
+
+#include <gtest/gtest.h>
+
+#include "gen_cpp/AgentService_types.h"
+#include "olap/inverted_index_parser.h"
+#include "olap/rowset/segment_v2/inverted_index/util/reader.h"
+#include "runtime/exec_env.h"
+#include "runtime/index_policy/index_policy_mgr.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class AnalyzerTest : public ::testing::Test {
+protected:
+    void SetUp() override { _index_policy_mgr = 
std::make_unique<IndexPolicyMgr>(); }
+
+    void TearDown() override { _index_policy_mgr.reset(); }
+
+    void SetupCustomAnalyzerPolicies() {
+        TIndexPolicy tokenizer;
+        tokenizer.id = 1;
+        tokenizer.name = "test_tokenizer";
+        tokenizer.type = TIndexPolicyType::TOKENIZER;
+        tokenizer.properties["type"] = "standard";
+
+        TIndexPolicy filter;
+        filter.id = 2;
+        filter.name = "test_filter";
+        filter.type = TIndexPolicyType::TOKEN_FILTER;
+        filter.properties["type"] = "lowercase";
+
+        TIndexPolicy analyzer;
+        analyzer.id = 3;
+        analyzer.name = "test_custom_analyzer";
+        analyzer.type = TIndexPolicyType::ANALYZER;
+        analyzer.properties["tokenizer"] = "test_tokenizer";
+        analyzer.properties["token_filter"] = "test_filter";
+
+        std::vector<TIndexPolicy> policies = {tokenizer, filter, analyzer};
+        _index_policy_mgr->apply_policy_changes(policies, {});
+    }
+
+    std::unique_ptr<IndexPolicyMgr> _index_policy_mgr;
+};
+
+// ==================== Combined test for is_builtin_analyzer and 
create_builtin_analyzer ====================
+
+TEST_F(AnalyzerTest, TestBuiltinAnalyzers) {
+    // Test all builtin analyzer names with is_builtin_analyzer and 
create_builtin_analyzer together
+    struct BuiltinAnalyzerTestCase {
+        std::string name;
+        InvertedIndexParserType parser_type;
+        std::string parser_mode;
+        bool requires_dict; // Flag to indicate if dictionary files are 
required
+    };
+
+    std::vector<BuiltinAnalyzerTestCase> builtin_cases = {
+            {INVERTED_INDEX_PARSER_NONE, InvertedIndexParserType::PARSER_NONE, 
"", false},
+            {INVERTED_INDEX_PARSER_STANDARD, 
InvertedIndexParserType::PARSER_STANDARD, "", false},
+            {INVERTED_INDEX_PARSER_UNICODE, 
InvertedIndexParserType::PARSER_UNICODE, "", false},
+            {INVERTED_INDEX_PARSER_ENGLISH, 
InvertedIndexParserType::PARSER_ENGLISH, "", false},
+            {INVERTED_INDEX_PARSER_CHINESE, 
InvertedIndexParserType::PARSER_CHINESE,
+             INVERTED_INDEX_PARSER_COARSE_GRANULARITY, true},
+            {INVERTED_INDEX_PARSER_ICU, InvertedIndexParserType::PARSER_ICU, 
"", true},
+            {INVERTED_INDEX_PARSER_BASIC, 
InvertedIndexParserType::PARSER_BASIC, "", false},
+            {INVERTED_INDEX_PARSER_IK, InvertedIndexParserType::PARSER_IK,
+             INVERTED_INDEX_PARSER_SMART, true}};
+
+    // Test all builtin analyzers
+    for (const auto& test_case : builtin_cases) {
+        // Test is_builtin_analyzer returns true
+        EXPECT_TRUE(InvertedIndexAnalyzer::is_builtin_analyzer(test_case.name))
+                << "Failed for: " << test_case.name;
+
+        // Test create_builtin_analyzer works
+        // For analyzers that require dict files, allow exception
+        if (test_case.requires_dict) {
+            try {
+                auto analyzer = InvertedIndexAnalyzer::create_builtin_analyzer(
+                        test_case.parser_type, test_case.parser_mode, "", "");
+                // If dict exists, analyzer should not be null
+                EXPECT_NE(analyzer, nullptr)
+                        << "Created analyzer for: " << test_case.name << " 
(dict available)";
+            } catch (const std::exception& e) {
+                // If dict doesn't exist, allow the exception and log it
+                LOG(INFO) << "Skipped creating " << test_case.name
+                          << " due to missing dict: " << e.what();
+            }
+        } else {
+            auto analyzer = InvertedIndexAnalyzer::create_builtin_analyzer(
+                    test_case.parser_type, test_case.parser_mode, "", "");
+            EXPECT_NE(analyzer, nullptr) << "Failed to create analyzer for: " 
<< test_case.name;
+        }
+    }
+
+    // Test non-builtin names return false
+    
EXPECT_FALSE(InvertedIndexAnalyzer::is_builtin_analyzer("my_custom_analyzer"));
+    EXPECT_FALSE(InvertedIndexAnalyzer::is_builtin_analyzer(""));
+    EXPECT_FALSE(InvertedIndexAnalyzer::is_builtin_analyzer("Standard")); // 
case sensitive
+
+    // Test different parser modes (Chinese with fine granularity)
+    try {
+        auto chinese_fine = InvertedIndexAnalyzer::create_builtin_analyzer(
+                InvertedIndexParserType::PARSER_CHINESE, 
INVERTED_INDEX_PARSER_FINE_GRANULARITY, "",
+                "");
+        EXPECT_NE(chinese_fine, nullptr);
+    } catch (const std::exception& e) {
+        LOG(INFO) << "Skipped Chinese fine granularity test due to missing 
dict: " << e.what();
+    }
+
+    // Test IK with max word mode
+    try {
+        auto ik_maxword = InvertedIndexAnalyzer::create_builtin_analyzer(
+                InvertedIndexParserType::PARSER_IK, 
INVERTED_INDEX_PARSER_MAX_WORD, "", "");
+        EXPECT_NE(ik_maxword, nullptr);
+    } catch (const std::exception& e) {
+        LOG(INFO) << "Skipped IK max word test due to missing dict: " << 
e.what();
+    }
+
+    // Test lowercase and stopwords settings (using STANDARD which doesn't 
require dict)
+    auto with_lower = InvertedIndexAnalyzer::create_builtin_analyzer(
+            InvertedIndexParserType::PARSER_STANDARD, "", 
INVERTED_INDEX_PARSER_TRUE, "");
+    EXPECT_NE(with_lower, nullptr);
+
+    auto without_lower = InvertedIndexAnalyzer::create_builtin_analyzer(
+            InvertedIndexParserType::PARSER_STANDARD, "", 
INVERTED_INDEX_PARSER_FALSE, "");
+    EXPECT_NE(without_lower, nullptr);
+
+    auto with_stopwords = InvertedIndexAnalyzer::create_builtin_analyzer(
+            InvertedIndexParserType::PARSER_STANDARD, "", "", "none");
+    EXPECT_NE(with_stopwords, nullptr);
+
+    // Test unknown parser type falls back to default
+    auto unknown = InvertedIndexAnalyzer::create_builtin_analyzer(
+            InvertedIndexParserType::PARSER_UNKNOWN, "", "", "");
+    EXPECT_NE(unknown, nullptr);
+}
+
+// ==================== Combined test for create_analyzer ====================
+
+TEST_F(AnalyzerTest, TestCreateAnalyzer) {
+    // Test Case 1: Empty custom_analyzer, use builtin parser_type
+    {
+        InvertedIndexCtx ctx;
+        ctx.custom_analyzer = "";
+        ctx.parser_type = InvertedIndexParserType::PARSER_STANDARD;
+        ctx.parser_mode = "";
+        ctx.lower_case = INVERTED_INDEX_PARSER_TRUE;
+        ctx.stop_words = "none";
+
+        auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+        EXPECT_NE(analyzer, nullptr);
+    }
+
+    // Test Case 2: custom_analyzer is a builtin name (using one that doesn't 
need dict)
+    {
+        InvertedIndexCtx ctx;
+        ctx.custom_analyzer = INVERTED_INDEX_PARSER_ENGLISH;
+        ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+        ctx.parser_mode = "";
+        ctx.lower_case = INVERTED_INDEX_PARSER_FALSE;
+        ctx.stop_words = "";
+
+        auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+        EXPECT_NE(analyzer, nullptr);
+    }
+
+    // Test Case 3: Test all builtin names work through create_analyzer
+    std::vector<std::pair<std::string, bool>> builtin_names = {
+            {INVERTED_INDEX_PARSER_STANDARD, false},
+            {INVERTED_INDEX_PARSER_UNICODE, false},
+            {INVERTED_INDEX_PARSER_ENGLISH, false},
+            {INVERTED_INDEX_PARSER_CHINESE, true}, // requires dict
+            {INVERTED_INDEX_PARSER_ICU, true},     // requires dict
+            {INVERTED_INDEX_PARSER_BASIC, false},
+            {INVERTED_INDEX_PARSER_IK, true} // requires dict
+    };
+
+    for (const auto& [name, requires_dict] : builtin_names) {
+        InvertedIndexCtx ctx;
+        ctx.custom_analyzer = name;
+        ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+        ctx.parser_mode = "";
+        ctx.lower_case = "";
+        ctx.stop_words = "";
+
+        if (requires_dict) {
+            try {
+                auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+                EXPECT_NE(analyzer, nullptr) << "Created analyzer for builtin 
name: " << name;
+            } catch (const std::exception& e) {
+                LOG(INFO) << "Skipped " << name << " due to missing dict: " << 
e.what();
+            }
+        } else {
+            auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+            EXPECT_NE(analyzer, nullptr) << "Failed for builtin name: " << 
name;
+        }
+    }
+
+    // Test Case 4: Test with different parser types
+    std::vector<std::pair<InvertedIndexParserType, bool>> parser_types = {
+            {InvertedIndexParserType::PARSER_STANDARD, false},
+            {InvertedIndexParserType::PARSER_UNICODE, false},
+            {InvertedIndexParserType::PARSER_ENGLISH, false},
+            {InvertedIndexParserType::PARSER_CHINESE, true}, // requires dict
+            {InvertedIndexParserType::PARSER_ICU, true},     // requires dict
+            {InvertedIndexParserType::PARSER_BASIC, false},
+            {InvertedIndexParserType::PARSER_IK, true} // requires dict
+    };
+
+    for (const auto& [parser_type, requires_dict] : parser_types) {
+        InvertedIndexCtx ctx;
+        ctx.custom_analyzer = "";
+        ctx.parser_type = parser_type;
+        ctx.parser_mode = "";
+        ctx.lower_case = "";
+        ctx.stop_words = "";
+
+        if (requires_dict) {
+            try {
+                auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+                EXPECT_NE(analyzer, nullptr)
+                        << "Created analyzer for parser_type: " << 
static_cast<int>(parser_type);
+            } catch (const std::exception& e) {
+                LOG(INFO) << "Skipped parser_type " << 
static_cast<int>(parser_type)
+                          << " due to missing dict: " << e.what();
+            }
+        } else {
+            auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+            EXPECT_NE(analyzer, nullptr)
+                    << "Failed for parser_type: " << 
static_cast<int>(parser_type);
+        }
+    }
+}
+
+// ==================== Test create_analyzer with index_policy_mgr 
====================
+
+TEST_F(AnalyzerTest, TestCreateAnalyzerWithCustomPolicy) {
+    // Test when index_policy_mgr is null - should throw exception
+    {
+        InvertedIndexCtx ctx;
+        ctx.custom_analyzer = "non_existent_custom";
+        ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+        ctx.parser_mode = "";
+        ctx.lower_case = "";
+        ctx.stop_words = "";
+
+        if (!doris::ExecEnv::GetInstance()->index_policy_mgr()) {
+            EXPECT_THROW(
+                    {
+                        try {
+                            InvertedIndexAnalyzer::create_analyzer(&ctx);
+                        } catch (const Exception& e) {
+                            EXPECT_EQ(e.code(), 
ErrorCode::INVERTED_INDEX_ANALYZER_ERROR);
+                            EXPECT_TRUE(std::string(e.what()).find(
+                                                "Index policy manager is not 
initialized") !=
+                                        std::string::npos);
+                            throw;
+                        }
+                    },
+                    Exception);
+        }
+    }
+
+    // Test with properly configured index_policy_mgr
+    auto* mgr = doris::ExecEnv::GetInstance()->index_policy_mgr();
+    if (mgr) {
+        SetupCustomAnalyzerPolicies();
+
+        // Test successful custom analyzer retrieval
+        {
+            InvertedIndexCtx ctx;
+            ctx.custom_analyzer = "test_custom_analyzer";
+            ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+            ctx.parser_mode = "";
+            ctx.lower_case = "";
+            ctx.stop_words = "";
+
+            auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+            EXPECT_NE(analyzer, nullptr);
+        }
+
+        // Test non-existent custom analyzer throws exception
+        {
+            InvertedIndexCtx ctx;
+            ctx.custom_analyzer = "non_existent_analyzer";
+            ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+            ctx.parser_mode = "";
+            ctx.lower_case = "";
+            ctx.stop_words = "";
+
+            EXPECT_THROW(InvertedIndexAnalyzer::create_analyzer(&ctx), 
Exception);
+        }
+    }
+}
+
+// ==================== Integration test ====================
+
+TEST_F(AnalyzerTest, TestAnalyzerFunctionality) {
+    // Create an analyzer and test it can tokenize text properly
+    InvertedIndexCtx ctx;
+    ctx.custom_analyzer = "";
+    ctx.parser_type = InvertedIndexParserType::PARSER_STANDARD;
+    ctx.parser_mode = "";
+    ctx.lower_case = INVERTED_INDEX_PARSER_TRUE;
+    ctx.stop_words = "none";
+
+    auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+    ASSERT_NE(analyzer, nullptr);
+
+    // Test tokenization
+    auto reader = std::make_shared<lucene::util::SStringReader<char>>();
+    std::string text = "Hello World Test";
+    reader->init(text.data(), static_cast<int32_t>(text.size()), true);
+
+    auto result = InvertedIndexAnalyzer::get_analyse_result(reader, 
analyzer.get());
+    EXPECT_GT(result.size(), 0);
+
+    // Verify tokens are not empty
+    for (const auto& term_info : result) {
+        // term is a variant, need to check based on its actual type
+        if (term_info.is_single_term()) {
+            EXPECT_FALSE(term_info.get_single_term().empty());
+        } else {
+            EXPECT_FALSE(term_info.get_multi_terms().empty());
+        }
+        EXPECT_GE(term_info.position, 0);
+    }
+}
+
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
index 7abfe53ff4e..56dd885bb36 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
@@ -67,6 +67,9 @@ public class IndexPolicy implements Writable, 
GsonPostProcessable {
     public static final Set<String> BUILTIN_CHAR_FILTERS = ImmutableSet.of(
             "empty", "char_replace");
 
+    public static final Set<String> BUILTIN_ANALYZERS = ImmutableSet.of(
+            "none", "standard", "unicode", "english", "chinese", "icu", 
"basic", "ik");
+
     private static final Logger LOG = LogManager.getLogger(IndexPolicy.class);
 
     @SerializedName(value = "id")
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
index 04205605669..876fe2483fc 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
@@ -45,7 +45,9 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.stream.Collectors;
 
 public class IndexPolicyMgr implements Writable, GsonPostProcessable {
     private static final Logger LOG = 
LogManager.getLogger(IndexPolicyMgr.class);
@@ -84,6 +86,11 @@ public class IndexPolicyMgr implements Writable, 
GsonPostProcessable {
     }
 
     public void validateAnalyzerExists(String analyzerName) throws 
DdlException {
+        // Allow built-in analyzers
+        if (IndexPolicy.BUILTIN_ANALYZERS.contains(analyzerName)) {
+            return;
+        }
+
         readLock();
         try {
             IndexPolicy policy = nameToIndexPolicy.get(analyzerName);
@@ -115,6 +122,9 @@ public class IndexPolicyMgr implements Writable, 
GsonPostProcessable {
         if (IndexPolicy.BUILTIN_CHAR_FILTERS.contains(policyName)) {
             throw new DdlException("Policy name '" + policyName + "' conflicts 
with built-in char filter name");
         }
+        if (IndexPolicy.BUILTIN_ANALYZERS.contains(policyName)) {
+            throw new DdlException("Policy name '" + policyName + "' conflicts 
with built-in analyzer name");
+        }
 
         IndexPolicy indexPolicy = IndexPolicy.create(policyName, type, 
properties);
 
@@ -267,8 +277,11 @@ public class IndexPolicyMgr implements Writable, 
GsonPostProcessable {
                 validator = new CharGroupTokenizerValidator();
                 break;
             default:
+                Set<String> userFacingTypes = 
IndexPolicy.BUILTIN_TOKEN_FILTERS.stream()
+                        .filter(t -> !t.equals("empty"))
+                        .collect(Collectors.toSet());
                 throw new DdlException("Unsupported tokenizer type: " + type
-                        + ". Supported types: " + 
IndexPolicy.BUILTIN_TOKENIZERS);
+                        + ". Supported types: " + userFacingTypes);
         }
         validator.validate(properties);
     }
@@ -293,8 +306,11 @@ public class IndexPolicyMgr implements Writable, 
GsonPostProcessable {
                 validator = new NoOperationValidator("lowercase token filter");
                 break;
             default:
+                Set<String> userFacingTypes = 
IndexPolicy.BUILTIN_TOKEN_FILTERS.stream()
+                        .filter(t -> !t.equals("empty"))
+                        .collect(Collectors.toSet());
                 throw new DdlException("Unsupported token filter type: " + type
-                        + ". Supported types: " + 
IndexPolicy.BUILTIN_TOKEN_FILTERS);
+                        + ". Supported types: " + userFacingTypes);
         }
         validator.validate(properties);
     }
@@ -313,8 +329,11 @@ public class IndexPolicyMgr implements Writable, 
GsonPostProcessable {
                 validator = new CharReplaceCharFilterValidator();
                 break;
             default:
+                Set<String> userFacingTypes = 
IndexPolicy.BUILTIN_CHAR_FILTERS.stream()
+                        .filter(t -> !t.equals("empty"))
+                        .collect(Collectors.toSet());
                 throw new DdlException("Unsupported char filter type: " + type
-                        + ". Supported types: " + 
IndexPolicy.BUILTIN_CHAR_FILTERS);
+                        + ". Supported types: " + userFacingTypes);
         }
         validator.validate(properties);
     }
diff --git 
a/regression-test/data/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.out
 
b/regression-test/data/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.out
new file mode 100644
index 00000000000..e72db5c6e79
--- /dev/null
+++ 
b/regression-test/data/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.out
@@ -0,0 +1,49 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !tokenize_standard --
+[{\n        "token": "apache"\n    }, {\n        "token": "doris"\n    }, {\n  
      "token": "fast"\n    }, {\n        "token": "mpp"\n    }, {\n        
"token": "database"\n    }]
+
+-- !tokenize_unicode --
+[{\n        "token": "hello"\n    }, {\n        "token": "world"\n    }, {\n   
     "token": "你"\n    }, {\n        "token": "好"\n    }, {\n        "token": 
"世"\n    }, {\n        "token": "界"\n    }]
+
+-- !tokenize_basic --
+[{\n        "token": "get"\n    }, {\n        "token": "images"\n    }, {\n    
    "token": "test"\n    }, {\n        "token": "jpg"\n    }, {\n        
"token": "http"\n    }, {\n        "token": "1"\n    }, {\n        "token": 
"0"\n    }]
+
+-- !tokenize_icu --
+[{\n        "token": "让"\n    }, {\n        "token": "我们"\n    }, {\n        
"token": "说"\n    }, {\n        "token": "hello"\n    }, {\n        "token": 
"そして"\n    }, {\n        "token": "世界"\n    }, {\n        "token": "と"\n    }, 
{\n        "token": "つ"\n    }, {\n        "token": "な"\n    }, {\n        
"token": "が"\n    }, {\n        "token": "ろう"\n    }]
+
+-- !tokenize_chinese_standard --
+[{\n        "token": "apache"\n    }, {\n        "token": "doris"\n    }, {\n  
      "token": "是"\n    }, {\n        "token": "一"\n    }, {\n        "token": 
"个"\n    }, {\n        "token": "现"\n    }, {\n        "token": "代"\n    }, {\n 
       "token": "化"\n    }, {\n        "token": "的"\n    }, {\n        "token": 
"mpp"\n    }, {\n        "token": "数"\n    }, {\n        "token": "据"\n    }, 
{\n        "token": "库"\n    }]
+
+-- !tokenize_chinese_basic --
+[{\n        "token": "apache"\n    }, {\n        "token": "doris"\n    }, {\n  
      "token": "是"\n    }, {\n        "token": "一"\n    }, {\n        "token": 
"个"\n    }, {\n        "token": "现"\n    }, {\n        "token": "代"\n    }, {\n 
       "token": "化"\n    }, {\n        "token": "的"\n    }, {\n        "token": 
"mpp"\n    }, {\n        "token": "数"\n    }, {\n        "token": "据"\n    }, 
{\n        "token": "库"\n    }]
+
+-- !tokenize_chinese_icu --
+[{\n        "token": "apache"\n    }, {\n        "token": "doris"\n    }, {\n  
      "token": "是"\n    }, {\n        "token": "一个"\n    }, {\n        "token": 
"现代"\n    }, {\n        "token": "化"\n    }, {\n        "token": "的"\n    }, 
{\n        "token": "mpp"\n    }, {\n        "token": "数据"\n    }, {\n        
"token": "库"\n    }]
+
+-- !tokenize_special_standard --
+[{\n        "token": "test"\n    }, {\n        "token": "example.com"\n    }, 
{\n        "token": "user_name"\n    }, {\n        "token": "123"\n    }, {\n   
     "token": "456"\n    }]
+
+-- !tokenize_empty --
+
+
+-- !tokenize_mixed --
+[{\n        "token": "中文"\n    }, {\n        "token": "english"\n    }, {\n    
    "token": "日本語"\n    }, {\n        "token": "한국어"\n    }]
+
+-- !sql_basic_match_logo --
+1      GET /images/logo.png HTTP/1.0
+
+-- !sql_basic_match_images --
+1      GET /images/logo.png HTTP/1.0
+
+-- !sql_basic_match_api --
+2      POST /api/v1/users HTTP/1.1
+
+-- !sql_basic_match_logo --
+1      GET /images/logo.png HTTP/1.0
+
+-- !sql_basic_match_images --
+1      GET /images/logo.png HTTP/1.0
+
+-- !sql_basic_match_api --
+2      POST /api/v1/users HTTP/1.1
+
diff --git 
a/regression-test/suites/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.groovy
 
b/regression-test/suites/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.groovy
new file mode 100644
index 00000000000..25e9cda1a40
--- /dev/null
+++ 
b/regression-test/suites/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.groovy
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_builtin_analyzer_in_custom_analyzer", "p0") {
+    // Define all built-in analyzers
+    def builtinAnalyzers = ["none", "standard", "unicode", "english", 
"chinese", "icu", "basic", "ik"]
+    
+    // Helper function to test that creating analyzer with builtin name should 
fail
+    def testBuiltinAnalyzerNameConflict = { String analyzerName ->
+        test {
+            sql """
+                CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerName}
+                PROPERTIES
+                (
+                    "tokenizer" = "basic",
+                    "token_filter" = "lowercase"
+                );
+            """
+            exception "conflicts with built-in"
+        }
+    }
+    
+    // Test: Cannot create custom analyzer with built-in analyzer names
+    logger.info("Testing that built-in analyzer names cannot be used for 
custom analyzers")
+    
+    builtinAnalyzers.each { analyzerName ->
+        logger.info("Testing conflict with built-in analyzer: ${analyzerName}")
+        testBuiltinAnalyzerNameConflict(analyzerName)
+    }
+    
+    // Define tokenize test cases: [testName, testText, analyzerName]
+    def tokenizeTestCases = [
+        ["standard", "Apache Doris is a fast MPP database", "standard"],
+        ["unicode", "Hello World 你好世界", "unicode"],
+        ["basic", "GET /images/test.jpg HTTP/1.0", "basic"],
+        ["icu", "让我们说「Hello」そして世界とつながろう！", "icu"],
+        ["chinese_standard", "Apache Doris是一个现代化的MPP数据库", "standard"],
+        ["chinese_basic", "Apache Doris是一个现代化的MPP数据库", "basic"],
+        ["chinese_icu", "Apache Doris是一个现代化的MPP数据库", "icu"],
+        ["special_standard", "[email protected] user_name 123-456", "standard"],
+        ["empty", "", "standard"],
+        ["mixed", "中文English日本語한국어", "icu"]
+    ]
+    
+    // Execute tokenize tests in a loop
+    tokenizeTestCases.each { testCase ->
+        def testName = testCase[0]
+        def testText = testCase[1]
+        def analyzerName = testCase[2]
+        
+        logger.info("Testing tokenize with ${analyzerName}: ${testName}")
+        "qt_tokenize_${testName}"(""" 
+            select tokenize("${testText}", '"analyzer"="${analyzerName}"'); 
+        """)
+    }
+
+    // Test table creation with analyzer_with_standard
+    def indexTblName = "test_builtin_analyzer_table"
+    
+    sql "DROP TABLE IF EXISTS ${indexTblName}"
+    
+    sql """
+        CREATE TABLE ${indexTblName} (
+            `id` int(11) NOT NULL,
+            `url` text NULL,
+            INDEX idx_url (`url`) USING INVERTED PROPERTIES("support_phrase" = 
"true", "analyzer" = "basic")
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`id`)
+        DISTRIBUTED BY HASH(`id`) BUCKETS 1
+        PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1"
+        );
+    """
+    
+    // Insert test data
+    sql """ INSERT INTO ${indexTblName} VALUES (1, 'GET /images/logo.png 
HTTP/1.0'); """
+    sql """ INSERT INTO ${indexTblName} VALUES (2, 'POST /api/v1/users 
HTTP/1.1'); """
+    sql """ INSERT INTO ${indexTblName} VALUES (3, 'GET /docs/index.html 
HTTP/1.0'); """
+    
+    try {
+        sql "sync"
+        sql """ set enable_common_expr_pushdown = true; """
+        
+        // Test MATCH queries with analyzer_with_standard
+        qt_sql_basic_match_logo """ SELECT * FROM ${indexTblName} WHERE url 
MATCH 'logo' ORDER BY id; """
+        qt_sql_basic_match_images """ SELECT * FROM ${indexTblName} WHERE url 
MATCH 'images' ORDER BY id; """
+        qt_sql_basic_match_api """ SELECT * FROM ${indexTblName} WHERE url 
MATCH 'api' ORDER BY id; """
+
+    } finally {
+    }
+    
+    // Test with another analyzer - analyzer_with_basic
+    def indexTblName2 = "test_builtin_analyzer_table_basic"
+    
+    sql "DROP TABLE IF EXISTS ${indexTblName2}"
+    
+    sql """
+        CREATE TABLE ${indexTblName2} (
+            `id` int(11) NOT NULL,
+            `url` text NULL,
+            INDEX idx_url (`url`) USING INVERTED PROPERTIES("support_phrase" = 
"true", "analyzer" = "basic")
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`id`)
+        DISTRIBUTED BY HASH(`id`) BUCKETS 1
+        PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1"
+        );
+    """
+    
+    // Insert URL test data
+    sql """ INSERT INTO ${indexTblName2} VALUES (1, 'GET /images/logo.png 
HTTP/1.0'); """
+    sql """ INSERT INTO ${indexTblName2} VALUES (2, 'POST /api/v1/users 
HTTP/1.1'); """
+    sql """ INSERT INTO ${indexTblName2} VALUES (3, 'GET /docs/index.html 
HTTP/1.0'); """
+    
+    try {
+        sql "sync"
+        sql """ set enable_common_expr_pushdown = true; """
+        
+        // Test basic analyzer on URL-like strings
+        qt_sql_basic_match_logo """ SELECT * FROM ${indexTblName2} WHERE url 
MATCH 'logo' ORDER BY id; """
+        qt_sql_basic_match_images """ SELECT * FROM ${indexTblName2} WHERE url 
MATCH 'images' ORDER BY id; """
+        qt_sql_basic_match_api """ SELECT * FROM ${indexTblName2} WHERE url 
MATCH 'api' ORDER BY id; """
+        
+    } finally {
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-3.1 updated: [opt](inverted index) support builtin analyzer names in custom analyzer field (#57727)

Reply via email to