This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 35a8a197916 [opt](inverted index) support builtin analyzer names in
custom analyzer field (#57727)
35a8a197916 is described below
commit 35a8a197916cb629a0fb2829c2fd30ebc0779881
Author: zzzxl <[email protected]>
AuthorDate: Thu Nov 6 13:07:23 2025 +0800
[opt](inverted index) support builtin analyzer names in custom analyzer
field (#57727)
picked from #57512
---
.../inverted_index/analyzer/analyzer.cpp | 137 +++++---
.../segment_v2/inverted_index/analyzer/analyzer.h | 12 +-
.../inverted_index/ananlyzer/analyzer_test.cpp | 348 +++++++++++++++++++++
.../org/apache/doris/indexpolicy/IndexPolicy.java | 3 +
.../apache/doris/indexpolicy/IndexPolicyMgr.java | 25 +-
.../test_builtin_analyzer_in_custom_analyzer.out | 49 +++
...test_builtin_analyzer_in_custom_analyzer.groovy | 140 +++++++++
7 files changed, 657 insertions(+), 57 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
index bc81f0a7685..c7c5110f1ac 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
@@ -50,67 +50,100 @@ ReaderPtr
InvertedIndexAnalyzer::create_reader(CharFilterMap& char_filter_map) {
return reader;
}
-std::shared_ptr<lucene::analysis::Analyzer>
InvertedIndexAnalyzer::create_analyzer(
- const InvertedIndexCtx* inverted_index_ctx) {
+bool InvertedIndexAnalyzer::is_builtin_analyzer(const std::string&
analyzer_name) {
+ return analyzer_name == INVERTED_INDEX_PARSER_NONE ||
+ analyzer_name == INVERTED_INDEX_PARSER_STANDARD ||
+ analyzer_name == INVERTED_INDEX_PARSER_UNICODE ||
+ analyzer_name == INVERTED_INDEX_PARSER_ENGLISH ||
+ analyzer_name == INVERTED_INDEX_PARSER_CHINESE ||
+ analyzer_name == INVERTED_INDEX_PARSER_ICU ||
+ analyzer_name == INVERTED_INDEX_PARSER_BASIC ||
+ analyzer_name == INVERTED_INDEX_PARSER_IK;
+}
+
+AnalyzerPtr
InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserType
parser_type,
+ const std::string&
parser_mode,
+ const std::string&
lower_case,
+ const std::string&
stop_words) {
std::shared_ptr<lucene::analysis::Analyzer> analyzer;
- if (!inverted_index_ctx->custom_analyzer.empty()) {
- analyzer =
doris::ExecEnv::GetInstance()->index_policy_mgr()->get_policy_by_name(
- inverted_index_ctx->custom_analyzer);
- } else {
- auto analyser_type = inverted_index_ctx->parser_type;
- if (analyser_type == InvertedIndexParserType::PARSER_STANDARD ||
- analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
- analyzer =
std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
- } else if (analyser_type == InvertedIndexParserType::PARSER_ENGLISH) {
- analyzer =
std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
- } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
- auto chinese_analyzer =
-
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
- chinese_analyzer->initDict(config::inverted_index_dict_path);
- auto mode = inverted_index_ctx->parser_mode;
- if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
-
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
- } else {
- chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
- }
- analyzer = std::move(chinese_analyzer);
- } else if (analyser_type == InvertedIndexParserType::PARSER_ICU) {
- analyzer = std::make_shared<ICUAnalyzer>();
- analyzer->initDict(config::inverted_index_dict_path + "/icu");
- } else if (analyser_type == InvertedIndexParserType::PARSER_BASIC) {
- analyzer = std::make_shared<BasicAnalyzer>();
- } else if (analyser_type == InvertedIndexParserType::PARSER_IK) {
- auto ik_analyzer = std::make_shared<IKAnalyzer>();
- ik_analyzer->initDict(config::inverted_index_dict_path + "/ik");
- auto mode = inverted_index_ctx->parser_mode;
- if (mode == INVERTED_INDEX_PARSER_SMART) {
- ik_analyzer->setMode(true);
- } else {
- ik_analyzer->setMode(false);
- }
- analyzer = std::move(ik_analyzer);
+
+ if (parser_type == InvertedIndexParserType::PARSER_STANDARD ||
+ parser_type == InvertedIndexParserType::PARSER_UNICODE) {
+ analyzer =
std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
+ } else if (parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
+ analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
+ } else if (parser_type == InvertedIndexParserType::PARSER_CHINESE) {
+ auto chinese_analyzer =
+
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
+ chinese_analyzer->initDict(config::inverted_index_dict_path);
+ if (parser_mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
+ chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
} else {
- // default
- analyzer =
std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
+ chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
}
- // set lowercase
- auto lowercase = inverted_index_ctx->lower_case;
- if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
- analyzer->set_lowercase(true);
- } else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
- analyzer->set_lowercase(false);
- }
- // set stop words
- auto stop_words = inverted_index_ctx->stop_words;
- if (stop_words == "none") {
- analyzer->set_stopwords(nullptr);
+ analyzer = std::move(chinese_analyzer);
+ } else if (parser_type == InvertedIndexParserType::PARSER_ICU) {
+ analyzer = std::make_shared<ICUAnalyzer>();
+ analyzer->initDict(config::inverted_index_dict_path + "/icu");
+ } else if (parser_type == InvertedIndexParserType::PARSER_BASIC) {
+ analyzer = std::make_shared<BasicAnalyzer>();
+ } else if (parser_type == InvertedIndexParserType::PARSER_IK) {
+ auto ik_analyzer = std::make_shared<IKAnalyzer>();
+ ik_analyzer->initDict(config::inverted_index_dict_path + "/ik");
+ if (parser_mode == INVERTED_INDEX_PARSER_SMART) {
+ ik_analyzer->setMode(true);
} else {
- analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
+ ik_analyzer->setMode(false);
}
+ analyzer = std::move(ik_analyzer);
+ } else {
+ // default
+ analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
+ }
+
+ // set lowercase
+ if (lower_case == INVERTED_INDEX_PARSER_TRUE) {
+ analyzer->set_lowercase(true);
+ } else if (lower_case == INVERTED_INDEX_PARSER_FALSE) {
+ analyzer->set_lowercase(false);
+ }
+
+ // set stop words
+ if (stop_words == "none") {
+ analyzer->set_stopwords(nullptr);
+ } else {
+ analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
}
+
return analyzer;
}
+std::shared_ptr<lucene::analysis::Analyzer>
InvertedIndexAnalyzer::create_analyzer(
+ const InvertedIndexCtx* inverted_index_ctx) {
+ const std::string& analyzer_name = inverted_index_ctx->custom_analyzer;
+ if (analyzer_name.empty()) {
+ return create_builtin_analyzer(
+ inverted_index_ctx->parser_type,
inverted_index_ctx->parser_mode,
+ inverted_index_ctx->lower_case,
inverted_index_ctx->stop_words);
+ }
+
+ if (is_builtin_analyzer(analyzer_name)) {
+ InvertedIndexParserType parser_type =
+ get_inverted_index_parser_type_from_string(analyzer_name);
+ return create_builtin_analyzer(parser_type,
inverted_index_ctx->parser_mode,
+ inverted_index_ctx->lower_case,
+ inverted_index_ctx->stop_words);
+ }
+
+ auto* index_policy_mgr = doris::ExecEnv::GetInstance()->index_policy_mgr();
+ if (!index_policy_mgr) {
+ throw Exception(ErrorCode::INVERTED_INDEX_ANALYZER_ERROR,
+ "Index policy manager is not initialized");
+ }
+
+ return index_policy_mgr->get_policy_by_name(analyzer_name);
+}
+
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
ReaderPtr reader, lucene::analysis::Analyzer* analyzer) {
std::vector<TermInfo> analyse_result;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
index df777e88889..71f575c4c83 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
@@ -36,12 +36,19 @@ class Analyzer;
} // namespace lucene
namespace doris::segment_v2::inverted_index {
+
+using AnalyzerPtr = std::shared_ptr<lucene::analysis::Analyzer>;
+
class InvertedIndexAnalyzer {
public:
static ReaderPtr create_reader(CharFilterMap& char_filter_map);
- static std::shared_ptr<lucene::analysis::Analyzer> create_analyzer(
- const InvertedIndexCtx* inverted_index_ctx);
+ static bool is_builtin_analyzer(const std::string& analyzer_name);
+ static AnalyzerPtr create_builtin_analyzer(InvertedIndexParserType
parser_type,
+ const std::string& parser_mode,
+ const std::string& lower_case,
+ const std::string& stop_words);
+ static AnalyzerPtr create_analyzer(const InvertedIndexCtx*
inverted_index_ctx);
static std::vector<TermInfo> get_analyse_result(ReaderPtr reader,
lucene::analysis::Analyzer* analyzer);
@@ -51,4 +58,5 @@ public:
static bool should_analyzer(const std::map<std::string, std::string>&
properties);
};
+
} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp
new file mode 100644
index 00000000000..46fc3077419
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp
@@ -0,0 +1,348 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
+
+#include <gtest/gtest.h>
+
+#include "gen_cpp/AgentService_types.h"
+#include "olap/inverted_index_parser.h"
+#include "olap/rowset/segment_v2/inverted_index/util/reader.h"
+#include "runtime/exec_env.h"
+#include "runtime/index_policy/index_policy_mgr.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class AnalyzerTest : public ::testing::Test {
+protected:
+ void SetUp() override { _index_policy_mgr =
std::make_unique<IndexPolicyMgr>(); }
+
+ void TearDown() override { _index_policy_mgr.reset(); }
+
+ void SetupCustomAnalyzerPolicies() {
+ TIndexPolicy tokenizer;
+ tokenizer.id = 1;
+ tokenizer.name = "test_tokenizer";
+ tokenizer.type = TIndexPolicyType::TOKENIZER;
+ tokenizer.properties["type"] = "standard";
+
+ TIndexPolicy filter;
+ filter.id = 2;
+ filter.name = "test_filter";
+ filter.type = TIndexPolicyType::TOKEN_FILTER;
+ filter.properties["type"] = "lowercase";
+
+ TIndexPolicy analyzer;
+ analyzer.id = 3;
+ analyzer.name = "test_custom_analyzer";
+ analyzer.type = TIndexPolicyType::ANALYZER;
+ analyzer.properties["tokenizer"] = "test_tokenizer";
+ analyzer.properties["token_filter"] = "test_filter";
+
+ std::vector<TIndexPolicy> policies = {tokenizer, filter, analyzer};
+ _index_policy_mgr->apply_policy_changes(policies, {});
+ }
+
+ std::unique_ptr<IndexPolicyMgr> _index_policy_mgr;
+};
+
+// ==================== Combined test for is_builtin_analyzer and
create_builtin_analyzer ====================
+
+TEST_F(AnalyzerTest, TestBuiltinAnalyzers) {
+ // Test all builtin analyzer names with is_builtin_analyzer and
create_builtin_analyzer together
+ struct BuiltinAnalyzerTestCase {
+ std::string name;
+ InvertedIndexParserType parser_type;
+ std::string parser_mode;
+ bool requires_dict; // Flag to indicate if dictionary files are
required
+ };
+
+ std::vector<BuiltinAnalyzerTestCase> builtin_cases = {
+ {INVERTED_INDEX_PARSER_NONE, InvertedIndexParserType::PARSER_NONE,
"", false},
+ {INVERTED_INDEX_PARSER_STANDARD,
InvertedIndexParserType::PARSER_STANDARD, "", false},
+ {INVERTED_INDEX_PARSER_UNICODE,
InvertedIndexParserType::PARSER_UNICODE, "", false},
+ {INVERTED_INDEX_PARSER_ENGLISH,
InvertedIndexParserType::PARSER_ENGLISH, "", false},
+ {INVERTED_INDEX_PARSER_CHINESE,
InvertedIndexParserType::PARSER_CHINESE,
+ INVERTED_INDEX_PARSER_COARSE_GRANULARITY, true},
+ {INVERTED_INDEX_PARSER_ICU, InvertedIndexParserType::PARSER_ICU,
"", true},
+ {INVERTED_INDEX_PARSER_BASIC,
InvertedIndexParserType::PARSER_BASIC, "", false},
+ {INVERTED_INDEX_PARSER_IK, InvertedIndexParserType::PARSER_IK,
+ INVERTED_INDEX_PARSER_SMART, true}};
+
+ // Test all builtin analyzers
+ for (const auto& test_case : builtin_cases) {
+ // Test is_builtin_analyzer returns true
+ EXPECT_TRUE(InvertedIndexAnalyzer::is_builtin_analyzer(test_case.name))
+ << "Failed for: " << test_case.name;
+
+ // Test create_builtin_analyzer works
+ // For analyzers that require dict files, allow exception
+ if (test_case.requires_dict) {
+ try {
+ auto analyzer = InvertedIndexAnalyzer::create_builtin_analyzer(
+ test_case.parser_type, test_case.parser_mode, "", "");
+ // If dict exists, analyzer should not be null
+ EXPECT_NE(analyzer, nullptr)
+ << "Created analyzer for: " << test_case.name << "
(dict available)";
+ } catch (const std::exception& e) {
+ // If dict doesn't exist, allow the exception and log it
+ LOG(INFO) << "Skipped creating " << test_case.name
+ << " due to missing dict: " << e.what();
+ }
+ } else {
+ auto analyzer = InvertedIndexAnalyzer::create_builtin_analyzer(
+ test_case.parser_type, test_case.parser_mode, "", "");
+ EXPECT_NE(analyzer, nullptr) << "Failed to create analyzer for: "
<< test_case.name;
+ }
+ }
+
+ // Test non-builtin names return false
+
EXPECT_FALSE(InvertedIndexAnalyzer::is_builtin_analyzer("my_custom_analyzer"));
+ EXPECT_FALSE(InvertedIndexAnalyzer::is_builtin_analyzer(""));
+ EXPECT_FALSE(InvertedIndexAnalyzer::is_builtin_analyzer("Standard")); //
case sensitive
+
+ // Test different parser modes (Chinese with fine granularity)
+ try {
+ auto chinese_fine = InvertedIndexAnalyzer::create_builtin_analyzer(
+ InvertedIndexParserType::PARSER_CHINESE,
INVERTED_INDEX_PARSER_FINE_GRANULARITY, "",
+ "");
+ EXPECT_NE(chinese_fine, nullptr);
+ } catch (const std::exception& e) {
+ LOG(INFO) << "Skipped Chinese fine granularity test due to missing
dict: " << e.what();
+ }
+
+ // Test IK with max word mode
+ try {
+ auto ik_maxword = InvertedIndexAnalyzer::create_builtin_analyzer(
+ InvertedIndexParserType::PARSER_IK,
INVERTED_INDEX_PARSER_MAX_WORD, "", "");
+ EXPECT_NE(ik_maxword, nullptr);
+ } catch (const std::exception& e) {
+ LOG(INFO) << "Skipped IK max word test due to missing dict: " <<
e.what();
+ }
+
+ // Test lowercase and stopwords settings (using STANDARD which doesn't
require dict)
+ auto with_lower = InvertedIndexAnalyzer::create_builtin_analyzer(
+ InvertedIndexParserType::PARSER_STANDARD, "",
INVERTED_INDEX_PARSER_TRUE, "");
+ EXPECT_NE(with_lower, nullptr);
+
+ auto without_lower = InvertedIndexAnalyzer::create_builtin_analyzer(
+ InvertedIndexParserType::PARSER_STANDARD, "",
INVERTED_INDEX_PARSER_FALSE, "");
+ EXPECT_NE(without_lower, nullptr);
+
+ auto with_stopwords = InvertedIndexAnalyzer::create_builtin_analyzer(
+ InvertedIndexParserType::PARSER_STANDARD, "", "", "none");
+ EXPECT_NE(with_stopwords, nullptr);
+
+ // Test unknown parser type falls back to default
+ auto unknown = InvertedIndexAnalyzer::create_builtin_analyzer(
+ InvertedIndexParserType::PARSER_UNKNOWN, "", "", "");
+ EXPECT_NE(unknown, nullptr);
+}
+
+// ==================== Combined test for create_analyzer ====================
+
+TEST_F(AnalyzerTest, TestCreateAnalyzer) {
+ // Test Case 1: Empty custom_analyzer, use builtin parser_type
+ {
+ InvertedIndexCtx ctx;
+ ctx.custom_analyzer = "";
+ ctx.parser_type = InvertedIndexParserType::PARSER_STANDARD;
+ ctx.parser_mode = "";
+ ctx.lower_case = INVERTED_INDEX_PARSER_TRUE;
+ ctx.stop_words = "none";
+
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ EXPECT_NE(analyzer, nullptr);
+ }
+
+ // Test Case 2: custom_analyzer is a builtin name (using one that doesn't
need dict)
+ {
+ InvertedIndexCtx ctx;
+ ctx.custom_analyzer = INVERTED_INDEX_PARSER_ENGLISH;
+ ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+ ctx.parser_mode = "";
+ ctx.lower_case = INVERTED_INDEX_PARSER_FALSE;
+ ctx.stop_words = "";
+
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ EXPECT_NE(analyzer, nullptr);
+ }
+
+ // Test Case 3: Test all builtin names work through create_analyzer
+ std::vector<std::pair<std::string, bool>> builtin_names = {
+ {INVERTED_INDEX_PARSER_STANDARD, false},
+ {INVERTED_INDEX_PARSER_UNICODE, false},
+ {INVERTED_INDEX_PARSER_ENGLISH, false},
+ {INVERTED_INDEX_PARSER_CHINESE, true}, // requires dict
+ {INVERTED_INDEX_PARSER_ICU, true}, // requires dict
+ {INVERTED_INDEX_PARSER_BASIC, false},
+ {INVERTED_INDEX_PARSER_IK, true} // requires dict
+ };
+
+ for (const auto& [name, requires_dict] : builtin_names) {
+ InvertedIndexCtx ctx;
+ ctx.custom_analyzer = name;
+ ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+ ctx.parser_mode = "";
+ ctx.lower_case = "";
+ ctx.stop_words = "";
+
+ if (requires_dict) {
+ try {
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ EXPECT_NE(analyzer, nullptr) << "Created analyzer for builtin
name: " << name;
+ } catch (const std::exception& e) {
+ LOG(INFO) << "Skipped " << name << " due to missing dict: " <<
e.what();
+ }
+ } else {
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ EXPECT_NE(analyzer, nullptr) << "Failed for builtin name: " <<
name;
+ }
+ }
+
+ // Test Case 4: Test with different parser types
+ std::vector<std::pair<InvertedIndexParserType, bool>> parser_types = {
+ {InvertedIndexParserType::PARSER_STANDARD, false},
+ {InvertedIndexParserType::PARSER_UNICODE, false},
+ {InvertedIndexParserType::PARSER_ENGLISH, false},
+ {InvertedIndexParserType::PARSER_CHINESE, true}, // requires dict
+ {InvertedIndexParserType::PARSER_ICU, true}, // requires dict
+ {InvertedIndexParserType::PARSER_BASIC, false},
+ {InvertedIndexParserType::PARSER_IK, true} // requires dict
+ };
+
+ for (const auto& [parser_type, requires_dict] : parser_types) {
+ InvertedIndexCtx ctx;
+ ctx.custom_analyzer = "";
+ ctx.parser_type = parser_type;
+ ctx.parser_mode = "";
+ ctx.lower_case = "";
+ ctx.stop_words = "";
+
+ if (requires_dict) {
+ try {
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ EXPECT_NE(analyzer, nullptr)
+ << "Created analyzer for parser_type: " <<
static_cast<int>(parser_type);
+ } catch (const std::exception& e) {
+ LOG(INFO) << "Skipped parser_type " <<
static_cast<int>(parser_type)
+ << " due to missing dict: " << e.what();
+ }
+ } else {
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ EXPECT_NE(analyzer, nullptr)
+ << "Failed for parser_type: " <<
static_cast<int>(parser_type);
+ }
+ }
+}
+
+// ==================== Test create_analyzer with index_policy_mgr
====================
+
+TEST_F(AnalyzerTest, TestCreateAnalyzerWithCustomPolicy) {
+ // Test when index_policy_mgr is null - should throw exception
+ {
+ InvertedIndexCtx ctx;
+ ctx.custom_analyzer = "non_existent_custom";
+ ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+ ctx.parser_mode = "";
+ ctx.lower_case = "";
+ ctx.stop_words = "";
+
+ if (!doris::ExecEnv::GetInstance()->index_policy_mgr()) {
+ EXPECT_THROW(
+ {
+ try {
+ InvertedIndexAnalyzer::create_analyzer(&ctx);
+ } catch (const Exception& e) {
+ EXPECT_EQ(e.code(),
ErrorCode::INVERTED_INDEX_ANALYZER_ERROR);
+ EXPECT_TRUE(std::string(e.what()).find(
+ "Index policy manager is not
initialized") !=
+ std::string::npos);
+ throw;
+ }
+ },
+ Exception);
+ }
+ }
+
+ // Test with properly configured index_policy_mgr
+ auto* mgr = doris::ExecEnv::GetInstance()->index_policy_mgr();
+ if (mgr) {
+ SetupCustomAnalyzerPolicies();
+
+ // Test successful custom analyzer retrieval
+ {
+ InvertedIndexCtx ctx;
+ ctx.custom_analyzer = "test_custom_analyzer";
+ ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+ ctx.parser_mode = "";
+ ctx.lower_case = "";
+ ctx.stop_words = "";
+
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ EXPECT_NE(analyzer, nullptr);
+ }
+
+ // Test non-existent custom analyzer throws exception
+ {
+ InvertedIndexCtx ctx;
+ ctx.custom_analyzer = "non_existent_analyzer";
+ ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+ ctx.parser_mode = "";
+ ctx.lower_case = "";
+ ctx.stop_words = "";
+
+ EXPECT_THROW(InvertedIndexAnalyzer::create_analyzer(&ctx),
Exception);
+ }
+ }
+}
+
+// ==================== Integration test ====================
+
+TEST_F(AnalyzerTest, TestAnalyzerFunctionality) {
+ // Create an analyzer and test it can tokenize text properly
+ InvertedIndexCtx ctx;
+ ctx.custom_analyzer = "";
+ ctx.parser_type = InvertedIndexParserType::PARSER_STANDARD;
+ ctx.parser_mode = "";
+ ctx.lower_case = INVERTED_INDEX_PARSER_TRUE;
+ ctx.stop_words = "none";
+
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ ASSERT_NE(analyzer, nullptr);
+
+ // Test tokenization
+ auto reader = std::make_shared<lucene::util::SStringReader<char>>();
+ std::string text = "Hello World Test";
+ reader->init(text.data(), static_cast<int32_t>(text.size()), true);
+
+ auto result = InvertedIndexAnalyzer::get_analyse_result(reader,
analyzer.get());
+ EXPECT_GT(result.size(), 0);
+
+ // Verify tokens are not empty
+ for (const auto& term_info : result) {
+ // term is a variant, need to check based on its actual type
+ if (term_info.is_single_term()) {
+ EXPECT_FALSE(term_info.get_single_term().empty());
+ } else {
+ EXPECT_FALSE(term_info.get_multi_terms().empty());
+ }
+ EXPECT_GE(term_info.position, 0);
+ }
+}
+
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
index 7abfe53ff4e..56dd885bb36 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
@@ -67,6 +67,9 @@ public class IndexPolicy implements Writable,
GsonPostProcessable {
public static final Set<String> BUILTIN_CHAR_FILTERS = ImmutableSet.of(
"empty", "char_replace");
+ public static final Set<String> BUILTIN_ANALYZERS = ImmutableSet.of(
+ "none", "standard", "unicode", "english", "chinese", "icu",
"basic", "ik");
+
private static final Logger LOG = LogManager.getLogger(IndexPolicy.class);
@SerializedName(value = "id")
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
index 04205605669..876fe2483fc 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
@@ -45,7 +45,9 @@ import java.io.DataOutput;
import java.io.IOException;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.stream.Collectors;
public class IndexPolicyMgr implements Writable, GsonPostProcessable {
private static final Logger LOG =
LogManager.getLogger(IndexPolicyMgr.class);
@@ -84,6 +86,11 @@ public class IndexPolicyMgr implements Writable,
GsonPostProcessable {
}
public void validateAnalyzerExists(String analyzerName) throws
DdlException {
+ // Allow built-in analyzers
+ if (IndexPolicy.BUILTIN_ANALYZERS.contains(analyzerName)) {
+ return;
+ }
+
readLock();
try {
IndexPolicy policy = nameToIndexPolicy.get(analyzerName);
@@ -115,6 +122,9 @@ public class IndexPolicyMgr implements Writable,
GsonPostProcessable {
if (IndexPolicy.BUILTIN_CHAR_FILTERS.contains(policyName)) {
throw new DdlException("Policy name '" + policyName + "' conflicts
with built-in char filter name");
}
+ if (IndexPolicy.BUILTIN_ANALYZERS.contains(policyName)) {
+ throw new DdlException("Policy name '" + policyName + "' conflicts
with built-in analyzer name");
+ }
IndexPolicy indexPolicy = IndexPolicy.create(policyName, type,
properties);
@@ -267,8 +277,11 @@ public class IndexPolicyMgr implements Writable,
GsonPostProcessable {
validator = new CharGroupTokenizerValidator();
break;
default:
+ Set<String> userFacingTypes =
IndexPolicy.BUILTIN_TOKEN_FILTERS.stream()
+ .filter(t -> !t.equals("empty"))
+ .collect(Collectors.toSet());
throw new DdlException("Unsupported tokenizer type: " + type
- + ". Supported types: " +
IndexPolicy.BUILTIN_TOKENIZERS);
+ + ". Supported types: " + userFacingTypes);
}
validator.validate(properties);
}
@@ -293,8 +306,11 @@ public class IndexPolicyMgr implements Writable,
GsonPostProcessable {
validator = new NoOperationValidator("lowercase token filter");
break;
default:
+ Set<String> userFacingTypes =
IndexPolicy.BUILTIN_TOKEN_FILTERS.stream()
+ .filter(t -> !t.equals("empty"))
+ .collect(Collectors.toSet());
throw new DdlException("Unsupported token filter type: " + type
- + ". Supported types: " +
IndexPolicy.BUILTIN_TOKEN_FILTERS);
+ + ". Supported types: " + userFacingTypes);
}
validator.validate(properties);
}
@@ -313,8 +329,11 @@ public class IndexPolicyMgr implements Writable,
GsonPostProcessable {
validator = new CharReplaceCharFilterValidator();
break;
default:
+ Set<String> userFacingTypes =
IndexPolicy.BUILTIN_CHAR_FILTERS.stream()
+ .filter(t -> !t.equals("empty"))
+ .collect(Collectors.toSet());
throw new DdlException("Unsupported char filter type: " + type
- + ". Supported types: " +
IndexPolicy.BUILTIN_CHAR_FILTERS);
+ + ". Supported types: " + userFacingTypes);
}
validator.validate(properties);
}
diff --git
a/regression-test/data/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.out
b/regression-test/data/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.out
new file mode 100644
index 00000000000..e72db5c6e79
--- /dev/null
+++
b/regression-test/data/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.out
@@ -0,0 +1,49 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !tokenize_standard --
+[{\n "token": "apache"\n }, {\n "token": "doris"\n }, {\n
"token": "fast"\n }, {\n "token": "mpp"\n }, {\n
"token": "database"\n }]
+
+-- !tokenize_unicode --
+[{\n "token": "hello"\n }, {\n "token": "world"\n }, {\n
"token": "你"\n }, {\n "token": "好"\n }, {\n "token":
"世"\n }, {\n "token": "界"\n }]
+
+-- !tokenize_basic --
+[{\n "token": "get"\n }, {\n "token": "images"\n }, {\n
"token": "test"\n }, {\n "token": "jpg"\n }, {\n
"token": "http"\n }, {\n "token": "1"\n }, {\n "token":
"0"\n }]
+
+-- !tokenize_icu --
+[{\n "token": "让"\n }, {\n "token": "我们"\n }, {\n
"token": "说"\n }, {\n "token": "hello"\n }, {\n "token":
"そして"\n }, {\n "token": "世界"\n }, {\n "token": "と"\n },
{\n "token": "つ"\n }, {\n "token": "な"\n }, {\n
"token": "が"\n }, {\n "token": "ろう"\n }]
+
+-- !tokenize_chinese_standard --
+[{\n "token": "apache"\n }, {\n "token": "doris"\n }, {\n
"token": "是"\n }, {\n "token": "一"\n }, {\n "token":
"个"\n }, {\n "token": "现"\n }, {\n "token": "代"\n }, {\n
"token": "化"\n }, {\n "token": "的"\n }, {\n "token":
"mpp"\n }, {\n "token": "数"\n }, {\n "token": "据"\n },
{\n "token": "库"\n }]
+
+-- !tokenize_chinese_basic --
+[{\n "token": "apache"\n }, {\n "token": "doris"\n }, {\n
"token": "是"\n }, {\n "token": "一"\n }, {\n "token":
"个"\n }, {\n "token": "现"\n }, {\n "token": "代"\n }, {\n
"token": "化"\n }, {\n "token": "的"\n }, {\n "token":
"mpp"\n }, {\n "token": "数"\n }, {\n "token": "据"\n },
{\n "token": "库"\n }]
+
+-- !tokenize_chinese_icu --
+[{\n "token": "apache"\n }, {\n "token": "doris"\n }, {\n
"token": "是"\n }, {\n "token": "一个"\n }, {\n "token":
"现代"\n }, {\n "token": "化"\n }, {\n "token": "的"\n },
{\n "token": "mpp"\n }, {\n "token": "数据"\n }, {\n
"token": "库"\n }]
+
+-- !tokenize_special_standard --
+[{\n "token": "test"\n }, {\n "token": "example.com"\n },
{\n "token": "user_name"\n }, {\n "token": "123"\n }, {\n
"token": "456"\n }]
+
+-- !tokenize_empty --
+
+
+-- !tokenize_mixed --
+[{\n "token": "中文"\n }, {\n "token": "english"\n }, {\n
"token": "日本語"\n }, {\n "token": "한국어"\n }]
+
+-- !sql_basic_match_logo --
+1 GET /images/logo.png HTTP/1.0
+
+-- !sql_basic_match_images --
+1 GET /images/logo.png HTTP/1.0
+
+-- !sql_basic_match_api --
+2 POST /api/v1/users HTTP/1.1
+
+-- !sql_basic_match_logo --
+1 GET /images/logo.png HTTP/1.0
+
+-- !sql_basic_match_images --
+1 GET /images/logo.png HTTP/1.0
+
+-- !sql_basic_match_api --
+2 POST /api/v1/users HTTP/1.1
+
diff --git
a/regression-test/suites/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.groovy
b/regression-test/suites/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.groovy
new file mode 100644
index 00000000000..25e9cda1a40
--- /dev/null
+++
b/regression-test/suites/inverted_index_p0/analyzer/test_builtin_analyzer_in_custom_analyzer.groovy
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_builtin_analyzer_in_custom_analyzer", "p0") {
+ // Define all built-in analyzers
+ def builtinAnalyzers = ["none", "standard", "unicode", "english",
"chinese", "icu", "basic", "ik"]
+
+ // Helper function to test that creating analyzer with builtin name should
fail
+ def testBuiltinAnalyzerNameConflict = { String analyzerName ->
+ test {
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerName}
+ PROPERTIES
+ (
+ "tokenizer" = "basic",
+ "token_filter" = "lowercase"
+ );
+ """
+ exception "conflicts with built-in"
+ }
+ }
+
+ // Test: Cannot create custom analyzer with built-in analyzer names
+ logger.info("Testing that built-in analyzer names cannot be used for
custom analyzers")
+
+ builtinAnalyzers.each { analyzerName ->
+ logger.info("Testing conflict with built-in analyzer: ${analyzerName}")
+ testBuiltinAnalyzerNameConflict(analyzerName)
+ }
+
+ // Define tokenize test cases: [testName, testText, analyzerName]
+ def tokenizeTestCases = [
+ ["standard", "Apache Doris is a fast MPP database", "standard"],
+ ["unicode", "Hello World 你好世界", "unicode"],
+ ["basic", "GET /images/test.jpg HTTP/1.0", "basic"],
+ ["icu", "让我们说「Hello」そして世界とつながろう!", "icu"],
+ ["chinese_standard", "Apache Doris是一个现代化的MPP数据库", "standard"],
+ ["chinese_basic", "Apache Doris是一个现代化的MPP数据库", "basic"],
+ ["chinese_icu", "Apache Doris是一个现代化的MPP数据库", "icu"],
+ ["special_standard", "[email protected] user_name 123-456", "standard"],
+ ["empty", "", "standard"],
+ ["mixed", "中文English日本語한국어", "icu"]
+ ]
+
+ // Execute tokenize tests in a loop
+ tokenizeTestCases.each { testCase ->
+ def testName = testCase[0]
+ def testText = testCase[1]
+ def analyzerName = testCase[2]
+
+ logger.info("Testing tokenize with ${analyzerName}: ${testName}")
+ "qt_tokenize_${testName}"("""
+ select tokenize("${testText}", '"analyzer"="${analyzerName}"');
+ """)
+ }
+
+ // Test table creation with analyzer_with_standard
+ def indexTblName = "test_builtin_analyzer_table"
+
+ sql "DROP TABLE IF EXISTS ${indexTblName}"
+
+ sql """
+ CREATE TABLE ${indexTblName} (
+ `id` int(11) NOT NULL,
+ `url` text NULL,
+ INDEX idx_url (`url`) USING INVERTED PROPERTIES("support_phrase" =
"true", "analyzer" = "basic")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ // Insert test data
+ sql """ INSERT INTO ${indexTblName} VALUES (1, 'GET /images/logo.png
HTTP/1.0'); """
+ sql """ INSERT INTO ${indexTblName} VALUES (2, 'POST /api/v1/users
HTTP/1.1'); """
+ sql """ INSERT INTO ${indexTblName} VALUES (3, 'GET /docs/index.html
HTTP/1.0'); """
+
+ try {
+ sql "sync"
+ sql """ set enable_common_expr_pushdown = true; """
+
+ // Test MATCH queries with analyzer_with_standard
+ qt_sql_basic_match_logo """ SELECT * FROM ${indexTblName} WHERE url
MATCH 'logo' ORDER BY id; """
+ qt_sql_basic_match_images """ SELECT * FROM ${indexTblName} WHERE url
MATCH 'images' ORDER BY id; """
+ qt_sql_basic_match_api """ SELECT * FROM ${indexTblName} WHERE url
MATCH 'api' ORDER BY id; """
+
+ } finally {
+ }
+
+ // Test with another analyzer - analyzer_with_basic
+ def indexTblName2 = "test_builtin_analyzer_table_basic"
+
+ sql "DROP TABLE IF EXISTS ${indexTblName2}"
+
+ sql """
+ CREATE TABLE ${indexTblName2} (
+ `id` int(11) NOT NULL,
+ `url` text NULL,
+ INDEX idx_url (`url`) USING INVERTED PROPERTIES("support_phrase" =
"true", "analyzer" = "basic")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ // Insert URL test data
+ sql """ INSERT INTO ${indexTblName2} VALUES (1, 'GET /images/logo.png
HTTP/1.0'); """
+ sql """ INSERT INTO ${indexTblName2} VALUES (2, 'POST /api/v1/users
HTTP/1.1'); """
+ sql """ INSERT INTO ${indexTblName2} VALUES (3, 'GET /docs/index.html
HTTP/1.0'); """
+
+ try {
+ sql "sync"
+ sql """ set enable_common_expr_pushdown = true; """
+
+ // Test basic analyzer on URL-like strings
+ qt_sql_basic_match_logo """ SELECT * FROM ${indexTblName2} WHERE url
MATCH 'logo' ORDER BY id; """
+ qt_sql_basic_match_images """ SELECT * FROM ${indexTblName2} WHERE url
MATCH 'images' ORDER BY id; """
+ qt_sql_basic_match_api """ SELECT * FROM ${indexTblName2} WHERE url
MATCH 'api' ORDER BY id; """
+
+ } finally {
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]