This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new 63ae98a8bc [fix](chinese) fix the issue where the be crashes due to
the missing Chinese dict (#182)
63ae98a8bc is described below
commit 63ae98a8bc280dc4728dca744c3fe06e7a38caf1
Author: zzzxl <[email protected]>
AuthorDate: Thu Feb 1 18:04:26 2024 +0800
[fix](chinese) fix the issue where the be crashes due to the missing
Chinese dict (#182)
---
.../CLucene/analysis/LanguageBasedAnalyzer.cpp | 14 +++++++++++-
.../CLucene/analysis/jieba/ChineseTokenizer.cpp | 8 +++----
.../CLucene/analysis/jieba/ChineseTokenizer.h | 25 ++++++++++++++++------
3 files changed, 35 insertions(+), 12 deletions(-)
diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
index 2a32ff04fa..6adfcf1e34 100644
--- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
@@ -4,6 +4,7 @@
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
+#include <fstream>
#include "CLucene/_ApiHeader.h"
#include "CLucene/analysis/Analyzers.h"
@@ -64,7 +65,18 @@ void LanguageBasedAnalyzer::setMode(AnalyzerMode m) {
void LanguageBasedAnalyzer::initDict(const std::string &dictPath) {
if (_tcscmp(lang, _T("chinese")) == 0) {
- CL_NS2(analysis, jieba)::ChineseTokenizer::init(dictPath);
+ ChineseDict chineseDict;
+ chineseDict.dictPath_ = dictPath;
+
+ for (const auto& file : chineseDict.files_) {
+ std::string path = dictPath + "/" + file;
+ std::ifstream in(path);
+ if (!in.good()) {
+ _CLTHROWA(CL_ERR_IO, std::string("chinese tokenizer dict file
not found: " + path).c_str());
+ }
+ }
+
+ CL_NS2(analysis, jieba)::ChineseTokenizer::init(&chineseDict);
}
}
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
index 9a7f5eddfd..ef46315ff5 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
@@ -17,11 +17,11 @@ ChineseTokenizer::ChineseTokenizer(lucene::util::Reader
*reader, AnalyzerMode m,
Tokenizer::lowercase = lowercase;
}
-void ChineseTokenizer::init(const std::string &dictPath) {
- JiebaSingleton::getInstance(dictPath);
+void ChineseTokenizer::init(const ChineseDict* chineseDict) {
+ JiebaSingleton::getInstance(chineseDict);
}
-CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token)
{
+CL_NS(analysis)::Token* ChineseTokenizer::next(lucene::analysis::Token* token)
{
if (bufferIndex >= dataLen) {
return nullptr;
}
@@ -29,7 +29,7 @@ CL_NS(analysis)::Token
*ChineseTokenizer::next(lucene::analysis::Token *token) {
std::string_view& token_text = tokens_text[bufferIndex++];
size_t size = std::min(token_text.size(),
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
if (Tokenizer::lowercase) {
- if (!token_text.empty() && token_text[0] < 0x80) {
+ if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80)
{
std::transform(token_text.begin(), token_text.end(),
const_cast<char*>(token_text.data()),
[](char c) { return to_lower(c); });
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
index 9fe33f5805..09760b7b1c 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
@@ -14,14 +14,25 @@
CL_NS_DEF2(analysis,jieba)
CL_NS_USE(analysis)
+struct ChineseDict {
+ std::string dictPath_;
+ std::vector<std::string> files_ = {
+ "jieba.dict.utf8",
+ "hmm_model.utf8",
+ "user.dict.utf8",
+ "idf.utf8",
+ "stop_words.utf8"
+ };
+};
+
class JiebaSingleton {
public:
- static cppjieba::Jieba& getInstance(const std::string& dictPath = "") {
- static cppjieba::Jieba instance(dictPath + "/" + "jieba.dict.utf8",
- dictPath + "/" + "hmm_model.utf8",
- dictPath + "/" + "user.dict.utf8",
- dictPath + "/" + "idf.utf8",
- dictPath + "/" + "stop_words.utf8");
+ static cppjieba::Jieba& getInstance(const ChineseDict* dict = nullptr) {
+ static cppjieba::Jieba instance(dict->dictPath_ + "/" +
dict->files_[0],
+ dict->dictPath_ + "/" +
dict->files_[1],
+ dict->dictPath_ + "/" +
dict->files_[2],
+ dict->dictPath_ + "/" +
dict->files_[3],
+ dict->dictPath_ + "/" +
dict->files_[4]);
return instance;
}
@@ -46,7 +57,7 @@ public:
// Constructor
explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode);
explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode,
bool lowercase);
- static void init(const std::string& dictPath="");
+ static void init(const ChineseDict* chineseDict);
// Destructor
~ChineseTokenizer() override = default;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]