This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new a23a45e6e1 [Fix](analyzer) add reader ownership for chinese and
standard analyzer (#223)
a23a45e6e1 is described below
commit a23a45e6e1846a8e82194a94f1678e006d638c31
Author: airborne12 <[email protected]>
AuthorDate: Thu Jun 13 10:54:12 2024 +0800
[Fix](analyzer) add reader ownership for chinese and standard analyzer
(#223)
---
src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp | 2 +-
src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp | 4 +++-
src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h | 2 +-
src/core/CLucene/analysis/standard95/StandardAnalyzer.h | 3 ++-
src/core/CLucene/analysis/standard95/StandardTokenizer.h | 4 +++-
5 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
index 6adfcf1e34..2f2af354d5 100644
--- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
@@ -121,7 +121,7 @@ TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR
*fieldName, Reader *
if (_tcscmp(lang, _T("cjk")) == 0) {
ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader);
} else if (_tcscmp(lang, _T("chinese")) == 0) {
- ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode,
Analyzer::_lowercase);
+ ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode,
Analyzer::_lowercase, Analyzer::_ownReader);
} else {
CL_NS(util)::BufferedReader* bufferedReader =
reader->__asBufferedReader();
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
index ef46315ff5..3aa5e32a60 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
@@ -10,11 +10,13 @@ CL_NS_USE(util)
ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode
m) : Tokenizer(reader), mode(m) {
reset(reader);
Tokenizer::lowercase = false;
+ Tokenizer::ownReader = false;
}
-ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode
m, bool lowercase) : Tokenizer(reader), mode(m) {
+ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode
m, bool lowercase, bool ownReader) : Tokenizer(reader), mode(m) {
reset(reader);
Tokenizer::lowercase = lowercase;
+ Tokenizer::ownReader = ownReader;
}
void ChineseTokenizer::init(const ChineseDict* chineseDict) {
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
index 09760b7b1c..b973aabc1d 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
@@ -56,7 +56,7 @@ private:
public:
// Constructor
explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode);
- explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode,
bool lowercase);
+ explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode,
bool lowercase, bool ownReader=false);
static void init(const ChineseDict* chineseDict);
// Destructor
diff --git a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
index ccfd1030e1..60764abb41 100644
--- a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
+++ b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
@@ -8,6 +8,7 @@ class StandardAnalyzer : public Analyzer {
public:
StandardAnalyzer() : Analyzer() {
_lowercase = true;
+ _ownReader = false;
_stopwords = nullptr;
}
@@ -15,7 +16,7 @@ class StandardAnalyzer : public Analyzer {
TokenStream* tokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) override {
- return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords);
+ return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords,
_ownReader);
}
TokenStream* reusableTokenStream(const TCHAR* fieldName,
diff --git a/src/core/CLucene/analysis/standard95/StandardTokenizer.h
b/src/core/CLucene/analysis/standard95/StandardTokenizer.h
index 431673f00e..62c8b2d0ad 100644
--- a/src/core/CLucene/analysis/standard95/StandardTokenizer.h
+++ b/src/core/CLucene/analysis/standard95/StandardTokenizer.h
@@ -23,13 +23,15 @@ class StandardTokenizer : public Tokenizer {
: Tokenizer(in) {
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
Tokenizer::lowercase = true;
+ Tokenizer::lowercase = false;
Tokenizer::stopwords = nullptr;
}
- StandardTokenizer(lucene::util::Reader* in, bool lowercase,
std::unordered_set<std::string_view>* stopwords)
+ StandardTokenizer(lucene::util::Reader* in, bool lowercase,
std::unordered_set<std::string_view>* stopwords, bool ownReader=false)
: Tokenizer(in) {
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
Tokenizer::lowercase = lowercase;
Tokenizer::stopwords = stopwords;
+ Tokenizer::ownReader = ownReader;
}
Token* next(Token* token) override {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]