This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new 5754b41b [Optimize](chinese) Optimize chinese tokenizer index process
(#115)
5754b41b is described below
commit 5754b41bbfbf2a07971476d9390445a393961352
Author: zzzxl <[email protected]>
AuthorDate: Fri Sep 1 17:30:33 2023 +0800
[Optimize](chinese) Optimize chinese tokenizer index process (#115)
chinese tokenzier use sDocument
---
src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp | 7 +++++++
src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h | 2 ++
src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp | 4 +---
src/core/CLucene/analysis/AnalysisHeader.h | 2 ++
src/core/CLucene/analysis/Analyzers.h | 3 +++
src/core/CLucene/analysis/standard95/StandardAnalyzer.h | 2 ++
src/core/CLucene/index/IndexWriter.cpp | 5 +----
7 files changed, 18 insertions(+), 7 deletions(-)
diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
index 1e2a28ab..23de239d 100644
--- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
@@ -41,6 +41,13 @@ LanguageBasedAnalyzer::~LanguageBasedAnalyzer() {
_CLLDELETE(stopSet);
}
+bool LanguageBasedAnalyzer::isSDocOpt() {
+ if (_tcscmp(lang, _T("chinese")) == 0) {
+ return true;
+ }
+ return false;
+}
+
void LanguageBasedAnalyzer::setStopWords(const TCHAR** stopwords) {
StopFilter::fillStopTable(stopSet, stopwords);
}
diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
index a7f0c7cf..536a921c 100644
--- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
@@ -41,6 +41,8 @@ class CLUCENE_CONTRIBS_EXPORT LanguageBasedAnalyzer : public
CL_NS(analysis)::An
public:
explicit LanguageBasedAnalyzer(const TCHAR *language = nullptr, bool stem
= true, AnalyzerMode mode = AnalyzerMode::All);
~LanguageBasedAnalyzer() override;
+
+ bool isSDocOpt() override;
void setStopWords(const TCHAR** stopwords);
void setLanguage(const TCHAR *language);
void setStem(bool s);
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
index e6acd64f..ef126c97 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
@@ -51,9 +51,7 @@ CL_NS(analysis)::Token
*ChineseTokenizer::next(lucene::analysis::Token *token) {
}
if (bufferIndex < dataLen) {
auto token_text = tokens_text[bufferIndex++];
- lucene_utf8towcs(buffer, token_text.c_str(), LUCENE_MAX_WORD_LEN);
- auto length = _tcslen(buffer);
- token->set(buffer, 0, length);
+ token->setNoCopy(token_text.data(), 0, token_text.size());
return token;
}
return nullptr;
diff --git a/src/core/CLucene/analysis/AnalysisHeader.h
b/src/core/CLucene/analysis/AnalysisHeader.h
index 8a52350c..46ab0020 100644
--- a/src/core/CLucene/analysis/AnalysisHeader.h
+++ b/src/core/CLucene/analysis/AnalysisHeader.h
@@ -276,6 +276,8 @@ class CLUCENE_EXPORT Analyzer{
public:
Analyzer();
+ virtual bool isSDocOpt() { return false; }
+
/** Creates a TokenStream which tokenizes all the text in the provided
Reader. Default implementation forwards to tokenStream(Reader) for
compatibility with older version. Override to allow Analyzer to choose
diff --git a/src/core/CLucene/analysis/Analyzers.h
b/src/core/CLucene/analysis/Analyzers.h
index 17f88cff..432dde01 100644
--- a/src/core/CLucene/analysis/Analyzers.h
+++ b/src/core/CLucene/analysis/Analyzers.h
@@ -180,6 +180,9 @@ template <typename T>
class CLUCENE_EXPORT SimpleAnalyzer: public Analyzer {
public:
SimpleAnalyzer(){}
+
+ bool isSDocOpt() override { return true; }
+
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader*
reader) override{
return _CLNEW SimpleTokenizer<T>(reader);
}
diff --git a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
index 41446d14..7e29eec8 100644
--- a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
+++ b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
@@ -6,6 +6,8 @@ namespace lucene::analysis::standard95 {
class StandardAnalyzer : public Analyzer {
public:
+ bool isSDocOpt() override { return true; }
+
TokenStream* tokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) override {
return _CLNEW StandardTokenizer(reader, useStopWords_);
diff --git a/src/core/CLucene/index/IndexWriter.cpp
b/src/core/CLucene/index/IndexWriter.cpp
index fe269a92..e79d2475 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -23,7 +23,6 @@
#include "CLucene/util/Array.h"
#include "CLucene/util/PriorityQueue.h"
#include "CLucene/index/CodeMode.h"
-#include "CLucene/analysis/standard95/StandardAnalyzer.h"
#include "MergePolicy.h"
#include "MergeScheduler.h"
#include "SDocumentWriter.h"
@@ -285,9 +284,7 @@ void IndexWriter::init(Directory *d, Analyzer *a, const
bool create, const bool
rollbackSegmentInfos = NULL;
}
if (analyzer != nullptr) {
- if (auto *sa = dynamic_cast<SimpleAnalyzer<char> *>(analyzer); sa
!= nullptr) {
- docWriter = _CLNEW SDocumentsWriter<char>(directory, this);
- } else if (auto *sa =
dynamic_cast<standard95::StandardAnalyzer*>(analyzer); sa != nullptr) {
+ if (analyzer->isSDocOpt()) {
docWriter = _CLNEW SDocumentsWriter<char>(directory, this);
} else {
docWriter = _CLNEW DocumentsWriter(directory, this);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]