This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new c736f8317b [opt](standard95) the ‘standard95’ tokenizer does not
include stop words by default. (#209)
c736f8317b is described below
commit c736f8317bc35cfe7783a4c10a1ef706cc546e2d
Author: zzzxl <[email protected]>
AuthorDate: Tue Apr 30 16:21:24 2024 +0800
[opt](standard95) the ‘standard95’ tokenizer does not include stop words by
default. (#209)
---
src/core/CLucene/analysis/AnalysisHeader.h | 10 ++++++++++
src/core/CLucene/analysis/standard95/StandardAnalyzer.h | 16 +++++++---------
src/core/CLucene/analysis/standard95/StandardTokenizer.h | 14 +++++++-------
src/test/analysis/TestStandard95.cpp | 4 +++-
4 files changed, 27 insertions(+), 17 deletions(-)
diff --git a/src/core/CLucene/analysis/AnalysisHeader.h
b/src/core/CLucene/analysis/AnalysisHeader.h
index a98e26e4ab..fe10e396bc 100644
--- a/src/core/CLucene/analysis/AnalysisHeader.h
+++ b/src/core/CLucene/analysis/AnalysisHeader.h
@@ -11,6 +11,8 @@
#include "CLucene/util/VoidList.h"
#include "CLucene/LuceneThreads.h"
+#include <unordered_set>
+
CL_CLASS_DEF(util,Reader)
CL_CLASS_DEF(util,IReader)
@@ -297,6 +299,11 @@ public:
virtual void set_lowercase(bool lowercase) {
_lowercase = lowercase;
}
+
+ virtual void set_stopwords(std::unordered_set<std::string_view>*
stopwords) {
+ _stopwords = stopwords;
+ }
+
private:
DEFINE_MUTEX(THIS_LOCK)
@@ -313,7 +320,9 @@ protected:
* to save a TokenStream for later re-use by the same
* thread. */
virtual void setPreviousTokenStream(TokenStream* obj);
+
bool _lowercase = false;
+ std::unordered_set<std::string_view>* _stopwords = nullptr;
public:
/**
@@ -350,6 +359,7 @@ protected:
/** The text source for this Tokenizer. */
CL_NS(util)::Reader* input;
bool lowercase = false;
+ std::unordered_set<std::string_view>* stopwords = nullptr;
public:
/** Construct a tokenizer with null input. */
diff --git a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
index 7460c8119f..ccfd1030e1 100644
--- a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
+++ b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
@@ -6,18 +6,22 @@ namespace lucene::analysis::standard95 {
class StandardAnalyzer : public Analyzer {
public:
- StandardAnalyzer() : Analyzer() { _lowercase = true; }
+ StandardAnalyzer() : Analyzer() {
+ _lowercase = true;
+ _stopwords = nullptr;
+ }
+
bool isSDocOpt() override { return true; }
TokenStream* tokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) override {
- return _CLNEW StandardTokenizer(reader, useStopWords_, _lowercase);
+ return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords);
}
TokenStream* reusableTokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) override {
if (tokenizer_ == nullptr) {
- tokenizer_ = new StandardTokenizer(reader, useStopWords_, _lowercase);
+ tokenizer_ = new StandardTokenizer(reader, _lowercase, _stopwords);
} else {
tokenizer_->reset(reader);
}
@@ -31,13 +35,7 @@ class StandardAnalyzer : public Analyzer {
}
}
- void useStopWords(bool useStopWords) {
- useStopWords_ = useStopWords;
- }
-
private:
- bool useStopWords_ = true;
-
StandardTokenizer* tokenizer_ = nullptr;
};
diff --git a/src/core/CLucene/analysis/standard95/StandardTokenizer.h
b/src/core/CLucene/analysis/standard95/StandardTokenizer.h
index 1aac86716d..431673f00e 100644
--- a/src/core/CLucene/analysis/standard95/StandardTokenizer.h
+++ b/src/core/CLucene/analysis/standard95/StandardTokenizer.h
@@ -19,15 +19,17 @@ static std::unordered_set<std::string_view> stop_words = {
class StandardTokenizer : public Tokenizer {
public:
- StandardTokenizer(lucene::util::Reader* in, bool useStopWords)
- : Tokenizer(in), useStopWords_(useStopWords) {
+ StandardTokenizer(lucene::util::Reader* in)
+ : Tokenizer(in) {
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
Tokenizer::lowercase = true;
+ Tokenizer::stopwords = nullptr;
}
- StandardTokenizer(lucene::util::Reader* in, bool useStopWords, bool
lowercase)
- : Tokenizer(in), useStopWords_(useStopWords) {
+ StandardTokenizer(lucene::util::Reader* in, bool lowercase,
std::unordered_set<std::string_view>* stopwords)
+ : Tokenizer(in) {
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
Tokenizer::lowercase = lowercase;
+ Tokenizer::stopwords = stopwords;
}
Token* next(Token* token) override {
@@ -47,7 +49,7 @@ class StandardTokenizer : public Tokenizer {
std::transform(term.begin(), term.end(),
const_cast<char*>(term.data()),
[](char c) { return to_lower(c); });
}
- if (useStopWords_ && stop_words.count(term)) {
+ if (stopwords && stopwords->count(term)) {
skippedPositions++;
continue;
}
@@ -70,8 +72,6 @@ class StandardTokenizer : public Tokenizer {
};
private:
- bool useStopWords_ = true;
-
std::unique_ptr<StandardTokenizerImpl> scanner_;
int32_t skippedPositions = 0;
diff --git a/src/test/analysis/TestStandard95.cpp
b/src/test/analysis/TestStandard95.cpp
index 9c839ddbab..80f3ba8824 100644
--- a/src/test/analysis/TestStandard95.cpp
+++ b/src/test/analysis/TestStandard95.cpp
@@ -3,11 +3,13 @@
#include "CLucene/_ApiHeader.h"
#include "CLucene/analysis/standard95/StandardAnalyzer.h"
+#include "CLucene/analysis/standard95/StandardTokenizer.h"
#include "test.h"
void testCut(const std::string &str, std::vector<std::string> &tokens) {
auto standard =
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
+ standard->set_stopwords(&lucene::analysis::standard95::stop_words);
auto tokenizer =
static_cast<lucene::analysis::standard95::StandardTokenizer *>(
standard->tokenStream(L"name", nullptr));
@@ -28,7 +30,7 @@ void testCut(const std::string &str, std::vector<std::string>
&tokens) {
void testCutLines(std::vector<std::string>& datas, std::vector<std::string>
&tokens) {
auto standard =
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
- standard->useStopWords(false);
+ standard->set_stopwords(nullptr);
auto tokenizer =
static_cast<lucene::analysis::standard95::StandardTokenizer *>(
standard->tokenStream(L"name", nullptr));
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]