This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new ae26e078 [Feature](jieba) jieba add stop words filter (#93)
ae26e078 is described below
commit ae26e078dd4ca8f390956a28f62dc8fa9dd34b0f
Author: zzzxl <[email protected]>
AuthorDate: Tue Jun 20 23:41:46 2023 +0800
[Feature](jieba) jieba add stop words filter (#93)
---
.../CLucene/analysis/LanguageBasedAnalyzer.cpp | 3 +--
.../CLucene/analysis/jieba/FullSegment.hpp | 22 ++++++++++++++++++++--
src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp | 4 ++--
.../CLucene/analysis/jieba/MixSegment.hpp | 20 ++++++++++++++++++--
.../CLucene/analysis/jieba/Unicode.hpp | 9 +++++++++
5 files changed, 50 insertions(+), 8 deletions(-)
diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
index 6874555a..1e2a28ab 100644
--- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
@@ -72,8 +72,7 @@ TokenStream *LanguageBasedAnalyzer::reusableTokenStream(const
TCHAR * /*fieldNam
_CLNEW StopFilter(streams->tokenStream, true, stopSet);
} else if (_tcscmp(lang, _T("chinese")) == 0) {
streams->tokenStream = _CLNEW CL_NS2(analysis,
jieba)::ChineseTokenizer(reader, mode);
- streams->filteredTokenStream =
- _CLNEW StopFilter(streams->tokenStream, true, stopSet);
+ streams->filteredTokenStream = streams->tokenStream;
} else {
CL_NS(util)::BufferedReader* bufferedReader =
reader->__asBufferedReader();
diff --git a/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
b/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
index 8ae715f7..7b8362c5 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
@@ -16,9 +16,10 @@ class FullSegment: public SegmentBase {
dictTrie_ = new DictTrie(dictPath);
isNeedDestroy_ = true;
}
- FullSegment(const DictTrie* dictTrie)
+ FullSegment(const DictTrie* dictTrie, const string& stopWordPath = "")
: dictTrie_(dictTrie), isNeedDestroy_(false) {
assert(dictTrie_);
+ LoadStopWordDict(stopWordPath);
}
~FullSegment() {
if (isNeedDestroy_) {
@@ -29,7 +30,9 @@ class FullSegment: public SegmentBase {
vector<string>& words) const {
vector<Word> tmp;
Cut(sentence, tmp);
- GetStringsFromWords(tmp, words);
+ GetStringsFromWords(tmp, words, [this](const std::string& word) {
+ return stopWords_.count(word);
+ });
}
void Cut(const string& sentence,
vector<Word>& words) const {
@@ -84,9 +87,24 @@ class FullSegment: public SegmentBase {
uIdx++;
}
}
+
+ void LoadStopWordDict(const string& filePath) {
+ ifstream ifs(filePath.c_str());
+ if (ifs.is_open()) {
+ string line;
+ while (getline(ifs, line)) {
+ stopWords_.insert(line);
+ }
+ assert(stopWords_.size());
+ }
+ }
+
private:
const DictTrie* dictTrie_;
bool isNeedDestroy_;
+
+ unordered_set<string> stopWords_;
+
};
}
diff --git a/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
b/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
index 8475404c..bfdb89fa 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
@@ -17,8 +17,8 @@ class Jieba {
model_(model_path),
mp_seg_(&dict_trie_),
hmm_seg_(&model_),
- mix_seg_(&dict_trie_, &model_),
- full_seg_(&dict_trie_),
+ mix_seg_(&dict_trie_, &model_, stopWordPath),
+ full_seg_(&dict_trie_, stopWordPath),
query_seg_(&dict_trie_, &model_),
extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
}
diff --git a/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
b/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
index 95084daa..31839f8d 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
@@ -15,8 +15,9 @@ class MixSegment: public SegmentTagged {
: mpSeg_(mpSegDict, userDict),
hmmSeg_(hmmSegDict) {
}
- MixSegment(const DictTrie* dictTrie, const HMMModel* model)
+ MixSegment(const DictTrie* dictTrie, const HMMModel* model, const string&
stopWordPath = "")
: mpSeg_(dictTrie), hmmSeg_(model) {
+ LoadStopWordDict(stopWordPath);
}
~MixSegment() {
}
@@ -27,7 +28,9 @@ class MixSegment: public SegmentTagged {
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
vector<Word> tmp;
Cut(sentence, tmp, hmm);
- GetStringsFromWords(tmp, words);
+ GetStringsFromWords(tmp, words, [this](const std::string& word) {
+ return stopWords_.count(word);
+ });
}
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const
{
PreFilter pre_filter(symbols_, sentence);
@@ -97,11 +100,24 @@ class MixSegment: public SegmentTagged {
return tagger_.LookupTag(str, (SegmentTagged &)*this);
}
+ void LoadStopWordDict(const string& filePath) {
+ ifstream ifs(filePath.c_str());
+ if (ifs.is_open()) {
+ string line;
+ while (getline(ifs, line)) {
+ stopWords_.insert(line);
+ }
+ assert(stopWords_.size());
+ }
+ }
+
private:
MPSegment mpSeg_;
HMMSegment hmmSeg_;
PosTagger tagger_;
+ unordered_set<string> stopWords_;
+
}; // class MixSegment
} // namespace cppjieba
diff --git a/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp
b/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp
index 28dbd23b..f6068868 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp
@@ -222,6 +222,15 @@ inline void GetStringsFromWords(const vector<Word>& words,
vector<string>& strs)
}
}
+inline void GetStringsFromWords(const vector<Word>& words, vector<string>&
strs, const std::function<bool(const std::string& word)>& cb) {
+ strs.clear();
+ for (size_t i = 0; i < words.size(); ++i) {
+ if (!cb(words[i].word)) {
+ strs.push_back(words[i].word);
+ }
+ }
+}
+
} // namespace cppjieba
#endif // CPPJIEBA_UNICODE_H
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]