[doris-thirdparty] branch clucene updated: [Feature](jieba) jieba add stop words filter (#93)

kxiao Tue, 20 Jun 2023 08:43:39 -0700

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene by this push:
     new ae26e078 [Feature](jieba) jieba add stop words filter (#93)
ae26e078 is described below

commit ae26e078dd4ca8f390956a28f62dc8fa9dd34b0f
Author: zzzxl <[email protected]>
AuthorDate: Tue Jun 20 23:41:46 2023 +0800

    [Feature](jieba) jieba add stop words filter (#93)
---
 .../CLucene/analysis/LanguageBasedAnalyzer.cpp     |  3 +--
 .../CLucene/analysis/jieba/FullSegment.hpp         | 22 ++++++++++++++++++++--
 src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp  |  4 ++--
 .../CLucene/analysis/jieba/MixSegment.hpp          | 20 ++++++++++++++++++--
 .../CLucene/analysis/jieba/Unicode.hpp             |  9 +++++++++
 5 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp 
b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
index 6874555a..1e2a28ab 100644
--- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
@@ -72,8 +72,7 @@ TokenStream *LanguageBasedAnalyzer::reusableTokenStream(const 
TCHAR * /*fieldNam
                     _CLNEW StopFilter(streams->tokenStream, true, stopSet);
         } else if (_tcscmp(lang, _T("chinese")) == 0) {
             streams->tokenStream = _CLNEW CL_NS2(analysis, 
jieba)::ChineseTokenizer(reader, mode);
-            streams->filteredTokenStream =
-                    _CLNEW StopFilter(streams->tokenStream, true, stopSet);
+            streams->filteredTokenStream = streams->tokenStream;
         } else {
             CL_NS(util)::BufferedReader* bufferedReader = 
reader->__asBufferedReader();
 
diff --git a/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp 
b/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
index 8ae715f7..7b8362c5 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
@@ -16,9 +16,10 @@ class FullSegment: public SegmentBase {
     dictTrie_ = new DictTrie(dictPath);
     isNeedDestroy_ = true;
   }
-  FullSegment(const DictTrie* dictTrie)
+  FullSegment(const DictTrie* dictTrie, const string& stopWordPath = "")
     : dictTrie_(dictTrie), isNeedDestroy_(false) {
     assert(dictTrie_);
+    LoadStopWordDict(stopWordPath);
   }
   ~FullSegment() {
     if (isNeedDestroy_) {
@@ -29,7 +30,9 @@ class FullSegment: public SegmentBase {
         vector<string>& words) const {
     vector<Word> tmp;
     Cut(sentence, tmp);
-    GetStringsFromWords(tmp, words);
+    GetStringsFromWords(tmp, words, [this](const std::string& word) {
+      return stopWords_.count(word);
+    });
   }
   void Cut(const string& sentence, 
         vector<Word>& words) const {
@@ -84,9 +87,24 @@ class FullSegment: public SegmentBase {
       uIdx++;
     }
   }
+
+  void LoadStopWordDict(const string& filePath) {
+    ifstream ifs(filePath.c_str());
+    if (ifs.is_open()) {
+      string line;
+      while (getline(ifs, line)) {
+        stopWords_.insert(line);
+      }
+      assert(stopWords_.size());
+    }
+  }
+
  private:
   const DictTrie* dictTrie_;
   bool isNeedDestroy_;
+
+  unordered_set<string> stopWords_;
+
 };
 }
 
diff --git a/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp 
b/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
index 8475404c..bfdb89fa 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
@@ -17,8 +17,8 @@ class Jieba {
       model_(model_path),
       mp_seg_(&dict_trie_),
       hmm_seg_(&model_),
-      mix_seg_(&dict_trie_, &model_),
-      full_seg_(&dict_trie_),
+      mix_seg_(&dict_trie_, &model_, stopWordPath),
+      full_seg_(&dict_trie_, stopWordPath),
       query_seg_(&dict_trie_, &model_),
       extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
   }
diff --git a/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp 
b/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
index 95084daa..31839f8d 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
@@ -15,8 +15,9 @@ class MixSegment: public SegmentTagged {
     : mpSeg_(mpSegDict, userDict), 
       hmmSeg_(hmmSegDict) {
   }
-  MixSegment(const DictTrie* dictTrie, const HMMModel* model) 
+  MixSegment(const DictTrie* dictTrie, const HMMModel* model, const string& 
stopWordPath = "") 
     : mpSeg_(dictTrie), hmmSeg_(model) {
+    LoadStopWordDict(stopWordPath);
   }
   ~MixSegment() {
   }
@@ -27,7 +28,9 @@ class MixSegment: public SegmentTagged {
   void Cut(const string& sentence, vector<string>& words, bool hmm) const {
     vector<Word> tmp;
     Cut(sentence, tmp, hmm);
-    GetStringsFromWords(tmp, words);
+    GetStringsFromWords(tmp, words, [this](const std::string& word) {
+      return stopWords_.count(word);
+    });
   }
   void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const 
{
     PreFilter pre_filter(symbols_, sentence);
@@ -97,11 +100,24 @@ class MixSegment: public SegmentTagged {
     return tagger_.LookupTag(str, (SegmentTagged &)*this);
   }
 
+  void LoadStopWordDict(const string& filePath) {
+    ifstream ifs(filePath.c_str());
+    if (ifs.is_open()) {
+      string line;
+      while (getline(ifs, line)) {
+        stopWords_.insert(line);
+      }
+      assert(stopWords_.size());
+    }
+  }
+
  private:
   MPSegment mpSeg_;
   HMMSegment hmmSeg_;
   PosTagger tagger_;
 
+  unordered_set<string> stopWords_;
+
 }; // class MixSegment
 
 } // namespace cppjieba
diff --git a/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp 
b/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp
index 28dbd23b..f6068868 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp
@@ -222,6 +222,15 @@ inline void GetStringsFromWords(const vector<Word>& words, 
vector<string>& strs)
   }
 }
 
+inline void GetStringsFromWords(const vector<Word>& words, vector<string>& 
strs, const std::function<bool(const std::string& word)>& cb) {
+  strs.clear();
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (!cb(words[i].word)) {
+      strs.push_back(words[i].word);
+    }
+  }
+}
+
 } // namespace cppjieba
 
 #endif // CPPJIEBA_UNICODE_H


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris-thirdparty] branch clucene updated: [Feature](jieba) jieba add stop words filter (#93)

Reply via email to