[doris-thirdparty] branch clucene updated: [optimize](chinese) optimize chinese tokenizer load (#123)

jianliangqi Thu, 21 Sep 2023 01:13:26 -0700

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene by this push:
     new e0fb04c0 [optimize](chinese) optimize chinese tokenizer load (#123)
e0fb04c0 is described below

commit e0fb04c02762e60395d9a1d36d0764b3de108684
Author: zzzxl <[email protected]>
AuthorDate: Thu Sep 21 16:13:14 2023 +0800

    [optimize](chinese) optimize chinese tokenizer load (#123)
---
 .../CLucene/analysis/jieba/ChineseTokenizer.cpp    | 64 ++++++++++------------
 .../CLucene/analysis/jieba/ChineseTokenizer.h      | 32 +++--------
 .../CLucene/analysis/jieba/FullSegment.hpp         | 37 +++++++++++--
 src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp  | 10 ++++
 .../CLucene/analysis/jieba/MixSegment.hpp          | 33 +++++++++--
 .../CLucene/analysis/jieba/QuerySegment.hpp        | 19 +++++++
 6 files changed, 127 insertions(+), 68 deletions(-)

diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp 
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
index c72e2451..a2795226 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
@@ -8,7 +8,7 @@ CL_NS_USE(analysis)
 CL_NS_USE(util)
 
 ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode 
m) : Tokenizer(reader), mode(m) {
-    memset(buffer, 0, LUCENE_MAX_WORD_LEN + 1);
+    reset(reader);
 }
 
 void ChineseTokenizer::init(const std::string &dictPath) {
@@ -16,45 +16,39 @@ void ChineseTokenizer::init(const std::string &dictPath) {
 }
 
 CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token) 
{
-    // try to read all words
-    const char *initBuffer;
-    if (dataLen == 0 || bufferIndex >= dataLen) {
-        int totalLen = 0;
-        do {
-            auto bufferLen = input->read((const void**)&ioBuffer, 1, 
LUCENE_IO_BUFFER_SIZE);
-            if (bufferLen <= 0) {
-                dataLen = 0;
-                bufferIndex = 0;
-                break;
-            }
-            if (totalLen < LUCENE_IO_BUFFER_SIZE) {
-                initBuffer = ioBuffer;
-            }
-            totalLen+=bufferLen;
-        } while (true);
-
-        //char tmp_buffer[4 * totalLen + 1];
-        //lucene_wcsntoutf8(tmp_buffer, initBuffer, totalLen, 4 * totalLen);
-       std::string s(initBuffer, totalLen);
-        switch (mode) {
+    if (bufferIndex >= dataLen) {
+        return nullptr;
+    }
+
+    std::string_view& token_text = tokens_text[bufferIndex++];
+    size_t size = std::min(token_text.size(), 
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
+    token->setNoCopy(token_text.data(), 0, size);
+    return token;
+}
+
+void ChineseTokenizer::reset(lucene::util::Reader* reader) {
+    this->input = reader;
+    this->bufferIndex = 0;
+    this->dataLen = 0;
+    this->tokens_text.clear();
+
+    buffer_.resize(input->size());
+    int32_t numRead = input->readCopy(buffer_.data(), 0, buffer_.size());
+    assert(buffer.size() == numRead);
+    
+    switch (mode) {
         case AnalyzerMode::Search:
-            JiebaSingleton::getInstance().CutForSearch(s, tokens_text, true);
+            JiebaSingleton::getInstance().CutForSearch(buffer_, tokens_text, 
true);
             break;
         case AnalyzerMode::All:
-            JiebaSingleton::getInstance().CutAll(s, tokens_text);
+            JiebaSingleton::getInstance().CutAll(buffer_, tokens_text);
             break;
         case AnalyzerMode::Default:
-            JiebaSingleton::getInstance().Cut(s, tokens_text, true);
+            JiebaSingleton::getInstance().Cut(buffer_, tokens_text, true);
             break;
-        }
-        dataLen = tokens_text.size();
     }
-    if (bufferIndex < dataLen) {
-        std::string& token_text = tokens_text[bufferIndex++];
-        size_t size = std::min(token_text.size(), 
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
-        token->setNoCopy(token_text.data(), 0, size);
-        return token;
-    }
-    return nullptr;
-}
+
+    dataLen = tokens_text.size();
+};
+
 CL_NS_END2
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h 
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
index 48de52b1..9bd34fb7 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
@@ -4,6 +4,7 @@
 #include <CLucene.h>
 
 #include <memory>
+#include <string_view>
 #include "Jieba.hpp"
 
 #include "CLucene/analysis/AnalysisHeader.h"
@@ -31,28 +32,15 @@ private:
 class ChineseTokenizer : public lucene::analysis::Tokenizer {
 private:
     AnalyzerMode mode{};
-    /** word offset, used to imply which character(in ) is parsed */
-    int32_t offset{};
 
     /** the index used only for ioBuffer */
-    int32_t bufferIndex{};
+    int32_t bufferIndex = 0;
 
     /** data length */
-    int32_t dataLen{};
-
-    /**
-     * character buffer, store the characters which are used to compose <br>
-     * the returned Token
-     */
-    TCHAR buffer[LUCENE_MAX_WORD_LEN + 1]{};
-
-    /**
-     * I/O buffer, used to store the content of the input(one of the <br>
-     * members of Tokenizer)
-     */
-    const char* ioBuffer{};
-    std::vector<std::string> tokens_text;
-    //std::vector<std::unique_ptr<Token>> tokens;
+    int32_t dataLen = 0;
+
+    std::string buffer_;
+    std::vector<std::string_view> tokens_text;
 
 public:
     // Constructor
@@ -65,13 +53,7 @@ public:
     // Override the next method to tokenize Chinese text using Jieba
     lucene::analysis::Token* next(lucene::analysis::Token* token) override;
 
-    void reset(lucene::util::Reader *reader) override {
-        this->input = reader;
-        this->offset = 0;
-        this->bufferIndex = 0;
-        this->dataLen = 0;
-        this->tokens_text.clear();
-    }
+    void reset(lucene::util::Reader *reader) override;
 };
 
 CL_NS_END2
diff --git a/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp 
b/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
index 12127a92..df3c4c1c 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
@@ -26,6 +26,32 @@ class FullSegment: public SegmentBase {
       delete dictTrie_;
     }
   }
+
+  void Cut(const std::string& sentence, vector<std::string_view>& words) const 
{
+    PreFilter pre_filter(symbols_, sentence, true);
+    PreFilter::Range range;
+    vector<WordRange> wrs;
+    wrs.reserve(sentence.size()/2);
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      if (range.type == PreFilter::Language::CHINESE) {
+        Cut(range.begin, range.end, wrs);
+      } else {
+        CutAlnum(range.begin, range.end, wrs);
+      }
+    }
+    words.clear();
+    words.reserve(wrs.size());
+    for (auto& wr : wrs) {
+      uint32_t len = wr.right->offset - wr.left->offset + wr.right->len;
+      std::string_view word(sentence.data() + wr.left->offset, len);
+      if (stopWords_.count(word)) {
+        continue;
+      }
+      words.emplace_back(word);
+    }
+  }
+
   void Cut(const string& sentence, 
         vector<string>& words) const {
     vector<Word> tmp;
@@ -137,10 +163,12 @@ class FullSegment: public SegmentBase {
     ifstream ifs(filePath.c_str());
     if (ifs.is_open()) {
       string line;
-      while (getline(ifs, line)) {
-        stopWords_.insert(line);
+        while (getline(ifs, line)) {
+        stopWordList_.push_back(line);
+      }
+      for (auto& word : stopWordList_) {
+        stopWords_.insert(std::string_view(word.data(), word.size()));
       }
-      assert(stopWords_.size());
     }
   }
 
@@ -148,7 +176,8 @@ class FullSegment: public SegmentBase {
   const DictTrie* dictTrie_;
   bool isNeedDestroy_;
 
-  unordered_set<string> stopWords_;
+  std::vector<std::string> stopWordList_;
+  unordered_set<std::string_view> stopWords_;
 
 };
 }
diff --git a/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp 
b/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
index bfdb89fa..0e6e91de 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
@@ -1,6 +1,7 @@
 #ifndef CPPJIEAB_JIEBA_H
 #define CPPJIEAB_JIEBA_H
 
+#include <string_view>
 #include "QuerySegment.hpp"
 #include "KeywordExtractor.hpp"
 
@@ -34,18 +35,27 @@ class Jieba {
   void Cut(const string& sentence, vector<string>& words, bool hmm = true) 
const {
     mix_seg_.Cut(sentence, words, hmm);
   }
+  void Cut(const string& sentence, vector<std::string_view>& words, bool hmm = 
true) const {
+    mix_seg_.Cut(sentence, words, hmm);
+  }
   void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const 
{
     mix_seg_.Cut(sentence, words, hmm);
   }
   void CutAll(const string& sentence, vector<string>& words) const {
     full_seg_.Cut(sentence, words);
   }
+  void CutAll(const string& sentence, vector<std::string_view>& words) const {
+    full_seg_.Cut(sentence, words);
+  }
   void CutAll(const string& sentence, vector<Word>& words) const {
     full_seg_.Cut(sentence, words);
   }
   void CutForSearch(const string& sentence, vector<string>& words, bool hmm = 
true) const {
     query_seg_.Cut(sentence, words, hmm);
   }
+  void CutForSearch(const string& sentence, vector<std::string_view>& words, 
bool hmm = true) const {
+    query_seg_.Cut(sentence, words, hmm);
+  }
   void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = 
true) const {
     query_seg_.Cut(sentence, words, hmm);
   }
diff --git a/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp 
b/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
index 31839f8d..dcb99d23 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
@@ -2,6 +2,7 @@
 #define CPPJIEBA_MIXSEGMENT_H
 
 #include <cassert>
+#include <string_view>
 #include "MPSegment.hpp"
 #include "HMMSegment.hpp"
 #include "StringUtil.hpp"
@@ -22,6 +23,27 @@ class MixSegment: public SegmentTagged {
   ~MixSegment() {
   }
 
+  void Cut(const string& sentence, vector<std::string_view>& words, bool hmm = 
true) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<WordRange> wrs;
+    wrs.reserve(sentence.size() / 2);
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      Cut(range.begin, range.end, wrs, hmm);
+    }
+    words.clear();
+    words.reserve(wrs.size());
+    for (auto& wr : wrs) {
+      uint32_t len = wr.right->offset - wr.left->offset + wr.right->len;
+      std::string_view word(sentence.data() + wr.left->offset, len);
+      if (stopWords_.count(word)) {
+        continue;
+      }
+      words.emplace_back(word);
+    }
+  }
+
   void Cut(const string& sentence, vector<string>& words) const {
     Cut(sentence, words, true);
   }
@@ -104,10 +126,12 @@ class MixSegment: public SegmentTagged {
     ifstream ifs(filePath.c_str());
     if (ifs.is_open()) {
       string line;
-      while (getline(ifs, line)) {
-        stopWords_.insert(line);
+        while (getline(ifs, line)) {
+        stopWordList_.push_back(line);
+      }
+      for (auto& word : stopWordList_) {
+        stopWords_.insert(std::string_view(word.data(), word.size()));
       }
-      assert(stopWords_.size());
     }
   }
 
@@ -116,7 +140,8 @@ class MixSegment: public SegmentTagged {
   HMMSegment hmmSeg_;
   PosTagger tagger_;
 
-  unordered_set<string> stopWords_;
+  std::vector<std::string> stopWordList_;
+  unordered_set<std::string_view> stopWords_;
 
 }; // class MixSegment
 
diff --git a/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp 
b/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp
index b6a5f759..49a7f69a 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp
@@ -4,6 +4,7 @@
 #include <algorithm>
 #include <set>
 #include <cassert>
+#include <string_view>
 #include "Logging.hpp"
 #include "DictTrie.hpp"
 #include "SegmentBase.hpp"
@@ -24,6 +25,24 @@ class QuerySegment: public SegmentBase {
   ~QuerySegment() {
   }
 
+  void Cut(const string& sentence, vector<std::string_view>& words, bool hmm = 
true) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<WordRange> wrs;
+    wrs.reserve(sentence.size()/2);
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      Cut(range.begin, range.end, wrs, hmm);
+    }
+    words.clear();
+    words.reserve(wrs.size());
+    for (auto& wr : wrs) {
+      uint32_t len = wr.right->offset - wr.left->offset + wr.right->len;
+      std::string_view word(sentence.data() + wr.left->offset, len);
+      words.emplace_back(word);
+    }
+  }
+
   void Cut(const string& sentence, vector<string>& words) const {
     Cut(sentence, words, true);
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris-thirdparty] branch clucene updated: [optimize](chinese) optimize chinese tokenizer load (#123)

Reply via email to