This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new e0fb04c0 [optimize](chinese) optimize chinese tokenizer load (#123)
e0fb04c0 is described below
commit e0fb04c02762e60395d9a1d36d0764b3de108684
Author: zzzxl <[email protected]>
AuthorDate: Thu Sep 21 16:13:14 2023 +0800
[optimize](chinese) optimize chinese tokenizer load (#123)
---
.../CLucene/analysis/jieba/ChineseTokenizer.cpp | 64 ++++++++++------------
.../CLucene/analysis/jieba/ChineseTokenizer.h | 32 +++--------
.../CLucene/analysis/jieba/FullSegment.hpp | 37 +++++++++++--
src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp | 10 ++++
.../CLucene/analysis/jieba/MixSegment.hpp | 33 +++++++++--
.../CLucene/analysis/jieba/QuerySegment.hpp | 19 +++++++
6 files changed, 127 insertions(+), 68 deletions(-)
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
index c72e2451..a2795226 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
@@ -8,7 +8,7 @@ CL_NS_USE(analysis)
CL_NS_USE(util)
ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode
m) : Tokenizer(reader), mode(m) {
- memset(buffer, 0, LUCENE_MAX_WORD_LEN + 1);
+ reset(reader);
}
void ChineseTokenizer::init(const std::string &dictPath) {
@@ -16,45 +16,39 @@ void ChineseTokenizer::init(const std::string &dictPath) {
}
CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token)
{
- // try to read all words
- const char *initBuffer;
- if (dataLen == 0 || bufferIndex >= dataLen) {
- int totalLen = 0;
- do {
- auto bufferLen = input->read((const void**)&ioBuffer, 1,
LUCENE_IO_BUFFER_SIZE);
- if (bufferLen <= 0) {
- dataLen = 0;
- bufferIndex = 0;
- break;
- }
- if (totalLen < LUCENE_IO_BUFFER_SIZE) {
- initBuffer = ioBuffer;
- }
- totalLen+=bufferLen;
- } while (true);
-
- //char tmp_buffer[4 * totalLen + 1];
- //lucene_wcsntoutf8(tmp_buffer, initBuffer, totalLen, 4 * totalLen);
- std::string s(initBuffer, totalLen);
- switch (mode) {
+ if (bufferIndex >= dataLen) {
+ return nullptr;
+ }
+
+ std::string_view& token_text = tokens_text[bufferIndex++];
+ size_t size = std::min(token_text.size(),
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
+ token->setNoCopy(token_text.data(), 0, size);
+ return token;
+}
+
+void ChineseTokenizer::reset(lucene::util::Reader* reader) {
+ this->input = reader;
+ this->bufferIndex = 0;
+ this->dataLen = 0;
+ this->tokens_text.clear();
+
+ buffer_.resize(input->size());
+ int32_t numRead = input->readCopy(buffer_.data(), 0, buffer_.size());
+ assert(buffer.size() == numRead);
+
+ switch (mode) {
case AnalyzerMode::Search:
- JiebaSingleton::getInstance().CutForSearch(s, tokens_text, true);
+ JiebaSingleton::getInstance().CutForSearch(buffer_, tokens_text,
true);
break;
case AnalyzerMode::All:
- JiebaSingleton::getInstance().CutAll(s, tokens_text);
+ JiebaSingleton::getInstance().CutAll(buffer_, tokens_text);
break;
case AnalyzerMode::Default:
- JiebaSingleton::getInstance().Cut(s, tokens_text, true);
+ JiebaSingleton::getInstance().Cut(buffer_, tokens_text, true);
break;
- }
- dataLen = tokens_text.size();
}
- if (bufferIndex < dataLen) {
- std::string& token_text = tokens_text[bufferIndex++];
- size_t size = std::min(token_text.size(),
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
- token->setNoCopy(token_text.data(), 0, size);
- return token;
- }
- return nullptr;
-}
+
+ dataLen = tokens_text.size();
+};
+
CL_NS_END2
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
index 48de52b1..9bd34fb7 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
@@ -4,6 +4,7 @@
#include <CLucene.h>
#include <memory>
+#include <string_view>
#include "Jieba.hpp"
#include "CLucene/analysis/AnalysisHeader.h"
@@ -31,28 +32,15 @@ private:
class ChineseTokenizer : public lucene::analysis::Tokenizer {
private:
AnalyzerMode mode{};
- /** word offset, used to imply which character(in ) is parsed */
- int32_t offset{};
/** the index used only for ioBuffer */
- int32_t bufferIndex{};
+ int32_t bufferIndex = 0;
/** data length */
- int32_t dataLen{};
-
- /**
- * character buffer, store the characters which are used to compose <br>
- * the returned Token
- */
- TCHAR buffer[LUCENE_MAX_WORD_LEN + 1]{};
-
- /**
- * I/O buffer, used to store the content of the input(one of the <br>
- * members of Tokenizer)
- */
- const char* ioBuffer{};
- std::vector<std::string> tokens_text;
- //std::vector<std::unique_ptr<Token>> tokens;
+ int32_t dataLen = 0;
+
+ std::string buffer_;
+ std::vector<std::string_view> tokens_text;
public:
// Constructor
@@ -65,13 +53,7 @@ public:
// Override the next method to tokenize Chinese text using Jieba
lucene::analysis::Token* next(lucene::analysis::Token* token) override;
- void reset(lucene::util::Reader *reader) override {
- this->input = reader;
- this->offset = 0;
- this->bufferIndex = 0;
- this->dataLen = 0;
- this->tokens_text.clear();
- }
+ void reset(lucene::util::Reader *reader) override;
};
CL_NS_END2
diff --git a/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
b/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
index 12127a92..df3c4c1c 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
@@ -26,6 +26,32 @@ class FullSegment: public SegmentBase {
delete dictTrie_;
}
}
+
+ void Cut(const std::string& sentence, vector<std::string_view>& words) const
{
+ PreFilter pre_filter(symbols_, sentence, true);
+ PreFilter::Range range;
+ vector<WordRange> wrs;
+ wrs.reserve(sentence.size()/2);
+ while (pre_filter.HasNext()) {
+ range = pre_filter.Next();
+ if (range.type == PreFilter::Language::CHINESE) {
+ Cut(range.begin, range.end, wrs);
+ } else {
+ CutAlnum(range.begin, range.end, wrs);
+ }
+ }
+ words.clear();
+ words.reserve(wrs.size());
+ for (auto& wr : wrs) {
+ uint32_t len = wr.right->offset - wr.left->offset + wr.right->len;
+ std::string_view word(sentence.data() + wr.left->offset, len);
+ if (stopWords_.count(word)) {
+ continue;
+ }
+ words.emplace_back(word);
+ }
+ }
+
void Cut(const string& sentence,
vector<string>& words) const {
vector<Word> tmp;
@@ -137,10 +163,12 @@ class FullSegment: public SegmentBase {
ifstream ifs(filePath.c_str());
if (ifs.is_open()) {
string line;
- while (getline(ifs, line)) {
- stopWords_.insert(line);
+ while (getline(ifs, line)) {
+ stopWordList_.push_back(line);
+ }
+ for (auto& word : stopWordList_) {
+ stopWords_.insert(std::string_view(word.data(), word.size()));
}
- assert(stopWords_.size());
}
}
@@ -148,7 +176,8 @@ class FullSegment: public SegmentBase {
const DictTrie* dictTrie_;
bool isNeedDestroy_;
- unordered_set<string> stopWords_;
+ std::vector<std::string> stopWordList_;
+ unordered_set<std::string_view> stopWords_;
};
}
diff --git a/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
b/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
index bfdb89fa..0e6e91de 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
@@ -1,6 +1,7 @@
#ifndef CPPJIEAB_JIEBA_H
#define CPPJIEAB_JIEBA_H
+#include <string_view>
#include "QuerySegment.hpp"
#include "KeywordExtractor.hpp"
@@ -34,18 +35,27 @@ class Jieba {
void Cut(const string& sentence, vector<string>& words, bool hmm = true)
const {
mix_seg_.Cut(sentence, words, hmm);
}
+ void Cut(const string& sentence, vector<std::string_view>& words, bool hmm =
true) const {
+ mix_seg_.Cut(sentence, words, hmm);
+ }
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const
{
mix_seg_.Cut(sentence, words, hmm);
}
void CutAll(const string& sentence, vector<string>& words) const {
full_seg_.Cut(sentence, words);
}
+ void CutAll(const string& sentence, vector<std::string_view>& words) const {
+ full_seg_.Cut(sentence, words);
+ }
void CutAll(const string& sentence, vector<Word>& words) const {
full_seg_.Cut(sentence, words);
}
void CutForSearch(const string& sentence, vector<string>& words, bool hmm =
true) const {
query_seg_.Cut(sentence, words, hmm);
}
+ void CutForSearch(const string& sentence, vector<std::string_view>& words,
bool hmm = true) const {
+ query_seg_.Cut(sentence, words, hmm);
+ }
void CutForSearch(const string& sentence, vector<Word>& words, bool hmm =
true) const {
query_seg_.Cut(sentence, words, hmm);
}
diff --git a/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
b/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
index 31839f8d..dcb99d23 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
@@ -2,6 +2,7 @@
#define CPPJIEBA_MIXSEGMENT_H
#include <cassert>
+#include <string_view>
#include "MPSegment.hpp"
#include "HMMSegment.hpp"
#include "StringUtil.hpp"
@@ -22,6 +23,27 @@ class MixSegment: public SegmentTagged {
~MixSegment() {
}
+ void Cut(const string& sentence, vector<std::string_view>& words, bool hmm =
true) const {
+ PreFilter pre_filter(symbols_, sentence);
+ PreFilter::Range range;
+ vector<WordRange> wrs;
+ wrs.reserve(sentence.size() / 2);
+ while (pre_filter.HasNext()) {
+ range = pre_filter.Next();
+ Cut(range.begin, range.end, wrs, hmm);
+ }
+ words.clear();
+ words.reserve(wrs.size());
+ for (auto& wr : wrs) {
+ uint32_t len = wr.right->offset - wr.left->offset + wr.right->len;
+ std::string_view word(sentence.data() + wr.left->offset, len);
+ if (stopWords_.count(word)) {
+ continue;
+ }
+ words.emplace_back(word);
+ }
+ }
+
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, true);
}
@@ -104,10 +126,12 @@ class MixSegment: public SegmentTagged {
ifstream ifs(filePath.c_str());
if (ifs.is_open()) {
string line;
- while (getline(ifs, line)) {
- stopWords_.insert(line);
+ while (getline(ifs, line)) {
+ stopWordList_.push_back(line);
+ }
+ for (auto& word : stopWordList_) {
+ stopWords_.insert(std::string_view(word.data(), word.size()));
}
- assert(stopWords_.size());
}
}
@@ -116,7 +140,8 @@ class MixSegment: public SegmentTagged {
HMMSegment hmmSeg_;
PosTagger tagger_;
- unordered_set<string> stopWords_;
+ std::vector<std::string> stopWordList_;
+ unordered_set<std::string_view> stopWords_;
}; // class MixSegment
diff --git a/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp
b/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp
index b6a5f759..49a7f69a 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp
@@ -4,6 +4,7 @@
#include <algorithm>
#include <set>
#include <cassert>
+#include <string_view>
#include "Logging.hpp"
#include "DictTrie.hpp"
#include "SegmentBase.hpp"
@@ -24,6 +25,24 @@ class QuerySegment: public SegmentBase {
~QuerySegment() {
}
+ void Cut(const string& sentence, vector<std::string_view>& words, bool hmm =
true) const {
+ PreFilter pre_filter(symbols_, sentence);
+ PreFilter::Range range;
+ vector<WordRange> wrs;
+ wrs.reserve(sentence.size()/2);
+ while (pre_filter.HasNext()) {
+ range = pre_filter.Next();
+ Cut(range.begin, range.end, wrs, hmm);
+ }
+ words.clear();
+ words.reserve(wrs.size());
+ for (auto& wr : wrs) {
+ uint32_t len = wr.right->offset - wr.left->offset + wr.right->len;
+ std::string_view word(sentence.data() + wr.left->offset, len);
+ words.emplace_back(word);
+ }
+ }
+
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, true);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]