This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new cf210eaaad [opt](chinese) chinese tokenizer lowercase interface (#203)
cf210eaaad is described below
commit cf210eaaadc3ad5d7b27ff2e7b9635ad45cf227b
Author: zzzxl <[email protected]>
AuthorDate: Mon Mar 18 17:43:54 2024 +0800
[opt](chinese) chinese tokenizer lowercase interface (#203)
---
src/core/CLucene/index/IndexWriter.cpp | 18 +++++++++++++++---
src/core/CLucene/index/IndexWriter.h | 4 ++--
2 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/src/core/CLucene/index/IndexWriter.cpp
b/src/core/CLucene/index/IndexWriter.cpp
index 0d770182ba..6b52e047f5 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1255,7 +1255,7 @@ void IndexWriter::resetMergeExceptions() {
void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *>
&src_dirs,
std::vector<lucene::store::Directory *>
dest_dirs,
std::vector<std::vector<std::pair<uint32_t,
uint32_t>>> trans_vec,
- std::vector<uint32_t> dest_index_docs) {
+ std::vector<uint32_t> dest_index_docs, bool
maybe_skip) {
CND_CONDITION(src_dirs.size() > 0, "Source directory not found.");
CND_CONDITION(dest_dirs.size() > 0, "Destination directory not found.");
this->_trans_vec = std::move(trans_vec);
@@ -1387,7 +1387,7 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
}
/// merge terms
- mergeTerms(hasProx);
+ mergeTerms(hasProx, maybe_skip);
/// merge null_bitmap
mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList);
@@ -1613,7 +1613,7 @@ protected:
};
-void IndexWriter::mergeTerms(bool hasProx) {
+void IndexWriter::mergeTerms(bool hasProx, bool maybe_skip) {
auto queue = _CLNEW SegmentMergeQueue(readers.size());
auto numSrcIndexes = readers.size();
//std::vector<TermPositions *> postingsList(numSrcIndexes);
@@ -1664,6 +1664,18 @@ void IndexWriter::mergeTerms(bool hasProx) {
top = queue->top();
}
+ if (maybe_skip && smallestTerm) {
+ auto containsUpperCase = [](const std::wstring_view& ws_term) {
+ return std::any_of(ws_term.begin(), ws_term.end(),
+ [](wchar_t ch) { return std::iswupper(ch)
!= 0; });
+ };
+
+ std::wstring_view ws_term(smallestTerm->text(),
smallestTerm->textLength());
+ if (containsUpperCase(ws_term)) {
+ _CLTHROWA(CL_ERR_InvalidState, "need rewrite, skip index
compaction");
+ }
+ }
+
std::vector<std::vector<uint32_t>> docDeltaBuffers(numDestIndexes);
std::vector<std::vector<uint32_t>> freqBuffers(numDestIndexes);
auto destPostingQueues = _CLNEW postingQueue(matchSize);
diff --git a/src/core/CLucene/index/IndexWriter.h
b/src/core/CLucene/index/IndexWriter.h
index 7cfb67d2ca..0e8d40d8cc 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -317,14 +317,14 @@ public:
void indexCompaction(std::vector<lucene::store::Directory*>& src_dirs,
std::vector<lucene::store::Directory*> dest_dirs,
std::vector<std::vector<std::pair<uint32_t,
uint32_t>>> trans_vec,
- std::vector<uint32_t> dest_index_docs);
+ std::vector<uint32_t> dest_index_docs, bool
maybe_skip = false);
// create new fields info
void mergeFields(bool hasProx);
// write fields info file
void writeFields(lucene::store::Directory* d, std::string segment);
// merge terms and write files
- void mergeTerms(bool hasProx);
+ void mergeTerms(bool hasProx, bool maybe_skip = false);
// merge null_bitmap
void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues,
std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]