This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new ff2cd82f9e Revert "[opt](chinese) chinese tokenizer lowercase
interface (#203)" (#204)
ff2cd82f9e is described below
commit ff2cd82f9e545a24318f1256eba312b4d0562a82
Author: qiye <[email protected]>
AuthorDate: Tue Mar 19 15:08:03 2024 +0800
Revert "[opt](chinese) chinese tokenizer lowercase interface (#203)" (#204)
This reverts commit cf210eaaadc3ad5d7b27ff2e7b9635ad45cf227b.
---
src/core/CLucene/index/IndexWriter.cpp | 18 +++---------------
src/core/CLucene/index/IndexWriter.h | 4 ++--
2 files changed, 5 insertions(+), 17 deletions(-)
diff --git a/src/core/CLucene/index/IndexWriter.cpp
b/src/core/CLucene/index/IndexWriter.cpp
index 6b52e047f5..0d770182ba 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1255,7 +1255,7 @@ void IndexWriter::resetMergeExceptions() {
void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *>
&src_dirs,
std::vector<lucene::store::Directory *>
dest_dirs,
std::vector<std::vector<std::pair<uint32_t,
uint32_t>>> trans_vec,
- std::vector<uint32_t> dest_index_docs, bool
maybe_skip) {
+ std::vector<uint32_t> dest_index_docs) {
CND_CONDITION(src_dirs.size() > 0, "Source directory not found.");
CND_CONDITION(dest_dirs.size() > 0, "Destination directory not found.");
this->_trans_vec = std::move(trans_vec);
@@ -1387,7 +1387,7 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
}
/// merge terms
- mergeTerms(hasProx, maybe_skip);
+ mergeTerms(hasProx);
/// merge null_bitmap
mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList);
@@ -1613,7 +1613,7 @@ protected:
};
-void IndexWriter::mergeTerms(bool hasProx, bool maybe_skip) {
+void IndexWriter::mergeTerms(bool hasProx) {
auto queue = _CLNEW SegmentMergeQueue(readers.size());
auto numSrcIndexes = readers.size();
//std::vector<TermPositions *> postingsList(numSrcIndexes);
@@ -1664,18 +1664,6 @@ void IndexWriter::mergeTerms(bool hasProx, bool
maybe_skip) {
top = queue->top();
}
- if (maybe_skip && smallestTerm) {
- auto containsUpperCase = [](const std::wstring_view& ws_term) {
- return std::any_of(ws_term.begin(), ws_term.end(),
- [](wchar_t ch) { return std::iswupper(ch)
!= 0; });
- };
-
- std::wstring_view ws_term(smallestTerm->text(),
smallestTerm->textLength());
- if (containsUpperCase(ws_term)) {
- _CLTHROWA(CL_ERR_InvalidState, "need rewrite, skip index
compaction");
- }
- }
-
std::vector<std::vector<uint32_t>> docDeltaBuffers(numDestIndexes);
std::vector<std::vector<uint32_t>> freqBuffers(numDestIndexes);
auto destPostingQueues = _CLNEW postingQueue(matchSize);
diff --git a/src/core/CLucene/index/IndexWriter.h
b/src/core/CLucene/index/IndexWriter.h
index 0e8d40d8cc..7cfb67d2ca 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -317,14 +317,14 @@ public:
void indexCompaction(std::vector<lucene::store::Directory*>& src_dirs,
std::vector<lucene::store::Directory*> dest_dirs,
std::vector<std::vector<std::pair<uint32_t,
uint32_t>>> trans_vec,
- std::vector<uint32_t> dest_index_docs, bool
maybe_skip = false);
+ std::vector<uint32_t> dest_index_docs);
// create new fields info
void mergeFields(bool hasProx);
// write fields info file
void writeFields(lucene::store::Directory* d, std::string segment);
// merge terms and write files
- void mergeTerms(bool hasProx, bool maybe_skip = false);
+ void mergeTerms(bool hasProx);
// merge null_bitmap
void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues,
std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]