This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene-2.0
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene-2.0 by this push:
new 0ab22b4f [fix](index compaction)support compact multi segments in one
index (#152)
0ab22b4f is described below
commit 0ab22b4f4fa704e9040c8e0c4694c3bfc77769b0
Author: qiye <[email protected]>
AuthorDate: Mon Dec 18 11:18:04 2023 +0800
[fix](index compaction)support compact multi segments in one index (#152)
---
src/core/CLucene/index/DirectoryIndexReader.cpp | 6 ++
src/core/CLucene/index/DirectoryIndexReader.h | 2 +
src/core/CLucene/index/IndexReader.h | 6 ++
src/core/CLucene/index/IndexWriter.cpp | 76 ++++++++++---------------
src/core/CLucene/index/IndexWriter.h | 7 ---
src/core/CLucene/index/MultiReader.cpp | 7 +++
src/core/CLucene/index/MultiReader.h | 1 +
src/core/CLucene/index/MultiSegmentReader.cpp | 5 ++
src/core/CLucene/index/_MultiSegmentReader.h | 1 +
9 files changed, 57 insertions(+), 54 deletions(-)
diff --git a/src/core/CLucene/index/DirectoryIndexReader.cpp
b/src/core/CLucene/index/DirectoryIndexReader.cpp
index d3acb6f1..ba3a42ff 100644
--- a/src/core/CLucene/index/DirectoryIndexReader.cpp
+++ b/src/core/CLucene/index/DirectoryIndexReader.cpp
@@ -256,6 +256,12 @@ CL_NS_DEF(index)
return segmentInfos->getVersion();
}
+/** Not implemented.
+* @throws UnsupportedOperationException
+*/
+FieldInfos* DirectoryIndexReader::getFieldInfos() {
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "DirectoryIndexReader does not
support this method.");
+}
/**
* Check whether this IndexReader is still using the
* current (i.e., most recently committed) version of the
diff --git a/src/core/CLucene/index/DirectoryIndexReader.h
b/src/core/CLucene/index/DirectoryIndexReader.h
index e614f3d3..93f7a24d 100644
--- a/src/core/CLucene/index/DirectoryIndexReader.h
+++ b/src/core/CLucene/index/DirectoryIndexReader.h
@@ -102,6 +102,8 @@ public:
*/
int64_t getVersion();
+ FieldInfos* getFieldInfos();
+
/**
* Check whether this IndexReader is still using the
* current (i.e., most recently committed) version of the
diff --git a/src/core/CLucene/index/IndexReader.h
b/src/core/CLucene/index/IndexReader.h
index 59e937df..060c0545 100644
--- a/src/core/CLucene/index/IndexReader.h
+++ b/src/core/CLucene/index/IndexReader.h
@@ -10,8 +10,10 @@
#include "CLucene/util/Array.h"
#include "CLucene/util/VoidList.h"
+#include "CLucene/util/VoidMap.h"
#include "CLucene/LuceneThreads.h"
#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/_FieldInfos.h"
CL_CLASS_DEF(store,Directory)
CL_CLASS_DEF(store,LuceneLock)
@@ -264,6 +266,10 @@ public:
*/
virtual void norms(const TCHAR* field, uint8_t* bytes) = 0;
+ /**
+ * Get index field infos
+ */
+ virtual FieldInfos* getFieldInfos() = 0;
/** Expert: Resets the normalization factor for the named field of the named
* document.
*
diff --git a/src/core/CLucene/index/IndexWriter.cpp
b/src/core/CLucene/index/IndexWriter.cpp
index d200b353..bb4139ff 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1246,39 +1246,33 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
CND_CONDITION(dest_dirs.size() > 0, "Destination directory not found.");
this->_trans_vec = std::move(trans_vec);
- // order mapping: dir -> segment info -> segment reader
- addIndexesSegments(src_dirs);
-
// create segment readers
- int32_t totDocCount = 0;
- int numSegments = segmentInfos->size();
- assert(numSegments > 0);
+ int numIndices = src_dirs.size();
//Set of IndexReaders
if (infoStream != NULL) {
- message(string("src index dir size: ") + Misc::toString(numSegments));
+ message(string("src index dir size: ") + Misc::toString(numIndices));
}
- for (int32_t i = 0; i < numSegments; i++) {
- SegmentInfo *si = segmentInfos->info(i);
- IndexReader *reader = SegmentReader::get(si, MERGE_READ_BUFFER_SIZE,
false /* mergeDocStores */);
+ for (int32_t i = 0; i < numIndices; i++) {
+ // One index dir may have more than one segment, so we change the code
to open all segments by using IndexReader::open
+ // To keep the number of readers consistent with the number of src
dirs.
+ // Using IndexWriter::segmentInfos will be incorrect when there are
more than one segment in one index dir
+ IndexReader* reader = lucene::index::IndexReader::open(src_dirs[i],
MERGE_READ_BUFFER_SIZE, false);
readers.push_back(reader);
- totDocCount += reader->numDocs();
if (infoStream != NULL) {
message(src_dirs[i]->toString());
}
}
- if (infoStream != NULL) {
- message(string("index compaction total doc count: ") +
Misc::toString(totDocCount));
- }
+ assert(readers.size() == numIndices);
// check hasProx
bool hasProx = false;
{
if (!readers.empty()) {
- auto reader = static_cast<SegmentReader*>(readers[0]);
+ IndexReader* reader = readers[0];
hasProx = reader->getFieldInfos()->hasProx();
for (int32_t i = 1; i < readers.size(); i++) {
- if (hasProx != reader->getFieldInfos()->hasProx()) {
+ if (hasProx != readers[i]->getFieldInfos()->hasProx()) {
_CLTHROWA(CL_ERR_IllegalArgument, "src_dirs hasProx
inconformity");
}
}
@@ -1506,25 +1500,6 @@ void
IndexWriter::compareIndexes(lucene::store::Directory *other) {
}
}
-void IndexWriter::addIndexesSegments(std::vector<lucene::store::Directory *>
&dirs) {
- ensureOpen();
- try {
- if (infoStream != NULL)
- message(string("add indexes segments"));
-
- {
- SCOPED_LOCK_MUTEX(this->THIS_LOCK)
- for (auto dir: dirs) {
- SegmentInfos sis;// read infos from dir
- sis.read(dir);
- segmentInfos->insert(&sis, true);
- }
- }
- } catch (CLuceneError &e) {
- throw e;
- }
-}
-
void IndexWriter::mergeFields(bool hasProx) {
//Create a new FieldInfos
fieldInfos = _CLNEW FieldInfos();
@@ -1536,14 +1511,11 @@ void IndexWriter::mergeFields(bool hasProx) {
// fields of all readers are the same, so we pick the first one.
IndexReader *reader = readers[0];
- if (reader->instanceOf(SegmentReader::getClassName())) {
- SegmentReader *segmentReader = (SegmentReader *) reader;
- for (size_t j = 0; j < segmentReader->getFieldInfos()->size(); j++) {
- FieldInfo *fi = segmentReader->getFieldInfos()->fieldInfo(j);
- fieldInfos->add(fi->name, fi->isIndexed, fi->storeTermVector,
- fi->storePositionWithTermVector,
fi->storeOffsetWithTermVector,
- !reader->hasNorms(fi->name), hasProx,
fi->storePayloads);
- }
+ for (size_t j = 0; j < reader->getFieldInfos()->size(); j++) {
+ FieldInfo *fi = reader->getFieldInfos()->fieldInfo(j);
+ fieldInfos->add(fi->name, fi->isIndexed, fi->storeTermVector,
+ fi->storePositionWithTermVector,
fi->storeOffsetWithTermVector,
+ !reader->hasNorms(fi->name), hasProx,
fi->storePayloads);
}
}
@@ -1618,11 +1590,21 @@ void IndexWriter::mergeTerms(bool hasProx) {
match[matchSize++] = queue->pop();
Term *smallestTerm = match[0]->term;
- // std::wstring ws = smallestTerm->text();
- // std::string name = std::string(ws.begin(), ws.end());
- // std::cout << name << std::endl;
-
SegmentMergeInfo *top = queue->top();
+ if (infoStream != nullptr) {
+ std::string name = lucene_wcstoutf8string(smallestTerm->text(),
smallestTerm->textLength());
+ std::string field = lucene_wcstoutf8string(smallestTerm->field(),
wcslen(smallestTerm->field()));
+ message("smallestTerm name: " + name);
+ message("smallestTerm field: " + field);
+
+ if (top != nullptr) {
+ Term* topTerm = top->term;
+ std::string name1 = lucene_wcstoutf8string(topTerm->text(),
topTerm->textLength());
+ std::string field1 = lucene_wcstoutf8string(topTerm->field(),
wcslen(topTerm->field()));
+ message("topTerm name: " + name1);
+ message("topTerm field: " + field1);
+ }
+ }
while (top != nullptr && smallestTerm->equals(top->term)) {
match[matchSize++] = queue->pop();
top = queue->top();
diff --git a/src/core/CLucene/index/IndexWriter.h
b/src/core/CLucene/index/IndexWriter.h
index 21df3c0c..7dbf2b12 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -319,13 +319,6 @@ public:
std::vector<std::vector<std::pair<uint32_t,
uint32_t>>> trans_vec,
std::vector<uint32_t> dest_index_docs);
- /**
- * Merges all segments from an array of indexes into this index, just
merging segment infos.
- * Simplified from
addIndexesNoOptimize(CL_NS(util)::ArrayBase<CL_NS(store)::Directory*>& dirs)
- * @param dirs
- */
- void addIndexesSegments(std::vector<lucene::store::Directory*>& dirs);
-
// create new fields info
void mergeFields(bool hasProx);
// write fields info file
diff --git a/src/core/CLucene/index/MultiReader.cpp
b/src/core/CLucene/index/MultiReader.cpp
index 0a94c5f3..726b6e3d 100644
--- a/src/core/CLucene/index/MultiReader.cpp
+++ b/src/core/CLucene/index/MultiReader.cpp
@@ -354,6 +354,13 @@ int64_t MultiReader::getVersion() {
_CLTHROWA(CL_ERR_UnsupportedOperation, "MultiReader does not support this
method.");
}
+/** Not implemented.
+ * @throws UnsupportedOperationException
+ */
+FieldInfos* MultiReader::getFieldInfos() {
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "MultiReader does not support this
method.");
+}
+
const char* MultiReader::getClassName(){
return "MultiReader";
}
diff --git a/src/core/CLucene/index/MultiReader.h
b/src/core/CLucene/index/MultiReader.h
index 64d1cca4..301d1422 100644
--- a/src/core/CLucene/index/MultiReader.h
+++ b/src/core/CLucene/index/MultiReader.h
@@ -94,6 +94,7 @@ public:
bool hasDeletions() const;
uint8_t* norms(const TCHAR* field);
void norms(const TCHAR* field, uint8_t* result);
+ FieldInfos* getFieldInfos();
TermEnum* terms();
TermEnum* terms(const Term* term);
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp
b/src/core/CLucene/index/MultiSegmentReader.cpp
index 495acaa4..ad37807e 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -325,6 +325,11 @@ void MultiSegmentReader::norms(const TCHAR* field,
uint8_t* result) {
(*subReaders)[i]->norms(field, result + starts[i]);
}
+FieldInfos* MultiSegmentReader::getFieldInfos() {
+ // field infos of subReaders are same, so we return the first one.
+ assert(subReaders->length > 0);
+ return (*subReaders)[0]->getFieldInfos();
+}
void MultiSegmentReader::doSetNorm(int32_t n, const TCHAR* field, uint8_t
value){
normsCache.removeitr( normsCache.find((TCHAR*)field) );
// clear cache
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h
b/src/core/CLucene/index/_MultiSegmentReader.h
index 52f700d1..ad600824 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -97,6 +97,7 @@ public:
// synchronized
uint8_t* norms(const TCHAR* field);
void norms(const TCHAR* field, uint8_t* result);
+ FieldInfos* getFieldInfos();
TermEnum* terms();
TermEnum* terms(const Term* term);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]