(doris-thirdparty) branch clucene-2.0 updated: [fix](index compaction)support compact multi segments in one index (#152)

jianliangqi Sun, 17 Dec 2023 19:18:15 -0800

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch clucene-2.0
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene-2.0 by this push:
     new 0ab22b4f [fix](index compaction)support compact multi segments in one 
index (#152)
0ab22b4f is described below

commit 0ab22b4f4fa704e9040c8e0c4694c3bfc77769b0
Author: qiye <[email protected]>
AuthorDate: Mon Dec 18 11:18:04 2023 +0800

    [fix](index compaction)support compact multi segments in one index (#152)
---
 src/core/CLucene/index/DirectoryIndexReader.cpp |  6 ++
 src/core/CLucene/index/DirectoryIndexReader.h   |  2 +
 src/core/CLucene/index/IndexReader.h            |  6 ++
 src/core/CLucene/index/IndexWriter.cpp          | 76 ++++++++++---------------
 src/core/CLucene/index/IndexWriter.h            |  7 ---
 src/core/CLucene/index/MultiReader.cpp          |  7 +++
 src/core/CLucene/index/MultiReader.h            |  1 +
 src/core/CLucene/index/MultiSegmentReader.cpp   |  5 ++
 src/core/CLucene/index/_MultiSegmentReader.h    |  1 +
 9 files changed, 57 insertions(+), 54 deletions(-)

diff --git a/src/core/CLucene/index/DirectoryIndexReader.cpp 
b/src/core/CLucene/index/DirectoryIndexReader.cpp
index d3acb6f1..ba3a42ff 100644
--- a/src/core/CLucene/index/DirectoryIndexReader.cpp
+++ b/src/core/CLucene/index/DirectoryIndexReader.cpp
@@ -256,6 +256,12 @@ CL_NS_DEF(index)
     return segmentInfos->getVersion();
   }
 
+/** Not implemented.
+* @throws UnsupportedOperationException
+*/
+FieldInfos* DirectoryIndexReader::getFieldInfos() {
+    _CLTHROWA(CL_ERR_UnsupportedOperation, "DirectoryIndexReader does not 
support this method.");
+}
   /**
    * Check whether this IndexReader is still using the
    * current (i.e., most recently committed) version of the
diff --git a/src/core/CLucene/index/DirectoryIndexReader.h 
b/src/core/CLucene/index/DirectoryIndexReader.h
index e614f3d3..93f7a24d 100644
--- a/src/core/CLucene/index/DirectoryIndexReader.h
+++ b/src/core/CLucene/index/DirectoryIndexReader.h
@@ -102,6 +102,8 @@ public:
    */
   int64_t getVersion();
 
+  FieldInfos* getFieldInfos();
+
   /**
    * Check whether this IndexReader is still using the
    * current (i.e., most recently committed) version of the
diff --git a/src/core/CLucene/index/IndexReader.h 
b/src/core/CLucene/index/IndexReader.h
index 59e937df..060c0545 100644
--- a/src/core/CLucene/index/IndexReader.h
+++ b/src/core/CLucene/index/IndexReader.h
@@ -10,8 +10,10 @@
 
 #include "CLucene/util/Array.h"
 #include "CLucene/util/VoidList.h"
+#include "CLucene/util/VoidMap.h"
 #include "CLucene/LuceneThreads.h"
 #include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/_FieldInfos.h"
 
 CL_CLASS_DEF(store,Directory)
 CL_CLASS_DEF(store,LuceneLock)
@@ -264,6 +266,10 @@ public:
        */
        virtual void norms(const TCHAR* field, uint8_t* bytes) = 0;
 
+    /**
+     * Get index field infos
+     */
+    virtual FieldInfos* getFieldInfos() = 0;
   /** Expert: Resets the normalization factor for the named field of the named
   * document.
   *
diff --git a/src/core/CLucene/index/IndexWriter.cpp 
b/src/core/CLucene/index/IndexWriter.cpp
index d200b353..bb4139ff 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1246,39 +1246,33 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
     CND_CONDITION(dest_dirs.size() > 0, "Destination directory not found.");
     this->_trans_vec = std::move(trans_vec);
 
-    // order mapping: dir -> segment info -> segment reader
-    addIndexesSegments(src_dirs);
-
     // create segment readers
-    int32_t totDocCount = 0;
-    int numSegments = segmentInfos->size();
-    assert(numSegments > 0);
+    int numIndices = src_dirs.size();
 
     //Set of IndexReaders
     if (infoStream != NULL) {
-        message(string("src index dir size: ") + Misc::toString(numSegments));
+        message(string("src index dir size: ") + Misc::toString(numIndices));
     }
-    for (int32_t i = 0; i < numSegments; i++) {
-        SegmentInfo *si = segmentInfos->info(i);
-        IndexReader *reader = SegmentReader::get(si, MERGE_READ_BUFFER_SIZE, 
false /* mergeDocStores */);
+    for (int32_t i = 0; i < numIndices; i++) {
+        // One index dir may have more than one segment, so we change the code 
to open all segments by using IndexReader::open
+        // To keep the number of readers consistent with the number of src 
dirs.
+        // Using IndexWriter::segmentInfos will be incorrect when there are 
more than one segment in one index dir
+        IndexReader* reader = lucene::index::IndexReader::open(src_dirs[i], 
MERGE_READ_BUFFER_SIZE, false);
         readers.push_back(reader);
-        totDocCount += reader->numDocs();
         if (infoStream != NULL) {
             message(src_dirs[i]->toString());
         }
     }
-    if (infoStream != NULL) {
-        message(string("index compaction total doc count: ") + 
Misc::toString(totDocCount));
-    }
+    assert(readers.size() == numIndices);
 
     // check hasProx
     bool hasProx = false;
     {
         if (!readers.empty()) {
-            auto reader = static_cast<SegmentReader*>(readers[0]);
+            IndexReader* reader = readers[0];
             hasProx = reader->getFieldInfos()->hasProx();
             for (int32_t i = 1; i < readers.size(); i++) {
-                if (hasProx != reader->getFieldInfos()->hasProx()) {
+                if (hasProx != readers[i]->getFieldInfos()->hasProx()) {
                     _CLTHROWA(CL_ERR_IllegalArgument, "src_dirs hasProx 
inconformity");
                 }
             }
@@ -1506,25 +1500,6 @@ void 
IndexWriter::compareIndexes(lucene::store::Directory *other) {
     }
 }
 
-void IndexWriter::addIndexesSegments(std::vector<lucene::store::Directory *> 
&dirs) {
-    ensureOpen();
-    try {
-        if (infoStream != NULL)
-            message(string("add indexes segments"));
-
-        {
-            SCOPED_LOCK_MUTEX(this->THIS_LOCK)
-            for (auto dir: dirs) {
-                SegmentInfos sis;// read infos from dir
-                sis.read(dir);
-                segmentInfos->insert(&sis, true);
-            }
-        }
-    } catch (CLuceneError &e) {
-        throw e;
-    }
-}
-
 void IndexWriter::mergeFields(bool hasProx) {
     //Create a new FieldInfos
     fieldInfos = _CLNEW FieldInfos();
@@ -1536,14 +1511,11 @@ void IndexWriter::mergeFields(bool hasProx) {
     // fields of all readers are the same, so we pick the first one.
     IndexReader *reader = readers[0];
 
-    if (reader->instanceOf(SegmentReader::getClassName())) {
-        SegmentReader *segmentReader = (SegmentReader *) reader;
-        for (size_t j = 0; j < segmentReader->getFieldInfos()->size(); j++) {
-            FieldInfo *fi = segmentReader->getFieldInfos()->fieldInfo(j);
-            fieldInfos->add(fi->name, fi->isIndexed, fi->storeTermVector,
-                            fi->storePositionWithTermVector, 
fi->storeOffsetWithTermVector,
-                            !reader->hasNorms(fi->name), hasProx, 
fi->storePayloads);
-        }
+    for (size_t j = 0; j < reader->getFieldInfos()->size(); j++) {
+        FieldInfo *fi = reader->getFieldInfos()->fieldInfo(j);
+        fieldInfos->add(fi->name, fi->isIndexed, fi->storeTermVector,
+                        fi->storePositionWithTermVector, 
fi->storeOffsetWithTermVector,
+                        !reader->hasNorms(fi->name), hasProx, 
fi->storePayloads);
     }
 }
 
@@ -1618,11 +1590,21 @@ void IndexWriter::mergeTerms(bool hasProx) {
 
         match[matchSize++] = queue->pop();
         Term *smallestTerm = match[0]->term;
-        // std::wstring ws = smallestTerm->text();
-        // std::string name = std::string(ws.begin(), ws.end());
-        // std::cout << name << std::endl;
-
         SegmentMergeInfo *top = queue->top();
+        if (infoStream != nullptr) {
+            std::string name = lucene_wcstoutf8string(smallestTerm->text(), 
smallestTerm->textLength());
+            std::string field = lucene_wcstoutf8string(smallestTerm->field(), 
wcslen(smallestTerm->field()));
+            message("smallestTerm name: " + name);
+            message("smallestTerm field: " + field);
+
+            if (top != nullptr) {
+                Term* topTerm = top->term;
+                std::string name1 = lucene_wcstoutf8string(topTerm->text(), 
topTerm->textLength());
+                std::string field1 = lucene_wcstoutf8string(topTerm->field(), 
wcslen(topTerm->field()));
+                message("topTerm name: " + name1);
+                message("topTerm field: " + field1);
+            }
+        }
         while (top != nullptr && smallestTerm->equals(top->term)) {
             match[matchSize++] = queue->pop();
             top = queue->top();
diff --git a/src/core/CLucene/index/IndexWriter.h 
b/src/core/CLucene/index/IndexWriter.h
index 21df3c0c..7dbf2b12 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -319,13 +319,6 @@ public:
                             std::vector<std::vector<std::pair<uint32_t, 
uint32_t>>> trans_vec,
                             std::vector<uint32_t> dest_index_docs);
 
-    /**
-     * Merges all segments from an array of indexes into this index, just 
merging segment infos.
-     * Simplified from 
addIndexesNoOptimize(CL_NS(util)::ArrayBase<CL_NS(store)::Directory*>& dirs)
-     * @param dirs
-     */
-    void addIndexesSegments(std::vector<lucene::store::Directory*>& dirs);
-
     // create new fields info
     void mergeFields(bool hasProx);
     // write fields info file
diff --git a/src/core/CLucene/index/MultiReader.cpp 
b/src/core/CLucene/index/MultiReader.cpp
index 0a94c5f3..726b6e3d 100644
--- a/src/core/CLucene/index/MultiReader.cpp
+++ b/src/core/CLucene/index/MultiReader.cpp
@@ -354,6 +354,13 @@ int64_t MultiReader::getVersion() {
   _CLTHROWA(CL_ERR_UnsupportedOperation, "MultiReader does not support this 
method.");
 }
 
+/** Not implemented.
+ * @throws UnsupportedOperationException
+ */
+FieldInfos* MultiReader::getFieldInfos() {
+    _CLTHROWA(CL_ERR_UnsupportedOperation, "MultiReader does not support this 
method.");
+}
+
 const char* MultiReader::getClassName(){
   return "MultiReader";
 }
diff --git a/src/core/CLucene/index/MultiReader.h 
b/src/core/CLucene/index/MultiReader.h
index 64d1cca4..301d1422 100644
--- a/src/core/CLucene/index/MultiReader.h
+++ b/src/core/CLucene/index/MultiReader.h
@@ -94,6 +94,7 @@ public:
        bool hasDeletions() const;
        uint8_t* norms(const TCHAR* field);
        void norms(const TCHAR* field, uint8_t* result);
+    FieldInfos* getFieldInfos();
        TermEnum* terms();
        TermEnum* terms(const Term* term);
 
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp 
b/src/core/CLucene/index/MultiSegmentReader.cpp
index 495acaa4..ad37807e 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -325,6 +325,11 @@ void MultiSegmentReader::norms(const TCHAR* field, 
uint8_t* result) {
          (*subReaders)[i]->norms(field, result + starts[i]);
 }
 
+FieldInfos* MultiSegmentReader::getFieldInfos() {
+    // field infos of subReaders are same, so we return the first one.
+    assert(subReaders->length > 0);
+    return (*subReaders)[0]->getFieldInfos();
+}
 
 void MultiSegmentReader::doSetNorm(int32_t n, const TCHAR* field, uint8_t 
value){
        normsCache.removeitr( normsCache.find((TCHAR*)field) );                 
        // clear cache
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h 
b/src/core/CLucene/index/_MultiSegmentReader.h
index 52f700d1..ad600824 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -97,6 +97,7 @@ public:
        // synchronized
        uint8_t* norms(const TCHAR* field);
        void norms(const TCHAR* field, uint8_t* result);
+    FieldInfos* getFieldInfos();
 
        TermEnum* terms();
        TermEnum* terms(const Term* term);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris-thirdparty) branch clucene-2.0 updated: [fix](index compaction)support compact multi segments in one index (#152)

Reply via email to