(doris-thirdparty) branch clucene-3.1 updated: [opt](inverted index) add block-based reading with readRange and skipToBlock (#372)

airborne Tue, 06 Jan 2026 05:37:50 -0800

This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch clucene-3.1
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene-3.1 by this push:
     new a4796e961e1 [opt](inverted index) add block-based reading with 
readRange and skipToBlock (#372)
a4796e961e1 is described below

commit a4796e961e1724f8b6b5145658bb2704981376e5
Author: Jack <[email protected]>
AuthorDate: Tue Jan 6 21:36:47 2026 +0800

    [opt](inverted index) add block-based reading with readRange and 
skipToBlock (#372)
    
    * [opt](inverted index) add block-based reading with readRange and 
skipToBlock (#371)
    
    * fix interface
    
    * fix interface
    
    ---------
    
    Co-authored-by: zzzxl <[email protected]>
---
 src/core/CLucene/index/DocRange.h               |   2 +
 src/core/CLucene/index/MultiSegmentReader.cpp   |  28 +-
 src/core/CLucene/index/SegmentTermDocs.cpp      |  45 +-
 src/core/CLucene/index/SegmentTermPositions.cpp |   8 +
 src/core/CLucene/index/Terms.h                  |   8 +
 src/core/CLucene/index/_MultiSegmentReader.h    |   6 +
 src/core/CLucene/index/_SegmentHeader.h         |  11 +-
 src/test/CMakeLists.txt                         |   1 +
 src/test/index/TestReadRange.cpp                | 854 ++++++++++++++++++++++++
 src/test/test.h                                 |   1 +
 src/test/tests.cpp                              |   1 +
 11 files changed, 960 insertions(+), 5 deletions(-)

diff --git a/src/core/CLucene/index/DocRange.h 
b/src/core/CLucene/index/DocRange.h
index ef7906a24fb..9412178eb6b 100644
--- a/src/core/CLucene/index/DocRange.h
+++ b/src/core/CLucene/index/DocRange.h
@@ -27,4 +27,6 @@ class DocRange {
   std::vector<uint32_t>* freq_many = nullptr;
 
   std::pair<uint32_t, uint32_t> doc_range;
+
+  bool need_positions = false;
 };
\ No newline at end of file
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp 
b/src/core/CLucene/index/MultiSegmentReader.cpp
index e5987023c94..6ab57b2c80f 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -672,11 +672,14 @@ bool MultiTermDocs::readRange(DocRange* docRange) {
                if (!current->readRange(docRange)) {
                        current = nullptr;
                } else {
-                       if (docRange->type_ == DocRangeType::kMany) {
+                       // Always update doc_many with base offset (doc_many is 
always valid)
+                       if (docRange->doc_many && docRange->doc_many_size_ > 0) 
{
                                auto begin = docRange->doc_many->begin();
                                auto end = docRange->doc_many->begin() + 
docRange->doc_many_size_;
                                std::transform(begin, end, begin, 
[this](int32_t val) { return val + base; });
-                       } else if (docRange->type_ == DocRangeType::kRange) {
+                       }
+                       // Also update doc_range if kRange type
+                       if (docRange->type_ == DocRangeType::kRange) {
                                docRange->doc_range.first += base;
                                docRange->doc_range.second += base;
                        }
@@ -703,6 +706,17 @@ bool MultiTermDocs::skipTo(const int32_t target) {
        }
 }
 
+void MultiTermDocs::skipToBlock(const int32_t target) {
+    while (pointer < subReaders->length && target >= starts[pointer]) {
+        base = starts[pointer];
+        current = termDocs(pointer++);
+    }
+    
+    if (current != NULL) {
+        current->skipToBlock(target - base);
+    }
+}
+
 void MultiTermDocs::close() {
 //Func - Closes all MultiTermDocs managed by this instance
 //Pre  - true
@@ -959,6 +973,11 @@ int32_t MultiTermPositions::nextPosition() {
        return curAsTP->nextPosition();
 }
 
+int32_t MultiTermPositions::nextDeltaPosition() {
+       TermPositions* curAsTP = current->__asTermPositions();
+       return curAsTP->nextDeltaPosition();
+}
+
 int32_t MultiTermPositions::getPayloadLength() const{
   TermPositions* curAsTP = current->__asTermPositions();
   return curAsTP->getPayloadLength();
@@ -974,4 +993,9 @@ bool MultiTermPositions::isPayloadAvailable() const{
   return curAsTP->isPayloadAvailable();
 }
 
+void MultiTermPositions::addLazySkipProxCount(int32_t count) {
+  TermPositions* curAsTP = current->__asTermPositions();
+  curAsTP->addLazySkipProxCount(count);
+}
+
 CL_NS_END
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp 
b/src/core/CLucene/index/SegmentTermDocs.cpp
index 5f5df366f91..6014099c0d1 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -146,9 +146,26 @@ bool SegmentTermDocs::readRange(DocRange* docRange) {
     }
 
     buffer_.readRange(docRange);
-
     count += docRange->doc_many_size_;
 
+    if (docRange->need_positions && hasProx && docRange->doc_many_size_ > 0 && 
df >= skipInterval) {
+        if (skipListReader == nullptr) {
+            skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(), 
maxSkipLevels, skipInterval);
+            skipListReader->setIoContext(io_ctx_);
+        }
+        if (!haveSkipped) {
+            skipListReader->init(skipPointer, freqBasePointer, 
proxBasePointer, df, hasProx, currentFieldStoresPayloads);
+            haveSkipped = true;
+        }
+        
+        uint32_t firstDoc = (*docRange->doc_many)[0];
+        int32_t skippedCount = skipListReader->skipTo(firstDoc);
+
+        if (skipListReader->getDoc() >= 0) {
+            skipProx(skipListReader->getProxPointer(), 
skipListReader->getPayloadLength());
+        }
+    }
+
     if (docRange->doc_many_size_ > 0) {
         uint32_t start = (*docRange->doc_many)[0];
         uint32_t end = (*docRange->doc_many)[docRange->doc_many_size_ - 1];
@@ -195,6 +212,32 @@ bool SegmentTermDocs::skipTo(const int32_t target) {
     return true;
 }
 
+void SegmentTermDocs::skipToBlock(const int32_t target) {
+    if (df >= skipInterval) {
+        if (skipListReader == NULL) {
+            skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(), 
maxSkipLevels, skipInterval);
+            skipListReader->setIoContext(io_ctx_);
+        }
+
+        if (!haveSkipped) {
+            skipListReader->init(skipPointer, freqBasePointer, 
proxBasePointer, df, hasProx, currentFieldStoresPayloads);
+            haveSkipped = true;
+        }
+
+        int32_t newCount = skipListReader->skipTo(target);
+        if (newCount > count) {
+            freqStream->seek(skipListReader->getFreqPointer());
+            skipProx(skipListReader->getProxPointer(), 
skipListReader->getPayloadLength());
+
+            _doc = skipListReader->getDoc();
+            count = newCount;
+            // Note: We do NOT call buffer_.refill() here.
+            // The caller will use readRange() to read the next block.
+        }
+    }
+    // If df < skipInterval, nothing to skip. Caller will use readRange() 
sequentially.
+}
+
 void TermDocsBuffer::refill() {
     cur_doc_ = 0;
     cur_freq_ = 0;
diff --git a/src/core/CLucene/index/SegmentTermPositions.cpp 
b/src/core/CLucene/index/SegmentTermPositions.cpp
index 7ddb1a2ad18..973206fe5f0 100644
--- a/src/core/CLucene/index/SegmentTermPositions.cpp
+++ b/src/core/CLucene/index/SegmentTermPositions.cpp
@@ -69,6 +69,14 @@ int32_t SegmentTermPositions::nextPosition() {
     return position += readDeltaPosition();
 }
 
+int32_t SegmentTermPositions::nextDeltaPosition() {
+    if (!hasProx) {
+        return 0;
+    }
+    lazySkip();
+    return readDeltaPosition();
+}
+
 int32_t SegmentTermPositions::readDeltaPosition() {
        int32_t delta = buffer_.getPos();
        if (currentFieldStoresPayloads) {
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index 0af1102874c..7a29c9cc94b 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -74,6 +74,11 @@ public:
        // Some implementations are considerably more efficient than that.
        virtual bool skipTo(const int32_t target)=0;
 
+       // Skip to the block containing the target document using skip list.
+       // This is an optimization that positions the stream for subsequent 
readRange calls.
+       // Unlike skipTo, this does not scan to find the exact document.
+       virtual void skipToBlock(const int32_t target) {}
+
        // Frees associated resources.
        virtual void close() = 0;
 
@@ -195,6 +200,9 @@ public:
          */
        virtual TermDocs* __asTermDocs()=0;
        virtual TermPositions* __asTermPositions()=0;
+
+       virtual void addLazySkipProxCount(int32_t count) {}
+       virtual int32_t nextDeltaPosition() { return 0; }
 };
 CL_NS_END
 #endif
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h 
b/src/core/CLucene/index/_MultiSegmentReader.h
index 830315208c2..569cb8fb8a1 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -168,6 +168,9 @@ public:
    /* A Possible future optimization could skip entire segments */
   bool skipTo(const int32_t target);
 
+  /** Skip to the block containing target using skip list. */
+  void skipToBlock(const int32_t target) override;
+
   void close();
 
   virtual TermPositions* __asTermPositions();
@@ -229,6 +232,7 @@ public:
   MultiTermPositions(CL_NS(util)::ArrayBase<IndexReader*>* subReaders, const 
int32_t* s);
   virtual ~MultiTermPositions() {};
   int32_t nextPosition();
+  int32_t nextDeltaPosition();
 
   /**
   * Not implemented.
@@ -249,6 +253,8 @@ public:
   // TODO: Remove warning after API has been finalized
   bool isPayloadAvailable() const;
 
+  void addLazySkipProxCount(int32_t count) override;
+
   virtual TermDocs* __asTermDocs();
   virtual TermPositions* __asTermPositions();
 };
diff --git a/src/core/CLucene/index/_SegmentHeader.h 
b/src/core/CLucene/index/_SegmentHeader.h
index 54e84ad4ffd..0b3cf1c75d7 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -25,8 +25,7 @@
 #include "_CompoundFile.h"
 #include "DirectoryIndexReader.h"
 #include "_SkipListReader.h"
-#include "CLucene/util/_ThreadLocal.h"
-#include "CLucene/index/IndexVersion.h"
+#include "_TermInfosReader.h"
 
 CL_NS_DEF(index)
 class SegmentReader;
@@ -198,6 +197,9 @@ public:
   /** Optimized implementation. */
   virtual bool skipTo(const int32_t target);
 
+    /** Skip to the block containing target using skip list. */
+    void skipToBlock(const int32_t target) override;
+
   virtual TermPositions* __asTermPositions();
 
   void setIoContext(const void* io_ctx) override;
@@ -241,6 +243,8 @@ public:
 
   void setIoContext(const void* io_ctx) override;
 
+    void addLazySkipProxCount(int32_t count) override { lazySkipProxCount += 
count; }
+
 private:
   void seek(const TermInfo* ti, Term* term);
 
@@ -248,6 +252,8 @@ public:
   void close();
 
   int32_t nextPosition();
+    int32_t nextDeltaPosition();
+
 private:
   int32_t readDeltaPosition();
 
@@ -296,6 +302,7 @@ private:
   int32_t doc() const{ return SegmentTermDocs::doc(); }
   int32_t freq() const{ return SegmentTermDocs::freq(); }
   bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); }
+    void skipToBlock(const int32_t target) { 
SegmentTermDocs::skipToBlock(target); }
 
 private:
   IndexVersion indexVersion_ = IndexVersion::kV0; 
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 083edeeab8c..4aa23796d31 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -112,6 +112,7 @@ SET(test_files ./tests.cpp
         ./query/TestMultiPhraseQuery.cpp
         ./store/TestUTF8Chars.cpp
         ./store/testPFOR.cpp
+        ./index/TestReadRange.cpp
         ${test_HEADERS})
 IF (USE_SHARED_OBJECT_FILES)
     GET_SHARED_FILES(clucene_shared_Files)
diff --git a/src/test/index/TestReadRange.cpp b/src/test/index/TestReadRange.cpp
new file mode 100644
index 00000000000..23d448c3fa6
--- /dev/null
+++ b/src/test/index/TestReadRange.cpp
@@ -0,0 +1,854 @@
+/*------------------------------------------------------------------------------
+ * Test for readRange and skipToBlock interfaces
+ * Compares block-based reading with traditional next() approach
+ * Verifies correctness and performance
+ 
*------------------------------------------------------------------------------*/
+#include <CLucene.h>
+#include <CLucene/index/DocRange.h>
+#include <CLucene/index/IndexReader.h>
+#include <CLucene/util/stringUtil.h>
+
+#include <algorithm>
+#include <chrono>
+#include <ctime>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "CLucene/analysis/Analyzers.h"
+#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/Term.h"
+#include "CLucene/store/RAMDirectory.h"
+#include "test.h"
+
+CL_NS_USE(search)
+CL_NS_USE(store)
+CL_NS_USE(index)
+CL_NS_USE(util)
+
+// Test configuration
+static constexpr int32_t DEFAULT_DOC_COUNT = 10000;
+static constexpr int32_t PERF_ITERATIONS = 10;
+
+#define FINALLY(eptr, finallyBlock)       \
+    {                                     \
+        finallyBlock;                     \
+        if (eptr) {                       \
+            std::rethrow_exception(eptr); \
+        }                                 \
+    }
+
+static int32_t getDaySeed() {
+    std::time_t now = std::time(nullptr);
+    std::tm* localTime = std::localtime(&now);
+    localTime->tm_sec = 0;
+    localTime->tm_min = 0;
+    localTime->tm_hour = 0;
+    return static_cast<int32_t>(std::mktime(localTime) / (60 * 60 * 24));
+}
+
+// Generate random text with multiple tokens for position testing
+static std::string generateRandomText(std::mt19937& rng, int minTokens, int 
maxTokens) {
+    static const std::vector<std::string> words = {
+            "apple",    "banana",    "cherry",     "date",      "elderberry", 
"fig",      "grape",
+            "honeydew", "kiwi",      "lemon",      "mango",     "nectarine",  
"orange",   "papaya",
+            "quince",   "raspberry", "strawberry", "tangerine", "watermelon", 
"blueberry"};
+
+    std::uniform_int_distribution<int> tokenDist(minTokens, maxTokens);
+    std::uniform_int_distribution<size_t> wordDist(0, words.size() - 1);
+
+    int numTokens = tokenDist(rng);
+    std::string result;
+    for (int i = 0; i < numTokens; ++i) {
+        if (i > 0) result += " ";
+        result += words[wordDist(rng)];
+    }
+    return result;
+}
+
+// Write index with random data
+static void writeTestIndex(const std::string& fieldName, RAMDirectory* dir,
+                           IndexVersion indexVersion, const 
std::vector<std::string>& datas) {
+    auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>;
+    analyzer->set_stopwords(nullptr);
+    auto* indexwriter = _CLNEW lucene::index::IndexWriter(dir, analyzer, true);
+    indexwriter->setRAMBufferSizeMB(512);
+    indexwriter->setMaxBufferedDocs(-1);
+    indexwriter->setMaxFieldLength(0x7FFFFFFFL);
+    indexwriter->setMergeFactor(1000000000);
+    indexwriter->setUseCompoundFile(false);
+
+    auto* char_string_reader = _CLNEW lucene::util::SStringReader<char>;
+
+    auto* doc = _CLNEW lucene::document::Document();
+    int32_t field_config = lucene::document::Field::STORE_NO;
+    field_config |= lucene::document::Field::INDEX_TOKENIZED;
+    auto field_name_w = std::wstring(fieldName.begin(), fieldName.end());
+    auto* field = _CLNEW lucene::document::Field(field_name_w.c_str(), 
field_config);
+    field->setOmitTermFreqAndPositions(false);
+    field->setIndexVersion(indexVersion);
+    doc->add(*field);
+
+    for (const auto& data : datas) {
+        char_string_reader->init(data.data(), data.size(), false);
+        auto* stream = analyzer->reusableTokenStream(field->name(), 
char_string_reader);
+        field->setValue(stream);
+        indexwriter->addDocument(doc);
+    }
+
+    indexwriter->close();
+
+    _CLLDELETE(indexwriter);
+    _CLLDELETE(doc);
+    _CLLDELETE(analyzer);
+    _CLLDELETE(char_string_reader);
+}
+
+// Result structure for comparison
+struct TermDocsResult {
+    std::vector<int32_t> docs;
+    std::vector<int32_t> freqs;
+};
+
+struct TermPositionsResult {
+    std::vector<int32_t> docs;
+    std::vector<int32_t> freqs;
+    std::vector<std::vector<int32_t>> positions; // positions per doc
+};
+
+// Read using traditional next() method
+static TermDocsResult readWithNext(TermDocs* termDocs) {
+    TermDocsResult result;
+    while (termDocs->next()) {
+        result.docs.push_back(termDocs->doc());
+        result.freqs.push_back(termDocs->freq());
+    }
+    return result;
+}
+
+// Read using readRange() method
+static TermDocsResult readWithRange(TermDocs* termDocs) {
+    TermDocsResult result;
+    DocRange docRange;
+    while (termDocs->readRange(&docRange)) {
+        for (uint32_t i = 0; i < docRange.doc_many_size_; ++i) {
+            result.docs.push_back((*docRange.doc_many)[i]);
+            if (docRange.freq_many && i < docRange.freq_many_size_) {
+                result.freqs.push_back((*docRange.freq_many)[i]);
+            } else {
+                result.freqs.push_back(1);
+            }
+        }
+    }
+    return result;
+}
+
+// Read positions using traditional next()/nextPosition() method
+static TermPositionsResult readPositionsWithNext(TermPositions* termPos) {
+    TermPositionsResult result;
+    while (termPos->next()) {
+        result.docs.push_back(termPos->doc());
+        int32_t freq = termPos->freq();
+        result.freqs.push_back(freq);
+
+        std::vector<int32_t> positions;
+        for (int32_t i = 0; i < freq; ++i) {
+            positions.push_back(termPos->nextPosition());
+        }
+        result.positions.push_back(std::move(positions));
+    }
+    return result;
+}
+
+// Read positions using readRange() and nextDeltaPosition() method
+static TermPositionsResult readPositionsWithRange(TermPositions* termPos) {
+    TermPositionsResult result;
+    DocRange docRange;
+    docRange.need_positions = true;
+
+    while (termPos->readRange(&docRange)) {
+        for (uint32_t i = 0; i < docRange.doc_many_size_; ++i) {
+            result.docs.push_back((*docRange.doc_many)[i]);
+            int32_t freq = 1;
+            if (docRange.freq_many && i < docRange.freq_many_size_) {
+                freq = (*docRange.freq_many)[i];
+            }
+            result.freqs.push_back(freq);
+
+            std::vector<int32_t> positions;
+            int32_t position = 0;
+            for (int32_t j = 0; j < freq; ++j) {
+                position += termPos->nextDeltaPosition();
+                positions.push_back(position);
+            }
+            result.positions.push_back(std::move(positions));
+        }
+    }
+    return result;
+}
+
+// Compare TermDocs results
+static bool compareTermDocsResults(const TermDocsResult& a, const 
TermDocsResult& b,
+                                   bool checkNorms = false) {
+    if (a.docs.size() != b.docs.size()) {
+        std::cerr << "Doc count mismatch: " << a.docs.size() << " vs " << 
b.docs.size()
+                  << std::endl;
+        return false;
+    }
+
+    for (size_t i = 0; i < a.docs.size(); ++i) {
+        if (a.docs[i] != b.docs[i]) {
+            std::cerr << "Doc mismatch at " << i << ": " << a.docs[i] << " vs 
" << b.docs[i]
+                      << std::endl;
+            return false;
+        }
+        if (a.freqs[i] != b.freqs[i]) {
+            std::cerr << "Freq mismatch at doc " << a.docs[i] << ": " << 
a.freqs[i] << " vs "
+                      << b.freqs[i] << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+// Compare TermPositions results
+static bool compareTermPositionsResults(const TermPositionsResult& a,
+                                        const TermPositionsResult& b) {
+    if (a.docs.size() != b.docs.size()) {
+        std::cerr << "Doc count mismatch: " << a.docs.size() << " vs " << 
b.docs.size()
+                  << std::endl;
+        return false;
+    }
+
+    for (size_t i = 0; i < a.docs.size(); ++i) {
+        if (a.docs[i] != b.docs[i]) {
+            std::cerr << "Doc mismatch at " << i << ": " << a.docs[i] << " vs 
" << b.docs[i]
+                      << std::endl;
+            return false;
+        }
+        if (a.freqs[i] != b.freqs[i]) {
+            std::cerr << "Freq mismatch at doc " << a.docs[i] << ": " << 
a.freqs[i] << " vs "
+                      << b.freqs[i] << std::endl;
+            return false;
+        }
+        if (a.positions[i].size() != b.positions[i].size()) {
+            std::cerr << "Position count mismatch at doc " << a.docs[i] << 
std::endl;
+            return false;
+        }
+        for (size_t j = 0; j < a.positions[i].size(); ++j) {
+            if (a.positions[i][j] != b.positions[i][j]) {
+                std::cerr << "Position mismatch at doc " << a.docs[i] << " pos 
" << j << ": "
+                          << a.positions[i][j] << " vs " << b.positions[i][j] 
<< std::endl;
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+//=============================================================================
+// Test: Basic readRange correctness for TermDocs
+//=============================================================================
+void TestReadRangeBasic(CuTest* tc) {
+    std::srand(getDaySeed());
+    std::mt19937 rng(getDaySeed());
+
+    std::string fieldName = "content";
+    std::vector<std::string> datas;
+    for (int32_t i = 0; i < DEFAULT_DOC_COUNT; ++i) {
+        datas.push_back(generateRandomText(rng, 1, 5));
+    }
+
+    RAMDirectory dir;
+    writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+    auto* reader = IndexReader::open(&dir);
+    std::exception_ptr eptr;
+
+    try {
+        Term* term = nullptr;
+        TermEnum* enumerator = reader->terms();
+
+        int termCount = 0;
+        while (enumerator->next() && termCount < 50) { // Test first 50 terms
+            term = enumerator->term();
+            if (term == nullptr) continue;
+
+            // Read with next()
+            TermDocs* termDocs1 = reader->termDocs();
+            termDocs1->seek(term);
+            TermDocsResult result1 = readWithNext(termDocs1);
+            termDocs1->close();
+            _CLDELETE(termDocs1);
+
+            // Read with readRange()
+            TermDocs* termDocs2 = reader->termDocs();
+            termDocs2->seek(term);
+            TermDocsResult result2 = readWithRange(termDocs2);
+            termDocs2->close();
+            _CLDELETE(termDocs2);
+
+            // Compare results
+            bool match = compareTermDocsResults(result1, result2);
+            if (!match) {
+                char termStr[256];
+                STRCPY_TtoA(termStr, term->text(), 255);
+                std::cerr << "Mismatch for term: " << termStr << std::endl;
+            }
+            assertTrue(match);
+
+            _CLDECDELETE(term);
+            termCount++;
+        }
+
+        enumerator->close();
+        _CLDELETE(enumerator);
+
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+
+    FINALLY(eptr, {
+        reader->close();
+        _CLLDELETE(reader);
+    })
+
+    std::cout << "\nTestReadRangeBasic success" << std::endl;
+}
+
+//=============================================================================
+// Test: readRange with positions (TermPositions)
+//=============================================================================
+void TestReadRangePositions(CuTest* tc) {
+    std::srand(getDaySeed());
+    std::mt19937 rng(getDaySeed());
+
+    std::string fieldName = "content";
+    std::vector<std::string> datas;
+    for (int32_t i = 0; i < 1000; ++i) {
+        datas.push_back(generateRandomText(rng, 2, 8)); // Multiple tokens per 
doc
+    }
+
+    RAMDirectory dir;
+    writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+    auto* reader = IndexReader::open(&dir);
+    std::exception_ptr eptr;
+
+    try {
+        Term* term = nullptr;
+        TermEnum* enumerator = reader->terms();
+
+        int termCount = 0;
+        while (enumerator->next() && termCount < 20) {
+            term = enumerator->term();
+            if (term == nullptr) continue;
+
+            // Read with next()/nextPosition()
+            TermPositions* termPos1 = reader->termPositions();
+            termPos1->seek(term);
+            TermPositionsResult result1 = readPositionsWithNext(termPos1);
+            termPos1->close();
+            _CLDELETE(termPos1);
+
+            // Read with readRange()/nextDeltaPosition()
+            TermPositions* termPos2 = reader->termPositions();
+            termPos2->seek(term);
+            TermPositionsResult result2 = readPositionsWithRange(termPos2);
+            termPos2->close();
+            _CLDELETE(termPos2);
+
+            // Compare results
+            bool match = compareTermPositionsResults(result1, result2);
+            if (!match) {
+                char termStr[256];
+                STRCPY_TtoA(termStr, term->text(), 255);
+                std::cerr << "Position mismatch for term: " << termStr << 
std::endl;
+            }
+            assertTrue(match);
+
+            _CLDECDELETE(term);
+            termCount++;
+        }
+
+        enumerator->close();
+        _CLDELETE(enumerator);
+
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+
+    FINALLY(eptr, {
+        reader->close();
+        _CLLDELETE(reader);
+    })
+
+    std::cout << "\nTestReadRangePositions success" << std::endl;
+}
+
+//=============================================================================
+// Test: skipToBlock correctness
+//=============================================================================
+void TestSkipToBlock(CuTest* tc) {
+    std::srand(getDaySeed());
+    std::mt19937 rng(getDaySeed());
+
+    std::string fieldName = "content";
+    std::vector<std::string> datas;
+
+    // Create index with a common term appearing in many docs
+    std::string commonWord = "common";
+    for (int32_t i = 0; i < DEFAULT_DOC_COUNT; ++i) {
+        std::string text = generateRandomText(rng, 1, 3);
+        if (i % 3 == 0) { // Add common word to every 3rd doc
+            text += " " + commonWord;
+        }
+        datas.push_back(text);
+    }
+
+    RAMDirectory dir;
+    writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+    auto* reader = IndexReader::open(&dir);
+    std::exception_ptr eptr;
+
+    try {
+        std::wstring ws = StringUtil::string_to_wstring(commonWord);
+        std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+        Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+        // Test skipTo with traditional method
+        std::vector<int32_t> skipToTargets = {0, 100, 500, 1000, 5000, 9000};
+
+        for (int32_t target : skipToTargets) {
+            // Using skipTo
+            TermDocs* termDocs1 = reader->termDocs();
+            termDocs1->seek(term);
+            std::vector<int32_t> docs1;
+            if (termDocs1->skipTo(target)) {
+                docs1.push_back(termDocs1->doc());
+                while (termDocs1->next()) {
+                    docs1.push_back(termDocs1->doc());
+                }
+            }
+            termDocs1->close();
+            _CLDELETE(termDocs1);
+
+            // Using skipToBlock + readRange
+            TermDocs* termDocs2 = reader->termDocs();
+            termDocs2->seek(term);
+            termDocs2->skipToBlock(target);
+            std::vector<int32_t> docs2;
+            DocRange docRange;
+            while (termDocs2->readRange(&docRange)) {
+                for (uint32_t i = 0; i < docRange.doc_many_size_; ++i) {
+                    int32_t doc = (*docRange.doc_many)[i];
+                    if (doc >= target) {
+                        docs2.push_back(doc);
+                    }
+                }
+            }
+            termDocs2->close();
+            _CLDELETE(termDocs2);
+
+            // Compare: docs2 should contain at least all docs from docs1
+            // (skipToBlock may return some docs before target within the same 
block)
+            bool allFound = true;
+            for (int32_t doc : docs1) {
+                if (std::find(docs2.begin(), docs2.end(), doc) == docs2.end()) 
{
+                    std::cerr << "Doc " << doc << " not found after 
skipToBlock(" << target << ")"
+                              << std::endl;
+                    allFound = false;
+                }
+            }
+            assertTrue(allFound);
+        }
+
+        _CLDECDELETE(term);
+
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+
+    FINALLY(eptr, {
+        reader->close();
+        _CLLDELETE(reader);
+    })
+
+    std::cout << "\nTestSkipToBlock success" << std::endl;
+}
+
+//=============================================================================
+// Test: Performance comparison between next() and readRange()
+//=============================================================================
+void TestReadRangePerformance(CuTest* tc) {
+    std::srand(getDaySeed());
+    std::mt19937 rng(getDaySeed());
+
+    std::string fieldName = "content";
+    std::vector<std::string> datas;
+
+    // Create larger dataset for performance testing
+    int32_t perfDocCount = 50000;
+    for (int32_t i = 0; i < perfDocCount; ++i) {
+        datas.push_back(generateRandomText(rng, 1, 5));
+    }
+
+    RAMDirectory dir;
+    writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+    auto* reader = IndexReader::open(&dir);
+    std::exception_ptr eptr;
+
+    try {
+        // Collect all terms
+        std::vector<Term*> terms;
+        TermEnum* enumerator = reader->terms();
+        while (enumerator->next()) {
+            Term* term = enumerator->term();
+            if (term != nullptr) {
+                terms.push_back(term);
+            }
+        }
+        enumerator->close();
+        _CLDELETE(enumerator);
+
+        // Benchmark next() method
+        auto startNext = std::chrono::high_resolution_clock::now();
+        int64_t totalDocsNext = 0;
+        for (int iter = 0; iter < PERF_ITERATIONS; ++iter) {
+            for (Term* term : terms) {
+                TermDocs* termDocs = reader->termDocs();
+                termDocs->seek(term);
+                while (termDocs->next()) {
+                    totalDocsNext++;
+                }
+                termDocs->close();
+                _CLDELETE(termDocs);
+            }
+        }
+        auto endNext = std::chrono::high_resolution_clock::now();
+        auto durationNext =
+                std::chrono::duration_cast<std::chrono::milliseconds>(endNext 
- startNext);
+
+        // Benchmark readRange() method
+        auto startRange = std::chrono::high_resolution_clock::now();
+        int64_t totalDocsRange = 0;
+        for (int iter = 0; iter < PERF_ITERATIONS; ++iter) {
+            for (Term* term : terms) {
+                TermDocs* termDocs = reader->termDocs();
+                termDocs->seek(term);
+                DocRange docRange;
+                while (termDocs->readRange(&docRange)) {
+                    totalDocsRange += docRange.doc_many_size_;
+                }
+                termDocs->close();
+                _CLDELETE(termDocs);
+            }
+        }
+        auto endRange = std::chrono::high_resolution_clock::now();
+        auto durationRange =
+                std::chrono::duration_cast<std::chrono::milliseconds>(endRange 
- startRange);
+
+        // Verify same number of docs read
+        assertEquals(totalDocsNext, totalDocsRange);
+
+        std::cout << "\n=== Performance Results ===" << std::endl;
+        std::cout << "Terms: " << terms.size() << ", Iterations: " << 
PERF_ITERATIONS << std::endl;
+        std::cout << "next() method:      " << durationNext.count() << " ms" 
<< std::endl;
+        std::cout << "readRange() method: " << durationRange.count() << " ms" 
<< std::endl;
+        std::cout << "Speedup: " << (double)durationNext.count() / 
durationRange.count() << "x"
+                  << std::endl;
+
+        // readRange should not be significantly slower
+        // Allow up to 2x slower as acceptable (it should actually be faster 
in most cases)
+        assertTrue(durationRange.count() <= durationNext.count() * 2);
+
+        // Cleanup terms
+        for (Term* term : terms) {
+            _CLDECDELETE(term);
+        }
+
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+
+    FINALLY(eptr, {
+        reader->close();
+        _CLLDELETE(reader);
+    })
+
+    std::cout << "TestReadRangePerformance success" << std::endl;
+}
+
+//=============================================================================
+// Test: Large document count stress test
+//=============================================================================
+void TestReadRangeLargeDataset(CuTest* tc) {
+    std::srand(getDaySeed());
+    std::mt19937 rng(getDaySeed());
+
+    std::string fieldName = "content";
+    std::vector<std::string> datas;
+
+    // Use a small vocabulary to ensure high term frequency
+    std::vector<std::string> vocabulary = {"alpha", "beta", "gamma", "delta", 
"epsilon"};
+
+    int32_t largeDocCount = 100000;
+    for (int32_t i = 0; i < largeDocCount; ++i) {
+        std::uniform_int_distribution<size_t> dist(0, vocabulary.size() - 1);
+        std::string text = vocabulary[dist(rng)];
+        // Add 1-3 more words
+        int extraWords = (i % 3) + 1;
+        for (int j = 0; j < extraWords; ++j) {
+            text += " " + vocabulary[dist(rng)];
+        }
+        datas.push_back(text);
+    }
+
+    RAMDirectory dir;
+    writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+    auto* reader = IndexReader::open(&dir);
+    std::exception_ptr eptr;
+
+    try {
+        // Test each vocabulary term
+        std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+
+        for (const std::string& word : vocabulary) {
+            std::wstring ws = StringUtil::string_to_wstring(word);
+            Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+            // Read with next()
+            TermDocs* termDocs1 = reader->termDocs();
+            termDocs1->seek(term);
+            TermDocsResult result1 = readWithNext(termDocs1);
+            termDocs1->close();
+            _CLDELETE(termDocs1);
+
+            // Read with readRange()
+            TermDocs* termDocs2 = reader->termDocs();
+            termDocs2->seek(term);
+            TermDocsResult result2 = readWithRange(termDocs2);
+            termDocs2->close();
+            _CLDELETE(termDocs2);
+
+            // Compare
+            bool match = compareTermDocsResults(result1, result2);
+            if (!match) {
+                std::cerr << "Mismatch for term: " << word << std::endl;
+            }
+            assertTrue(match);
+
+            std::cout << "Term '" << word << "': " << result1.docs.size() << " 
docs - OK"
+                      << std::endl;
+
+            _CLDECDELETE(term);
+        }
+
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+
+    FINALLY(eptr, {
+        reader->close();
+        _CLLDELETE(reader);
+    })
+
+    std::cout << "\nTestReadRangeLargeDataset success" << std::endl;
+}
+
+//=============================================================================
+// Test: Index version compatibility (V1 and V2)
+//=============================================================================
+void TestReadRangeVersions(CuTest* tc) {
+    std::srand(getDaySeed());
+    std::mt19937 rng(getDaySeed());
+
+    std::string fieldName = "content";
+    std::vector<std::string> datas;
+    for (int32_t i = 0; i < 5000; ++i) {
+        datas.push_back(generateRandomText(rng, 1, 4));
+    }
+
+    std::vector<IndexVersion> versions = {IndexVersion::kV1, 
IndexVersion::kV2};
+
+    for (IndexVersion version : versions) {
+        RAMDirectory dir;
+        writeTestIndex(fieldName, &dir, version, datas);
+
+        auto* reader = IndexReader::open(&dir);
+        std::exception_ptr eptr;
+
+        try {
+            Term* term = nullptr;
+            TermEnum* enumerator = reader->terms();
+
+            int termCount = 0;
+            while (enumerator->next() && termCount < 30) {
+                term = enumerator->term();
+                if (term == nullptr) continue;
+
+                // Read with next()
+                TermDocs* termDocs1 = reader->termDocs();
+                termDocs1->seek(term);
+                TermDocsResult result1 = readWithNext(termDocs1);
+                termDocs1->close();
+                _CLDELETE(termDocs1);
+
+                // Read with readRange()
+                TermDocs* termDocs2 = reader->termDocs();
+                termDocs2->seek(term);
+                TermDocsResult result2 = readWithRange(termDocs2);
+                termDocs2->close();
+                _CLDELETE(termDocs2);
+
+                bool match = compareTermDocsResults(result1, result2);
+                assertTrue(match);
+
+                _CLDECDELETE(term);
+                termCount++;
+            }
+
+            enumerator->close();
+            _CLDELETE(enumerator);
+
+        } catch (...) {
+            eptr = std::current_exception();
+        }
+
+        FINALLY(eptr, {
+            reader->close();
+            _CLLDELETE(reader);
+        })
+
+        std::cout << "IndexVersion " << static_cast<int>(version) << " - OK" 
<< std::endl;
+    }
+
+    std::cout << "\nTestReadRangeVersions success" << std::endl;
+}
+
+//=============================================================================
+// Test: Edge cases - empty results, single doc, etc.
+//=============================================================================
+void TestReadRangeEdgeCases(CuTest* tc) {
+    std::srand(getDaySeed());
+
+    std::string fieldName = "content";
+
+    // Test 1: Single document
+    {
+        RAMDirectory dir;
+        std::vector<std::string> datas = {"single"};
+        writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+        auto* reader = IndexReader::open(&dir);
+        std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+        std::wstring ws = StringUtil::string_to_wstring("single");
+        Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+        TermDocs* termDocs1 = reader->termDocs();
+        termDocs1->seek(term);
+        TermDocsResult result1 = readWithNext(termDocs1);
+        termDocs1->close();
+        _CLDELETE(termDocs1);
+
+        TermDocs* termDocs2 = reader->termDocs();
+        termDocs2->seek(term);
+        TermDocsResult result2 = readWithRange(termDocs2);
+        termDocs2->close();
+        _CLDELETE(termDocs2);
+
+        assertTrue(compareTermDocsResults(result1, result2));
+        assertEquals(result1.docs.size(), 1);
+
+        _CLDECDELETE(term);
+        reader->close();
+        _CLLDELETE(reader);
+
+        std::cout << "Single doc test - OK" << std::endl;
+    }
+
+    // Test 2: Non-existent term
+    {
+        RAMDirectory dir;
+        std::vector<std::string> datas = {"apple", "banana", "cherry"};
+        writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+        auto* reader = IndexReader::open(&dir);
+        std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+        std::wstring ws = StringUtil::string_to_wstring("nonexistent");
+        Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+        TermDocs* termDocs = reader->termDocs();
+        termDocs->seek(term);
+        DocRange docRange;
+        bool hasData = termDocs->readRange(&docRange);
+        // Non-existent term should return false or empty range
+        assertTrue(!hasData || docRange.doc_many_size_ == 0);
+        termDocs->close();
+        _CLDELETE(termDocs);
+
+        _CLDECDELETE(term);
+        reader->close();
+        _CLLDELETE(reader);
+
+        std::cout << "Non-existent term test - OK" << std::endl;
+    }
+
+    // Test 3: Term appearing in all documents
+    {
+        RAMDirectory dir;
+        std::vector<std::string> datas;
+        for (int i = 0; i < 1000; ++i) {
+            datas.push_back("common");
+        }
+        writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+        auto* reader = IndexReader::open(&dir);
+        std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+        std::wstring ws = StringUtil::string_to_wstring("common");
+        Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+        TermDocs* termDocs1 = reader->termDocs();
+        termDocs1->seek(term);
+        TermDocsResult result1 = readWithNext(termDocs1);
+        termDocs1->close();
+        _CLDELETE(termDocs1);
+
+        TermDocs* termDocs2 = reader->termDocs();
+        termDocs2->seek(term);
+        TermDocsResult result2 = readWithRange(termDocs2);
+        termDocs2->close();
+        _CLDELETE(termDocs2);
+
+        assertTrue(compareTermDocsResults(result1, result2));
+        assertEquals(result1.docs.size(), 1000);
+
+        _CLDECDELETE(term);
+        reader->close();
+        _CLLDELETE(reader);
+
+        std::cout << "All docs term test - OK" << std::endl;
+    }
+
+    std::cout << "\nTestReadRangeEdgeCases success" << std::endl;
+}
+
+//=============================================================================
+// Suite registration
+//=============================================================================
+CuSuite* testReadRange() {
+    CuSuite* suite = CuSuiteNew(_T("CLucene ReadRange Test"));
+
+    SUITE_ADD_TEST(suite, TestReadRangeBasic);
+    SUITE_ADD_TEST(suite, TestReadRangePositions);
+    SUITE_ADD_TEST(suite, TestSkipToBlock);
+    SUITE_ADD_TEST(suite, TestReadRangePerformance);
+    SUITE_ADD_TEST(suite, TestReadRangeLargeDataset);
+    SUITE_ADD_TEST(suite, TestReadRangeVersions);
+    SUITE_ADD_TEST(suite, TestReadRangeEdgeCases);
+
+    return suite;
+}
diff --git a/src/test/test.h b/src/test/test.h
index 762424eef53..81fd7623e00 100644
--- a/src/test/test.h
+++ b/src/test/test.h
@@ -93,6 +93,7 @@ CuSuite *testICU(void);
 CuSuite *testUTF8CharsSuite(void);
 CuSuite *testIndexReader2(void);
 CuSuite *testPFORSuite(void);
+CuSuite *testReadRange(void);
 #ifdef TEST_CONTRIB_LIBS
 //CuSuite *testGermanAnalyzer(void);
 CuSuite *testchinese(void);
diff --git a/src/test/tests.cpp b/src/test/tests.cpp
index 1d4a05a43ce..cf3b53097b0 100644
--- a/src/test/tests.cpp
+++ b/src/test/tests.cpp
@@ -28,4 +28,5 @@ unittest tests[] = {{"analysis", testanalysis},
                     {"chinese", testchinese},
 #endif
                     {"TestIndexReader2", testIndexReader2},
+                    {"testReadRange", testReadRange},
                     {"LastTest", NULL}};


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris-thirdparty) branch clucene-3.1 updated: [opt](inverted index) add block-based reading with readRange and skipToBlock (#372)

Reply via email to