This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch clucene-3.1
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene-3.1 by this push:
new a4796e961e1 [opt](inverted index) add block-based reading with
readRange and skipToBlock (#372)
a4796e961e1 is described below
commit a4796e961e1724f8b6b5145658bb2704981376e5
Author: Jack <[email protected]>
AuthorDate: Tue Jan 6 21:36:47 2026 +0800
[opt](inverted index) add block-based reading with readRange and
skipToBlock (#372)
* [opt](inverted index) add block-based reading with readRange and
skipToBlock (#371)
* fix interface
* fix interface
---------
Co-authored-by: zzzxl <[email protected]>
---
src/core/CLucene/index/DocRange.h | 2 +
src/core/CLucene/index/MultiSegmentReader.cpp | 28 +-
src/core/CLucene/index/SegmentTermDocs.cpp | 45 +-
src/core/CLucene/index/SegmentTermPositions.cpp | 8 +
src/core/CLucene/index/Terms.h | 8 +
src/core/CLucene/index/_MultiSegmentReader.h | 6 +
src/core/CLucene/index/_SegmentHeader.h | 11 +-
src/test/CMakeLists.txt | 1 +
src/test/index/TestReadRange.cpp | 854 ++++++++++++++++++++++++
src/test/test.h | 1 +
src/test/tests.cpp | 1 +
11 files changed, 960 insertions(+), 5 deletions(-)
diff --git a/src/core/CLucene/index/DocRange.h
b/src/core/CLucene/index/DocRange.h
index ef7906a24fb..9412178eb6b 100644
--- a/src/core/CLucene/index/DocRange.h
+++ b/src/core/CLucene/index/DocRange.h
@@ -27,4 +27,6 @@ class DocRange {
std::vector<uint32_t>* freq_many = nullptr;
std::pair<uint32_t, uint32_t> doc_range;
+
+ bool need_positions = false;
};
\ No newline at end of file
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp
b/src/core/CLucene/index/MultiSegmentReader.cpp
index e5987023c94..6ab57b2c80f 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -672,11 +672,14 @@ bool MultiTermDocs::readRange(DocRange* docRange) {
if (!current->readRange(docRange)) {
current = nullptr;
} else {
- if (docRange->type_ == DocRangeType::kMany) {
+ // Always update doc_many with base offset (doc_many is
always valid)
+ if (docRange->doc_many && docRange->doc_many_size_ > 0)
{
auto begin = docRange->doc_many->begin();
auto end = docRange->doc_many->begin() +
docRange->doc_many_size_;
std::transform(begin, end, begin,
[this](int32_t val) { return val + base; });
- } else if (docRange->type_ == DocRangeType::kRange) {
+ }
+ // Also update doc_range if kRange type
+ if (docRange->type_ == DocRangeType::kRange) {
docRange->doc_range.first += base;
docRange->doc_range.second += base;
}
@@ -703,6 +706,17 @@ bool MultiTermDocs::skipTo(const int32_t target) {
}
}
+void MultiTermDocs::skipToBlock(const int32_t target) {
+ while (pointer < subReaders->length && target >= starts[pointer]) {
+ base = starts[pointer];
+ current = termDocs(pointer++);
+ }
+
+ if (current != NULL) {
+ current->skipToBlock(target - base);
+ }
+}
+
void MultiTermDocs::close() {
//Func - Closes all MultiTermDocs managed by this instance
//Pre - true
@@ -959,6 +973,11 @@ int32_t MultiTermPositions::nextPosition() {
return curAsTP->nextPosition();
}
+int32_t MultiTermPositions::nextDeltaPosition() {
+ TermPositions* curAsTP = current->__asTermPositions();
+ return curAsTP->nextDeltaPosition();
+}
+
int32_t MultiTermPositions::getPayloadLength() const{
TermPositions* curAsTP = current->__asTermPositions();
return curAsTP->getPayloadLength();
@@ -974,4 +993,9 @@ bool MultiTermPositions::isPayloadAvailable() const{
return curAsTP->isPayloadAvailable();
}
+void MultiTermPositions::addLazySkipProxCount(int32_t count) {
+ TermPositions* curAsTP = current->__asTermPositions();
+ curAsTP->addLazySkipProxCount(count);
+}
+
CL_NS_END
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp
b/src/core/CLucene/index/SegmentTermDocs.cpp
index 5f5df366f91..6014099c0d1 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -146,9 +146,26 @@ bool SegmentTermDocs::readRange(DocRange* docRange) {
}
buffer_.readRange(docRange);
-
count += docRange->doc_many_size_;
+ if (docRange->need_positions && hasProx && docRange->doc_many_size_ > 0 &&
df >= skipInterval) {
+ if (skipListReader == nullptr) {
+ skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(),
maxSkipLevels, skipInterval);
+ skipListReader->setIoContext(io_ctx_);
+ }
+ if (!haveSkipped) {
+ skipListReader->init(skipPointer, freqBasePointer,
proxBasePointer, df, hasProx, currentFieldStoresPayloads);
+ haveSkipped = true;
+ }
+
+ uint32_t firstDoc = (*docRange->doc_many)[0];
+ int32_t skippedCount = skipListReader->skipTo(firstDoc);
+
+ if (skipListReader->getDoc() >= 0) {
+ skipProx(skipListReader->getProxPointer(),
skipListReader->getPayloadLength());
+ }
+ }
+
if (docRange->doc_many_size_ > 0) {
uint32_t start = (*docRange->doc_many)[0];
uint32_t end = (*docRange->doc_many)[docRange->doc_many_size_ - 1];
@@ -195,6 +212,32 @@ bool SegmentTermDocs::skipTo(const int32_t target) {
return true;
}
+void SegmentTermDocs::skipToBlock(const int32_t target) {
+ if (df >= skipInterval) {
+ if (skipListReader == NULL) {
+ skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(),
maxSkipLevels, skipInterval);
+ skipListReader->setIoContext(io_ctx_);
+ }
+
+ if (!haveSkipped) {
+ skipListReader->init(skipPointer, freqBasePointer,
proxBasePointer, df, hasProx, currentFieldStoresPayloads);
+ haveSkipped = true;
+ }
+
+ int32_t newCount = skipListReader->skipTo(target);
+ if (newCount > count) {
+ freqStream->seek(skipListReader->getFreqPointer());
+ skipProx(skipListReader->getProxPointer(),
skipListReader->getPayloadLength());
+
+ _doc = skipListReader->getDoc();
+ count = newCount;
+ // Note: We do NOT call buffer_.refill() here.
+ // The caller will use readRange() to read the next block.
+ }
+ }
+ // If df < skipInterval, nothing to skip. Caller will use readRange()
sequentially.
+}
+
void TermDocsBuffer::refill() {
cur_doc_ = 0;
cur_freq_ = 0;
diff --git a/src/core/CLucene/index/SegmentTermPositions.cpp
b/src/core/CLucene/index/SegmentTermPositions.cpp
index 7ddb1a2ad18..973206fe5f0 100644
--- a/src/core/CLucene/index/SegmentTermPositions.cpp
+++ b/src/core/CLucene/index/SegmentTermPositions.cpp
@@ -69,6 +69,14 @@ int32_t SegmentTermPositions::nextPosition() {
return position += readDeltaPosition();
}
+int32_t SegmentTermPositions::nextDeltaPosition() {
+ if (!hasProx) {
+ return 0;
+ }
+ lazySkip();
+ return readDeltaPosition();
+}
+
int32_t SegmentTermPositions::readDeltaPosition() {
int32_t delta = buffer_.getPos();
if (currentFieldStoresPayloads) {
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index 0af1102874c..7a29c9cc94b 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -74,6 +74,11 @@ public:
// Some implementations are considerably more efficient than that.
virtual bool skipTo(const int32_t target)=0;
+ // Skip to the block containing the target document using skip list.
+ // This is an optimization that positions the stream for subsequent
readRange calls.
+ // Unlike skipTo, this does not scan to find the exact document.
+ virtual void skipToBlock(const int32_t target) {}
+
// Frees associated resources.
virtual void close() = 0;
@@ -195,6 +200,9 @@ public:
*/
virtual TermDocs* __asTermDocs()=0;
virtual TermPositions* __asTermPositions()=0;
+
+ virtual void addLazySkipProxCount(int32_t count) {}
+ virtual int32_t nextDeltaPosition() { return 0; }
};
CL_NS_END
#endif
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h
b/src/core/CLucene/index/_MultiSegmentReader.h
index 830315208c2..569cb8fb8a1 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -168,6 +168,9 @@ public:
/* A Possible future optimization could skip entire segments */
bool skipTo(const int32_t target);
+ /** Skip to the block containing target using skip list. */
+ void skipToBlock(const int32_t target) override;
+
void close();
virtual TermPositions* __asTermPositions();
@@ -229,6 +232,7 @@ public:
MultiTermPositions(CL_NS(util)::ArrayBase<IndexReader*>* subReaders, const
int32_t* s);
virtual ~MultiTermPositions() {};
int32_t nextPosition();
+ int32_t nextDeltaPosition();
/**
* Not implemented.
@@ -249,6 +253,8 @@ public:
// TODO: Remove warning after API has been finalized
bool isPayloadAvailable() const;
+ void addLazySkipProxCount(int32_t count) override;
+
virtual TermDocs* __asTermDocs();
virtual TermPositions* __asTermPositions();
};
diff --git a/src/core/CLucene/index/_SegmentHeader.h
b/src/core/CLucene/index/_SegmentHeader.h
index 54e84ad4ffd..0b3cf1c75d7 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -25,8 +25,7 @@
#include "_CompoundFile.h"
#include "DirectoryIndexReader.h"
#include "_SkipListReader.h"
-#include "CLucene/util/_ThreadLocal.h"
-#include "CLucene/index/IndexVersion.h"
+#include "_TermInfosReader.h"
CL_NS_DEF(index)
class SegmentReader;
@@ -198,6 +197,9 @@ public:
/** Optimized implementation. */
virtual bool skipTo(const int32_t target);
+ /** Skip to the block containing target using skip list. */
+ void skipToBlock(const int32_t target) override;
+
virtual TermPositions* __asTermPositions();
void setIoContext(const void* io_ctx) override;
@@ -241,6 +243,8 @@ public:
void setIoContext(const void* io_ctx) override;
+ void addLazySkipProxCount(int32_t count) override { lazySkipProxCount +=
count; }
+
private:
void seek(const TermInfo* ti, Term* term);
@@ -248,6 +252,8 @@ public:
void close();
int32_t nextPosition();
+ int32_t nextDeltaPosition();
+
private:
int32_t readDeltaPosition();
@@ -296,6 +302,7 @@ private:
int32_t doc() const{ return SegmentTermDocs::doc(); }
int32_t freq() const{ return SegmentTermDocs::freq(); }
bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); }
+ void skipToBlock(const int32_t target) {
SegmentTermDocs::skipToBlock(target); }
private:
IndexVersion indexVersion_ = IndexVersion::kV0;
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 083edeeab8c..4aa23796d31 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -112,6 +112,7 @@ SET(test_files ./tests.cpp
./query/TestMultiPhraseQuery.cpp
./store/TestUTF8Chars.cpp
./store/testPFOR.cpp
+ ./index/TestReadRange.cpp
${test_HEADERS})
IF (USE_SHARED_OBJECT_FILES)
GET_SHARED_FILES(clucene_shared_Files)
diff --git a/src/test/index/TestReadRange.cpp b/src/test/index/TestReadRange.cpp
new file mode 100644
index 00000000000..23d448c3fa6
--- /dev/null
+++ b/src/test/index/TestReadRange.cpp
@@ -0,0 +1,854 @@
+/*------------------------------------------------------------------------------
+ * Test for readRange and skipToBlock interfaces
+ * Compares block-based reading with traditional next() approach
+ * Verifies correctness and performance
+
*------------------------------------------------------------------------------*/
+#include <CLucene.h>
+#include <CLucene/index/DocRange.h>
+#include <CLucene/index/IndexReader.h>
+#include <CLucene/util/stringUtil.h>
+
+#include <algorithm>
+#include <chrono>
+#include <ctime>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "CLucene/analysis/Analyzers.h"
+#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/Term.h"
+#include "CLucene/store/RAMDirectory.h"
+#include "test.h"
+
+CL_NS_USE(search)
+CL_NS_USE(store)
+CL_NS_USE(index)
+CL_NS_USE(util)
+
+// Test configuration
+static constexpr int32_t DEFAULT_DOC_COUNT = 10000;
+static constexpr int32_t PERF_ITERATIONS = 10;
+
+#define FINALLY(eptr, finallyBlock) \
+ { \
+ finallyBlock; \
+ if (eptr) { \
+ std::rethrow_exception(eptr); \
+ } \
+ }
+
+static int32_t getDaySeed() {
+ std::time_t now = std::time(nullptr);
+ std::tm* localTime = std::localtime(&now);
+ localTime->tm_sec = 0;
+ localTime->tm_min = 0;
+ localTime->tm_hour = 0;
+ return static_cast<int32_t>(std::mktime(localTime) / (60 * 60 * 24));
+}
+
+// Generate random text with multiple tokens for position testing
+static std::string generateRandomText(std::mt19937& rng, int minTokens, int
maxTokens) {
+ static const std::vector<std::string> words = {
+ "apple", "banana", "cherry", "date", "elderberry",
"fig", "grape",
+ "honeydew", "kiwi", "lemon", "mango", "nectarine",
"orange", "papaya",
+ "quince", "raspberry", "strawberry", "tangerine", "watermelon",
"blueberry"};
+
+ std::uniform_int_distribution<int> tokenDist(minTokens, maxTokens);
+ std::uniform_int_distribution<size_t> wordDist(0, words.size() - 1);
+
+ int numTokens = tokenDist(rng);
+ std::string result;
+ for (int i = 0; i < numTokens; ++i) {
+ if (i > 0) result += " ";
+ result += words[wordDist(rng)];
+ }
+ return result;
+}
+
+// Write index with random data
+static void writeTestIndex(const std::string& fieldName, RAMDirectory* dir,
+ IndexVersion indexVersion, const
std::vector<std::string>& datas) {
+ auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>;
+ analyzer->set_stopwords(nullptr);
+ auto* indexwriter = _CLNEW lucene::index::IndexWriter(dir, analyzer, true);
+ indexwriter->setRAMBufferSizeMB(512);
+ indexwriter->setMaxBufferedDocs(-1);
+ indexwriter->setMaxFieldLength(0x7FFFFFFFL);
+ indexwriter->setMergeFactor(1000000000);
+ indexwriter->setUseCompoundFile(false);
+
+ auto* char_string_reader = _CLNEW lucene::util::SStringReader<char>;
+
+ auto* doc = _CLNEW lucene::document::Document();
+ int32_t field_config = lucene::document::Field::STORE_NO;
+ field_config |= lucene::document::Field::INDEX_TOKENIZED;
+ auto field_name_w = std::wstring(fieldName.begin(), fieldName.end());
+ auto* field = _CLNEW lucene::document::Field(field_name_w.c_str(),
field_config);
+ field->setOmitTermFreqAndPositions(false);
+ field->setIndexVersion(indexVersion);
+ doc->add(*field);
+
+ for (const auto& data : datas) {
+ char_string_reader->init(data.data(), data.size(), false);
+ auto* stream = analyzer->reusableTokenStream(field->name(),
char_string_reader);
+ field->setValue(stream);
+ indexwriter->addDocument(doc);
+ }
+
+ indexwriter->close();
+
+ _CLLDELETE(indexwriter);
+ _CLLDELETE(doc);
+ _CLLDELETE(analyzer);
+ _CLLDELETE(char_string_reader);
+}
+
+// Result structure for comparison
+struct TermDocsResult {
+ std::vector<int32_t> docs;
+ std::vector<int32_t> freqs;
+};
+
+struct TermPositionsResult {
+ std::vector<int32_t> docs;
+ std::vector<int32_t> freqs;
+ std::vector<std::vector<int32_t>> positions; // positions per doc
+};
+
+// Read using traditional next() method
+static TermDocsResult readWithNext(TermDocs* termDocs) {
+ TermDocsResult result;
+ while (termDocs->next()) {
+ result.docs.push_back(termDocs->doc());
+ result.freqs.push_back(termDocs->freq());
+ }
+ return result;
+}
+
+// Read using readRange() method
+static TermDocsResult readWithRange(TermDocs* termDocs) {
+ TermDocsResult result;
+ DocRange docRange;
+ while (termDocs->readRange(&docRange)) {
+ for (uint32_t i = 0; i < docRange.doc_many_size_; ++i) {
+ result.docs.push_back((*docRange.doc_many)[i]);
+ if (docRange.freq_many && i < docRange.freq_many_size_) {
+ result.freqs.push_back((*docRange.freq_many)[i]);
+ } else {
+ result.freqs.push_back(1);
+ }
+ }
+ }
+ return result;
+}
+
+// Read positions using traditional next()/nextPosition() method
+static TermPositionsResult readPositionsWithNext(TermPositions* termPos) {
+ TermPositionsResult result;
+ while (termPos->next()) {
+ result.docs.push_back(termPos->doc());
+ int32_t freq = termPos->freq();
+ result.freqs.push_back(freq);
+
+ std::vector<int32_t> positions;
+ for (int32_t i = 0; i < freq; ++i) {
+ positions.push_back(termPos->nextPosition());
+ }
+ result.positions.push_back(std::move(positions));
+ }
+ return result;
+}
+
+// Read positions using readRange() and nextDeltaPosition() method
+static TermPositionsResult readPositionsWithRange(TermPositions* termPos) {
+ TermPositionsResult result;
+ DocRange docRange;
+ docRange.need_positions = true;
+
+ while (termPos->readRange(&docRange)) {
+ for (uint32_t i = 0; i < docRange.doc_many_size_; ++i) {
+ result.docs.push_back((*docRange.doc_many)[i]);
+ int32_t freq = 1;
+ if (docRange.freq_many && i < docRange.freq_many_size_) {
+ freq = (*docRange.freq_many)[i];
+ }
+ result.freqs.push_back(freq);
+
+ std::vector<int32_t> positions;
+ int32_t position = 0;
+ for (int32_t j = 0; j < freq; ++j) {
+ position += termPos->nextDeltaPosition();
+ positions.push_back(position);
+ }
+ result.positions.push_back(std::move(positions));
+ }
+ }
+ return result;
+}
+
+// Compare TermDocs results
+static bool compareTermDocsResults(const TermDocsResult& a, const
TermDocsResult& b,
+ bool checkNorms = false) {
+ if (a.docs.size() != b.docs.size()) {
+ std::cerr << "Doc count mismatch: " << a.docs.size() << " vs " <<
b.docs.size()
+ << std::endl;
+ return false;
+ }
+
+ for (size_t i = 0; i < a.docs.size(); ++i) {
+ if (a.docs[i] != b.docs[i]) {
+ std::cerr << "Doc mismatch at " << i << ": " << a.docs[i] << " vs
" << b.docs[i]
+ << std::endl;
+ return false;
+ }
+ if (a.freqs[i] != b.freqs[i]) {
+ std::cerr << "Freq mismatch at doc " << a.docs[i] << ": " <<
a.freqs[i] << " vs "
+ << b.freqs[i] << std::endl;
+ return false;
+ }
+ }
+ return true;
+}
+
+// Compare TermPositions results
+static bool compareTermPositionsResults(const TermPositionsResult& a,
+ const TermPositionsResult& b) {
+ if (a.docs.size() != b.docs.size()) {
+ std::cerr << "Doc count mismatch: " << a.docs.size() << " vs " <<
b.docs.size()
+ << std::endl;
+ return false;
+ }
+
+ for (size_t i = 0; i < a.docs.size(); ++i) {
+ if (a.docs[i] != b.docs[i]) {
+ std::cerr << "Doc mismatch at " << i << ": " << a.docs[i] << " vs
" << b.docs[i]
+ << std::endl;
+ return false;
+ }
+ if (a.freqs[i] != b.freqs[i]) {
+ std::cerr << "Freq mismatch at doc " << a.docs[i] << ": " <<
a.freqs[i] << " vs "
+ << b.freqs[i] << std::endl;
+ return false;
+ }
+ if (a.positions[i].size() != b.positions[i].size()) {
+ std::cerr << "Position count mismatch at doc " << a.docs[i] <<
std::endl;
+ return false;
+ }
+ for (size_t j = 0; j < a.positions[i].size(); ++j) {
+ if (a.positions[i][j] != b.positions[i][j]) {
+ std::cerr << "Position mismatch at doc " << a.docs[i] << " pos
" << j << ": "
+ << a.positions[i][j] << " vs " << b.positions[i][j]
<< std::endl;
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+//=============================================================================
+// Test: Basic readRange correctness for TermDocs
+//=============================================================================
+void TestReadRangeBasic(CuTest* tc) {
+ std::srand(getDaySeed());
+ std::mt19937 rng(getDaySeed());
+
+ std::string fieldName = "content";
+ std::vector<std::string> datas;
+ for (int32_t i = 0; i < DEFAULT_DOC_COUNT; ++i) {
+ datas.push_back(generateRandomText(rng, 1, 5));
+ }
+
+ RAMDirectory dir;
+ writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+ auto* reader = IndexReader::open(&dir);
+ std::exception_ptr eptr;
+
+ try {
+ Term* term = nullptr;
+ TermEnum* enumerator = reader->terms();
+
+ int termCount = 0;
+ while (enumerator->next() && termCount < 50) { // Test first 50 terms
+ term = enumerator->term();
+ if (term == nullptr) continue;
+
+ // Read with next()
+ TermDocs* termDocs1 = reader->termDocs();
+ termDocs1->seek(term);
+ TermDocsResult result1 = readWithNext(termDocs1);
+ termDocs1->close();
+ _CLDELETE(termDocs1);
+
+ // Read with readRange()
+ TermDocs* termDocs2 = reader->termDocs();
+ termDocs2->seek(term);
+ TermDocsResult result2 = readWithRange(termDocs2);
+ termDocs2->close();
+ _CLDELETE(termDocs2);
+
+ // Compare results
+ bool match = compareTermDocsResults(result1, result2);
+ if (!match) {
+ char termStr[256];
+ STRCPY_TtoA(termStr, term->text(), 255);
+ std::cerr << "Mismatch for term: " << termStr << std::endl;
+ }
+ assertTrue(match);
+
+ _CLDECDELETE(term);
+ termCount++;
+ }
+
+ enumerator->close();
+ _CLDELETE(enumerator);
+
+ } catch (...) {
+ eptr = std::current_exception();
+ }
+
+ FINALLY(eptr, {
+ reader->close();
+ _CLLDELETE(reader);
+ })
+
+ std::cout << "\nTestReadRangeBasic success" << std::endl;
+}
+
+//=============================================================================
+// Test: readRange with positions (TermPositions)
+//=============================================================================
+void TestReadRangePositions(CuTest* tc) {
+ std::srand(getDaySeed());
+ std::mt19937 rng(getDaySeed());
+
+ std::string fieldName = "content";
+ std::vector<std::string> datas;
+ for (int32_t i = 0; i < 1000; ++i) {
+ datas.push_back(generateRandomText(rng, 2, 8)); // Multiple tokens per
doc
+ }
+
+ RAMDirectory dir;
+ writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+ auto* reader = IndexReader::open(&dir);
+ std::exception_ptr eptr;
+
+ try {
+ Term* term = nullptr;
+ TermEnum* enumerator = reader->terms();
+
+ int termCount = 0;
+ while (enumerator->next() && termCount < 20) {
+ term = enumerator->term();
+ if (term == nullptr) continue;
+
+ // Read with next()/nextPosition()
+ TermPositions* termPos1 = reader->termPositions();
+ termPos1->seek(term);
+ TermPositionsResult result1 = readPositionsWithNext(termPos1);
+ termPos1->close();
+ _CLDELETE(termPos1);
+
+ // Read with readRange()/nextDeltaPosition()
+ TermPositions* termPos2 = reader->termPositions();
+ termPos2->seek(term);
+ TermPositionsResult result2 = readPositionsWithRange(termPos2);
+ termPos2->close();
+ _CLDELETE(termPos2);
+
+ // Compare results
+ bool match = compareTermPositionsResults(result1, result2);
+ if (!match) {
+ char termStr[256];
+ STRCPY_TtoA(termStr, term->text(), 255);
+ std::cerr << "Position mismatch for term: " << termStr <<
std::endl;
+ }
+ assertTrue(match);
+
+ _CLDECDELETE(term);
+ termCount++;
+ }
+
+ enumerator->close();
+ _CLDELETE(enumerator);
+
+ } catch (...) {
+ eptr = std::current_exception();
+ }
+
+ FINALLY(eptr, {
+ reader->close();
+ _CLLDELETE(reader);
+ })
+
+ std::cout << "\nTestReadRangePositions success" << std::endl;
+}
+
+//=============================================================================
+// Test: skipToBlock correctness
+//=============================================================================
+void TestSkipToBlock(CuTest* tc) {
+ std::srand(getDaySeed());
+ std::mt19937 rng(getDaySeed());
+
+ std::string fieldName = "content";
+ std::vector<std::string> datas;
+
+ // Create index with a common term appearing in many docs
+ std::string commonWord = "common";
+ for (int32_t i = 0; i < DEFAULT_DOC_COUNT; ++i) {
+ std::string text = generateRandomText(rng, 1, 3);
+ if (i % 3 == 0) { // Add common word to every 3rd doc
+ text += " " + commonWord;
+ }
+ datas.push_back(text);
+ }
+
+ RAMDirectory dir;
+ writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+ auto* reader = IndexReader::open(&dir);
+ std::exception_ptr eptr;
+
+ try {
+ std::wstring ws = StringUtil::string_to_wstring(commonWord);
+ std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+ Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+ // Test skipTo with traditional method
+ std::vector<int32_t> skipToTargets = {0, 100, 500, 1000, 5000, 9000};
+
+ for (int32_t target : skipToTargets) {
+ // Using skipTo
+ TermDocs* termDocs1 = reader->termDocs();
+ termDocs1->seek(term);
+ std::vector<int32_t> docs1;
+ if (termDocs1->skipTo(target)) {
+ docs1.push_back(termDocs1->doc());
+ while (termDocs1->next()) {
+ docs1.push_back(termDocs1->doc());
+ }
+ }
+ termDocs1->close();
+ _CLDELETE(termDocs1);
+
+ // Using skipToBlock + readRange
+ TermDocs* termDocs2 = reader->termDocs();
+ termDocs2->seek(term);
+ termDocs2->skipToBlock(target);
+ std::vector<int32_t> docs2;
+ DocRange docRange;
+ while (termDocs2->readRange(&docRange)) {
+ for (uint32_t i = 0; i < docRange.doc_many_size_; ++i) {
+ int32_t doc = (*docRange.doc_many)[i];
+ if (doc >= target) {
+ docs2.push_back(doc);
+ }
+ }
+ }
+ termDocs2->close();
+ _CLDELETE(termDocs2);
+
+ // Compare: docs2 should contain at least all docs from docs1
+ // (skipToBlock may return some docs before target within the same
block)
+ bool allFound = true;
+ for (int32_t doc : docs1) {
+ if (std::find(docs2.begin(), docs2.end(), doc) == docs2.end())
{
+ std::cerr << "Doc " << doc << " not found after
skipToBlock(" << target << ")"
+ << std::endl;
+ allFound = false;
+ }
+ }
+ assertTrue(allFound);
+ }
+
+ _CLDECDELETE(term);
+
+ } catch (...) {
+ eptr = std::current_exception();
+ }
+
+ FINALLY(eptr, {
+ reader->close();
+ _CLLDELETE(reader);
+ })
+
+ std::cout << "\nTestSkipToBlock success" << std::endl;
+}
+
+//=============================================================================
+// Test: Performance comparison between next() and readRange()
+//=============================================================================
+void TestReadRangePerformance(CuTest* tc) {
+ std::srand(getDaySeed());
+ std::mt19937 rng(getDaySeed());
+
+ std::string fieldName = "content";
+ std::vector<std::string> datas;
+
+ // Create larger dataset for performance testing
+ int32_t perfDocCount = 50000;
+ for (int32_t i = 0; i < perfDocCount; ++i) {
+ datas.push_back(generateRandomText(rng, 1, 5));
+ }
+
+ RAMDirectory dir;
+ writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+ auto* reader = IndexReader::open(&dir);
+ std::exception_ptr eptr;
+
+ try {
+ // Collect all terms
+ std::vector<Term*> terms;
+ TermEnum* enumerator = reader->terms();
+ while (enumerator->next()) {
+ Term* term = enumerator->term();
+ if (term != nullptr) {
+ terms.push_back(term);
+ }
+ }
+ enumerator->close();
+ _CLDELETE(enumerator);
+
+ // Benchmark next() method
+ auto startNext = std::chrono::high_resolution_clock::now();
+ int64_t totalDocsNext = 0;
+ for (int iter = 0; iter < PERF_ITERATIONS; ++iter) {
+ for (Term* term : terms) {
+ TermDocs* termDocs = reader->termDocs();
+ termDocs->seek(term);
+ while (termDocs->next()) {
+ totalDocsNext++;
+ }
+ termDocs->close();
+ _CLDELETE(termDocs);
+ }
+ }
+ auto endNext = std::chrono::high_resolution_clock::now();
+ auto durationNext =
+ std::chrono::duration_cast<std::chrono::milliseconds>(endNext
- startNext);
+
+ // Benchmark readRange() method
+ auto startRange = std::chrono::high_resolution_clock::now();
+ int64_t totalDocsRange = 0;
+ for (int iter = 0; iter < PERF_ITERATIONS; ++iter) {
+ for (Term* term : terms) {
+ TermDocs* termDocs = reader->termDocs();
+ termDocs->seek(term);
+ DocRange docRange;
+ while (termDocs->readRange(&docRange)) {
+ totalDocsRange += docRange.doc_many_size_;
+ }
+ termDocs->close();
+ _CLDELETE(termDocs);
+ }
+ }
+ auto endRange = std::chrono::high_resolution_clock::now();
+ auto durationRange =
+ std::chrono::duration_cast<std::chrono::milliseconds>(endRange
- startRange);
+
+ // Verify same number of docs read
+ assertEquals(totalDocsNext, totalDocsRange);
+
+ std::cout << "\n=== Performance Results ===" << std::endl;
+ std::cout << "Terms: " << terms.size() << ", Iterations: " <<
PERF_ITERATIONS << std::endl;
+ std::cout << "next() method: " << durationNext.count() << " ms"
<< std::endl;
+ std::cout << "readRange() method: " << durationRange.count() << " ms"
<< std::endl;
+ std::cout << "Speedup: " << (double)durationNext.count() /
durationRange.count() << "x"
+ << std::endl;
+
+ // readRange should not be significantly slower
+ // Allow up to 2x slower as acceptable (it should actually be faster
in most cases)
+ assertTrue(durationRange.count() <= durationNext.count() * 2);
+
+ // Cleanup terms
+ for (Term* term : terms) {
+ _CLDECDELETE(term);
+ }
+
+ } catch (...) {
+ eptr = std::current_exception();
+ }
+
+ FINALLY(eptr, {
+ reader->close();
+ _CLLDELETE(reader);
+ })
+
+ std::cout << "TestReadRangePerformance success" << std::endl;
+}
+
+//=============================================================================
+// Test: Large document count stress test
+//=============================================================================
+void TestReadRangeLargeDataset(CuTest* tc) {
+ std::srand(getDaySeed());
+ std::mt19937 rng(getDaySeed());
+
+ std::string fieldName = "content";
+ std::vector<std::string> datas;
+
+ // Use a small vocabulary to ensure high term frequency
+ std::vector<std::string> vocabulary = {"alpha", "beta", "gamma", "delta",
"epsilon"};
+
+ int32_t largeDocCount = 100000;
+ for (int32_t i = 0; i < largeDocCount; ++i) {
+ std::uniform_int_distribution<size_t> dist(0, vocabulary.size() - 1);
+ std::string text = vocabulary[dist(rng)];
+ // Add 1-3 more words
+ int extraWords = (i % 3) + 1;
+ for (int j = 0; j < extraWords; ++j) {
+ text += " " + vocabulary[dist(rng)];
+ }
+ datas.push_back(text);
+ }
+
+ RAMDirectory dir;
+ writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+ auto* reader = IndexReader::open(&dir);
+ std::exception_ptr eptr;
+
+ try {
+ // Test each vocabulary term
+ std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+
+ for (const std::string& word : vocabulary) {
+ std::wstring ws = StringUtil::string_to_wstring(word);
+ Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+ // Read with next()
+ TermDocs* termDocs1 = reader->termDocs();
+ termDocs1->seek(term);
+ TermDocsResult result1 = readWithNext(termDocs1);
+ termDocs1->close();
+ _CLDELETE(termDocs1);
+
+ // Read with readRange()
+ TermDocs* termDocs2 = reader->termDocs();
+ termDocs2->seek(term);
+ TermDocsResult result2 = readWithRange(termDocs2);
+ termDocs2->close();
+ _CLDELETE(termDocs2);
+
+ // Compare
+ bool match = compareTermDocsResults(result1, result2);
+ if (!match) {
+ std::cerr << "Mismatch for term: " << word << std::endl;
+ }
+ assertTrue(match);
+
+ std::cout << "Term '" << word << "': " << result1.docs.size() << "
docs - OK"
+ << std::endl;
+
+ _CLDECDELETE(term);
+ }
+
+ } catch (...) {
+ eptr = std::current_exception();
+ }
+
+ FINALLY(eptr, {
+ reader->close();
+ _CLLDELETE(reader);
+ })
+
+ std::cout << "\nTestReadRangeLargeDataset success" << std::endl;
+}
+
+//=============================================================================
+// Test: Index version compatibility (V1 and V2)
+//=============================================================================
+void TestReadRangeVersions(CuTest* tc) {
+ std::srand(getDaySeed());
+ std::mt19937 rng(getDaySeed());
+
+ std::string fieldName = "content";
+ std::vector<std::string> datas;
+ for (int32_t i = 0; i < 5000; ++i) {
+ datas.push_back(generateRandomText(rng, 1, 4));
+ }
+
+ std::vector<IndexVersion> versions = {IndexVersion::kV1,
IndexVersion::kV2};
+
+ for (IndexVersion version : versions) {
+ RAMDirectory dir;
+ writeTestIndex(fieldName, &dir, version, datas);
+
+ auto* reader = IndexReader::open(&dir);
+ std::exception_ptr eptr;
+
+ try {
+ Term* term = nullptr;
+ TermEnum* enumerator = reader->terms();
+
+ int termCount = 0;
+ while (enumerator->next() && termCount < 30) {
+ term = enumerator->term();
+ if (term == nullptr) continue;
+
+ // Read with next()
+ TermDocs* termDocs1 = reader->termDocs();
+ termDocs1->seek(term);
+ TermDocsResult result1 = readWithNext(termDocs1);
+ termDocs1->close();
+ _CLDELETE(termDocs1);
+
+ // Read with readRange()
+ TermDocs* termDocs2 = reader->termDocs();
+ termDocs2->seek(term);
+ TermDocsResult result2 = readWithRange(termDocs2);
+ termDocs2->close();
+ _CLDELETE(termDocs2);
+
+ bool match = compareTermDocsResults(result1, result2);
+ assertTrue(match);
+
+ _CLDECDELETE(term);
+ termCount++;
+ }
+
+ enumerator->close();
+ _CLDELETE(enumerator);
+
+ } catch (...) {
+ eptr = std::current_exception();
+ }
+
+ FINALLY(eptr, {
+ reader->close();
+ _CLLDELETE(reader);
+ })
+
+ std::cout << "IndexVersion " << static_cast<int>(version) << " - OK"
<< std::endl;
+ }
+
+ std::cout << "\nTestReadRangeVersions success" << std::endl;
+}
+
+//=============================================================================
+// Test: Edge cases - empty results, single doc, etc.
+//=============================================================================
+void TestReadRangeEdgeCases(CuTest* tc) {
+ std::srand(getDaySeed());
+
+ std::string fieldName = "content";
+
+ // Test 1: Single document
+ {
+ RAMDirectory dir;
+ std::vector<std::string> datas = {"single"};
+ writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+ auto* reader = IndexReader::open(&dir);
+ std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+ std::wstring ws = StringUtil::string_to_wstring("single");
+ Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+ TermDocs* termDocs1 = reader->termDocs();
+ termDocs1->seek(term);
+ TermDocsResult result1 = readWithNext(termDocs1);
+ termDocs1->close();
+ _CLDELETE(termDocs1);
+
+ TermDocs* termDocs2 = reader->termDocs();
+ termDocs2->seek(term);
+ TermDocsResult result2 = readWithRange(termDocs2);
+ termDocs2->close();
+ _CLDELETE(termDocs2);
+
+ assertTrue(compareTermDocsResults(result1, result2));
+ assertEquals(result1.docs.size(), 1);
+
+ _CLDECDELETE(term);
+ reader->close();
+ _CLLDELETE(reader);
+
+ std::cout << "Single doc test - OK" << std::endl;
+ }
+
+ // Test 2: Non-existent term
+ {
+ RAMDirectory dir;
+ std::vector<std::string> datas = {"apple", "banana", "cherry"};
+ writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+ auto* reader = IndexReader::open(&dir);
+ std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+ std::wstring ws = StringUtil::string_to_wstring("nonexistent");
+ Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+ TermDocs* termDocs = reader->termDocs();
+ termDocs->seek(term);
+ DocRange docRange;
+ bool hasData = termDocs->readRange(&docRange);
+ // Non-existent term should return false or empty range
+ assertTrue(!hasData || docRange.doc_many_size_ == 0);
+ termDocs->close();
+ _CLDELETE(termDocs);
+
+ _CLDECDELETE(term);
+ reader->close();
+ _CLLDELETE(reader);
+
+ std::cout << "Non-existent term test - OK" << std::endl;
+ }
+
+ // Test 3: Term appearing in all documents
+ {
+ RAMDirectory dir;
+ std::vector<std::string> datas;
+ for (int i = 0; i < 1000; ++i) {
+ datas.push_back("common");
+ }
+ writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+ auto* reader = IndexReader::open(&dir);
+ std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+ std::wstring ws = StringUtil::string_to_wstring("common");
+ Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+ TermDocs* termDocs1 = reader->termDocs();
+ termDocs1->seek(term);
+ TermDocsResult result1 = readWithNext(termDocs1);
+ termDocs1->close();
+ _CLDELETE(termDocs1);
+
+ TermDocs* termDocs2 = reader->termDocs();
+ termDocs2->seek(term);
+ TermDocsResult result2 = readWithRange(termDocs2);
+ termDocs2->close();
+ _CLDELETE(termDocs2);
+
+ assertTrue(compareTermDocsResults(result1, result2));
+ assertEquals(result1.docs.size(), 1000);
+
+ _CLDECDELETE(term);
+ reader->close();
+ _CLLDELETE(reader);
+
+ std::cout << "All docs term test - OK" << std::endl;
+ }
+
+ std::cout << "\nTestReadRangeEdgeCases success" << std::endl;
+}
+
+//=============================================================================
+// Suite registration
+//=============================================================================
+CuSuite* testReadRange() {
+ CuSuite* suite = CuSuiteNew(_T("CLucene ReadRange Test"));
+
+ SUITE_ADD_TEST(suite, TestReadRangeBasic);
+ SUITE_ADD_TEST(suite, TestReadRangePositions);
+ SUITE_ADD_TEST(suite, TestSkipToBlock);
+ SUITE_ADD_TEST(suite, TestReadRangePerformance);
+ SUITE_ADD_TEST(suite, TestReadRangeLargeDataset);
+ SUITE_ADD_TEST(suite, TestReadRangeVersions);
+ SUITE_ADD_TEST(suite, TestReadRangeEdgeCases);
+
+ return suite;
+}
diff --git a/src/test/test.h b/src/test/test.h
index 762424eef53..81fd7623e00 100644
--- a/src/test/test.h
+++ b/src/test/test.h
@@ -93,6 +93,7 @@ CuSuite *testICU(void);
CuSuite *testUTF8CharsSuite(void);
CuSuite *testIndexReader2(void);
CuSuite *testPFORSuite(void);
+CuSuite *testReadRange(void);
#ifdef TEST_CONTRIB_LIBS
//CuSuite *testGermanAnalyzer(void);
CuSuite *testchinese(void);
diff --git a/src/test/tests.cpp b/src/test/tests.cpp
index 1d4a05a43ce..cf3b53097b0 100644
--- a/src/test/tests.cpp
+++ b/src/test/tests.cpp
@@ -28,4 +28,5 @@ unittest tests[] = {{"analysis", testanalysis},
{"chinese", testchinese},
#endif
{"TestIndexReader2", testIndexReader2},
+ {"testReadRange", testReadRange},
{"LastTest", NULL}};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]