This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new eff3191d680 [fix](inverted index) fix BM25 scoring anomaly in term
frequency calculation (#344)
eff3191d680 is described below
commit eff3191d6805644d0438d6fe2eecd1b26e8ee740
Author: zzzxl <[email protected]>
AuthorDate: Wed Jul 30 19:12:44 2025 +0800
[fix](inverted index) fix BM25 scoring anomaly in term frequency
calculation (#344)
---
src/core/CLucene/index/IndexReader.cpp | 8 ++--
src/core/CLucene/index/IndexReader.h | 4 +-
src/core/CLucene/index/IndexWriter.cpp | 37 +++++++++++------
src/core/CLucene/index/IndexWriter.h | 6 ++-
src/core/CLucene/index/MultiReader.cpp | 6 ++-
src/core/CLucene/index/MultiReader.h | 4 +-
src/core/CLucene/index/MultiSegmentReader.cpp | 24 ++++++-----
src/core/CLucene/index/MultipleTermPositions.cpp | 4 +-
src/core/CLucene/index/MultipleTermPositions.h | 4 +-
src/core/CLucene/index/SDocumentWriter.cpp | 21 +++++++---
src/core/CLucene/index/SDocumentWriter.h | 3 ++
src/core/CLucene/index/SegmentReader.cpp | 32 +++++----------
src/core/CLucene/index/SegmentTermDocs.cpp | 20 ++++++----
src/core/CLucene/index/SegmentTermPositions.cpp | 8 +++-
src/core/CLucene/index/Terms.h | 6 ++-
src/core/CLucene/index/_MultiSegmentReader.h | 13 +++---
src/core/CLucene/index/_SegmentHeader.h | 22 +++++-----
src/core/CLucene/search/Similarity.cpp | 51 ++++++++++++++++++++++++
src/core/CLucene/search/Similarity.h | 2 +
src/test/search/TestSearch.cpp | 2 +-
20 files changed, 186 insertions(+), 91 deletions(-)
diff --git a/src/core/CLucene/index/IndexReader.cpp
b/src/core/CLucene/index/IndexReader.cpp
index 77df957f75e..9fccfdbda31 100644
--- a/src/core/CLucene/index/IndexReader.cpp
+++ b/src/core/CLucene/index/IndexReader.cpp
@@ -278,9 +278,9 @@ CL_NS_DEF(index)
ensureOpen();
//Reference an instantiated TermDocs instance
- TermDocs* _termDocs = termDocs(io_ctx);
+ TermDocs* _termDocs = termDocs(load_stats, io_ctx);
//Seek all documents containing term
- _termDocs->seek(term, load_stats);
+ _termDocs->seek(term);
//return the enumaration
return _termDocs;
}
@@ -304,9 +304,9 @@ CL_NS_DEF(index)
ensureOpen();
//Reference an instantiated termPositions instance
- TermPositions* _termPositions = termPositions(io_ctx);
+ TermPositions* _termPositions = termPositions(load_stats, io_ctx);
//Seek all documents containing term
- _termPositions->seek(term, load_stats);
+ _termPositions->seek(term);
//return the enumeration
return _termPositions;
}
diff --git a/src/core/CLucene/index/IndexReader.h
b/src/core/CLucene/index/IndexReader.h
index da73d64dcab..a7f2920a804 100644
--- a/src/core/CLucene/index/IndexReader.h
+++ b/src/core/CLucene/index/IndexReader.h
@@ -575,7 +575,7 @@ public:
* @throws IOException if there is a low-level IO error
* @memory Caller must clean up
*/
- virtual TermPositions* termPositions(const void* io_ctx = nullptr) = 0;
+ virtual TermPositions* termPositions(bool load_stats = false, const
void* io_ctx = nullptr) = 0;
/** Returns an enumeration of all the documents which contain
* <code>term</code>. For each document, in addition to the document
number
@@ -601,7 +601,7 @@ public:
* @throws IOException if there is a low-level IO error
* @memory Caller must clean up
*/
- virtual TermDocs* termDocs(const void* io_ctx = nullptr) = 0;
+ virtual TermDocs* termDocs(bool load_stats = false, const void* io_ctx
= nullptr) = 0;
/** Returns an enumeration of all the documents which contain
* <code>term</code>. For each document, the document number, the
frequency of
diff --git a/src/core/CLucene/index/IndexWriter.cpp
b/src/core/CLucene/index/IndexWriter.cpp
index 7ec6f56817d..332e0a4e786 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1282,8 +1282,9 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
std::vector<lucene::store::IndexOutput *> normsOutputList;
// first level vector index is src_index_id
- // <TCHAR, ValueArray<uint8_t>> key is field name, value is the norm of
src_doc_id
- std::vector<map<TCHAR, std::vector<uint8_t>>>
srcFieldNormsMapValues(numIndices);
+ // <std::wstring, ValueArray<uint8_t>> key is field name, value is the
norm of src_doc_id
+ std::vector<std::map<std::wstring, std::vector<uint8_t>>>
srcFieldNormsMapValues(numIndices);
+ std::map<std::wstring, uint64_t> srcFieldTotalTermCountMap;
try {
// check hasProx, indexVersion
@@ -1336,14 +1337,17 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
if (fi->isIndexed && !fi->omitNorms) {
CL_NS(util)::ValueArray<uint8_t> normBuffer;
size_t maxDoc = reader->maxDoc();
- if ( normBuffer.length < maxDoc){
+ if (normBuffer.length < maxDoc) {
normBuffer.resize(maxDoc);
memset(normBuffer.values, 0, sizeof(uint8_t) *
maxDoc);
}
reader->norms(fi->name, normBuffer.values);
for (int j = 0; j < normBuffer.length; j++) {
-
srcFieldNormsMapValues[srcIndex][*fi->name].emplace_back(normBuffer.values[j]);
+
srcFieldNormsMapValues[srcIndex][fi->name].emplace_back(
+ normBuffer.values[j]);
}
+ srcFieldTotalTermCountMap[fi->name] +=
+ reader->sumTotalTermFreq(fi->name).value_or(0);
}
}
}
@@ -1406,7 +1410,7 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
/// merge norms if have
if (hasNorms){
- mergeNorms(dest_index_docs, srcFieldNormsMapValues,
normsOutputList);
+ mergeNorms(dest_index_docs, srcFieldTotalTermCountMap,
srcFieldNormsMapValues, normsOutputList);
}
/// merge null_bitmap
@@ -1918,18 +1922,19 @@ void IndexWriter::mergeTerms(bool hasProx, IndexVersion
indexVersion) {
}
void IndexWriter::mergeNorms(std::vector<uint32_t> dest_index_docs,
- std::vector<std::map<TCHAR,
std::vector<uint8_t>>> srcFieldNormsMapValues,
- std::vector<lucene::store::IndexOutput *>
normsOutputList) {
+ std::map<std::wstring, uint64_t>
srcFieldTotalTermCountMap,
+ std::vector<std::map<std::wstring,
std::vector<uint8_t>>> srcFieldNormsMapValues,
+ std::vector<lucene::store::IndexOutput *>
normsOutputList) {
//Func - Merges the norms for all fields
//Pre - fieldInfos != NULL
//Post - The norms for all fields have been merged
CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");
- std::vector<std::map<TCHAR, std::vector<uint8_t>>>
destFieldNormsMapValues(numDestIndexes);
+ std::vector<std::map<std::wstring, std::vector<uint8_t>>>
destFieldNormsMapValues(numDestIndexes);
// iterate srcFieldNormsValues to construct destFieldNormsMapValues
for (size_t srcIndex = 0; srcIndex < srcFieldNormsMapValues.size();
++srcIndex) {
- std::map<TCHAR, std::vector<uint8_t>> &srcFieldNormsMap =
srcFieldNormsMapValues[srcIndex];
+ std::map<std::wstring, std::vector<uint8_t>> &srcFieldNormsMap =
srcFieldNormsMapValues[srcIndex];
if (srcFieldNormsMap.empty()) {
// empty indicates there is no nrm file in this index
continue;
@@ -1937,7 +1942,7 @@ void IndexWriter::mergeNorms(std::vector<uint32_t>
dest_index_docs,
// find field has norms
for (int j =0; j < fieldInfos->size(); j++) {
FieldInfo* fi = fieldInfos->fieldInfo(j);
- TCHAR fieldName = *fi->name;
+ std::wstring fieldName = fi->name;
// Is this Field indexed and field need norms ?
if (fi->isIndexed && !fi->omitNorms) {
auto& srcFieldNorms = srcFieldNormsMap[fieldName];
@@ -1965,15 +1970,21 @@ void IndexWriter::mergeNorms(std::vector<uint32_t>
dest_index_docs,
// construct nrm and write nrm to dest index
for (size_t i = 0; i < destFieldNormsMapValues.size(); ++i) {
auto& destFieldNormsMap = destFieldNormsMapValues[i];
- for (int j =0; j < fieldInfos->size(); j++) {
+ for (int j = 0; j < fieldInfos->size(); j++) {
FieldInfo* fi = fieldInfos->fieldInfo(j);
- TCHAR fieldName = *fi->name;
+ std::wstring fieldName = fi->name;
auto destDocCount = dest_index_docs[i];
if (fi->isIndexed && !fi->omitNorms) {
// if not find then norm is zero
if (destFieldNormsMap.find(fieldName) ==
destFieldNormsMap.end()) {
destFieldNormsMap[fieldName].resize(destDocCount);
-
std::fill(destFieldNormsMap[fieldName].begin(),destFieldNormsMap[fieldName].end(),
0);
+ std::fill(destFieldNormsMap[fieldName].begin(),
+ destFieldNormsMap[fieldName].end(), 0);
+ }
+ if (i == 0) {
+
normsOutputList[i]->writeLong(srcFieldTotalTermCountMap[fieldName]);
+ } else {
+ normsOutputList[i]->writeLong(0);
}
auto& destFieldNorms = destFieldNormsMap[fieldName];
normsOutputList[i]->writeBytes(destFieldNorms.data(),
destDocCount);
diff --git a/src/core/CLucene/index/IndexWriter.h
b/src/core/CLucene/index/IndexWriter.h
index 33df65cad46..e8c5fbd0ec3 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -330,7 +330,11 @@ public:
// merge terms and write files
void mergeTerms(bool hasProx, IndexVersion indexVersion);
// merge norms and write files
- void mergeNorms(std::vector<uint32_t> dest_index_docs,
std::vector<std::map<TCHAR, std::vector<uint8_t>>> srcFieldNormsMapValues,
std::vector<lucene::store::IndexOutput *> normsOutputList);
+ void mergeNorms(
+ std::vector<uint32_t> dest_index_docs,
+ std::map<std::wstring, uint64_t> srcFieldTotalTermCountMap,
+ std::vector<std::map<std::wstring, std::vector<uint8_t>>>
srcFieldNormsMapValues,
+ std::vector<lucene::store::IndexOutput*> normsOutputList);
// merge null_bitmap
void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues,
std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList);
diff --git a/src/core/CLucene/index/MultiReader.cpp
b/src/core/CLucene/index/MultiReader.cpp
index 3ac68b4af8c..2f8d771cbce 100644
--- a/src/core/CLucene/index/MultiReader.cpp
+++ b/src/core/CLucene/index/MultiReader.cpp
@@ -307,16 +307,18 @@ std::optional<uint64_t>
MultiReader::sumTotalTermFreq(const TCHAR* field) {
return std::nullopt;
}
-TermDocs* MultiReader::termDocs(const void* io_ctx) {
+TermDocs* MultiReader::termDocs(bool load_stats, const void* io_ctx) {
ensureOpen();
TermDocs* ret = _CLNEW MultiTermDocs(subReaders, starts);
+ ret->setLoadStats(load_stats);
ret->setIoContext(io_ctx);
return ret;
}
-TermPositions* MultiReader::termPositions(const void* io_ctx) {
+TermPositions* MultiReader::termPositions(bool load_stats, const void* io_ctx)
{
ensureOpen();
TermPositions* ret = (TermPositions*)_CLNEW
MultiTermPositions(subReaders, starts);
+ ret->setLoadStats(load_stats);
ret->setIoContext(io_ctx);
return ret;
}
diff --git a/src/core/CLucene/index/MultiReader.h
b/src/core/CLucene/index/MultiReader.h
index 362415501c2..1e83bc6c54a 100644
--- a/src/core/CLucene/index/MultiReader.h
+++ b/src/core/CLucene/index/MultiReader.h
@@ -107,8 +107,8 @@ public:
// Returns the total norm of all terms appeared in all documents in this
field
std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
- TermDocs* termDocs(const void* io_ctx = nullptr);
- TermPositions* termPositions(const void* io_ctx = nullptr);
+ TermDocs* termDocs(bool load_stats = false, const void* io_ctx =
nullptr);
+ TermPositions* termPositions(bool load_stats = false, const void*
io_ctx = nullptr);
/**
* @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption)
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp
b/src/core/CLucene/index/MultiSegmentReader.cpp
index 910f7c45c67..cedee01b765 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -391,16 +391,18 @@ std::optional<uint64_t>
MultiSegmentReader::sumTotalTermFreq(const TCHAR* field)
return std::nullopt;
}
-TermDocs* MultiSegmentReader::termDocs(const void* io_ctx) {
+TermDocs* MultiSegmentReader::termDocs(bool load_stats, const void* io_ctx) {
ensureOpen();
TermDocs* ret = _CLNEW MultiTermDocs(subReaders, starts);
+ ret->setLoadStats(load_stats);
ret->setIoContext(io_ctx);
return ret;
}
-TermPositions* MultiSegmentReader::termPositions(const void* io_ctx) {
+TermPositions* MultiSegmentReader::termPositions(bool load_stats, const void*
io_ctx) {
ensureOpen();
TermPositions* ret = static_cast<TermPositions*>(_CLNEW
MultiTermPositions(subReaders, starts));
+ ret->setLoadStats(load_stats);
ret->setIoContext(io_ctx);
return ret;
}
@@ -597,6 +599,10 @@ int32_t MultiTermDocs::docFreq() {
return docFreq;
}
+void MultiTermDocs::setLoadStats(bool load_stats) {
+ load_stats_ = load_stats;
+}
+
void MultiTermDocs::setIoContext(const void* io_ctx) {
io_ctx_ = io_ctx;
}
@@ -623,11 +629,11 @@ int32_t MultiTermDocs::norm() const {
return current->norm();
}
-void MultiTermDocs::seek(TermEnum* termEnum, bool load_stats){
- seek(termEnum->term(false), load_stats);
+void MultiTermDocs::seek(TermEnum* termEnum){
+ seek(termEnum->term(false));
}
-void MultiTermDocs::seek( Term* tterm, bool load_stats) {
+void MultiTermDocs::seek( Term* tterm) {
//Func - Resets the instance for a new search
//Pre - tterm != NULL
//Post - The instance has been reset for a new search
@@ -797,10 +803,10 @@ void MultiTermDocs::close() {
}
TermDocs* MultiTermDocs::termDocs(IndexReader* reader) {
- return reader->termDocs(io_ctx_);
+ return reader->termDocs(load_stats_, io_ctx_);
}
-TermDocs* MultiTermDocs::termDocs(const int32_t i, bool local_stats) {
+TermDocs* MultiTermDocs::termDocs(const int32_t i) {
if (term == NULL)
return NULL;
TermDocs* result = (*readerTermDocs)[i];
@@ -809,7 +815,7 @@ TermDocs* MultiTermDocs::termDocs(const int32_t i, bool
local_stats) {
readerTermDocs->values[i] = termDocs((*subReaders)[i]);
result = (*readerTermDocs)[i];
}
- result->seek(term, local_stats);
+ result->seek(term);
return result;
}
@@ -993,7 +999,7 @@ TermDocs* MultiTermPositions::termDocs(IndexReader* reader)
{
// rather merely producing a SegmentTermDocs via the reader's termDocs
// method.
- TermPositions* tp = reader->termPositions(io_ctx_);
+ TermPositions* tp = reader->termPositions(load_stats_, io_ctx_);
TermDocs* ret = tp->__asTermDocs();
CND_CONDITION(ret != NULL,
diff --git a/src/core/CLucene/index/MultipleTermPositions.cpp
b/src/core/CLucene/index/MultipleTermPositions.cpp
index b5846516f76..4a4bb0563e6 100644
--- a/src/core/CLucene/index/MultipleTermPositions.cpp
+++ b/src/core/CLucene/index/MultipleTermPositions.cpp
@@ -14,11 +14,11 @@ CL_NS_USE(util)
CL_NS_DEF(index)
-void MultipleTermPositions::seek(Term*, bool) {
+void MultipleTermPositions::seek(Term*) {
_CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation:
MultipleTermPositions::seek");
}
-void MultipleTermPositions::seek(TermEnum*, bool) {
+void MultipleTermPositions::seek(TermEnum*) {
_CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation:
MultipleTermPositions::seek");
}
diff --git a/src/core/CLucene/index/MultipleTermPositions.h
b/src/core/CLucene/index/MultipleTermPositions.h
index 8ef7be1ac56..d50a4faaeee 100644
--- a/src/core/CLucene/index/MultipleTermPositions.h
+++ b/src/core/CLucene/index/MultipleTermPositions.h
@@ -53,13 +53,13 @@ public:
* Not implemented.
* @throws UnsupportedOperationException
*/
- void seek(Term*, bool);
+ void seek(Term*);
/**
* Not implemented.
* @throws UnsupportedOperationException
*/
- void seek(TermEnum*, bool);
+ void seek(TermEnum*);
/**
* Not implemented.
diff --git a/src/core/CLucene/index/SDocumentWriter.cpp
b/src/core/CLucene/index/SDocumentWriter.cpp
index b41290741be..c272a575228 100644
--- a/src/core/CLucene/index/SDocumentWriter.cpp
+++ b/src/core/CLucene/index/SDocumentWriter.cpp
@@ -31,7 +31,7 @@ CL_NS_USE(document)
CL_NS_DEF(index)
template<typename T>
-const uint8_t SDocumentsWriter<T>::defaultNorm =
search::Similarity::encodeNorm(1.0f);
+const uint8_t SDocumentsWriter<T>::defaultNorm =
search::Similarity::encodeNorm(1);
template<typename T>
const int32_t SDocumentsWriter<T>::BYTE_BLOCK_SHIFT = 15;
@@ -298,8 +298,7 @@ void SDocumentsWriter<T>::ThreadState::writeDocument() {
assert(bn != nullptr);
assert(bn->upto <= docID);
bn->fill(docID);
- float_t norm = fp->boost *
_parent->writer->getSimilarity()->lengthNorm(fp->fieldInfo->name, fp->length);
- bn->add(norm);
+ bn->add(fp->length);
}
}
} catch (CLuceneError &t) {
@@ -982,9 +981,11 @@ void SDocumentsWriter<T>::writeNorms(const std::string
&segmentName, int32_t tot
if (fi->isIndexed && !fi->omitNorms) {
BufferedNorms *n = norms[fieldIdx];
int64_t v;
- if (n == nullptr)
+ if (n == nullptr) {
+ normsOut->writeLong(0);
v = 0;
- else {
+ } else {
+ normsOut->writeLong(n->total_term_count_);
v = n->out.getFilePointer();
n->out.writeTo(normsOut);
n->reset();
@@ -1429,10 +1430,20 @@ void SDocumentsWriter<T>::BufferedNorms::add(float_t
norm) {
out.writeByte(b);
upto++;
}
+
+template<typename T>
+void SDocumentsWriter<T>::BufferedNorms::add(int32_t norm) {
+ total_term_count_ += norm;
+ uint8_t b = search::Similarity::encodeNorm(norm);
+ out.writeByte(b);
+ upto++;
+}
+
template<typename T>
void SDocumentsWriter<T>::BufferedNorms::reset() {
out.reset();
upto = 0;
+ total_term_count_ = 0;
}
template<typename T>
void SDocumentsWriter<T>::BufferedNorms::fill(int32_t docID) {
diff --git a/src/core/CLucene/index/SDocumentWriter.h
b/src/core/CLucene/index/SDocumentWriter.h
index a166703f1d0..5fc8247b7c4 100644
--- a/src/core/CLucene/index/SDocumentWriter.h
+++ b/src/core/CLucene/index/SDocumentWriter.h
@@ -88,8 +88,11 @@ public:
BufferedNorms();
void add(float_t norm);
+ void add(int32_t norm);
void reset();
void fill(int32_t docID);
+
+ int64_t total_term_count_ = 0;
};
template<typename T2>
class BlockPool {
diff --git a/src/core/CLucene/index/SegmentReader.cpp
b/src/core/CLucene/index/SegmentReader.cpp
index 374322761ed..beb357fc3d5 100644
--- a/src/core/CLucene/index/SegmentReader.cpp
+++ b/src/core/CLucene/index/SegmentReader.cpp
@@ -507,24 +507,26 @@ bool SegmentReader::isDeleted(const int32_t n) {
return ret;
}
-TermDocs *SegmentReader::termDocs(const void* io_ctx) {
+TermDocs *SegmentReader::termDocs(bool load_stats, const void* io_ctx) {
//Func - Returns an unpositioned TermDocs enumerator.
//Pre - true
//Post - An unpositioned TermDocs enumerator has been returned
ensureOpen();
auto* ret = _CLNEW SegmentTermDocs(this);
+ ret->setLoadStats(load_stats);
ret->setIoContext(io_ctx);
return ret;
}
-TermPositions *SegmentReader::termPositions(const void* io_ctx) {
+TermPositions *SegmentReader::termPositions(bool load_stats, const void*
io_ctx) {
//Func - Returns an unpositioned TermPositions enumerator.
//Pre - true
//Post - An unpositioned TermPositions enumerator has been returned
ensureOpen();
auto* ret = _CLNEW SegmentTermPositions(this);
+ ret->setLoadStats(load_stats);
ret->setIoContext(io_ctx);
return ret;
}
@@ -902,28 +904,14 @@ void SegmentReader::openNorms(Directory *cfsDir, int32_t
readBufferSize) {
normInput = d->openInput(fileName.c_str());
}
- _norms[fi->name] = _CLNEW Norm(normInput, singleNormFile,
fi->number, normSeek, this, segment.c_str());
-
- // read total norm info into cache
- std::vector<uint8_t> bytes(_maxDoc);
- IndexInput *normStream;
- if (_norms[fi->name]->useSingleNormStream) {
- normStream = singleNormStream;
- } else {
- normStream = _norms[fi->name]->in;
- }
+ normInput->seek(normSeek);
+ auto total_term_count = normInput->readLong();
+ sum_total_term_freq[*fi->name] = total_term_count;
+ normSeek += sizeof(int64_t);
- ensureOpen();
- SCOPED_LOCK_MUTEX(_norms[fi->name]->THIS_LOCK);
- normStream->seek(_norms[fi->name]->normSeek);
- normStream->readBytes(bytes.data(), _maxDoc);
- uint64_t sum = 0;
- for (int doc = 0; doc < _maxDoc; doc++) {
- sum += Similarity::decodeNorm(bytes[doc]);
- }
- sum_total_term_freq[*fi->name] = sum;
+ _norms[fi->name] = _CLNEW Norm(normInput, singleNormFile,
fi->number, normSeek, this, segment.c_str());
- nextNormSeek += _maxDoc;// increment also if some norms are
separate
+ nextNormSeek += (_maxDoc + sizeof(int64_t));// increment also if
some norms are separate
}
}
}
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp
b/src/core/CLucene/index/SegmentTermDocs.cpp
index 35ffc8fd615..bd58b1b2cff 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -37,6 +37,10 @@ TermPositions *SegmentTermDocs::__asTermPositions() {
return NULL;
}
+void SegmentTermDocs::setLoadStats(bool load_stats) {
+ load_stats_ = load_stats;
+}
+
void SegmentTermDocs::setIoContext(const void* io_ctx) {
if (freqStream) {
freqStream->setIoContext(io_ctx);
@@ -58,13 +62,13 @@ int32_t SegmentTermDocs::docNorm() {
return 0;
}
-void SegmentTermDocs::seek(Term *term, bool load_stats) {
+void SegmentTermDocs::seek(Term *term) {
TermInfo *ti = parent->tis->get(term, io_ctx_);
- seek(ti, term, load_stats);
+ seek(ti, term);
_CLDELETE(ti);
}
-void SegmentTermDocs::seek(TermEnum *termEnum, bool load_stats) {
+void SegmentTermDocs::seek(TermEnum *termEnum) {
TermInfo *ti = NULL;
Term *term = NULL;
@@ -79,15 +83,15 @@ void SegmentTermDocs::seek(TermEnum *termEnum, bool
load_stats) {
ti = parent->tis->get(term);
}
- seek(ti, term, load_stats);
+ seek(ti, term);
_CLDELETE(ti);
}
-void SegmentTermDocs::seek(const TermInfo *ti, Term *term, bool load_stats) {
+void SegmentTermDocs::seek(const TermInfo *ti, Term *term) {
count = 0;
FieldInfo *fi = parent->_fieldInfos->fieldInfo(term->field());
currentFieldStoresPayloads = (fi != NULL) ? fi->storePayloads : false;
- buffer_.needLoadStats(load_stats);
- if (load_stats && fi != NULL && fi->isIndexed && !fi->omitNorms) {
+ buffer_.needLoadStats(load_stats_);
+ if (load_stats_ && fi != NULL && fi->isIndexed && !fi->omitNorms) {
const TCHAR *curField = fi->name;
norms = parent->norms(curField);
buffer_.setAllDocNorms(norms);
@@ -356,7 +360,7 @@ void TermDocsBuffer::refillNorm(int32_t size) {
auto doc = docs_[i];
// avoid doc norms not set
if (doc < maxDoc && all_doc_norms_) {
- norms_[i] = search::Similarity::decodeNorm(all_doc_norms_[doc]);
+ norms_[i] = all_doc_norms_[doc];
} else {
norms_[i] = 0;
}
diff --git a/src/core/CLucene/index/SegmentTermPositions.cpp
b/src/core/CLucene/index/SegmentTermPositions.cpp
index 7a17496b71f..edd20a0f22d 100644
--- a/src/core/CLucene/index/SegmentTermPositions.cpp
+++ b/src/core/CLucene/index/SegmentTermPositions.cpp
@@ -27,6 +27,10 @@ SegmentTermPositions::~SegmentTermPositions() {
close();
}
+void SegmentTermPositions::setLoadStats(bool load_stats) {
+ SegmentTermDocs::setLoadStats(load_stats);
+}
+
void SegmentTermPositions::setIoContext(const void* io_ctx) {
SegmentTermDocs::setIoContext(io_ctx);
}
@@ -38,8 +42,8 @@ TermPositions* SegmentTermPositions::__asTermPositions(){
return (TermPositions*) this;
}
-void SegmentTermPositions::seek(const TermInfo* ti, Term* term, bool
local_stats) {
- SegmentTermDocs::seek(ti, term, local_stats);
+void SegmentTermPositions::seek(const TermInfo* ti, Term* term) {
+ SegmentTermDocs::seek(ti, term);
if (ti != NULL)
lazySkipPointer = ti->proxPointer;
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index 771eff51873..57133322314 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -31,12 +31,12 @@ public:
// Sets this to the data for a term.
// The enumeration is reset to the start of the data for this term.
- virtual void seek(Term* term, bool load_stats = false) = 0;
+ virtual void seek(Term* term) = 0;
/** Sets this to the data for the current term in a {@link TermEnum}.
* This may be optimized in some implementations.
*/
- virtual void seek(TermEnum* termEnum, bool load_stats = false) = 0;
+ virtual void seek(TermEnum* termEnum) = 0;
// Returns the current document number. <p> This is invalid until
{@link
// #next()} is called for the first time.
@@ -88,6 +88,7 @@ public:
*/
virtual TermPositions* __asTermPositions()=0;
+ virtual void setLoadStats(bool load_stats) {}
virtual void setIoContext(const void*) {}
virtual int32_t docFreq() {
@@ -140,6 +141,7 @@ public:
*/
virtual bool skipTo(Term* target);
+ virtual void setLoadStats(bool load_stats) {}
virtual void setIoContext(const void*) {}
};
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h
b/src/core/CLucene/index/_MultiSegmentReader.h
index d3dc7c70486..3ace80765fb 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -104,8 +104,8 @@ public:
//Returns the document frequency of the current term in the set
int32_t docFreq(const Term* t=NULL);
- TermDocs* termDocs(const void* io_ctx = nullptr);
- TermPositions* termPositions(const void* io_ctx = nullptr);
+ TermDocs* termDocs(bool load_stats = false, const void* io_ctx =
nullptr);
+ TermPositions* termPositions(bool load_stats = false, const void*
io_ctx = nullptr);
// Returns the document norm
int32_t docNorm(const TCHAR* field, int32_t n);
@@ -152,7 +152,7 @@ protected:
size_t pointer;
TermDocs* current; // == segTermDocs[pointer]
- TermDocs* termDocs(const int32_t i, bool local_stats = false); //< internal
use only
+ TermDocs* termDocs(const int32_t i); //< internal use only
virtual TermDocs* termDocs(IndexReader* reader);
void init(CL_NS(util)::ArrayBase<IndexReader*>* subReaders, const int32_t*
starts);
public:
@@ -164,8 +164,8 @@ public:
int32_t freq() const;
int32_t norm() const;
- void seek(TermEnum* termEnum, bool load_stats = false);
- void seek(Term* tterm, bool load_stats = false);
+ void seek(TermEnum* termEnum);
+ void seek(Term* tterm);
bool next();
/** Optimized implementation. */
@@ -183,9 +183,11 @@ public:
int32_t docFreq() override;
int32_t docNorm() override;
+ void setLoadStats(bool load_stats) override;
void setIoContext(const void* io_ctx) override;
protected:
+ bool load_stats_ = false;
const void* io_ctx_ = nullptr;
};
@@ -224,6 +226,7 @@ public:
void setIoContext(const void*) override;
private:
+ bool load_stats_ = false;
const void* io_ctx_ = nullptr;
};
diff --git a/src/core/CLucene/index/_SegmentHeader.h
b/src/core/CLucene/index/_SegmentHeader.h
index e09ed00969e..81657b386d4 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -114,7 +114,7 @@ private:
// save all doc norms in this term's field
uint32_t maxDoc = 0;
- uint8_t* all_doc_norms_;
+ uint8_t* all_doc_norms_ = nullptr;
bool hasProx_ = false;
bool compatibleRead_ = false;
@@ -211,6 +211,8 @@ protected:
bool currentFieldStoresPayloads;
bool hasProx = false;
IndexVersion indexVersion_ = IndexVersion::kV0;
+
+ bool load_stats_ = false;
const void* io_ctx_ = nullptr;
public:
@@ -218,9 +220,9 @@ public:
SegmentTermDocs( const SegmentReader* Parent);
virtual ~SegmentTermDocs();
- virtual void seek(Term* term, bool load_stats = false);
- virtual void seek(TermEnum* termEnum, bool load_stats = false);
- virtual void seek(const TermInfo* ti,Term* term, bool load_stats = false);
+ virtual void seek(Term* term);
+ virtual void seek(TermEnum* termEnum);
+ virtual void seek(const TermInfo* ti,Term* term);
virtual void close();
virtual int32_t doc()const;
@@ -241,6 +243,7 @@ public:
virtual TermPositions* __asTermPositions();
+ void setLoadStats(bool load_stats) override;
void setIoContext(const void* io_ctx) override;
int32_t docFreq() override;
@@ -282,10 +285,11 @@ public:
SegmentTermPositions(const SegmentReader* Parent);
virtual ~SegmentTermPositions();
+ void setLoadStats(bool load_stats) override;
void setIoContext(const void* io_ctx) override;
private:
- void seek(const TermInfo* ti, Term* term, bool load_stats = false);
+ void seek(const TermInfo* ti, Term* term);
public:
void close();
@@ -335,8 +339,8 @@ private:
virtual TermPositions* __asTermPositions();
//resolve SegmentTermDocs/TermPositions ambiguity
- void seek(Term* term, bool load_stats = false){ SegmentTermDocs::seek(term,
load_stats); }
- void seek(TermEnum* termEnum, bool load_stats = false){
SegmentTermDocs::seek(termEnum, load_stats); }
+ void seek(Term* term){ SegmentTermDocs::seek(term); }
+ void seek(TermEnum* termEnum){ SegmentTermDocs::seek(termEnum); }
int32_t doc() const{ return SegmentTermDocs::doc(); }
int32_t freq() const{ return SegmentTermDocs::freq(); }
int32_t norm() const{ return SegmentTermDocs::norm(); }
@@ -526,9 +530,9 @@ public:
bool isDeleted(const int32_t n);
///Returns an unpositioned TermDocs enumerator.
- TermDocs* termDocs(const void* io_ctx = nullptr);
+ TermDocs* termDocs(bool load_stats = false, const void* io_ctx = nullptr);
///Returns an unpositioned TermPositions enumerator.
- TermPositions* termPositions(const void* io_ctx = nullptr);
+ TermPositions* termPositions(bool load_stats = false, const void* io_ctx =
nullptr);
///Returns the number of documents which contain the term t
int32_t docFreq(const Term* t);
diff --git a/src/core/CLucene/search/Similarity.cpp
b/src/core/CLucene/search/Similarity.cpp
index 922d644ec8f..826be432492 100644
--- a/src/core/CLucene/search/Similarity.cpp
+++ b/src/core/CLucene/search/Similarity.cpp
@@ -193,6 +193,57 @@ CL_NS_DEF(search)
#endif
}
+ int32_t number_of_leading_zeros(uint64_t value) {
+ if (value == 0) {
+ return 64;
+ }
+#if defined(__GNUC__) || defined(__clang__)
+ return __builtin_clzll(value);
+#else
+ int32_t count = 0;
+ for (uint64_t mask = 1ULL << 63; mask != 0; mask >>= 1) {
+ if (value & mask) break;
+ ++count;
+ }
+ return count;
+#endif
+ }
+
+ uint32_t long_to_int4(uint64_t i) {
+ if (i > std::numeric_limits<uint64_t>::max()) {
+ _CLTHROWA(CL_ERR_IllegalArgument, "Only supports positive
values");
+ }
+
+ int32_t numBits = 64 - number_of_leading_zeros(i);
+ if (numBits < 4) {
+ return static_cast<uint32_t>(i);
+ } else {
+ int32_t shift = numBits - 4;
+ uint32_t encoded = static_cast<uint32_t>(i >> shift) & 0x07;
+ return encoded | ((shift + 1) << 3);
+ }
+ }
+
+ const int32_t MAX_INT32 = std::numeric_limits<int32_t>::max();
+ const uint32_t MAX_INT4 = long_to_int4(static_cast<uint64_t>(MAX_INT32));
+ const int32_t NUM_FREE_VALUES = 255 - static_cast<int>(MAX_INT4);
+
+ uint8_t int_to_byte4(int32_t i) {
+ if (i < 0) {
+ _CLTHROWA(CL_ERR_IllegalArgument, "Only supports positive
values");
+ }
+
+ if (i < NUM_FREE_VALUES) {
+ return static_cast<uint8_t>(i);
+ } else {
+ uint32_t encoded = long_to_int4(i - NUM_FREE_VALUES);
+ return static_cast<uint8_t>(NUM_FREE_VALUES + encoded);
+ }
+ }
+
+ uint8_t Similarity::encodeNorm(int32_t i) {
+ return int_to_byte4(i);
+ }
float_t Similarity::idf(Term* term, Searcher* searcher) {
return idf(searcher->docFreq(term), searcher->maxDoc());
diff --git a/src/core/CLucene/search/Similarity.h
b/src/core/CLucene/search/Similarity.h
index 74b7a819d06..472e4758983 100644
--- a/src/core/CLucene/search/Similarity.h
+++ b/src/core/CLucene/search/Similarity.h
@@ -95,6 +95,8 @@ public:
static uint8_t floatToByte(float_t f);
static float_t byteToFloat(uint8_t b);
+ static uint8_t encodeNorm(int32_t i);
+
/** Computes a score factor for a phrase.
*
* <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
diff --git a/src/test/search/TestSearch.cpp b/src/test/search/TestSearch.cpp
index 14676ca7337..b7b1eb9928b 100644
--- a/src/test/search/TestSearch.cpp
+++ b/src/test/search/TestSearch.cpp
@@ -342,7 +342,7 @@ void testNormEncoding(CuTest *tc) {
CLUCENE_ASSERT( CL_NS(search)::Similarity::encodeNorm(0)==0 );
CLUCENE_ASSERT( CL_NS(search)::Similarity::encodeNorm(1)==124 );
CLUCENE_ASSERT( CL_NS(search)::Similarity::encodeNorm(1)==124 );
- CLUCENE_ASSERT( CL_NS(search)::Similarity::encodeNorm(7516192768.0
)==255);
+ CLUCENE_ASSERT(
CL_NS(search)::Similarity::encodeNorm(7516192768.0f)==255);
CLUCENE_ASSERT( CL_NS(search)::Similarity::decodeNorm(124)==1 );
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]