(doris-thirdparty) branch clucene updated: [fix](inverted index) fix BM25 scoring anomaly in term frequency calculation (#344)

airborne Wed, 30 Jul 2025 04:12:55 -0700

This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene by this push:
     new eff3191d680 [fix](inverted index) fix BM25 scoring anomaly in term 
frequency calculation (#344)
eff3191d680 is described below

commit eff3191d6805644d0438d6fe2eecd1b26e8ee740
Author: zzzxl <[email protected]>
AuthorDate: Wed Jul 30 19:12:44 2025 +0800

    [fix](inverted index) fix BM25 scoring anomaly in term frequency 
calculation (#344)
---
 src/core/CLucene/index/IndexReader.cpp           |  8 ++--
 src/core/CLucene/index/IndexReader.h             |  4 +-
 src/core/CLucene/index/IndexWriter.cpp           | 37 +++++++++++------
 src/core/CLucene/index/IndexWriter.h             |  6 ++-
 src/core/CLucene/index/MultiReader.cpp           |  6 ++-
 src/core/CLucene/index/MultiReader.h             |  4 +-
 src/core/CLucene/index/MultiSegmentReader.cpp    | 24 ++++++-----
 src/core/CLucene/index/MultipleTermPositions.cpp |  4 +-
 src/core/CLucene/index/MultipleTermPositions.h   |  4 +-
 src/core/CLucene/index/SDocumentWriter.cpp       | 21 +++++++---
 src/core/CLucene/index/SDocumentWriter.h         |  3 ++
 src/core/CLucene/index/SegmentReader.cpp         | 32 +++++----------
 src/core/CLucene/index/SegmentTermDocs.cpp       | 20 ++++++----
 src/core/CLucene/index/SegmentTermPositions.cpp  |  8 +++-
 src/core/CLucene/index/Terms.h                   |  6 ++-
 src/core/CLucene/index/_MultiSegmentReader.h     | 13 +++---
 src/core/CLucene/index/_SegmentHeader.h          | 22 +++++-----
 src/core/CLucene/search/Similarity.cpp           | 51 ++++++++++++++++++++++++
 src/core/CLucene/search/Similarity.h             |  2 +
 src/test/search/TestSearch.cpp                   |  2 +-
 20 files changed, 186 insertions(+), 91 deletions(-)

diff --git a/src/core/CLucene/index/IndexReader.cpp 
b/src/core/CLucene/index/IndexReader.cpp
index 77df957f75e..9fccfdbda31 100644
--- a/src/core/CLucene/index/IndexReader.cpp
+++ b/src/core/CLucene/index/IndexReader.cpp
@@ -278,9 +278,9 @@ CL_NS_DEF(index)
 
       ensureOpen();
       //Reference an instantiated TermDocs instance
-      TermDocs* _termDocs = termDocs(io_ctx);
+      TermDocs* _termDocs = termDocs(load_stats, io_ctx);
       //Seek all documents containing term
-      _termDocs->seek(term, load_stats);
+      _termDocs->seek(term);
       //return the enumaration
       return _termDocs;
   }
@@ -304,9 +304,9 @@ CL_NS_DEF(index)
 
       ensureOpen();
       //Reference an instantiated termPositions instance
-      TermPositions* _termPositions = termPositions(io_ctx);
+      TermPositions* _termPositions = termPositions(load_stats, io_ctx);
          //Seek all documents containing term
-      _termPositions->seek(term, load_stats);
+      _termPositions->seek(term);
          //return the enumeration
       return _termPositions;
   }
diff --git a/src/core/CLucene/index/IndexReader.h 
b/src/core/CLucene/index/IndexReader.h
index da73d64dcab..a7f2920a804 100644
--- a/src/core/CLucene/index/IndexReader.h
+++ b/src/core/CLucene/index/IndexReader.h
@@ -575,7 +575,7 @@ public:
    * @throws IOException if there is a low-level IO error
         * @memory Caller must clean up
         */
-       virtual TermPositions* termPositions(const void* io_ctx = nullptr) = 0;
+       virtual TermPositions* termPositions(bool load_stats = false, const 
void* io_ctx = nullptr) = 0;
 
     /** Returns an enumeration of all the documents which contain
        * <code>term</code>.  For each document, in addition to the document 
number
@@ -601,7 +601,7 @@ public:
    * @throws IOException if there is a low-level IO error
         * @memory Caller must clean up
         */
-       virtual TermDocs* termDocs(const void* io_ctx = nullptr) = 0;
+       virtual TermDocs* termDocs(bool load_stats = false, const void* io_ctx 
= nullptr) = 0;
 
        /** Returns an enumeration of all the documents which contain
        * <code>term</code>. For each document, the document number, the 
frequency of
diff --git a/src/core/CLucene/index/IndexWriter.cpp 
b/src/core/CLucene/index/IndexWriter.cpp
index 7ec6f56817d..332e0a4e786 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1282,8 +1282,9 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
     std::vector<lucene::store::IndexOutput *> normsOutputList;
 
     // first level vector index is src_index_id
-    // <TCHAR, ValueArray<uint8_t>> key is field name, value is the norm of 
src_doc_id
-    std::vector<map<TCHAR, std::vector<uint8_t>>> 
srcFieldNormsMapValues(numIndices);
+    // <std::wstring, ValueArray<uint8_t>> key is field name, value is the 
norm of src_doc_id
+    std::vector<std::map<std::wstring, std::vector<uint8_t>>> 
srcFieldNormsMapValues(numIndices);
+    std::map<std::wstring, uint64_t> srcFieldTotalTermCountMap;
 
     try {
         // check hasProx, indexVersion
@@ -1336,14 +1337,17 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
                     if (fi->isIndexed && !fi->omitNorms) {
                         CL_NS(util)::ValueArray<uint8_t> normBuffer;
                         size_t maxDoc = reader->maxDoc();
-                        if ( normBuffer.length < maxDoc){
+                        if (normBuffer.length < maxDoc) {
                             normBuffer.resize(maxDoc);
                             memset(normBuffer.values, 0, sizeof(uint8_t) * 
maxDoc);
                         }
                         reader->norms(fi->name, normBuffer.values);
                         for (int j = 0; j < normBuffer.length; j++) {
-                            
srcFieldNormsMapValues[srcIndex][*fi->name].emplace_back(normBuffer.values[j]);
+                            
srcFieldNormsMapValues[srcIndex][fi->name].emplace_back(
+                                    normBuffer.values[j]);
                         }
+                        srcFieldTotalTermCountMap[fi->name] +=
+                                reader->sumTotalTermFreq(fi->name).value_or(0);
                     }
                 }
             }
@@ -1406,7 +1410,7 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
 
         /// merge norms if have
         if (hasNorms){
-            mergeNorms(dest_index_docs, srcFieldNormsMapValues, 
normsOutputList);
+            mergeNorms(dest_index_docs, srcFieldTotalTermCountMap, 
srcFieldNormsMapValues, normsOutputList);
         }
 
         /// merge null_bitmap
@@ -1918,18 +1922,19 @@ void IndexWriter::mergeTerms(bool hasProx, IndexVersion 
indexVersion) {
 }
 
 void IndexWriter::mergeNorms(std::vector<uint32_t> dest_index_docs,
-                                std::vector<std::map<TCHAR, 
std::vector<uint8_t>>> srcFieldNormsMapValues,
-                                std::vector<lucene::store::IndexOutput *> 
normsOutputList) {
+                             std::map<std::wstring, uint64_t> 
srcFieldTotalTermCountMap,
+                             std::vector<std::map<std::wstring, 
std::vector<uint8_t>>> srcFieldNormsMapValues,
+                             std::vector<lucene::store::IndexOutput *> 
normsOutputList) {
     //Func - Merges the norms for all fields
     //Pre  - fieldInfos != NULL
     //Post - The norms for all fields have been merged
     CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");
 
-    std::vector<std::map<TCHAR, std::vector<uint8_t>>> 
destFieldNormsMapValues(numDestIndexes);
+    std::vector<std::map<std::wstring, std::vector<uint8_t>>> 
destFieldNormsMapValues(numDestIndexes);
 
     // iterate srcFieldNormsValues to construct destFieldNormsMapValues
     for (size_t srcIndex = 0; srcIndex < srcFieldNormsMapValues.size(); 
++srcIndex) {
-        std::map<TCHAR, std::vector<uint8_t>> &srcFieldNormsMap = 
srcFieldNormsMapValues[srcIndex];
+        std::map<std::wstring, std::vector<uint8_t>> &srcFieldNormsMap = 
srcFieldNormsMapValues[srcIndex];
         if (srcFieldNormsMap.empty()) {
             // empty indicates there is no nrm file in this index
             continue;
@@ -1937,7 +1942,7 @@ void IndexWriter::mergeNorms(std::vector<uint32_t> 
dest_index_docs,
         // find field has norms
         for (int j =0; j < fieldInfos->size(); j++) {
             FieldInfo* fi = fieldInfos->fieldInfo(j);
-            TCHAR fieldName = *fi->name;
+            std::wstring fieldName = fi->name;
             // Is this Field indexed and field need norms ?
             if (fi->isIndexed && !fi->omitNorms) {
                 auto& srcFieldNorms = srcFieldNormsMap[fieldName];
@@ -1965,15 +1970,21 @@ void IndexWriter::mergeNorms(std::vector<uint32_t> 
dest_index_docs,
     // construct nrm and write nrm to dest index
     for (size_t i = 0; i < destFieldNormsMapValues.size(); ++i) {
         auto& destFieldNormsMap = destFieldNormsMapValues[i];
-        for (int j =0; j < fieldInfos->size(); j++) {
+        for (int j = 0; j < fieldInfos->size(); j++) {
             FieldInfo* fi = fieldInfos->fieldInfo(j);
-            TCHAR fieldName = *fi->name;
+            std::wstring fieldName = fi->name;
             auto destDocCount = dest_index_docs[i];
             if (fi->isIndexed && !fi->omitNorms) {
                 // if not find then norm is zero
                 if (destFieldNormsMap.find(fieldName) == 
destFieldNormsMap.end()) {
                     destFieldNormsMap[fieldName].resize(destDocCount);
-                    
std::fill(destFieldNormsMap[fieldName].begin(),destFieldNormsMap[fieldName].end(),
 0);
+                    std::fill(destFieldNormsMap[fieldName].begin(),
+                              destFieldNormsMap[fieldName].end(), 0);
+                }
+                if (i == 0) {
+                    
normsOutputList[i]->writeLong(srcFieldTotalTermCountMap[fieldName]);
+                } else {
+                    normsOutputList[i]->writeLong(0);
                 }
                 auto& destFieldNorms = destFieldNormsMap[fieldName];
                 normsOutputList[i]->writeBytes(destFieldNorms.data(), 
destDocCount);
diff --git a/src/core/CLucene/index/IndexWriter.h 
b/src/core/CLucene/index/IndexWriter.h
index 33df65cad46..e8c5fbd0ec3 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -330,7 +330,11 @@ public:
     // merge terms and write files
     void mergeTerms(bool hasProx, IndexVersion indexVersion);
     // merge norms and write files
-    void mergeNorms(std::vector<uint32_t> dest_index_docs, 
std::vector<std::map<TCHAR, std::vector<uint8_t>>> srcFieldNormsMapValues, 
std::vector<lucene::store::IndexOutput *> normsOutputList);
+    void mergeNorms(
+            std::vector<uint32_t> dest_index_docs,
+            std::map<std::wstring, uint64_t> srcFieldTotalTermCountMap,
+            std::vector<std::map<std::wstring, std::vector<uint8_t>>> 
srcFieldNormsMapValues,
+            std::vector<lucene::store::IndexOutput*> normsOutputList);
     // merge null_bitmap
     void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues, 
std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList);
 
diff --git a/src/core/CLucene/index/MultiReader.cpp 
b/src/core/CLucene/index/MultiReader.cpp
index 3ac68b4af8c..2f8d771cbce 100644
--- a/src/core/CLucene/index/MultiReader.cpp
+++ b/src/core/CLucene/index/MultiReader.cpp
@@ -307,16 +307,18 @@ std::optional<uint64_t> 
MultiReader::sumTotalTermFreq(const TCHAR* field) {
     return std::nullopt;
 }
 
-TermDocs* MultiReader::termDocs(const void* io_ctx) {
+TermDocs* MultiReader::termDocs(bool load_stats, const void* io_ctx) {
     ensureOpen();
        TermDocs* ret =  _CLNEW MultiTermDocs(subReaders, starts);
+  ret->setLoadStats(load_stats);
   ret->setIoContext(io_ctx);
        return ret;
 }
 
-TermPositions* MultiReader::termPositions(const void* io_ctx) {
+TermPositions* MultiReader::termPositions(bool load_stats, const void* io_ctx) 
{
     ensureOpen();
        TermPositions* ret = (TermPositions*)_CLNEW 
MultiTermPositions(subReaders, starts);
+  ret->setLoadStats(load_stats);
   ret->setIoContext(io_ctx);
        return ret;
 }
diff --git a/src/core/CLucene/index/MultiReader.h 
b/src/core/CLucene/index/MultiReader.h
index 362415501c2..1e83bc6c54a 100644
--- a/src/core/CLucene/index/MultiReader.h
+++ b/src/core/CLucene/index/MultiReader.h
@@ -107,8 +107,8 @@ public:
   // Returns the total norm of all terms appeared in all documents in this 
field
   std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
 
-       TermDocs* termDocs(const void* io_ctx = nullptr);
-       TermPositions* termPositions(const void* io_ctx = nullptr);
+       TermDocs* termDocs(bool load_stats = false, const void* io_ctx = 
nullptr);
+       TermPositions* termPositions(bool load_stats = false, const void* 
io_ctx = nullptr);
 
        /**
        * @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption)
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp 
b/src/core/CLucene/index/MultiSegmentReader.cpp
index 910f7c45c67..cedee01b765 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -391,16 +391,18 @@ std::optional<uint64_t> 
MultiSegmentReader::sumTotalTermFreq(const TCHAR* field)
     return std::nullopt;
 }
 
-TermDocs* MultiSegmentReader::termDocs(const void* io_ctx) {
+TermDocs* MultiSegmentReader::termDocs(bool load_stats, const void* io_ctx) {
     ensureOpen();
        TermDocs* ret =  _CLNEW MultiTermDocs(subReaders, starts);
+       ret->setLoadStats(load_stats);
        ret->setIoContext(io_ctx);
        return ret;
 }
 
-TermPositions* MultiSegmentReader::termPositions(const void* io_ctx) {
+TermPositions* MultiSegmentReader::termPositions(bool load_stats, const void* 
io_ctx) {
     ensureOpen();
        TermPositions* ret = static_cast<TermPositions*>(_CLNEW 
MultiTermPositions(subReaders, starts));
+       ret->setLoadStats(load_stats);
        ret->setIoContext(io_ctx);
        return ret;
 }
@@ -597,6 +599,10 @@ int32_t MultiTermDocs::docFreq() {
        return docFreq;
 }
 
+void MultiTermDocs::setLoadStats(bool load_stats) {
+  load_stats_ = load_stats;
+}
+
 void MultiTermDocs::setIoContext(const void* io_ctx) {
        io_ctx_ = io_ctx;
 }
@@ -623,11 +629,11 @@ int32_t MultiTermDocs::norm() const {
     return current->norm();
 }
 
-void MultiTermDocs::seek(TermEnum* termEnum, bool load_stats){
-       seek(termEnum->term(false), load_stats);
+void MultiTermDocs::seek(TermEnum* termEnum){
+       seek(termEnum->term(false));
 }
 
-void MultiTermDocs::seek( Term* tterm, bool load_stats) {
+void MultiTermDocs::seek( Term* tterm) {
 //Func - Resets the instance for a new search
 //Pre  - tterm != NULL
 //Post - The instance has been reset for a new search
@@ -797,10 +803,10 @@ void MultiTermDocs::close() {
 }
 
 TermDocs* MultiTermDocs::termDocs(IndexReader* reader) {
-       return reader->termDocs(io_ctx_);
+       return reader->termDocs(load_stats_, io_ctx_);
 }
 
-TermDocs* MultiTermDocs::termDocs(const int32_t i, bool local_stats) {
+TermDocs* MultiTermDocs::termDocs(const int32_t i) {
        if (term == NULL)
          return NULL;
        TermDocs* result = (*readerTermDocs)[i];
@@ -809,7 +815,7 @@ TermDocs* MultiTermDocs::termDocs(const int32_t i, bool 
local_stats) {
          readerTermDocs->values[i] = termDocs((*subReaders)[i]);
          result = (*readerTermDocs)[i];
        }
-       result->seek(term, local_stats);
+       result->seek(term);
 
        return result;
 }
@@ -993,7 +999,7 @@ TermDocs* MultiTermPositions::termDocs(IndexReader* reader) 
{
 // rather merely producing a SegmentTermDocs via the reader's termDocs
 // method.
 
-       TermPositions* tp = reader->termPositions(io_ctx_);
+       TermPositions* tp = reader->termPositions(load_stats_, io_ctx_);
        TermDocs* ret = tp->__asTermDocs();
 
        CND_CONDITION(ret != NULL,
diff --git a/src/core/CLucene/index/MultipleTermPositions.cpp 
b/src/core/CLucene/index/MultipleTermPositions.cpp
index b5846516f76..4a4bb0563e6 100644
--- a/src/core/CLucene/index/MultipleTermPositions.cpp
+++ b/src/core/CLucene/index/MultipleTermPositions.cpp
@@ -14,11 +14,11 @@ CL_NS_USE(util)
 
 CL_NS_DEF(index)
 
-void MultipleTermPositions::seek(Term*, bool) {
+void MultipleTermPositions::seek(Term*) {
        _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::seek");
 }
 
-void MultipleTermPositions::seek(TermEnum*, bool) {
+void MultipleTermPositions::seek(TermEnum*) {
        _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::seek");
 }
 
diff --git a/src/core/CLucene/index/MultipleTermPositions.h 
b/src/core/CLucene/index/MultipleTermPositions.h
index 8ef7be1ac56..d50a4faaeee 100644
--- a/src/core/CLucene/index/MultipleTermPositions.h
+++ b/src/core/CLucene/index/MultipleTermPositions.h
@@ -53,13 +53,13 @@ public:
        * Not implemented.
        * @throws UnsupportedOperationException
        */
-       void seek(Term*, bool);
+       void seek(Term*);
 
        /**
        * Not implemented.
        * @throws UnsupportedOperationException
        */
-       void seek(TermEnum*, bool);
+       void seek(TermEnum*);
 
        /**
        * Not implemented.
diff --git a/src/core/CLucene/index/SDocumentWriter.cpp 
b/src/core/CLucene/index/SDocumentWriter.cpp
index b41290741be..c272a575228 100644
--- a/src/core/CLucene/index/SDocumentWriter.cpp
+++ b/src/core/CLucene/index/SDocumentWriter.cpp
@@ -31,7 +31,7 @@ CL_NS_USE(document)
 CL_NS_DEF(index)
 
 template<typename T>
-const uint8_t SDocumentsWriter<T>::defaultNorm = 
search::Similarity::encodeNorm(1.0f);
+const uint8_t SDocumentsWriter<T>::defaultNorm = 
search::Similarity::encodeNorm(1);
 
 template<typename T>
 const int32_t SDocumentsWriter<T>::BYTE_BLOCK_SHIFT = 15;
@@ -298,8 +298,7 @@ void SDocumentsWriter<T>::ThreadState::writeDocument() {
                 assert(bn != nullptr);
                 assert(bn->upto <= docID);
                 bn->fill(docID);
-                float_t norm = fp->boost * 
_parent->writer->getSimilarity()->lengthNorm(fp->fieldInfo->name, fp->length);
-                bn->add(norm);
+                bn->add(fp->length);
             }
         }
     } catch (CLuceneError &t) {
@@ -982,9 +981,11 @@ void SDocumentsWriter<T>::writeNorms(const std::string 
&segmentName, int32_t tot
             if (fi->isIndexed && !fi->omitNorms) {
                 BufferedNorms *n = norms[fieldIdx];
                 int64_t v;
-                if (n == nullptr)
+                if (n == nullptr) {
+                    normsOut->writeLong(0);
                     v = 0;
-                else {
+                } else {
+                    normsOut->writeLong(n->total_term_count_);
                     v = n->out.getFilePointer();
                     n->out.writeTo(normsOut);
                     n->reset();
@@ -1429,10 +1430,20 @@ void SDocumentsWriter<T>::BufferedNorms::add(float_t 
norm) {
     out.writeByte(b);
     upto++;
 }
+
+template<typename T>
+void SDocumentsWriter<T>::BufferedNorms::add(int32_t norm) {
+    total_term_count_ += norm;
+    uint8_t b = search::Similarity::encodeNorm(norm);
+    out.writeByte(b);
+    upto++;
+}
+
 template<typename T>
 void SDocumentsWriter<T>::BufferedNorms::reset() {
     out.reset();
     upto = 0;
+    total_term_count_ = 0;
 }
 template<typename T>
 void SDocumentsWriter<T>::BufferedNorms::fill(int32_t docID) {
diff --git a/src/core/CLucene/index/SDocumentWriter.h 
b/src/core/CLucene/index/SDocumentWriter.h
index a166703f1d0..5fc8247b7c4 100644
--- a/src/core/CLucene/index/SDocumentWriter.h
+++ b/src/core/CLucene/index/SDocumentWriter.h
@@ -88,8 +88,11 @@ public:
 
         BufferedNorms();
         void add(float_t norm);
+        void add(int32_t norm);
         void reset();
         void fill(int32_t docID);
+
+        int64_t total_term_count_ = 0;
     };
     template<typename T2>
     class BlockPool {
diff --git a/src/core/CLucene/index/SegmentReader.cpp 
b/src/core/CLucene/index/SegmentReader.cpp
index 374322761ed..beb357fc3d5 100644
--- a/src/core/CLucene/index/SegmentReader.cpp
+++ b/src/core/CLucene/index/SegmentReader.cpp
@@ -507,24 +507,26 @@ bool SegmentReader::isDeleted(const int32_t n) {
     return ret;
 }
 
-TermDocs *SegmentReader::termDocs(const void* io_ctx) {
+TermDocs *SegmentReader::termDocs(bool load_stats, const void* io_ctx) {
     //Func - Returns an unpositioned TermDocs enumerator.
     //Pre  - true
     //Post - An unpositioned TermDocs enumerator has been returned
 
     ensureOpen();
     auto* ret = _CLNEW SegmentTermDocs(this);
+    ret->setLoadStats(load_stats);
     ret->setIoContext(io_ctx);
     return ret;
 }
 
-TermPositions *SegmentReader::termPositions(const void* io_ctx) {
+TermPositions *SegmentReader::termPositions(bool load_stats, const void* 
io_ctx) {
     //Func - Returns an unpositioned TermPositions enumerator.
     //Pre  - true
     //Post - An unpositioned TermPositions enumerator has been returned
 
     ensureOpen();
     auto* ret = _CLNEW SegmentTermPositions(this);
+    ret->setLoadStats(load_stats);
     ret->setIoContext(io_ctx);
     return ret;
 }
@@ -902,28 +904,14 @@ void SegmentReader::openNorms(Directory *cfsDir, int32_t 
readBufferSize) {
                 normInput = d->openInput(fileName.c_str());
             }
 
-            _norms[fi->name] = _CLNEW Norm(normInput, singleNormFile, 
fi->number, normSeek, this, segment.c_str());
-
-            // read total norm info into cache
-            std::vector<uint8_t> bytes(_maxDoc);
-            IndexInput *normStream;
-            if (_norms[fi->name]->useSingleNormStream) {
-                normStream = singleNormStream;
-            } else {
-                normStream = _norms[fi->name]->in;
-            }
+            normInput->seek(normSeek);
+            auto total_term_count = normInput->readLong();
+            sum_total_term_freq[*fi->name] = total_term_count;
+            normSeek += sizeof(int64_t);
 
-            ensureOpen();
-            SCOPED_LOCK_MUTEX(_norms[fi->name]->THIS_LOCK);
-            normStream->seek(_norms[fi->name]->normSeek);
-            normStream->readBytes(bytes.data(), _maxDoc);
-            uint64_t sum = 0;
-            for (int doc = 0; doc < _maxDoc; doc++) {
-                sum += Similarity::decodeNorm(bytes[doc]);
-            }
-            sum_total_term_freq[*fi->name] = sum;
+            _norms[fi->name] = _CLNEW Norm(normInput, singleNormFile, 
fi->number, normSeek, this, segment.c_str());
 
-            nextNormSeek += _maxDoc;// increment also if some norms are 
separate
+            nextNormSeek += (_maxDoc + sizeof(int64_t));// increment also if 
some norms are separate
         }
     }
 }
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp 
b/src/core/CLucene/index/SegmentTermDocs.cpp
index 35ffc8fd615..bd58b1b2cff 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -37,6 +37,10 @@ TermPositions *SegmentTermDocs::__asTermPositions() {
     return NULL;
 }
 
+void SegmentTermDocs::setLoadStats(bool load_stats) {
+    load_stats_ = load_stats;
+}
+
 void SegmentTermDocs::setIoContext(const void* io_ctx) {
     if (freqStream) {
         freqStream->setIoContext(io_ctx);
@@ -58,13 +62,13 @@ int32_t SegmentTermDocs::docNorm() {
     return 0;
 }
 
-void SegmentTermDocs::seek(Term *term, bool load_stats) {
+void SegmentTermDocs::seek(Term *term) {
     TermInfo *ti = parent->tis->get(term, io_ctx_);
-    seek(ti, term, load_stats);
+    seek(ti, term);
     _CLDELETE(ti);
 }
 
-void SegmentTermDocs::seek(TermEnum *termEnum, bool load_stats) {
+void SegmentTermDocs::seek(TermEnum *termEnum) {
     TermInfo *ti = NULL;
     Term *term = NULL;
 
@@ -79,15 +83,15 @@ void SegmentTermDocs::seek(TermEnum *termEnum, bool 
load_stats) {
         ti = parent->tis->get(term);
     }
 
-    seek(ti, term, load_stats);
+    seek(ti, term);
     _CLDELETE(ti);
 }
-void SegmentTermDocs::seek(const TermInfo *ti, Term *term, bool load_stats) {
+void SegmentTermDocs::seek(const TermInfo *ti, Term *term) {
     count = 0;
     FieldInfo *fi = parent->_fieldInfos->fieldInfo(term->field());
     currentFieldStoresPayloads = (fi != NULL) ? fi->storePayloads : false;
-    buffer_.needLoadStats(load_stats);
-    if (load_stats && fi != NULL && fi->isIndexed && !fi->omitNorms) {
+    buffer_.needLoadStats(load_stats_);
+    if (load_stats_ && fi != NULL && fi->isIndexed && !fi->omitNorms) {
         const TCHAR *curField = fi->name;
         norms = parent->norms(curField);
         buffer_.setAllDocNorms(norms);
@@ -356,7 +360,7 @@ void TermDocsBuffer::refillNorm(int32_t size) {
         auto doc = docs_[i];
         // avoid doc norms not set
         if (doc < maxDoc && all_doc_norms_) {
-            norms_[i] = search::Similarity::decodeNorm(all_doc_norms_[doc]);
+            norms_[i] = all_doc_norms_[doc];
         } else {
             norms_[i] = 0;
         }
diff --git a/src/core/CLucene/index/SegmentTermPositions.cpp 
b/src/core/CLucene/index/SegmentTermPositions.cpp
index 7a17496b71f..edd20a0f22d 100644
--- a/src/core/CLucene/index/SegmentTermPositions.cpp
+++ b/src/core/CLucene/index/SegmentTermPositions.cpp
@@ -27,6 +27,10 @@ SegmentTermPositions::~SegmentTermPositions() {
     close();
 }
 
+void SegmentTermPositions::setLoadStats(bool load_stats) {
+    SegmentTermDocs::setLoadStats(load_stats);
+}
+
 void SegmentTermPositions::setIoContext(const void* io_ctx) {
     SegmentTermDocs::setIoContext(io_ctx);
 }
@@ -38,8 +42,8 @@ TermPositions* SegmentTermPositions::__asTermPositions(){
     return (TermPositions*) this;
 }
 
-void SegmentTermPositions::seek(const TermInfo* ti, Term* term, bool 
local_stats) {
-    SegmentTermDocs::seek(ti, term, local_stats);
+void SegmentTermPositions::seek(const TermInfo* ti, Term* term) {
+    SegmentTermDocs::seek(ti, term);
     if (ti != NULL)
        lazySkipPointer = ti->proxPointer;
     
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index 771eff51873..57133322314 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -31,12 +31,12 @@ public:
 
        // Sets this to the data for a term.
        // The enumeration is reset to the start of the data for this term.
-       virtual void seek(Term* term, bool load_stats = false) = 0;
+       virtual void seek(Term* term) = 0;
 
        /** Sets this to the data for the current term in a {@link TermEnum}.
        * This may be optimized in some implementations.
        */
-       virtual void seek(TermEnum* termEnum,  bool load_stats = false) = 0;
+       virtual void seek(TermEnum* termEnum) = 0;
 
        // Returns the current document number.  <p> This is invalid until 
{@link
        //      #next()} is called for the first time.
@@ -88,6 +88,7 @@ public:
     */
        virtual TermPositions* __asTermPositions()=0;
 
+       virtual void setLoadStats(bool load_stats) {}
        virtual void setIoContext(const void*) {}
 
        virtual int32_t docFreq() {
@@ -140,6 +141,7 @@ public:
        */
        virtual bool skipTo(Term* target);
 
+       virtual void setLoadStats(bool load_stats) {}
        virtual void setIoContext(const void*) {}
 };
 
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h 
b/src/core/CLucene/index/_MultiSegmentReader.h
index d3dc7c70486..3ace80765fb 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -104,8 +104,8 @@ public:
 
        //Returns the document frequency of the current term in the set
        int32_t docFreq(const Term* t=NULL);
-       TermDocs* termDocs(const void* io_ctx = nullptr);
-       TermPositions* termPositions(const void* io_ctx = nullptr);
+       TermDocs* termDocs(bool load_stats = false, const void* io_ctx = 
nullptr);
+       TermPositions* termPositions(bool load_stats = false, const void* 
io_ctx = nullptr);
 
   // Returns the document norm
   int32_t docNorm(const TCHAR* field, int32_t n);
@@ -152,7 +152,7 @@ protected:
   size_t pointer;
 
   TermDocs* current;              // == segTermDocs[pointer]
-  TermDocs* termDocs(const int32_t i, bool local_stats = false); //< internal 
use only
+  TermDocs* termDocs(const int32_t i); //< internal use only
   virtual TermDocs* termDocs(IndexReader* reader);
   void init(CL_NS(util)::ArrayBase<IndexReader*>* subReaders, const int32_t* 
starts);
 public:
@@ -164,8 +164,8 @@ public:
   int32_t freq() const;
   int32_t norm() const;
 
-  void seek(TermEnum* termEnum, bool load_stats = false);
-  void seek(Term* tterm, bool load_stats = false);
+  void seek(TermEnum* termEnum);
+  void seek(Term* tterm);
   bool next();
 
   /** Optimized implementation. */
@@ -183,9 +183,11 @@ public:
   int32_t docFreq() override;
   int32_t docNorm() override;
 
+  void setLoadStats(bool load_stats) override;
   void setIoContext(const void* io_ctx) override;
 
 protected:
+  bool load_stats_ = false;
   const void* io_ctx_ = nullptr;
 };
 
@@ -224,6 +226,7 @@ public:
   void setIoContext(const void*) override;
 
 private:
+  bool load_stats_ = false;
   const void* io_ctx_ = nullptr;
 };
 
diff --git a/src/core/CLucene/index/_SegmentHeader.h 
b/src/core/CLucene/index/_SegmentHeader.h
index e09ed00969e..81657b386d4 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -114,7 +114,7 @@ private:
 
   // save all doc norms in this term's field
   uint32_t maxDoc = 0;
-  uint8_t* all_doc_norms_;
+  uint8_t* all_doc_norms_ = nullptr;
 
   bool hasProx_ = false;
   bool compatibleRead_ = false;
@@ -211,6 +211,8 @@ protected:
   bool currentFieldStoresPayloads;
   bool hasProx = false;
   IndexVersion indexVersion_ = IndexVersion::kV0;
+  
+  bool load_stats_ = false;
   const void* io_ctx_ = nullptr;
 
 public:
@@ -218,9 +220,9 @@ public:
   SegmentTermDocs( const SegmentReader* Parent);
   virtual ~SegmentTermDocs();
 
-  virtual void seek(Term* term, bool load_stats = false);
-  virtual void seek(TermEnum* termEnum, bool load_stats = false);
-  virtual void seek(const TermInfo* ti,Term* term, bool load_stats = false);
+  virtual void seek(Term* term);
+  virtual void seek(TermEnum* termEnum);
+  virtual void seek(const TermInfo* ti,Term* term);
 
   virtual void close();
   virtual int32_t doc()const;
@@ -241,6 +243,7 @@ public:
 
   virtual TermPositions* __asTermPositions();
 
+  void setLoadStats(bool load_stats) override;
   void setIoContext(const void* io_ctx) override;
 
   int32_t docFreq() override;
@@ -282,10 +285,11 @@ public:
   SegmentTermPositions(const SegmentReader* Parent);
   virtual ~SegmentTermPositions();
 
+  void setLoadStats(bool load_stats) override;
   void setIoContext(const void* io_ctx) override;
 
 private:
-  void seek(const TermInfo* ti, Term* term, bool load_stats = false);
+  void seek(const TermInfo* ti, Term* term);
 
 public:
   void close();
@@ -335,8 +339,8 @@ private:
   virtual TermPositions* __asTermPositions();
 
   //resolve SegmentTermDocs/TermPositions ambiguity
-  void seek(Term* term, bool load_stats = false){ SegmentTermDocs::seek(term, 
load_stats); }
-  void seek(TermEnum* termEnum, bool load_stats = false){ 
SegmentTermDocs::seek(termEnum, load_stats); }
+  void seek(Term* term){ SegmentTermDocs::seek(term); }
+  void seek(TermEnum* termEnum){ SegmentTermDocs::seek(termEnum); }
   int32_t doc() const{ return SegmentTermDocs::doc(); }
   int32_t freq() const{ return SegmentTermDocs::freq(); }
   int32_t norm() const{ return SegmentTermDocs::norm(); }
@@ -526,9 +530,9 @@ public:
   bool isDeleted(const int32_t n);
 
   ///Returns an unpositioned TermDocs enumerator.
-  TermDocs* termDocs(const void* io_ctx = nullptr);
+  TermDocs* termDocs(bool load_stats = false, const void* io_ctx = nullptr);
   ///Returns an unpositioned TermPositions enumerator.
-  TermPositions* termPositions(const void* io_ctx = nullptr);
+  TermPositions* termPositions(bool load_stats = false, const void* io_ctx = 
nullptr);
 
   ///Returns the number of documents which contain the term t
   int32_t docFreq(const Term* t);
diff --git a/src/core/CLucene/search/Similarity.cpp 
b/src/core/CLucene/search/Similarity.cpp
index 922d644ec8f..826be432492 100644
--- a/src/core/CLucene/search/Similarity.cpp
+++ b/src/core/CLucene/search/Similarity.cpp
@@ -193,6 +193,57 @@ CL_NS_DEF(search)
 #endif
    }
 
+   int32_t number_of_leading_zeros(uint64_t value) {
+           if (value == 0) {
+                   return 64;
+           }
+#if defined(__GNUC__) || defined(__clang__)
+           return __builtin_clzll(value);
+#else
+           int32_t count = 0;
+           for (uint64_t mask = 1ULL << 63; mask != 0; mask >>= 1) {
+         if (value & mask) break;
+         ++count;
+           }
+           return count;
+#endif
+   }
+
+   uint32_t long_to_int4(uint64_t i) {
+           if (i > std::numeric_limits<uint64_t>::max()) {
+                   _CLTHROWA(CL_ERR_IllegalArgument, "Only supports positive 
values");
+           }
+
+           int32_t numBits = 64 - number_of_leading_zeros(i);
+           if (numBits < 4) {
+                   return static_cast<uint32_t>(i);
+           } else {
+                   int32_t shift = numBits - 4;
+                   uint32_t encoded = static_cast<uint32_t>(i >> shift) & 0x07;
+                   return encoded | ((shift + 1) << 3);
+           }
+   }
+
+   const int32_t MAX_INT32 = std::numeric_limits<int32_t>::max();
+   const uint32_t MAX_INT4 = long_to_int4(static_cast<uint64_t>(MAX_INT32));
+   const int32_t NUM_FREE_VALUES = 255 - static_cast<int>(MAX_INT4);
+
+   uint8_t int_to_byte4(int32_t i) {
+           if (i < 0) {
+                   _CLTHROWA(CL_ERR_IllegalArgument, "Only supports positive 
values");
+           }
+
+           if (i < NUM_FREE_VALUES) {
+                   return static_cast<uint8_t>(i);
+           } else {
+                   uint32_t encoded = long_to_int4(i - NUM_FREE_VALUES);
+                   return static_cast<uint8_t>(NUM_FREE_VALUES + encoded);
+           }
+   }
+
+   uint8_t Similarity::encodeNorm(int32_t i) {
+           return int_to_byte4(i);
+   }
 
    float_t Similarity::idf(Term* term, Searcher* searcher) {
       return idf(searcher->docFreq(term), searcher->maxDoc());
diff --git a/src/core/CLucene/search/Similarity.h 
b/src/core/CLucene/search/Similarity.h
index 74b7a819d06..472e4758983 100644
--- a/src/core/CLucene/search/Similarity.h
+++ b/src/core/CLucene/search/Similarity.h
@@ -95,6 +95,8 @@ public:
    static uint8_t floatToByte(float_t f);
    static float_t byteToFloat(uint8_t b);
 
+   static uint8_t encodeNorm(int32_t i);
+
    /** Computes a score factor for a phrase.
    *
    * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
diff --git a/src/test/search/TestSearch.cpp b/src/test/search/TestSearch.cpp
index 14676ca7337..b7b1eb9928b 100644
--- a/src/test/search/TestSearch.cpp
+++ b/src/test/search/TestSearch.cpp
@@ -342,7 +342,7 @@ void testNormEncoding(CuTest *tc) {
        CLUCENE_ASSERT( CL_NS(search)::Similarity::encodeNorm(0)==0 );
        CLUCENE_ASSERT( CL_NS(search)::Similarity::encodeNorm(1)==124 );
        CLUCENE_ASSERT( CL_NS(search)::Similarity::encodeNorm(1)==124 );
-       CLUCENE_ASSERT( CL_NS(search)::Similarity::encodeNorm(7516192768.0 
)==255);
+       CLUCENE_ASSERT( 
CL_NS(search)::Similarity::encodeNorm(7516192768.0f)==255);
 
 
        CLUCENE_ASSERT( CL_NS(search)::Similarity::decodeNorm(124)==1 );


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris-thirdparty) branch clucene updated: [fix](inverted index) fix BM25 scoring anomaly in term frequency calculation (#344)

Reply via email to