(doris-thirdparty) branch tmp_clucene_hybrid_search updated: [feature](clucene): Introduce extra statistics for calculating Doris BM25. (#308)

kxiao Mon, 21 Apr 2025 02:34:28 -0700

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch tmp_clucene_hybrid_search
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/tmp_clucene_hybrid_search by 
this push:
     new 368270c359d [feature](clucene): Introduce extra statistics for 
calculating Doris BM25. (#308)
368270c359d is described below

commit 368270c359da9a978f2ba894c251eadf03e9305e
Author: Zephyr Guo <[email protected]>
AuthorDate: Mon Apr 21 17:34:16 2025 +0800

    [feature](clucene): Introduce extra statistics for calculating Doris BM25. 
(#308)
---
 src/core/CLucene/index/DocRange.h                |   3 +
 src/core/CLucene/index/IndexReader.cpp           |  10 +-
 src/core/CLucene/index/IndexReader.h             |  15 ++-
 src/core/CLucene/index/IndexWriter.cpp           | 128 ++++++++++++++++++++++-
 src/core/CLucene/index/IndexWriter.h             |   4 +-
 src/core/CLucene/index/MultiReader.cpp           |  32 ++++++
 src/core/CLucene/index/MultiReader.h             |   7 ++
 src/core/CLucene/index/MultiSegmentReader.cpp    |  69 +++++++++++-
 src/core/CLucene/index/MultipleTermPositions.cpp |  14 ++-
 src/core/CLucene/index/MultipleTermPositions.h   |  12 ++-
 src/core/CLucene/index/SegmentMerger.cpp         |   2 +-
 src/core/CLucene/index/SegmentReader.cpp         | 118 ++++++++++++++++++---
 src/core/CLucene/index/SegmentTermDocs.cpp       | 101 ++++++++++++++++--
 src/core/CLucene/index/SegmentTermPositions.cpp  |   8 +-
 src/core/CLucene/index/Terms.h                   |  13 ++-
 src/core/CLucene/index/_MultiSegmentReader.h     |  16 ++-
 src/core/CLucene/index/_SegmentHeader.h          |  72 +++++++++++--
 src/core/CLucene/search/IndexSearcher.cpp        |  15 +++
 src/core/CLucene/search/IndexSearcher.h          |   4 +
 src/core/CLucene/search/MultiSearcher.cpp        |  26 +++++
 src/core/CLucene/search/MultiSearcher.h          |   4 +
 src/core/CLucene/search/Searchable.h             |   8 +-
 src/core/CLucene/search/Similarity.cpp           |   8 ++
 src/core/CLucene/search/Similarity.h             |   9 ++
 src/core/CLucene/search/query/TermIterator.h     |   8 ++
 25 files changed, 646 insertions(+), 60 deletions(-)

diff --git a/src/core/CLucene/index/DocRange.h 
b/src/core/CLucene/index/DocRange.h
index ef7906a24fb..ab417ce5877 100644
--- a/src/core/CLucene/index/DocRange.h
+++ b/src/core/CLucene/index/DocRange.h
@@ -23,8 +23,11 @@ class DocRange {
 
   uint32_t doc_many_size_ = 0;
   uint32_t freq_many_size_ = 0;
+  uint32_t norm_many_size_ = 0;
+
   std::vector<uint32_t>* doc_many = nullptr;
   std::vector<uint32_t>* freq_many = nullptr;
+  std::vector<uint32_t>* norm_many = nullptr;
 
   std::pair<uint32_t, uint32_t> doc_range;
 };
\ No newline at end of file
diff --git a/src/core/CLucene/index/IndexReader.cpp 
b/src/core/CLucene/index/IndexReader.cpp
index 5b9f8ad2624..41b055181b0 100644
--- a/src/core/CLucene/index/IndexReader.cpp
+++ b/src/core/CLucene/index/IndexReader.cpp
@@ -251,7 +251,7 @@ CL_NS_DEF(index)
     return SegmentInfos::getCurrentSegmentGeneration(directory) != -1;
   }
 
-  TermDocs* IndexReader::termDocs(Term* term) {
+  TermDocs* IndexReader::termDocs(Term* term, bool load_stats) {
   //Func - Returns an enumeration of all the documents which contain
   //       term. For each document, the document number, the frequency of
   //       the term in that document is also provided, for use in search 
scoring.
@@ -269,13 +269,13 @@ CL_NS_DEF(index)
       ensureOpen();
       //Reference an instantiated TermDocs instance
       TermDocs* _termDocs = termDocs();
-      //Seek all documents containing term
-      _termDocs->seek(term);
+      //Seek all document·s containing term
+      _termDocs->seek(term, load_stats);
       //return the enumaration
       return _termDocs;
   }
 
-  TermPositions* IndexReader::termPositions(Term* term){
+  TermPositions* IndexReader::termPositions(Term* term, bool load_stats){
   //Func - Returns an enumeration of all the documents which contain  term. 
For each
   //       document, in addition to the document number and frequency of the 
term in
   //       that document, a list of all of the ordinal positions of the term 
in the document
@@ -296,7 +296,7 @@ CL_NS_DEF(index)
       //Reference an instantiated termPositions instance
       TermPositions* _termPositions = termPositions();
          //Seek all documents containing term
-      _termPositions->seek(term);
+      _termPositions->seek(term, load_stats);
          //return the enumeration
       return _termPositions;
   }
diff --git a/src/core/CLucene/index/IndexReader.h 
b/src/core/CLucene/index/IndexReader.h
index 4307a0d9332..a61a1a2ec57 100644
--- a/src/core/CLucene/index/IndexReader.h
+++ b/src/core/CLucene/index/IndexReader.h
@@ -15,6 +15,8 @@
 #include "CLucene/index/IndexVersion.h"
 #include "CLucene/index/_FieldInfos.h"
 
+#include <optional>
+
 CL_CLASS_DEF(store,Directory)
 CL_CLASS_DEF(store,LuceneLock)
 CL_CLASS_DEF(document,Document)
@@ -59,7 +61,6 @@ class CLUCENE_EXPORT IndexReader: public 
CL_NS(util)::NamedObject{
   bool closed;
 protected:
   bool hasChanges;
-
   /**
   * Legacy Constructor for backwards compatibility.
   *
@@ -560,6 +561,14 @@ public:
    */
        virtual int32_t docFreq(const Term* t) = 0;
 
+    /** Returns the norm of document whoss id is <code>doc</code> in the 
<code>field</code>.
+   */
+        virtual int32_t docNorm(const TCHAR* field, int32_t doc) = 0;
+
+    /** Returns the total norm of all terms appeared in all documents
+   */
+        virtual std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) = 
0;
+
        /* Returns an unpositioned TermPositions enumerator.
    * @throws IOException if there is a low-level IO error
         * @memory Caller must clean up
@@ -584,7 +593,7 @@ public:
   * @throws IOException if there is a low-level IO error
   * @memory Caller must clean up
        */
-       TermPositions* termPositions(Term* term);
+       TermPositions* termPositions(Term* term, bool load_stats = false);
 
        /** Returns an unpositioned {@link TermDocs} enumerator.
    * @throws IOException if there is a low-level IO error
@@ -602,7 +611,7 @@ public:
   * @throws IOException if there is a low-level IO error
   * @memory Caller must clean up
        */
-       TermDocs* termDocs(Term* term);
+       TermDocs* termDocs(Term* term, bool load_stats = false);
 
        /** Deletes the document numbered <code>docNum</code>.  Once a document 
is
        * deleted it will not appear in TermDocs or TermPostitions enumerations.
diff --git a/src/core/CLucene/index/IndexWriter.cpp 
b/src/core/CLucene/index/IndexWriter.cpp
index a3b30848af6..7ede27aec2f 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1341,10 +1341,52 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
 
     std::vector<lucene::index::IndexWriter *> destIndexWriterList;
     std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList;
+    std::vector<lucene::store::IndexOutput *> normsOutputList;
+
+    // first level vector index is src_index_id
+    // <TCHAR, ValueArray<uint8_t>> key is field name, value is the norm of 
src_doc_id
+    std::vector<map<TCHAR, std::vector<uint8_t>>> 
srcFieldNormsMapValues(numIndices);
+
     try {
         /// merge fields
         mergeFields(hasProx);
 
+        // check if field has norms
+        bool hasNorms = false;
+        {
+            for (size_t i = 0; i < fieldInfos->size(); i++) {
+                //Get the i-th FieldInfo
+                FieldInfo* fi = fieldInfos->fieldInfo(i);
+                // Is this Field indexed and field need norms ?
+                if (fi->isIndexed && !fi->omitNorms) {
+                    hasNorms = true;
+                }
+            }
+        }
+
+        if (hasNorms) {
+            for (int srcIndex = 0; srcIndex < numIndices; srcIndex++) {
+                auto reader = readers[srcIndex];
+                for (size_t i = 0; i < fieldInfos->size(); i++) {
+                    //Get the i-th FieldInfo
+                    FieldInfo* fi = fieldInfos->fieldInfo(i);
+                    // Is this Field indexed and field need norms ?
+                    if (fi->isIndexed && !fi->omitNorms) {
+                        CL_NS(util)::ValueArray<uint8_t> normBuffer;
+                        size_t maxDoc = reader->maxDoc();
+                        if ( normBuffer.length < maxDoc){
+                            normBuffer.resize(maxDoc);
+                            memset(normBuffer.values, 0, sizeof(uint8_t) * 
maxDoc);
+                        }
+                        reader->norms(fi->name, normBuffer.values);
+                        for (int j = 0; j < normBuffer.length; j++) {
+                            
srcFieldNormsMapValues[srcIndex][*fi->name].emplace_back(normBuffer.values[j]);
+                        }
+                    }
+                }
+            }
+        }
+
         /// write fields and create files writers
         for (int j = 0; j < numDestIndexes; j++) {
             auto dest_dir = dest_dirs[j];
@@ -1385,6 +1427,13 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
             maxSkipLevels = termInfosWriter->maxSkipLevels;
             skipListWriterList.push_back(_CLNEW 
DefaultSkipListWriter(skipInterval, maxSkipLevels, (int) dest_index_docs[j], 
freqOutputList[j], proxOutputList[j]));
 
+            if (hasNorms) {
+                // create norms output
+                auto* norms_out = 
dest_dir->createOutput(Misc::segmentname(segment.c_str(), ".nrm").c_str());
+                norms_out->writeBytes(SegmentMerger::NORMS_HEADER, 
SegmentMerger::NORMS_HEADER_length);
+                normsOutputList.push_back(norms_out);
+            }
+
             // create null_bitmap index output
             auto* null_bitmap_out = 
dest_dir->createOutput(NULL_BITMAP_FILE_NAME);
             nullBitmapIndexOutputList.push_back(null_bitmap_out);
@@ -1393,6 +1442,11 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
         /// merge terms
         mergeTerms(hasProx);
 
+        /// merge norms if have
+        if (hasNorms){
+            mergeNorms(dest_index_docs, srcFieldNormsMapValues, 
normsOutputList);
+        }
+
         /// merge null_bitmap
         mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList);
     } catch (CLuceneError &e) {
@@ -1432,7 +1486,14 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
                     r->close();
                     _CLDELETE(r);
                 }
-            } readers.clear(););
+                 } readers.clear(););
+            for (auto* norms_out
+                 : normsOutputList) {
+                if (norms_out != nullptr) {
+                    norms_out->close();
+                    _CLDELETE(norms_out);
+                }
+         } normsOutputList.clear();
             for (auto* null_bitmap_out
                  : nullBitmapIndexOutputList) {
                 if (null_bitmap_out != nullptr) {
@@ -1889,6 +1950,71 @@ void IndexWriter::mergeTerms(bool hasProx) {
     }
 }
 
+void IndexWriter::mergeNorms(std::vector<uint32_t> dest_index_docs,
+                                std::vector<std::map<TCHAR, 
std::vector<uint8_t>>> srcFieldNormsMapValues,
+                                std::vector<lucene::store::IndexOutput *> 
normsOutputList) {
+    //Func - Merges the norms for all fields
+    //Pre  - fieldInfos != NULL
+    //Post - The norms for all fields have been merged
+    CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");
+
+    std::vector<std::map<TCHAR, std::vector<uint8_t>>> 
destFieldNormsMapValues(numDestIndexes);
+
+    // iterate srcFieldNormsValues to construct destFieldNormsMapValues
+    for (size_t srcIndex = 0; srcIndex < srcFieldNormsMapValues.size(); 
++srcIndex) {
+        std::map<TCHAR, std::vector<uint8_t>> &srcFieldNormsMap = 
srcFieldNormsMapValues[srcIndex];
+        if (srcFieldNormsMap.empty()) {
+            // empty indicates there is no nrm file in this index
+            continue;
+        }
+        // find field has norms
+        for (int j =0; j < fieldInfos->size(); j++) {
+            FieldInfo* fi = fieldInfos->fieldInfo(j);
+            TCHAR fieldName = *fi->name;
+            // Is this Field indexed and field need norms ?
+            if (fi->isIndexed && !fi->omitNorms) {
+                auto& srcFieldNorms = srcFieldNormsMap[fieldName];
+                // construct srcFieldNorms to destFieldNorms
+                for (int srcDocId = 0; srcDocId < srcFieldNorms.size(); 
srcDocId++) {
+                    auto destIdx = _trans_vec[srcIndex][srcDocId].first;
+                    auto destDocId = _trans_vec[srcIndex][srcDocId].second;
+                    if (destIdx == UINT32_MAX || destDocId == UINT32_MAX) {
+                        continue;
+                    }
+                    auto destDocCount = dest_index_docs[destIdx];
+                    auto& destFieldNormsMap = destFieldNormsMapValues[destIdx];
+                    if (destFieldNormsMap.find(fieldName) == 
destFieldNormsMap.end()) {
+                        destFieldNormsMap[fieldName].resize(destDocCount);
+                        
std::fill(destFieldNormsMap[fieldName].begin(),destFieldNormsMap[fieldName].end(),
 0);
+                    }
+                    auto& destFieldNorms = destFieldNormsMap[fieldName];
+                    destFieldNorms[destDocId] = srcFieldNorms[srcDocId];
+                    destFieldNormsMap[fieldName] = destFieldNorms;
+                }
+            }
+        }
+    }
+
+    // construct nrm and write nrm to dest index
+    for (size_t i = 0; i < destFieldNormsMapValues.size(); ++i) {
+        auto& destFieldNormsMap = destFieldNormsMapValues[i];
+        for (int j =0; j < fieldInfos->size(); j++) {
+            FieldInfo* fi = fieldInfos->fieldInfo(j);
+            TCHAR fieldName = *fi->name;
+            auto destDocCount = dest_index_docs[i];
+            if (fi->isIndexed && !fi->omitNorms) {
+                // if not find then norm is zero
+                if (destFieldNormsMap.find(fieldName) == 
destFieldNormsMap.end()) {
+                    destFieldNormsMap[fieldName].resize(destDocCount);
+                    
std::fill(destFieldNormsMap[fieldName].begin(),destFieldNormsMap[fieldName].end(),
 0);
+                }
+                auto& destFieldNorms = destFieldNormsMap[fieldName];
+                normsOutputList[i]->writeBytes(destFieldNorms.data(), 
destDocCount);
+            }
+        }
+    }
+}
+
 void IndexWriter::mergeNullBitmap(std::vector<std::vector<uint32_t>> 
srcNullBitmapValues, std::vector<lucene::store::IndexOutput *> 
nullBitmapIndexOutputList) {
     // first level vector index is dest_index_id
     // second level vector index is dest_doc_id
diff --git a/src/core/CLucene/index/IndexWriter.h 
b/src/core/CLucene/index/IndexWriter.h
index 7cfb67d2ca7..7765a2362f3 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -324,7 +324,9 @@ public:
     // write fields info file
     void writeFields(lucene::store::Directory* d, std::string segment);
     // merge terms and write files
-    void mergeTerms(bool hasProx);
+    void mergeTerms(bool hasProx, IndexVersion indexVersion);
+    // merge norms and write files
+    void mergeNorms(std::vector<uint32_t> dest_index_docs, 
std::vector<std::map<TCHAR, std::vector<uint8_t>>> srcFieldNormsMapValues, 
std::vector<lucene::store::IndexOutput *> normsOutputList);
     // merge null_bitmap
     void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues, 
std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList);
 
diff --git a/src/core/CLucene/index/MultiReader.cpp 
b/src/core/CLucene/index/MultiReader.cpp
index 726b6e3dac5..8535a116507 100644
--- a/src/core/CLucene/index/MultiReader.cpp
+++ b/src/core/CLucene/index/MultiReader.cpp
@@ -271,6 +271,38 @@ int32_t MultiReader::docFreq(const Term* t) {
        return total;
 }
 
+int32_t MultiReader::docNorm(const TCHAR* field, int32_t n) {
+    ensureOpen();
+    if (hasNorms(field)) {
+        int32_t i = readerIndex(n);
+        return (*subReaders)[i]->docNorm(field, n - starts[i]);
+    }
+    return 0;
+};
+
+std::optional<uint64_t> MultiReader::sumTotalTermFreq(const TCHAR* field) {
+    ensureOpen();
+
+    if (hasNorms(field)) {
+        int64_t sum = 0;
+        bool hasTotalNorm = false;
+        for (size_t i = 0; i < subReaders->length; i++) {
+            if(!isDeleted(i)) {
+                std::optional<int64_t> totalNorm = 
(*subReaders)[i]->sumTotalTermFreq(field);
+                if (totalNorm != std::nullopt) {
+                    hasTotalNorm = true;
+                    sum += totalNorm.value();
+                }
+            }
+        }
+        if (hasTotalNorm) {
+            return sum;
+        }
+    }
+
+    return std::nullopt;
+}
+
 TermDocs* MultiReader::termDocs() {
     ensureOpen();
        TermDocs* ret =  _CLNEW MultiTermDocs(subReaders, starts);
diff --git a/src/core/CLucene/index/MultiReader.h 
b/src/core/CLucene/index/MultiReader.h
index 301d1422e2c..aa1bbed0227 100644
--- a/src/core/CLucene/index/MultiReader.h
+++ b/src/core/CLucene/index/MultiReader.h
@@ -100,6 +100,13 @@ public:
 
        //Returns the document frequency of the current term in the set
        int32_t docFreq(const Term* t=NULL);
+
+        // Returns the document norm
+        int32_t docNorm(const TCHAR* field, int32_t n);
+
+        // Returns the total norm of all terms appeared in all documents in 
this field
+        std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
+
        TermDocs* termDocs();
        TermPositions* termPositions();
 
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp 
b/src/core/CLucene/index/MultiSegmentReader.cpp
index b4be5f01298..4c5a527663a 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -355,6 +355,34 @@ int32_t MultiSegmentReader::docFreq(const Term* t) {
        return total;
 }
 
+int32_t MultiSegmentReader::docNorm(const TCHAR* field, int32_t n) {
+    if (hasNorms(field)) {
+        int32_t i = readerIndex(n);                           // find segment 
num
+        return (*subReaders)[i]->docNorm(field,n - starts[i]);
+    }
+    return 0;
+}
+
+std::optional<uint64_t> MultiSegmentReader::sumTotalTermFreq(const TCHAR* 
field) {
+    if (hasNorms(field)) {
+        int64_t sum = 0;
+        bool hasTotalNorm = false;
+        for (size_t i = 0; i < subReaders->length; i++) {
+            if (!isDeleted(i)) {
+                std::optional<int64_t> totalNorm = 
(*subReaders)[i]->sumTotalTermFreq(field);
+                if (totalNorm != std::nullopt) {
+                    sum += totalNorm.value();
+                    hasTotalNorm = true;
+                }
+            }
+        }
+        if (hasTotalNorm) {
+            return sum;
+        }
+    }
+    return std::nullopt;
+}
+
 TermDocs* MultiSegmentReader::termDocs() {
     ensureOpen();
        TermDocs* ret =  _CLNEW MultiTermDocs(subReaders, starts);
@@ -559,6 +587,10 @@ int32_t MultiTermDocs::docFreq() {
        return docFreq;
 }
 
+int32_t MultiTermDocs::docNorm() {
+    return current->docNorm();
+}
+
 int32_t MultiTermDocs::doc() const {
   CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was 
called");
   // if not found term, current will return INT_MAX, we could not add base, 
otherwise it will overflow.
@@ -572,11 +604,16 @@ int32_t MultiTermDocs::freq() const {
        return current->freq();
 }
 
-void MultiTermDocs::seek(TermEnum* termEnum){
-       seek(termEnum->term(false));
+int32_t MultiTermDocs::norm() const {
+    CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was 
called");
+    return current->norm();
 }
 
-void MultiTermDocs::seek( Term* tterm) {
+void MultiTermDocs::seek(TermEnum* termEnum, bool load_stats){
+       seek(termEnum->term(false), load_stats);
+}
+
+void MultiTermDocs::seek( Term* tterm, bool load_stats) {
 //Func - Resets the instance for a new search
 //Pre  - tterm != NULL
 //Post - The instance has been reset for a new search
@@ -645,6 +682,28 @@ int32_t MultiTermDocs::read(int32_t* docs, int32_t* freqs, 
int32_t length) {
        }
 }
 
+int32_t MultiTermDocs::read(int32_t* docs, int32_t* freqs, int32_t* norms, 
int32_t length) {
+    while (true) {
+        while (current == NULL) {
+            if (pointer < subReaders->length) {                  // try next 
segment
+                base = starts[pointer];
+                current = termDocs(pointer++);
+            } else {
+                return 0;
+            }
+        }
+        int32_t end = current->read(docs, freqs, norms, length);
+        if (end == 0) {                                  // none left in 
segment
+            current = NULL;
+        } else {                                         // got some
+            int32_t b = base;                    // adjust doc numbers
+            for (int32_t i = 0; i < end; i++)
+                docs[i] += b;
+            return end;
+        }
+    }
+}
+
 bool MultiTermDocs::readRange(DocRange* docRange) {
        while (true) {
                while (current == NULL) {
@@ -727,7 +786,7 @@ TermDocs* MultiTermDocs::termDocs(IndexReader* reader) {
        return reader->termDocs();
 }
 
-TermDocs* MultiTermDocs::termDocs(const int32_t i) {
+TermDocs* MultiTermDocs::termDocs(const int32_t i, bool local_stats) {
        if (term == NULL)
          return NULL;
        TermDocs* result = (*readerTermDocs)[i];
@@ -736,7 +795,7 @@ TermDocs* MultiTermDocs::termDocs(const int32_t i) {
          readerTermDocs->values[i] = termDocs((*subReaders)[i]);
          result = (*readerTermDocs)[i];
        }
-       result->seek(term);
+       result->seek(term, local_stats);
 
        return result;
 }
diff --git a/src/core/CLucene/index/MultipleTermPositions.cpp 
b/src/core/CLucene/index/MultipleTermPositions.cpp
index e5bfa5ac24a..b5846516f76 100644
--- a/src/core/CLucene/index/MultipleTermPositions.cpp
+++ b/src/core/CLucene/index/MultipleTermPositions.cpp
@@ -14,11 +14,11 @@ CL_NS_USE(util)
 
 CL_NS_DEF(index)
 
-void MultipleTermPositions::seek(Term*) {
+void MultipleTermPositions::seek(Term*, bool) {
        _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::seek");
 }
 
-void MultipleTermPositions::seek(TermEnum*) {
+void MultipleTermPositions::seek(TermEnum*, bool) {
        _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::seek");
 }
 
@@ -26,6 +26,10 @@ int32_t MultipleTermPositions::read(int32_t*, 
int32_t*,int32_t) {
        _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::read");
 }
 
+int32_t MultipleTermPositions::read(int32_t*, int32_t*, int32_t*, int32_t) {
+    _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::read");
+}
+
 bool MultipleTermPositions::readRange(DocRange* docRange) {
        _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
MultipleTermPositions::readRange");
 }
@@ -144,6 +148,7 @@ bool MultipleTermPositions::next() {
 
        _posList->clear();
        _doc = _termPositionsQueue->peek()->doc();
+        _norm = _termPositionsQueue->peek()->norm();
 
        TermPositions* tp;
        do {
@@ -163,7 +168,6 @@ bool MultipleTermPositions::next() {
 
        _posList->sort();
        _freq = _posList->size();
-
        return true;
 }
 
@@ -192,6 +196,10 @@ int32_t MultipleTermPositions::freq() const {
        return _freq;
 }
 
+int32_t MultipleTermPositions::norm() const {
+    return _norm;
+}
+
 void MultipleTermPositions::close() {
        while (_termPositionsQueue->size() > 0) {
                TermPositions* tp = _termPositionsQueue->pop();
diff --git a/src/core/CLucene/index/MultipleTermPositions.h 
b/src/core/CLucene/index/MultipleTermPositions.h
index 67d03615f62..8ef7be1ac56 100644
--- a/src/core/CLucene/index/MultipleTermPositions.h
+++ b/src/core/CLucene/index/MultipleTermPositions.h
@@ -21,8 +21,9 @@ private:
        class IntQueue;
 
        int32_t _doc;
-       int32_t _freq;
-       TermPositionsQueue* _termPositionsQueue;
+        int32_t _freq;
+        int32_t _norm;
+        TermPositionsQueue* _termPositionsQueue;
        IntQueue* _posList;
 
 public:
@@ -44,25 +45,28 @@ public:
 
        int32_t freq() const;
 
+        int32_t norm() const;
+
        void close();
 
        /**
        * Not implemented.
        * @throws UnsupportedOperationException
        */
-       void seek(Term*);
+       void seek(Term*, bool);
 
        /**
        * Not implemented.
        * @throws UnsupportedOperationException
        */
-       void seek(TermEnum*);
+       void seek(TermEnum*, bool);
 
        /**
        * Not implemented.
        * @throws UnsupportedOperationException
        */
        int32_t read(int32_t*, int32_t*,int32_t);
+        int32_t read(int32_t*, int32_t*, int32_t*, int32_t);
        bool readRange(DocRange* docRange) override;
 
        /**
diff --git a/src/core/CLucene/index/SegmentMerger.cpp 
b/src/core/CLucene/index/SegmentMerger.cpp
index cc910b02c88..f0988ed4c9c 100644
--- a/src/core/CLucene/index/SegmentMerger.cpp
+++ b/src/core/CLucene/index/SegmentMerger.cpp
@@ -739,7 +739,7 @@ void SegmentMerger::mergeNorms() {
     for (size_t i = 0; i < fieldInfos->size(); i++) {
       //Get the i-th FieldInfo
       FieldInfo* fi = fieldInfos->fieldInfo(i);
-      //Is this Field indexed?
+      // Is this Field indexed and field need norms ?
       if (fi->isIndexed && !fi->omitNorms){
         //Instantiate  an IndexOutput to that norm file
         if (output == NULL) {
diff --git a/src/core/CLucene/index/SegmentReader.cpp 
b/src/core/CLucene/index/SegmentReader.cpp
index 721263664fa..5c5ede1063e 100644
--- a/src/core/CLucene/index/SegmentReader.cpp
+++ b/src/core/CLucene/index/SegmentReader.cpp
@@ -4,6 +4,9 @@
 * Distributable under the terms of either the Apache License (Version 2.0) or
 * the GNU Lesser General Public License, as specified in the COPYING file.
 
------------------------------------------------------------------------------*/
+#include <assert.h>
+#include <s2/base/integral_types.h>
+
 #include "CLucene/_ApiHeader.h"
 #include "CLucene/search/Similarity.h"
 #include "CLucene/store/FSDirectory.h"
@@ -17,7 +20,6 @@
 #include "_SegmentHeader.h"
 #include "_SegmentMerger.h"
 #include "_TermInfosReader.h"
-#include <assert.h>
 
 CL_NS_USE(util)
 CL_NS_USE(store)
@@ -199,8 +201,8 @@ void SegmentReader::initialize(SegmentInfo *si, int32_t 
readBufferSize, bool doO
         if (_fieldInfos->hasProx()) {
             proxStream = cfsDir->openInput((segment + ".prx").c_str(), 
readBufferSize);
         }
-        // we do not need norms, so we don't read it at all.
-        //openNorms(cfsDir, readBufferSize);
+
+        openNorms(cfsDir, readBufferSize);
 
         if (doOpenStores && _fieldInfos->hasVectors()) {// open term vector 
files only as needed
             string vectorsSegment;
@@ -546,6 +548,31 @@ int32_t SegmentReader::docFreq(const Term *t) {
         return 0;
 }
 
+int32_t SegmentReader::docNorm(const TCHAR* field, int32_t doc) {
+    //Func - Returns the norm of document whose id is doc in this filed
+    //Pre  - field has norm file
+    //Post - The norm of document whose id is doc in this filed has been 
returned, otherwise -1.0f;
+
+    ensureOpen();
+
+    if (hasNorms(field)) {
+        SCOPED_LOCK_MUTEX(THIS_LOCK)
+        uint8_t* field_norms = norms(field);
+        return search::Similarity::decodeNorm(field_norms[doc]);
+    }
+    return 0;
+}
+
+std::optional<uint64_t> SegmentReader::sumTotalTermFreq(const TCHAR* field) {
+    //Func - Returns the sum number of all terms in all docs
+    //Pre  - field has norm file;
+    //Post - The sum number of all terms in all docs has been returned, 
otherwise -1.0f;
+    if (hasNorms(field)) {
+        return sum_total_term_freq[*field];
+    }
+    return std::nullopt;
+}
+
 int32_t SegmentReader::numDocs() {
     //Func - Returns the actual number of documents in the segment
     //Pre  - true
@@ -654,6 +681,53 @@ void SegmentReader::norms(const TCHAR *field, uint8_t 
*bytes) {
     }
 
 
+    {
+        SCOPED_LOCK_MUTEX(norm->THIS_LOCK)
+        if (norm->bytes != NULL) {// can copy from cache
+            memcpy(bytes, norm->bytes, maxDoc());
+            return;
+        }
+
+        // Read from disk.  norm.in may be shared across  multiple norms and
+        // should only be used in a synchronized context.
+        IndexInput *normStream;
+        if (norm->useSingleNormStream) {
+            normStream = singleNormStream;
+        } else {
+            normStream = norm->in;
+        }
+        normStream->seek(norm->normSeek);
+        normStream->readBytes(bytes, maxDoc());
+    }
+}
+uint8_t* SegmentReader::norms(const TCHAR *field) const {
+    CND_PRECONDITION(field != NULL, "field is NULL");
+    Norm *norm = _norms.get(field);
+    if (norm == NULL) {
+        return NULL;
+    }
+    {
+        SCOPED_LOCK_MUTEX(norm->THIS_LOCK)
+        if (norm->bytes == NULL) {// value not yet read
+            uint8_t *bytes = _CL_NEWARRAY(uint8_t, maxDoc());
+            norms(field, bytes);
+            norm->bytes = bytes;// cache it
+            // it's OK to close the underlying IndexInput as we have cached the
+            // norms and will never read them again.
+            norm->close();
+        }
+
+        return norm->bytes;
+    }
+}
+
+void SegmentReader::norms(const TCHAR *field, uint8_t* bytes) const {
+    CND_PRECONDITION(field != NULL, "field is NULL");
+    Norm *norm = _norms.get(field);
+    if (norm == NULL) {
+        return;
+    }
+
     {
         SCOPED_LOCK_MUTEX(norm->THIS_LOCK)
         if (norm->bytes != NULL) {// can copy from cache
@@ -677,14 +751,15 @@ void SegmentReader::norms(const TCHAR *field, uint8_t 
*bytes) {
 uint8_t *SegmentReader::createFakeNorms(int32_t size) {
     uint8_t *ones = _CL_NEWARRAY(uint8_t, size);
     if (size > 0)
-        memset(ones, DefaultSimilarity::encodeNorm(1.0f), size);
+        memset(ones, Similarity::encodeNorm(0), size);
     return ones;
 }
 
 uint8_t *SegmentReader::fakeNorms() {
     if (ones == NULL)
-        // ones = createFakeNorms(maxDoc());
-        ones = createFakeNorms(1);
+        // TODO: this is origin clucene norms
+        ones = createFakeNorms(maxDoc());
+        // ones = createFakeNorms(1);
     return ones;
 }
 // can return NULL if norms aren't stored
@@ -748,12 +823,11 @@ uint8_t *SegmentReader::norms(const TCHAR *field) {
     //       and returned containing the norms for that field. If the named 
field is unknown NULL is returned.
 
     CND_PRECONDITION(field != NULL, "field is NULL");
-    // SCOPED_LOCK_MUTEX(THIS_LOCK)
-    // ensureOpen();
-    // uint8_t *bytes = getNorms(field);
-    // if (bytes == NULL)
-    //     bytes = fakeNorms();
-    uint8_t *bytes = fakeNorms();
+    SCOPED_LOCK_MUTEX(THIS_LOCK)
+    ensureOpen();
+    uint8_t *bytes = getNorms(field);
+    if (bytes == NULL)
+        bytes = fakeNorms();
     return bytes;
 }
 
@@ -826,6 +900,26 @@ void SegmentReader::openNorms(Directory *cfsDir, int32_t 
readBufferSize) {
             }
 
             _norms[fi->name] = _CLNEW Norm(normInput, singleNormFile, 
fi->number, normSeek, this, segment.c_str());
+
+            // read total norm info into cache
+            uint8_t *bytes = _CL_NEWARRAY(uint8_t, _maxDoc);
+            IndexInput *normStream;
+            if (_norms[fi->name]->useSingleNormStream) {
+                normStream = singleNormStream;
+            } else {
+                normStream = _norms[fi->name]->in;
+            }
+
+            ensureOpen();
+            SCOPED_LOCK_MUTEX(_norms[fi->name]->THIS_LOCK);
+            normStream->seek(_norms[fi->name]->normSeek);
+            normStream->readBytes(bytes, _maxDoc);
+            uint64_t sum = 0;
+            for (int doc = 0; doc < _maxDoc; doc++) {
+                sum += Similarity::decodeNorm(bytes[doc]);
+            }
+            sum_total_term_freq[*fi->name] = sum;
+
             nextNormSeek += _maxDoc;// increment also if some norms are 
separate
         }
     }
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp 
b/src/core/CLucene/index/SegmentTermDocs.cpp
index e346dc0ca24..a702f702bee 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -11,6 +11,7 @@
 #include "CLucene/index/CodeMode.h"
 #include "CLucene/util/PFORUtil.h"
 #include "Term.h"
+#include "CLucene/search/Similarity.h"
 
 #include <assert.h>
 #include <memory>
@@ -19,10 +20,10 @@
 CL_NS_DEF(index)
 
 SegmentTermDocs::SegmentTermDocs(const SegmentReader *_parent) : 
parent(_parent), freqStream(_parent->freqStream->clone()),
-                                                                 count(0), 
df(0), deletedDocs(_parent->deletedDocs), _doc(-1), _freq(0), 
skipInterval(_parent->tis->getSkipInterval()),
+                                                                 count(0), 
df(0), maxDoc(_parent->maxDoc()), deletedDocs(_parent->deletedDocs), _doc(-1), 
_freq(0), skipInterval(_parent->tis->getSkipInterval()),
                                                                  
maxSkipLevels(_parent->tis->getMaxSkipLevels()), skipListReader(NULL), 
freqBasePointer(0), proxBasePointer(0),
                                                                  
skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0), 
indexVersion_(_parent->_fieldInfos->getIndexVersion()),
-                                                                 
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, 
indexVersion_) {
+                                                                 
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, maxDoc, 
indexVersion_) {
     CND_CONDITION(_parent != NULL, "Parent is NULL");
     memset(docs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
     memset(freqs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
@@ -40,13 +41,23 @@ int32_t SegmentTermDocs::docFreq() {
     return df;
 }
 
-void SegmentTermDocs::seek(Term *term) {
+int32_t SegmentTermDocs::docNorm() {
+    if (_doc < 0 || _doc >= LUCENE_INT32_MAX_SHOULDBE) {
+        return 0;
+    }
+    if (_doc < maxDoc) {
+        return norms[_doc];
+    }
+    return 0;
+}
+
+void SegmentTermDocs::seek(Term *term, bool load_stats) {
     TermInfo *ti = parent->tis->get(term);
-    seek(ti, term);
+    seek(ti, term, load_stats);
     _CLDELETE(ti);
 }
 
-void SegmentTermDocs::seek(TermEnum *termEnum) {
+void SegmentTermDocs::seek(TermEnum *termEnum, bool load_stats) {
     TermInfo *ti = NULL;
     Term *term = NULL;
 
@@ -61,13 +72,19 @@ void SegmentTermDocs::seek(TermEnum *termEnum) {
         ti = parent->tis->get(term);
     }
 
-    seek(ti, term);
+    seek(ti, term, load_stats);
     _CLDELETE(ti);
 }
-void SegmentTermDocs::seek(const TermInfo *ti, Term *term) {
+void SegmentTermDocs::seek(const TermInfo *ti, Term *term, bool load_stats) {
     count = 0;
     FieldInfo *fi = parent->_fieldInfos->fieldInfo(term->field());
     currentFieldStoresPayloads = (fi != NULL) ? fi->storePayloads : false;
+    buffer_.needLoadStats(load_stats);
+    if (load_stats && fi != NULL && fi->isIndexed && !fi->omitNorms) {
+        const TCHAR *curField = fi->name;
+        norms = parent->norms(curField);
+        buffer_.setAllDocNorms(norms);
+    }
     // hasProx = (fi != nullptr) && fi->hasProx;
     if (ti == NULL) {
         df = 0;
@@ -93,6 +110,9 @@ int32_t SegmentTermDocs::doc() const {
 int32_t SegmentTermDocs::freq() const {
     return _freq;
 }
+int32_t SegmentTermDocs::norm() const {
+    return _norm;
+}
 
 bool SegmentTermDocs::next()  {
     if (count == df) {
@@ -104,6 +124,7 @@ bool SegmentTermDocs::next()  {
     if (hasProx) {
         _freq = buffer_.getFreq();
     }
+    _norm = buffer_.getNorm();
 
     count++;
 
@@ -125,6 +146,7 @@ int32_t SegmentTermDocs::read(int32_t *docs, int32_t 
*freqs, int32_t length) {
             _freq = buffer_.getFreq();
             freqs[i] = _freq;
         }
+        _norm = buffer_.getNorm();
 
         count++;
         i++;
@@ -133,6 +155,31 @@ int32_t SegmentTermDocs::read(int32_t *docs, int32_t 
*freqs, int32_t length) {
     return i;
 }
 
+int32_t SegmentTermDocs::read(int32_t *docs, int32_t *freqs, int32_t *norms, 
int32_t length) {
+    int32_t i = 0;
+
+    if (count == df) {
+        return i;
+    }
+
+    while (i < length && count < df) {
+        _doc = buffer_.getDoc();
+        docs[i] = _doc;
+
+        if (hasProx) {
+            _freq = buffer_.getFreq();
+            freqs[i] = _freq;
+        }
+
+        _norm = buffer_.getNorm();
+        norms[i] = _norm;
+
+        count++;
+        i++;
+    }
+
+    return i;
+}
 bool SegmentTermDocs::readRange(DocRange* docRange) {
     if (count >= df) {
         return false;
@@ -189,8 +236,8 @@ bool SegmentTermDocs::skipTo(const int32_t target) {
 void TermDocsBuffer::refill() {
     cur_doc_ = 0;
     cur_freq_ = 0;
-
-    if (indexVersion_ == IndexVersion::kV1) {
+    cur_norm_ = 0;
+    if (indexVersion_ >= IndexVersion::kV1) {
         size_ = refillV1();
     } else {
         size_ = refillV0();
@@ -211,8 +258,26 @@ void TermDocsBuffer::readRange(DocRange* docRange) {
         docRange->freq_many = &freqs_;
         docRange->freq_many_size_ = size;
     }
+
+    if (load_stats_) {
+        docRange->norm_many = &norms_;
+        docRange->norm_many_size_ = size;
+    }
+
+
 }
 
+void TermDocsBuffer::setAllDocNorms(uint8_t* norms) {
+    if(load_stats_ && norms) {
+        all_doc_norms_ = norms;
+    }
+}
+
+void TermDocsBuffer::needLoadStats(bool load_stats) {
+    load_stats_ = load_stats;
+}
+
+
 int32_t TermDocsBuffer::refillV0() {
     if (hasProx_) {
         char mode = freqStream_->readByte();
@@ -243,6 +308,7 @@ int32_t TermDocsBuffer::refillV0() {
                 }
             }
         }
+        refillNorm(arraySize);
         return arraySize;
     } else {
         uint32_t arraySize = freqStream_->readVInt();
@@ -261,6 +327,7 @@ int32_t TermDocsBuffer::refillV0() {
                 P4DEC(buf.data(), arraySize, docs_.data());
             }
         }
+        refillNorm(arraySize);
         return arraySize;
     }
 }
@@ -299,7 +366,23 @@ int32_t TermDocsBuffer::refillV1() {
             }            
         }
     }
+    refillNorm(arraySize);
     return arraySize;
 }
 
+void TermDocsBuffer::refillNorm(int32_t size) {
+    if (!load_stats_) {
+        return;
+    }
+
+    for (int i = 0 ;i < size; i++) {
+        auto doc = docs_[i];
+        // avoid doc norms not set
+        if (doc < maxDoc && all_doc_norms_) {
+            norms_[i] = search::Similarity::decodeNorm(all_doc_norms_[doc]);
+        } else {
+            norms_[i] = 0;
+        }
+    }
+}
 CL_NS_END
diff --git a/src/core/CLucene/index/SegmentTermPositions.cpp 
b/src/core/CLucene/index/SegmentTermPositions.cpp
index 1c7db0703c7..e9cacc80218 100644
--- a/src/core/CLucene/index/SegmentTermPositions.cpp
+++ b/src/core/CLucene/index/SegmentTermPositions.cpp
@@ -32,8 +32,8 @@ TermPositions* SegmentTermPositions::__asTermPositions(){
     return (TermPositions*) this;
 }
 
-void SegmentTermPositions::seek(const TermInfo* ti, Term* term) {
-    SegmentTermDocs::seek(ti, term);
+void SegmentTermPositions::seek(const TermInfo* ti, Term* term, bool 
local_stats) {
+    SegmentTermDocs::seek(ti, term, local_stats);
     if (ti != NULL)
        lazySkipPointer = ti->proxPointer;
     
@@ -100,6 +100,10 @@ int32_t SegmentTermPositions::read(int32_t* /*docs*/, 
int32_t* /*freqs*/, int32_
     _CLTHROWA(CL_ERR_UnsupportedOperation,"TermPositions does not support 
processing multiple documents in one call. Use TermDocs instead.");
 }
 
+int32_t SegmentTermPositions::read(int32_t* /*docs*/, int32_t* /*freqs*/, 
int32_t*  /*norms*/, int32_t /*length*/) {
+    _CLTHROWA(CL_ERR_UnsupportedOperation,"TermPositions does not support 
processing multiple documents in one call. Use TermDocs instead.");
+}
+
 bool SegmentTermPositions::readRange(DocRange* docRange) {
     _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation: 
SegmentTermPositions::readDocRange");
 }
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index 620105fd617..a0a4b834956 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -31,12 +31,12 @@ public:
 
        // Sets this to the data for a term.
        // The enumeration is reset to the start of the data for this term.
-       virtual void seek(Term* term)=0;
+       virtual void seek(Term* term, bool load_stats = false) = 0;
 
        /** Sets this to the data for the current term in a {@link TermEnum}.
        * This may be optimized in some implementations.
        */
-       virtual void seek(TermEnum* termEnum)=0;
+       virtual void seek(TermEnum* termEnum,  bool load_stats = false) = 0;
 
        // Returns the current document number.  <p> This is invalid until 
{@link
        //      #next()} is called for the first time.
@@ -46,6 +46,10 @@ public:
        //      is invalid until {@link #next()} is called for the first time.
        virtual int32_t freq() const=0;
 
+        // Returns the current document norm.  <p> This is invalid until {@link
+        //     #next()} is called for the first time.
+        virtual int32_t norm() const=0;
+
        // Moves to the next pair in the enumeration.  <p> Returns true iff 
there is
        //      such a next pair in the enumeration.
        virtual bool next() =0;
@@ -58,6 +62,7 @@ public:
        // <p>Returns the number of entries read.  Zero is only returned when 
the
        // stream has been exhausted.
        virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length)=0;
+        virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t* norms, 
int32_t length)=0;
        virtual bool readRange(DocRange* docRange) = 0;
 
        // Skips entries to the first beyond the current whose document number 
is
@@ -86,6 +91,10 @@ public:
        virtual int32_t docFreq() {
                _CLTHROWA(CL_ERR_UnsupportedOperation, "TermDocs::docFreq does 
not support this method.");
        }
+
+        virtual int32_t docNorm() {
+           return 0;
+       }
 };
 
 
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h 
b/src/core/CLucene/index/_MultiSegmentReader.h
index c5f8deeea23..e1bd40a2224 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -104,6 +104,13 @@ public:
 
        //Returns the document frequency of the current term in the set
        int32_t docFreq(const Term* t=NULL);
+
+        // Returns the document norm
+        int32_t docNorm(const TCHAR* field, int32_t n);
+
+        // Returns the total norm of all terms appeared in all documents in 
this field
+        std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
+
        TermDocs* termDocs();
        TermPositions* termPositions();
 
@@ -146,7 +153,7 @@ protected:
   size_t pointer;
 
   TermDocs* current;              // == segTermDocs[pointer]
-  TermDocs* termDocs(const int32_t i); //< internal use only
+  TermDocs* termDocs(const int32_t i, bool local_stats = false); //< internal 
use only
   virtual TermDocs* termDocs(IndexReader* reader);
   void init(CL_NS(util)::ArrayBase<IndexReader*>* subReaders, const int32_t* 
starts);
 public:
@@ -156,13 +163,15 @@ public:
 
   int32_t doc() const;
   int32_t freq() const;
+  int32_t norm() const;
 
-  void seek(TermEnum* termEnum);
-  void seek(Term* tterm);
+  void seek(TermEnum* termEnum, bool load_stats = false);
+  void seek(Term* tterm, bool load_stats = false);
   bool next();
 
   /** Optimized implementation. */
   int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
+  int32_t read(int32_t* docs, int32_t* freqs, int32_t* norms , int32_t length);
   bool readRange(DocRange* docRange) override;
 
    /* A Possible future optimization could skip entire segments */
@@ -173,6 +182,7 @@ public:
   virtual TermPositions* __asTermPositions();
 
   int32_t docFreq() override;
+  int32_t docNorm() override;
 };
 
 
diff --git a/src/core/CLucene/index/_SegmentHeader.h 
b/src/core/CLucene/index/_SegmentHeader.h
index 6bf7d1819b7..fd58d2fbf9e 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -32,9 +32,11 @@ class SegmentReader;
 
 class TermDocsBuffer {
 public:
-  TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx, 
IndexVersion indexVersion)
+  TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx, uint32_t 
maxDoc, IndexVersion indexVersion)
       : docs_(PFOR_BLOCK_SIZE + 3),
         freqs_(PFOR_BLOCK_SIZE + 3),
+        norms_(PFOR_BLOCK_SIZE + 3),
+        maxDoc(maxDoc),
         freqStream_(freqStream),
         hasProx_(hasProx),
         indexVersion_(indexVersion) {
@@ -43,9 +45,11 @@ public:
   ~TermDocsBuffer() {
     cur_doc_ = 0;
     cur_freq_ = 0;
+    cur_norm_ = 0;
 
     docs_.clear();
     freqs_.clear();
+    norms_.clear();
 
     freqStream_ = nullptr;
   }
@@ -64,12 +68,29 @@ public:
     return freqs_[cur_freq_++];
   }
 
+  inline int32_t getNorm() {
+      if (cur_norm_ >= size_) {
+          refill();
+      }
+      if(cur_norm_ >= maxDoc) {
+          return 0;
+      }
+      return norms_[cur_norm_++];
+  }
+
   void refill();
   void readRange(DocRange* docRange);
 
+  // set doc norms before readrange or refill
+  void setAllDocNorms(uint8_t* norms);
+
+  // need load state
+  void needLoadStats(bool load_stats = false);
+
 private:
   int32_t refillV0();
   int32_t refillV1();
+  void refillNorm(int32_t size);
 
 private:
   uint32_t size_ = 0;
@@ -80,8 +101,19 @@ private:
   uint32_t cur_freq_ = 0;
   std::vector<uint32_t> freqs_;
 
+  //cur doc norm
+  uint32_t cur_norm_ = 0;
+  std::vector<uint32_t> norms_;
+
   CL_NS(store)::IndexInput* freqStream_ = nullptr;
 
+  // need load statistic info
+  bool load_stats_ = false;
+
+  // save all doc norms in this term's field
+  uint32_t maxDoc = 0;
+  uint8_t* all_doc_norms_;
+
   bool hasProx_ = false;
   IndexVersion indexVersion_ = IndexVersion::kV0; 
 };
@@ -92,14 +124,19 @@ protected:
   CL_NS(store)::IndexInput* freqStream;
   int32_t count;
   int32_t df;
+  int32_t maxDoc;
+
   CL_NS(util)::BitSet* deletedDocs;
   int32_t _doc = -1;
   int32_t _freq = 0;
+  int32_t _norm = 0;
+
   int32_t docs[PFOR_BLOCK_SIZE];         // buffered doc numbers
   int32_t freqs[PFOR_BLOCK_SIZE];        // buffered term freqs
   int32_t pointer;
   int32_t pointerMax;
 
+  uint8_t* norms;
 private:
   int32_t skipInterval;
   int32_t maxSkipLevels;
@@ -121,18 +158,22 @@ public:
   SegmentTermDocs( const SegmentReader* Parent);
   virtual ~SegmentTermDocs();
 
-  virtual void seek(Term* term);
-  virtual void seek(TermEnum* termEnum);
-  virtual void seek(const TermInfo* ti,Term* term);
+  virtual void seek(Term* term, bool load_stats = false);
+  virtual void seek(TermEnum* termEnum, bool load_stats = false);
+  virtual void seek(const TermInfo* ti,Term* term, bool load_stats = false);
 
   virtual void close();
   virtual int32_t doc()const;
   virtual int32_t freq()const;
+  virtual int32_t norm()const;
 
   virtual bool next();
 
   /** Optimized implementation. */
   virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
+
+  virtual int32_t read(int32_t* docs, int32_t* freqs,int32_t* norms, int32_t 
length);
+
   bool readRange(DocRange* docRange) override;
 
   /** Optimized implementation. */
@@ -142,6 +183,8 @@ public:
 
   int32_t docFreq() override;
 
+  int32_t docNorm() override;
+
 protected:
   virtual void skippingDoc(){}
   virtual void skipProx(const int64_t /*proxPointer*/, const int32_t 
/*payloadLength*/){}
@@ -178,7 +221,7 @@ public:
   virtual ~SegmentTermPositions();
 
 private:
-  void seek(const TermInfo* ti, Term* term);
+  void seek(const TermInfo* ti, Term* term, bool load_stats = false);
 
 public:
   void close();
@@ -193,6 +236,7 @@ protected:
 public:
   bool next();
   int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
+  int32_t read(int32_t* docs, int32_t* freqs, int32_t* norms, int32_t length);
   bool readRange(DocRange* docRange) override;
 
 protected:
@@ -227,10 +271,11 @@ private:
   virtual TermPositions* __asTermPositions();
 
   //resolve SegmentTermDocs/TermPositions ambiguity
-  void seek(Term* term){ SegmentTermDocs::seek(term); }
-  void seek(TermEnum* termEnum){ SegmentTermDocs::seek(termEnum); }
+  void seek(Term* term, bool load_stats = false){ SegmentTermDocs::seek(term, 
load_stats); }
+  void seek(TermEnum* termEnum, bool load_stats = false){ 
SegmentTermDocs::seek(termEnum, load_stats); }
   int32_t doc() const{ return SegmentTermDocs::doc(); }
   int32_t freq() const{ return SegmentTermDocs::freq(); }
+  int32_t norm() const{ return SegmentTermDocs::norm(); }
   bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); }
 };
 
@@ -302,6 +347,7 @@ class SegmentReader: public DirectoryIndexReader {
     CL_NS(util)::Deletor::Dummy,
     Norm > NormsType;
   NormsType _norms;
+  std::unordered_map<TCHAR, std::optional<int64_t>> sum_total_term_freq;
 
   uint8_t* ones;
   uint8_t* fakeNorms();
@@ -419,13 +465,18 @@ public:
   ///Returns the number of documents which contain the term t
   int32_t docFreq(const Term* t);
 
+  ///Returns the number of document whose id is doc in this field
+  int32_t docNorm(const TCHAR* field, int32_t doc);
+
+  ///Returns the total norm of all terms appeared in all documents in this 
field
+  std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
+
   ///Returns the actual number of documents in the segment
   int32_t numDocs();
   ///Returns the number of  all the documents in the segment including the 
ones that have
   ///been marked deleted
   int32_t maxDoc() const;
 
-
   void setTermInfosIndexDivisor(int32_t indexDivisor);
 
   int32_t getTermInfosIndexDivisor();
@@ -434,6 +485,11 @@ public:
   ///Returns fake norms if norms aren't available
   uint8_t* norms(const TCHAR* field);
 
+  uint8_t* norms(const TCHAR* field) const;
+
+  ///Returns the bytes array that holds the norms of a named field.
+  void norms(const TCHAR* field, uint8_t* bytes) const;
+
   ///Reads the Norms for field from disk
   void norms(const TCHAR* field, uint8_t* bytes);
 
diff --git a/src/core/CLucene/search/IndexSearcher.cpp 
b/src/core/CLucene/search/IndexSearcher.cpp
index f5b313a3b26..e20d6f44239 100644
--- a/src/core/CLucene/search/IndexSearcher.cpp
+++ b/src/core/CLucene/search/IndexSearcher.cpp
@@ -200,6 +200,21 @@ CL_NS_DEF(search)
       return reader->docFreq(term);
   }
 
+  // doc norm
+  int32_t IndexSearcher::docNorm(const TCHAR* field, int32_t doc) const {
+
+      CND_PRECONDITION(reader != NULL, "reader is NULL");
+
+      return reader->docNorm(field, doc);
+  }
+
+  std::optional<uint64_t> IndexSearcher::sumTotalTermFreq(const TCHAR* field) 
const {
+
+      CND_PRECONDITION(reader != NULL, "reader is NULL");
+
+      return reader->sumTotalTermFreq(field);
+  }
+
   _CL_DEPRECATED( doc(i, document) ) CL_NS(document)::Document* 
IndexSearcher::doc(int32_t i){
        CL_NS(document)::Document* ret = _CLNEW CL_NS(document)::Document;
        if (!doc(i,ret) )
diff --git a/src/core/CLucene/search/IndexSearcher.h 
b/src/core/CLucene/search/IndexSearcher.h
index 8f0b2000aff..6969b9ed409 100644
--- a/src/core/CLucene/search/IndexSearcher.h
+++ b/src/core/CLucene/search/IndexSearcher.h
@@ -77,6 +77,10 @@ public:
 
        int32_t docFreq(const CL_NS(index)::Term* term) const;
 
+        int32_t docNorm(const TCHAR* field, int32_t doc) const;
+
+        std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) const;
+
        bool doc(int32_t i, CL_NS(document)::Document& document);
        bool doc(int32_t i, CL_NS(document)::Document* document);
        _CL_DEPRECATED( doc(i, document) ) CL_NS(document)::Document* 
doc(int32_t i);
diff --git a/src/core/CLucene/search/MultiSearcher.cpp 
b/src/core/CLucene/search/MultiSearcher.cpp
index 0f2a6862706..872179ae7f3 100644
--- a/src/core/CLucene/search/MultiSearcher.cpp
+++ b/src/core/CLucene/search/MultiSearcher.cpp
@@ -5,8 +5,10 @@
 * the GNU Lesser General Public License, as specified in the COPYING file.
 
------------------------------------------------------------------------------*/
 #include "CLucene/_ApiHeader.h"
+#include <optional>
 #include "CLucene/index/IndexReader.h"
 #include "MultiSearcher.h"
+
 #include "SearchHeader.h"
 #include "Query.h"
 #include "_HitQueue.h"
@@ -74,6 +76,30 @@ CL_NS_DEF(search)
     return docFreq;
   }
 
+// doc norm
+int32_t MultiSearcher::docNorm(const TCHAR* field, int32_t n) const {
+
+      CND_PRECONDITION(reader != NULL, "reader is NULL");
+      int32_t i = subSearcher(n);                        // find searcher index
+      return searchables[i]->docNorm(field, n - starts[i]);
+  }
+
+std::optional<uint64_t> MultiSearcher::sumTotalTermFreq(const TCHAR* field) 
const {
+      bool fieldHasNorm = false;
+      int64_t sum = 0;
+      for (int32_t i = 0; i < searchablesLen; ++i) {
+           std::optional<int64_t> norm = 
searchables[i]->sumTotalTermFreq(field);
+          if (norm != std::nullopt) {
+              fieldHasNorm = true;
+              sum += norm.value();
+          }
+      }
+      if (fieldHasNorm) {
+           return sum;
+      }
+      return std::nullopt;
+  }
+
   /** For use by {@link HitCollector} implementations. */
   bool MultiSearcher::doc(int32_t n, Document* d) {
     int32_t i = subSearcher(n);                          // find searcher index
diff --git a/src/core/CLucene/search/MultiSearcher.h 
b/src/core/CLucene/search/MultiSearcher.h
index 17adba01e8d..1815b0fc575 100644
--- a/src/core/CLucene/search/MultiSearcher.h
+++ b/src/core/CLucene/search/MultiSearcher.h
@@ -40,6 +40,10 @@ CL_NS_DEF(search)
 
          int32_t docFreq(const CL_NS(index)::Term* term) const ;
 
+    int32_t docNorm(const TCHAR* field, int32_t n) const;
+
+    std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) const;
+
       /** For use by {@link HitCollector} implementations. */
          bool doc(int32_t n, CL_NS(document)::Document* document);
 
diff --git a/src/core/CLucene/search/Searchable.h 
b/src/core/CLucene/search/Searchable.h
index cb32a88579e..dabd7d45928 100644
--- a/src/core/CLucene/search/Searchable.h
+++ b/src/core/CLucene/search/Searchable.h
@@ -9,6 +9,7 @@
 
 
 //#include "CLucene/index/IndexReader.h"
+#include <optional>
 CL_CLASS_DEF(index,Term)
 //#include "Filter.h"
 CL_CLASS_DEF(document,Document)
@@ -67,7 +68,12 @@ CL_NS_DEF(search)
       * @see IndexReader#docFreq(Term).
       */
       virtual int32_t docFreq(const CL_NS(index)::Term* term) const = 0;
-
+      /** Expert: Returns the norm of document whoss id is <code>doc</code> in 
the <code>field</code>.
+      */
+      virtual int32_t docNorm(const TCHAR* field, int32_t doc) const = 0;
+      /** Expert: Returns the total norm of all terms appeared in all 
documents in this field
+      */
+      virtual std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) 
const = 0;
       /** Expert: Returns one greater than the largest possible document 
number.
       * Called by search code to compute term weights.
       * @see IndexReader#maxDoc().
diff --git a/src/core/CLucene/search/Similarity.cpp 
b/src/core/CLucene/search/Similarity.cpp
index b78ce677533..922d644ec8f 100644
--- a/src/core/CLucene/search/Similarity.cpp
+++ b/src/core/CLucene/search/Similarity.cpp
@@ -247,4 +247,12 @@ CL_NS_DEF(search)
                return 0.0f;
     return overlap / (float_t)maxOverlap;
   }
+
+  LengthSimilarity::LengthSimilarity(){
+     }
+  LengthSimilarity::~LengthSimilarity(){
+     }
+  float_t LengthSimilarity::lengthNorm(const TCHAR* /*fieldName*/, int32_t 
numTerms) {
+       return numTerms;
+  }
 CL_NS_END
diff --git a/src/core/CLucene/search/Similarity.h 
b/src/core/CLucene/search/Similarity.h
index 388898aba23..74b7a819d06 100644
--- a/src/core/CLucene/search/Similarity.h
+++ b/src/core/CLucene/search/Similarity.h
@@ -275,5 +275,14 @@ public:
   float_t coord(int32_t overlap, int32_t maxOverlap);
 };
 
+/** Expert: Length scoring implementation. */
+class CLUCENE_EXPORT LengthSimilarity: public DefaultSimilarity {
+public:
+    LengthSimilarity();
+    ~LengthSimilarity();
+    /** Implemented as <code>1/sqrt(numTerms)</code>. */
+    float_t lengthNorm(const TCHAR* fieldName, int32_t numTerms) override;
+};
+
 CL_NS_END
 #endif
diff --git a/src/core/CLucene/search/query/TermIterator.h 
b/src/core/CLucene/search/query/TermIterator.h
index 3eb22a254de..82c5c71027d 100644
--- a/src/core/CLucene/search/query/TermIterator.h
+++ b/src/core/CLucene/search/query/TermIterator.h
@@ -27,6 +27,10 @@ public:
     return termDocs_->freq();
   }
 
+  inline int32_t norm() const {
+      return termDocs_->norm();
+  }
+
   inline int32_t nextDoc() const {
     if (termDocs_->next()) {
       return termDocs_->doc();
@@ -45,6 +49,10 @@ public:
     return termDocs_->docFreq();
   }
 
+  inline int32_t docNorm() const {
+      return termDocs_->docNorm();
+  }
+
   inline bool readRange(DocRange* docRange) const {
     return termDocs_->readRange(docRange);
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris-thirdparty) branch tmp_clucene_hybrid_search updated: [feature](clucene): Introduce extra statistics for calculating Doris BM25. (#308)

Reply via email to