This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch tmp_clucene_hybrid_search
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/tmp_clucene_hybrid_search by
this push:
new 368270c359d [feature](clucene): Introduce extra statistics for
calculating Doris BM25. (#308)
368270c359d is described below
commit 368270c359da9a978f2ba894c251eadf03e9305e
Author: Zephyr Guo <[email protected]>
AuthorDate: Mon Apr 21 17:34:16 2025 +0800
[feature](clucene): Introduce extra statistics for calculating Doris BM25.
(#308)
---
src/core/CLucene/index/DocRange.h | 3 +
src/core/CLucene/index/IndexReader.cpp | 10 +-
src/core/CLucene/index/IndexReader.h | 15 ++-
src/core/CLucene/index/IndexWriter.cpp | 128 ++++++++++++++++++++++-
src/core/CLucene/index/IndexWriter.h | 4 +-
src/core/CLucene/index/MultiReader.cpp | 32 ++++++
src/core/CLucene/index/MultiReader.h | 7 ++
src/core/CLucene/index/MultiSegmentReader.cpp | 69 +++++++++++-
src/core/CLucene/index/MultipleTermPositions.cpp | 14 ++-
src/core/CLucene/index/MultipleTermPositions.h | 12 ++-
src/core/CLucene/index/SegmentMerger.cpp | 2 +-
src/core/CLucene/index/SegmentReader.cpp | 118 ++++++++++++++++++---
src/core/CLucene/index/SegmentTermDocs.cpp | 101 ++++++++++++++++--
src/core/CLucene/index/SegmentTermPositions.cpp | 8 +-
src/core/CLucene/index/Terms.h | 13 ++-
src/core/CLucene/index/_MultiSegmentReader.h | 16 ++-
src/core/CLucene/index/_SegmentHeader.h | 72 +++++++++++--
src/core/CLucene/search/IndexSearcher.cpp | 15 +++
src/core/CLucene/search/IndexSearcher.h | 4 +
src/core/CLucene/search/MultiSearcher.cpp | 26 +++++
src/core/CLucene/search/MultiSearcher.h | 4 +
src/core/CLucene/search/Searchable.h | 8 +-
src/core/CLucene/search/Similarity.cpp | 8 ++
src/core/CLucene/search/Similarity.h | 9 ++
src/core/CLucene/search/query/TermIterator.h | 8 ++
25 files changed, 646 insertions(+), 60 deletions(-)
diff --git a/src/core/CLucene/index/DocRange.h
b/src/core/CLucene/index/DocRange.h
index ef7906a24fb..ab417ce5877 100644
--- a/src/core/CLucene/index/DocRange.h
+++ b/src/core/CLucene/index/DocRange.h
@@ -23,8 +23,11 @@ class DocRange {
uint32_t doc_many_size_ = 0;
uint32_t freq_many_size_ = 0;
+ uint32_t norm_many_size_ = 0;
+
std::vector<uint32_t>* doc_many = nullptr;
std::vector<uint32_t>* freq_many = nullptr;
+ std::vector<uint32_t>* norm_many = nullptr;
std::pair<uint32_t, uint32_t> doc_range;
};
\ No newline at end of file
diff --git a/src/core/CLucene/index/IndexReader.cpp
b/src/core/CLucene/index/IndexReader.cpp
index 5b9f8ad2624..41b055181b0 100644
--- a/src/core/CLucene/index/IndexReader.cpp
+++ b/src/core/CLucene/index/IndexReader.cpp
@@ -251,7 +251,7 @@ CL_NS_DEF(index)
return SegmentInfos::getCurrentSegmentGeneration(directory) != -1;
}
- TermDocs* IndexReader::termDocs(Term* term) {
+ TermDocs* IndexReader::termDocs(Term* term, bool load_stats) {
//Func - Returns an enumeration of all the documents which contain
// term. For each document, the document number, the frequency of
// the term in that document is also provided, for use in search
scoring.
@@ -269,13 +269,13 @@ CL_NS_DEF(index)
ensureOpen();
//Reference an instantiated TermDocs instance
TermDocs* _termDocs = termDocs();
- //Seek all documents containing term
- _termDocs->seek(term);
+ //Seek all documentĀ·s containing term
+ _termDocs->seek(term, load_stats);
//return the enumaration
return _termDocs;
}
- TermPositions* IndexReader::termPositions(Term* term){
+ TermPositions* IndexReader::termPositions(Term* term, bool load_stats){
//Func - Returns an enumeration of all the documents which contain term.
For each
// document, in addition to the document number and frequency of the
term in
// that document, a list of all of the ordinal positions of the term
in the document
@@ -296,7 +296,7 @@ CL_NS_DEF(index)
//Reference an instantiated termPositions instance
TermPositions* _termPositions = termPositions();
//Seek all documents containing term
- _termPositions->seek(term);
+ _termPositions->seek(term, load_stats);
//return the enumeration
return _termPositions;
}
diff --git a/src/core/CLucene/index/IndexReader.h
b/src/core/CLucene/index/IndexReader.h
index 4307a0d9332..a61a1a2ec57 100644
--- a/src/core/CLucene/index/IndexReader.h
+++ b/src/core/CLucene/index/IndexReader.h
@@ -15,6 +15,8 @@
#include "CLucene/index/IndexVersion.h"
#include "CLucene/index/_FieldInfos.h"
+#include <optional>
+
CL_CLASS_DEF(store,Directory)
CL_CLASS_DEF(store,LuceneLock)
CL_CLASS_DEF(document,Document)
@@ -59,7 +61,6 @@ class CLUCENE_EXPORT IndexReader: public
CL_NS(util)::NamedObject{
bool closed;
protected:
bool hasChanges;
-
/**
* Legacy Constructor for backwards compatibility.
*
@@ -560,6 +561,14 @@ public:
*/
virtual int32_t docFreq(const Term* t) = 0;
+ /** Returns the norm of document whoss id is <code>doc</code> in the
<code>field</code>.
+ */
+ virtual int32_t docNorm(const TCHAR* field, int32_t doc) = 0;
+
+ /** Returns the total norm of all terms appeared in all documents
+ */
+ virtual std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) =
0;
+
/* Returns an unpositioned TermPositions enumerator.
* @throws IOException if there is a low-level IO error
* @memory Caller must clean up
@@ -584,7 +593,7 @@ public:
* @throws IOException if there is a low-level IO error
* @memory Caller must clean up
*/
- TermPositions* termPositions(Term* term);
+ TermPositions* termPositions(Term* term, bool load_stats = false);
/** Returns an unpositioned {@link TermDocs} enumerator.
* @throws IOException if there is a low-level IO error
@@ -602,7 +611,7 @@ public:
* @throws IOException if there is a low-level IO error
* @memory Caller must clean up
*/
- TermDocs* termDocs(Term* term);
+ TermDocs* termDocs(Term* term, bool load_stats = false);
/** Deletes the document numbered <code>docNum</code>. Once a document
is
* deleted it will not appear in TermDocs or TermPostitions enumerations.
diff --git a/src/core/CLucene/index/IndexWriter.cpp
b/src/core/CLucene/index/IndexWriter.cpp
index a3b30848af6..7ede27aec2f 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1341,10 +1341,52 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
std::vector<lucene::index::IndexWriter *> destIndexWriterList;
std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList;
+ std::vector<lucene::store::IndexOutput *> normsOutputList;
+
+ // first level vector index is src_index_id
+ // <TCHAR, ValueArray<uint8_t>> key is field name, value is the norm of
src_doc_id
+ std::vector<map<TCHAR, std::vector<uint8_t>>>
srcFieldNormsMapValues(numIndices);
+
try {
/// merge fields
mergeFields(hasProx);
+ // check if field has norms
+ bool hasNorms = false;
+ {
+ for (size_t i = 0; i < fieldInfos->size(); i++) {
+ //Get the i-th FieldInfo
+ FieldInfo* fi = fieldInfos->fieldInfo(i);
+ // Is this Field indexed and field need norms ?
+ if (fi->isIndexed && !fi->omitNorms) {
+ hasNorms = true;
+ }
+ }
+ }
+
+ if (hasNorms) {
+ for (int srcIndex = 0; srcIndex < numIndices; srcIndex++) {
+ auto reader = readers[srcIndex];
+ for (size_t i = 0; i < fieldInfos->size(); i++) {
+ //Get the i-th FieldInfo
+ FieldInfo* fi = fieldInfos->fieldInfo(i);
+ // Is this Field indexed and field need norms ?
+ if (fi->isIndexed && !fi->omitNorms) {
+ CL_NS(util)::ValueArray<uint8_t> normBuffer;
+ size_t maxDoc = reader->maxDoc();
+ if ( normBuffer.length < maxDoc){
+ normBuffer.resize(maxDoc);
+ memset(normBuffer.values, 0, sizeof(uint8_t) *
maxDoc);
+ }
+ reader->norms(fi->name, normBuffer.values);
+ for (int j = 0; j < normBuffer.length; j++) {
+
srcFieldNormsMapValues[srcIndex][*fi->name].emplace_back(normBuffer.values[j]);
+ }
+ }
+ }
+ }
+ }
+
/// write fields and create files writers
for (int j = 0; j < numDestIndexes; j++) {
auto dest_dir = dest_dirs[j];
@@ -1385,6 +1427,13 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
maxSkipLevels = termInfosWriter->maxSkipLevels;
skipListWriterList.push_back(_CLNEW
DefaultSkipListWriter(skipInterval, maxSkipLevels, (int) dest_index_docs[j],
freqOutputList[j], proxOutputList[j]));
+ if (hasNorms) {
+ // create norms output
+ auto* norms_out =
dest_dir->createOutput(Misc::segmentname(segment.c_str(), ".nrm").c_str());
+ norms_out->writeBytes(SegmentMerger::NORMS_HEADER,
SegmentMerger::NORMS_HEADER_length);
+ normsOutputList.push_back(norms_out);
+ }
+
// create null_bitmap index output
auto* null_bitmap_out =
dest_dir->createOutput(NULL_BITMAP_FILE_NAME);
nullBitmapIndexOutputList.push_back(null_bitmap_out);
@@ -1393,6 +1442,11 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
/// merge terms
mergeTerms(hasProx);
+ /// merge norms if have
+ if (hasNorms){
+ mergeNorms(dest_index_docs, srcFieldNormsMapValues,
normsOutputList);
+ }
+
/// merge null_bitmap
mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList);
} catch (CLuceneError &e) {
@@ -1432,7 +1486,14 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
r->close();
_CLDELETE(r);
}
- } readers.clear(););
+ } readers.clear(););
+ for (auto* norms_out
+ : normsOutputList) {
+ if (norms_out != nullptr) {
+ norms_out->close();
+ _CLDELETE(norms_out);
+ }
+ } normsOutputList.clear();
for (auto* null_bitmap_out
: nullBitmapIndexOutputList) {
if (null_bitmap_out != nullptr) {
@@ -1889,6 +1950,71 @@ void IndexWriter::mergeTerms(bool hasProx) {
}
}
+void IndexWriter::mergeNorms(std::vector<uint32_t> dest_index_docs,
+ std::vector<std::map<TCHAR,
std::vector<uint8_t>>> srcFieldNormsMapValues,
+ std::vector<lucene::store::IndexOutput *>
normsOutputList) {
+ //Func - Merges the norms for all fields
+ //Pre - fieldInfos != NULL
+ //Post - The norms for all fields have been merged
+ CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");
+
+ std::vector<std::map<TCHAR, std::vector<uint8_t>>>
destFieldNormsMapValues(numDestIndexes);
+
+ // iterate srcFieldNormsValues to construct destFieldNormsMapValues
+ for (size_t srcIndex = 0; srcIndex < srcFieldNormsMapValues.size();
++srcIndex) {
+ std::map<TCHAR, std::vector<uint8_t>> &srcFieldNormsMap =
srcFieldNormsMapValues[srcIndex];
+ if (srcFieldNormsMap.empty()) {
+ // empty indicates there is no nrm file in this index
+ continue;
+ }
+ // find field has norms
+ for (int j =0; j < fieldInfos->size(); j++) {
+ FieldInfo* fi = fieldInfos->fieldInfo(j);
+ TCHAR fieldName = *fi->name;
+ // Is this Field indexed and field need norms ?
+ if (fi->isIndexed && !fi->omitNorms) {
+ auto& srcFieldNorms = srcFieldNormsMap[fieldName];
+ // construct srcFieldNorms to destFieldNorms
+ for (int srcDocId = 0; srcDocId < srcFieldNorms.size();
srcDocId++) {
+ auto destIdx = _trans_vec[srcIndex][srcDocId].first;
+ auto destDocId = _trans_vec[srcIndex][srcDocId].second;
+ if (destIdx == UINT32_MAX || destDocId == UINT32_MAX) {
+ continue;
+ }
+ auto destDocCount = dest_index_docs[destIdx];
+ auto& destFieldNormsMap = destFieldNormsMapValues[destIdx];
+ if (destFieldNormsMap.find(fieldName) ==
destFieldNormsMap.end()) {
+ destFieldNormsMap[fieldName].resize(destDocCount);
+
std::fill(destFieldNormsMap[fieldName].begin(),destFieldNormsMap[fieldName].end(),
0);
+ }
+ auto& destFieldNorms = destFieldNormsMap[fieldName];
+ destFieldNorms[destDocId] = srcFieldNorms[srcDocId];
+ destFieldNormsMap[fieldName] = destFieldNorms;
+ }
+ }
+ }
+ }
+
+ // construct nrm and write nrm to dest index
+ for (size_t i = 0; i < destFieldNormsMapValues.size(); ++i) {
+ auto& destFieldNormsMap = destFieldNormsMapValues[i];
+ for (int j =0; j < fieldInfos->size(); j++) {
+ FieldInfo* fi = fieldInfos->fieldInfo(j);
+ TCHAR fieldName = *fi->name;
+ auto destDocCount = dest_index_docs[i];
+ if (fi->isIndexed && !fi->omitNorms) {
+ // if not find then norm is zero
+ if (destFieldNormsMap.find(fieldName) ==
destFieldNormsMap.end()) {
+ destFieldNormsMap[fieldName].resize(destDocCount);
+
std::fill(destFieldNormsMap[fieldName].begin(),destFieldNormsMap[fieldName].end(),
0);
+ }
+ auto& destFieldNorms = destFieldNormsMap[fieldName];
+ normsOutputList[i]->writeBytes(destFieldNorms.data(),
destDocCount);
+ }
+ }
+ }
+}
+
void IndexWriter::mergeNullBitmap(std::vector<std::vector<uint32_t>>
srcNullBitmapValues, std::vector<lucene::store::IndexOutput *>
nullBitmapIndexOutputList) {
// first level vector index is dest_index_id
// second level vector index is dest_doc_id
diff --git a/src/core/CLucene/index/IndexWriter.h
b/src/core/CLucene/index/IndexWriter.h
index 7cfb67d2ca7..7765a2362f3 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -324,7 +324,9 @@ public:
// write fields info file
void writeFields(lucene::store::Directory* d, std::string segment);
// merge terms and write files
- void mergeTerms(bool hasProx);
+ void mergeTerms(bool hasProx, IndexVersion indexVersion);
+ // merge norms and write files
+ void mergeNorms(std::vector<uint32_t> dest_index_docs,
std::vector<std::map<TCHAR, std::vector<uint8_t>>> srcFieldNormsMapValues,
std::vector<lucene::store::IndexOutput *> normsOutputList);
// merge null_bitmap
void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues,
std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList);
diff --git a/src/core/CLucene/index/MultiReader.cpp
b/src/core/CLucene/index/MultiReader.cpp
index 726b6e3dac5..8535a116507 100644
--- a/src/core/CLucene/index/MultiReader.cpp
+++ b/src/core/CLucene/index/MultiReader.cpp
@@ -271,6 +271,38 @@ int32_t MultiReader::docFreq(const Term* t) {
return total;
}
+int32_t MultiReader::docNorm(const TCHAR* field, int32_t n) {
+ ensureOpen();
+ if (hasNorms(field)) {
+ int32_t i = readerIndex(n);
+ return (*subReaders)[i]->docNorm(field, n - starts[i]);
+ }
+ return 0;
+};
+
+std::optional<uint64_t> MultiReader::sumTotalTermFreq(const TCHAR* field) {
+ ensureOpen();
+
+ if (hasNorms(field)) {
+ int64_t sum = 0;
+ bool hasTotalNorm = false;
+ for (size_t i = 0; i < subReaders->length; i++) {
+ if(!isDeleted(i)) {
+ std::optional<int64_t> totalNorm =
(*subReaders)[i]->sumTotalTermFreq(field);
+ if (totalNorm != std::nullopt) {
+ hasTotalNorm = true;
+ sum += totalNorm.value();
+ }
+ }
+ }
+ if (hasTotalNorm) {
+ return sum;
+ }
+ }
+
+ return std::nullopt;
+}
+
TermDocs* MultiReader::termDocs() {
ensureOpen();
TermDocs* ret = _CLNEW MultiTermDocs(subReaders, starts);
diff --git a/src/core/CLucene/index/MultiReader.h
b/src/core/CLucene/index/MultiReader.h
index 301d1422e2c..aa1bbed0227 100644
--- a/src/core/CLucene/index/MultiReader.h
+++ b/src/core/CLucene/index/MultiReader.h
@@ -100,6 +100,13 @@ public:
//Returns the document frequency of the current term in the set
int32_t docFreq(const Term* t=NULL);
+
+ // Returns the document norm
+ int32_t docNorm(const TCHAR* field, int32_t n);
+
+ // Returns the total norm of all terms appeared in all documents in
this field
+ std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
+
TermDocs* termDocs();
TermPositions* termPositions();
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp
b/src/core/CLucene/index/MultiSegmentReader.cpp
index b4be5f01298..4c5a527663a 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -355,6 +355,34 @@ int32_t MultiSegmentReader::docFreq(const Term* t) {
return total;
}
+int32_t MultiSegmentReader::docNorm(const TCHAR* field, int32_t n) {
+ if (hasNorms(field)) {
+ int32_t i = readerIndex(n); // find segment
num
+ return (*subReaders)[i]->docNorm(field,n - starts[i]);
+ }
+ return 0;
+}
+
+std::optional<uint64_t> MultiSegmentReader::sumTotalTermFreq(const TCHAR*
field) {
+ if (hasNorms(field)) {
+ int64_t sum = 0;
+ bool hasTotalNorm = false;
+ for (size_t i = 0; i < subReaders->length; i++) {
+ if (!isDeleted(i)) {
+ std::optional<int64_t> totalNorm =
(*subReaders)[i]->sumTotalTermFreq(field);
+ if (totalNorm != std::nullopt) {
+ sum += totalNorm.value();
+ hasTotalNorm = true;
+ }
+ }
+ }
+ if (hasTotalNorm) {
+ return sum;
+ }
+ }
+ return std::nullopt;
+}
+
TermDocs* MultiSegmentReader::termDocs() {
ensureOpen();
TermDocs* ret = _CLNEW MultiTermDocs(subReaders, starts);
@@ -559,6 +587,10 @@ int32_t MultiTermDocs::docFreq() {
return docFreq;
}
+int32_t MultiTermDocs::docNorm() {
+ return current->docNorm();
+}
+
int32_t MultiTermDocs::doc() const {
CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was
called");
// if not found term, current will return INT_MAX, we could not add base,
otherwise it will overflow.
@@ -572,11 +604,16 @@ int32_t MultiTermDocs::freq() const {
return current->freq();
}
-void MultiTermDocs::seek(TermEnum* termEnum){
- seek(termEnum->term(false));
+int32_t MultiTermDocs::norm() const {
+ CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was
called");
+ return current->norm();
}
-void MultiTermDocs::seek( Term* tterm) {
+void MultiTermDocs::seek(TermEnum* termEnum, bool load_stats){
+ seek(termEnum->term(false), load_stats);
+}
+
+void MultiTermDocs::seek( Term* tterm, bool load_stats) {
//Func - Resets the instance for a new search
//Pre - tterm != NULL
//Post - The instance has been reset for a new search
@@ -645,6 +682,28 @@ int32_t MultiTermDocs::read(int32_t* docs, int32_t* freqs,
int32_t length) {
}
}
+int32_t MultiTermDocs::read(int32_t* docs, int32_t* freqs, int32_t* norms,
int32_t length) {
+ while (true) {
+ while (current == NULL) {
+ if (pointer < subReaders->length) { // try next
segment
+ base = starts[pointer];
+ current = termDocs(pointer++);
+ } else {
+ return 0;
+ }
+ }
+ int32_t end = current->read(docs, freqs, norms, length);
+ if (end == 0) { // none left in
segment
+ current = NULL;
+ } else { // got some
+ int32_t b = base; // adjust doc numbers
+ for (int32_t i = 0; i < end; i++)
+ docs[i] += b;
+ return end;
+ }
+ }
+}
+
bool MultiTermDocs::readRange(DocRange* docRange) {
while (true) {
while (current == NULL) {
@@ -727,7 +786,7 @@ TermDocs* MultiTermDocs::termDocs(IndexReader* reader) {
return reader->termDocs();
}
-TermDocs* MultiTermDocs::termDocs(const int32_t i) {
+TermDocs* MultiTermDocs::termDocs(const int32_t i, bool local_stats) {
if (term == NULL)
return NULL;
TermDocs* result = (*readerTermDocs)[i];
@@ -736,7 +795,7 @@ TermDocs* MultiTermDocs::termDocs(const int32_t i) {
readerTermDocs->values[i] = termDocs((*subReaders)[i]);
result = (*readerTermDocs)[i];
}
- result->seek(term);
+ result->seek(term, local_stats);
return result;
}
diff --git a/src/core/CLucene/index/MultipleTermPositions.cpp
b/src/core/CLucene/index/MultipleTermPositions.cpp
index e5bfa5ac24a..b5846516f76 100644
--- a/src/core/CLucene/index/MultipleTermPositions.cpp
+++ b/src/core/CLucene/index/MultipleTermPositions.cpp
@@ -14,11 +14,11 @@ CL_NS_USE(util)
CL_NS_DEF(index)
-void MultipleTermPositions::seek(Term*) {
+void MultipleTermPositions::seek(Term*, bool) {
_CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation:
MultipleTermPositions::seek");
}
-void MultipleTermPositions::seek(TermEnum*) {
+void MultipleTermPositions::seek(TermEnum*, bool) {
_CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation:
MultipleTermPositions::seek");
}
@@ -26,6 +26,10 @@ int32_t MultipleTermPositions::read(int32_t*,
int32_t*,int32_t) {
_CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation:
MultipleTermPositions::read");
}
+int32_t MultipleTermPositions::read(int32_t*, int32_t*, int32_t*, int32_t) {
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation:
MultipleTermPositions::read");
+}
+
bool MultipleTermPositions::readRange(DocRange* docRange) {
_CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation:
MultipleTermPositions::readRange");
}
@@ -144,6 +148,7 @@ bool MultipleTermPositions::next() {
_posList->clear();
_doc = _termPositionsQueue->peek()->doc();
+ _norm = _termPositionsQueue->peek()->norm();
TermPositions* tp;
do {
@@ -163,7 +168,6 @@ bool MultipleTermPositions::next() {
_posList->sort();
_freq = _posList->size();
-
return true;
}
@@ -192,6 +196,10 @@ int32_t MultipleTermPositions::freq() const {
return _freq;
}
+int32_t MultipleTermPositions::norm() const {
+ return _norm;
+}
+
void MultipleTermPositions::close() {
while (_termPositionsQueue->size() > 0) {
TermPositions* tp = _termPositionsQueue->pop();
diff --git a/src/core/CLucene/index/MultipleTermPositions.h
b/src/core/CLucene/index/MultipleTermPositions.h
index 67d03615f62..8ef7be1ac56 100644
--- a/src/core/CLucene/index/MultipleTermPositions.h
+++ b/src/core/CLucene/index/MultipleTermPositions.h
@@ -21,8 +21,9 @@ private:
class IntQueue;
int32_t _doc;
- int32_t _freq;
- TermPositionsQueue* _termPositionsQueue;
+ int32_t _freq;
+ int32_t _norm;
+ TermPositionsQueue* _termPositionsQueue;
IntQueue* _posList;
public:
@@ -44,25 +45,28 @@ public:
int32_t freq() const;
+ int32_t norm() const;
+
void close();
/**
* Not implemented.
* @throws UnsupportedOperationException
*/
- void seek(Term*);
+ void seek(Term*, bool);
/**
* Not implemented.
* @throws UnsupportedOperationException
*/
- void seek(TermEnum*);
+ void seek(TermEnum*, bool);
/**
* Not implemented.
* @throws UnsupportedOperationException
*/
int32_t read(int32_t*, int32_t*,int32_t);
+ int32_t read(int32_t*, int32_t*, int32_t*, int32_t);
bool readRange(DocRange* docRange) override;
/**
diff --git a/src/core/CLucene/index/SegmentMerger.cpp
b/src/core/CLucene/index/SegmentMerger.cpp
index cc910b02c88..f0988ed4c9c 100644
--- a/src/core/CLucene/index/SegmentMerger.cpp
+++ b/src/core/CLucene/index/SegmentMerger.cpp
@@ -739,7 +739,7 @@ void SegmentMerger::mergeNorms() {
for (size_t i = 0; i < fieldInfos->size(); i++) {
//Get the i-th FieldInfo
FieldInfo* fi = fieldInfos->fieldInfo(i);
- //Is this Field indexed?
+ // Is this Field indexed and field need norms ?
if (fi->isIndexed && !fi->omitNorms){
//Instantiate an IndexOutput to that norm file
if (output == NULL) {
diff --git a/src/core/CLucene/index/SegmentReader.cpp
b/src/core/CLucene/index/SegmentReader.cpp
index 721263664fa..5c5ede1063e 100644
--- a/src/core/CLucene/index/SegmentReader.cpp
+++ b/src/core/CLucene/index/SegmentReader.cpp
@@ -4,6 +4,9 @@
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
+#include <assert.h>
+#include <s2/base/integral_types.h>
+
#include "CLucene/_ApiHeader.h"
#include "CLucene/search/Similarity.h"
#include "CLucene/store/FSDirectory.h"
@@ -17,7 +20,6 @@
#include "_SegmentHeader.h"
#include "_SegmentMerger.h"
#include "_TermInfosReader.h"
-#include <assert.h>
CL_NS_USE(util)
CL_NS_USE(store)
@@ -199,8 +201,8 @@ void SegmentReader::initialize(SegmentInfo *si, int32_t
readBufferSize, bool doO
if (_fieldInfos->hasProx()) {
proxStream = cfsDir->openInput((segment + ".prx").c_str(),
readBufferSize);
}
- // we do not need norms, so we don't read it at all.
- //openNorms(cfsDir, readBufferSize);
+
+ openNorms(cfsDir, readBufferSize);
if (doOpenStores && _fieldInfos->hasVectors()) {// open term vector
files only as needed
string vectorsSegment;
@@ -546,6 +548,31 @@ int32_t SegmentReader::docFreq(const Term *t) {
return 0;
}
+int32_t SegmentReader::docNorm(const TCHAR* field, int32_t doc) {
+ //Func - Returns the norm of document whose id is doc in this filed
+ //Pre - field has norm file
+ //Post - The norm of document whose id is doc in this filed has been
returned, otherwise -1.0f;
+
+ ensureOpen();
+
+ if (hasNorms(field)) {
+ SCOPED_LOCK_MUTEX(THIS_LOCK)
+ uint8_t* field_norms = norms(field);
+ return search::Similarity::decodeNorm(field_norms[doc]);
+ }
+ return 0;
+}
+
+std::optional<uint64_t> SegmentReader::sumTotalTermFreq(const TCHAR* field) {
+ //Func - Returns the sum number of all terms in all docs
+ //Pre - field has norm file;
+ //Post - The sum number of all terms in all docs has been returned,
otherwise -1.0f;
+ if (hasNorms(field)) {
+ return sum_total_term_freq[*field];
+ }
+ return std::nullopt;
+}
+
int32_t SegmentReader::numDocs() {
//Func - Returns the actual number of documents in the segment
//Pre - true
@@ -654,6 +681,53 @@ void SegmentReader::norms(const TCHAR *field, uint8_t
*bytes) {
}
+ {
+ SCOPED_LOCK_MUTEX(norm->THIS_LOCK)
+ if (norm->bytes != NULL) {// can copy from cache
+ memcpy(bytes, norm->bytes, maxDoc());
+ return;
+ }
+
+ // Read from disk. norm.in may be shared across multiple norms and
+ // should only be used in a synchronized context.
+ IndexInput *normStream;
+ if (norm->useSingleNormStream) {
+ normStream = singleNormStream;
+ } else {
+ normStream = norm->in;
+ }
+ normStream->seek(norm->normSeek);
+ normStream->readBytes(bytes, maxDoc());
+ }
+}
+uint8_t* SegmentReader::norms(const TCHAR *field) const {
+ CND_PRECONDITION(field != NULL, "field is NULL");
+ Norm *norm = _norms.get(field);
+ if (norm == NULL) {
+ return NULL;
+ }
+ {
+ SCOPED_LOCK_MUTEX(norm->THIS_LOCK)
+ if (norm->bytes == NULL) {// value not yet read
+ uint8_t *bytes = _CL_NEWARRAY(uint8_t, maxDoc());
+ norms(field, bytes);
+ norm->bytes = bytes;// cache it
+ // it's OK to close the underlying IndexInput as we have cached the
+ // norms and will never read them again.
+ norm->close();
+ }
+
+ return norm->bytes;
+ }
+}
+
+void SegmentReader::norms(const TCHAR *field, uint8_t* bytes) const {
+ CND_PRECONDITION(field != NULL, "field is NULL");
+ Norm *norm = _norms.get(field);
+ if (norm == NULL) {
+ return;
+ }
+
{
SCOPED_LOCK_MUTEX(norm->THIS_LOCK)
if (norm->bytes != NULL) {// can copy from cache
@@ -677,14 +751,15 @@ void SegmentReader::norms(const TCHAR *field, uint8_t
*bytes) {
uint8_t *SegmentReader::createFakeNorms(int32_t size) {
uint8_t *ones = _CL_NEWARRAY(uint8_t, size);
if (size > 0)
- memset(ones, DefaultSimilarity::encodeNorm(1.0f), size);
+ memset(ones, Similarity::encodeNorm(0), size);
return ones;
}
uint8_t *SegmentReader::fakeNorms() {
if (ones == NULL)
- // ones = createFakeNorms(maxDoc());
- ones = createFakeNorms(1);
+ // TODO: this is origin clucene norms
+ ones = createFakeNorms(maxDoc());
+ // ones = createFakeNorms(1);
return ones;
}
// can return NULL if norms aren't stored
@@ -748,12 +823,11 @@ uint8_t *SegmentReader::norms(const TCHAR *field) {
// and returned containing the norms for that field. If the named
field is unknown NULL is returned.
CND_PRECONDITION(field != NULL, "field is NULL");
- // SCOPED_LOCK_MUTEX(THIS_LOCK)
- // ensureOpen();
- // uint8_t *bytes = getNorms(field);
- // if (bytes == NULL)
- // bytes = fakeNorms();
- uint8_t *bytes = fakeNorms();
+ SCOPED_LOCK_MUTEX(THIS_LOCK)
+ ensureOpen();
+ uint8_t *bytes = getNorms(field);
+ if (bytes == NULL)
+ bytes = fakeNorms();
return bytes;
}
@@ -826,6 +900,26 @@ void SegmentReader::openNorms(Directory *cfsDir, int32_t
readBufferSize) {
}
_norms[fi->name] = _CLNEW Norm(normInput, singleNormFile,
fi->number, normSeek, this, segment.c_str());
+
+ // read total norm info into cache
+ uint8_t *bytes = _CL_NEWARRAY(uint8_t, _maxDoc);
+ IndexInput *normStream;
+ if (_norms[fi->name]->useSingleNormStream) {
+ normStream = singleNormStream;
+ } else {
+ normStream = _norms[fi->name]->in;
+ }
+
+ ensureOpen();
+ SCOPED_LOCK_MUTEX(_norms[fi->name]->THIS_LOCK);
+ normStream->seek(_norms[fi->name]->normSeek);
+ normStream->readBytes(bytes, _maxDoc);
+ uint64_t sum = 0;
+ for (int doc = 0; doc < _maxDoc; doc++) {
+ sum += Similarity::decodeNorm(bytes[doc]);
+ }
+ sum_total_term_freq[*fi->name] = sum;
+
nextNormSeek += _maxDoc;// increment also if some norms are
separate
}
}
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp
b/src/core/CLucene/index/SegmentTermDocs.cpp
index e346dc0ca24..a702f702bee 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -11,6 +11,7 @@
#include "CLucene/index/CodeMode.h"
#include "CLucene/util/PFORUtil.h"
#include "Term.h"
+#include "CLucene/search/Similarity.h"
#include <assert.h>
#include <memory>
@@ -19,10 +20,10 @@
CL_NS_DEF(index)
SegmentTermDocs::SegmentTermDocs(const SegmentReader *_parent) :
parent(_parent), freqStream(_parent->freqStream->clone()),
- count(0),
df(0), deletedDocs(_parent->deletedDocs), _doc(-1), _freq(0),
skipInterval(_parent->tis->getSkipInterval()),
+ count(0),
df(0), maxDoc(_parent->maxDoc()), deletedDocs(_parent->deletedDocs), _doc(-1),
_freq(0), skipInterval(_parent->tis->getSkipInterval()),
maxSkipLevels(_parent->tis->getMaxSkipLevels()), skipListReader(NULL),
freqBasePointer(0), proxBasePointer(0),
skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0),
indexVersion_(_parent->_fieldInfos->getIndexVersion()),
-
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx,
indexVersion_) {
+
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, maxDoc,
indexVersion_) {
CND_CONDITION(_parent != NULL, "Parent is NULL");
memset(docs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
memset(freqs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
@@ -40,13 +41,23 @@ int32_t SegmentTermDocs::docFreq() {
return df;
}
-void SegmentTermDocs::seek(Term *term) {
+int32_t SegmentTermDocs::docNorm() {
+ if (_doc < 0 || _doc >= LUCENE_INT32_MAX_SHOULDBE) {
+ return 0;
+ }
+ if (_doc < maxDoc) {
+ return norms[_doc];
+ }
+ return 0;
+}
+
+void SegmentTermDocs::seek(Term *term, bool load_stats) {
TermInfo *ti = parent->tis->get(term);
- seek(ti, term);
+ seek(ti, term, load_stats);
_CLDELETE(ti);
}
-void SegmentTermDocs::seek(TermEnum *termEnum) {
+void SegmentTermDocs::seek(TermEnum *termEnum, bool load_stats) {
TermInfo *ti = NULL;
Term *term = NULL;
@@ -61,13 +72,19 @@ void SegmentTermDocs::seek(TermEnum *termEnum) {
ti = parent->tis->get(term);
}
- seek(ti, term);
+ seek(ti, term, load_stats);
_CLDELETE(ti);
}
-void SegmentTermDocs::seek(const TermInfo *ti, Term *term) {
+void SegmentTermDocs::seek(const TermInfo *ti, Term *term, bool load_stats) {
count = 0;
FieldInfo *fi = parent->_fieldInfos->fieldInfo(term->field());
currentFieldStoresPayloads = (fi != NULL) ? fi->storePayloads : false;
+ buffer_.needLoadStats(load_stats);
+ if (load_stats && fi != NULL && fi->isIndexed && !fi->omitNorms) {
+ const TCHAR *curField = fi->name;
+ norms = parent->norms(curField);
+ buffer_.setAllDocNorms(norms);
+ }
// hasProx = (fi != nullptr) && fi->hasProx;
if (ti == NULL) {
df = 0;
@@ -93,6 +110,9 @@ int32_t SegmentTermDocs::doc() const {
int32_t SegmentTermDocs::freq() const {
return _freq;
}
+int32_t SegmentTermDocs::norm() const {
+ return _norm;
+}
bool SegmentTermDocs::next() {
if (count == df) {
@@ -104,6 +124,7 @@ bool SegmentTermDocs::next() {
if (hasProx) {
_freq = buffer_.getFreq();
}
+ _norm = buffer_.getNorm();
count++;
@@ -125,6 +146,7 @@ int32_t SegmentTermDocs::read(int32_t *docs, int32_t
*freqs, int32_t length) {
_freq = buffer_.getFreq();
freqs[i] = _freq;
}
+ _norm = buffer_.getNorm();
count++;
i++;
@@ -133,6 +155,31 @@ int32_t SegmentTermDocs::read(int32_t *docs, int32_t
*freqs, int32_t length) {
return i;
}
+int32_t SegmentTermDocs::read(int32_t *docs, int32_t *freqs, int32_t *norms,
int32_t length) {
+ int32_t i = 0;
+
+ if (count == df) {
+ return i;
+ }
+
+ while (i < length && count < df) {
+ _doc = buffer_.getDoc();
+ docs[i] = _doc;
+
+ if (hasProx) {
+ _freq = buffer_.getFreq();
+ freqs[i] = _freq;
+ }
+
+ _norm = buffer_.getNorm();
+ norms[i] = _norm;
+
+ count++;
+ i++;
+ }
+
+ return i;
+}
bool SegmentTermDocs::readRange(DocRange* docRange) {
if (count >= df) {
return false;
@@ -189,8 +236,8 @@ bool SegmentTermDocs::skipTo(const int32_t target) {
void TermDocsBuffer::refill() {
cur_doc_ = 0;
cur_freq_ = 0;
-
- if (indexVersion_ == IndexVersion::kV1) {
+ cur_norm_ = 0;
+ if (indexVersion_ >= IndexVersion::kV1) {
size_ = refillV1();
} else {
size_ = refillV0();
@@ -211,8 +258,26 @@ void TermDocsBuffer::readRange(DocRange* docRange) {
docRange->freq_many = &freqs_;
docRange->freq_many_size_ = size;
}
+
+ if (load_stats_) {
+ docRange->norm_many = &norms_;
+ docRange->norm_many_size_ = size;
+ }
+
+
}
+void TermDocsBuffer::setAllDocNorms(uint8_t* norms) {
+ if(load_stats_ && norms) {
+ all_doc_norms_ = norms;
+ }
+}
+
+void TermDocsBuffer::needLoadStats(bool load_stats) {
+ load_stats_ = load_stats;
+}
+
+
int32_t TermDocsBuffer::refillV0() {
if (hasProx_) {
char mode = freqStream_->readByte();
@@ -243,6 +308,7 @@ int32_t TermDocsBuffer::refillV0() {
}
}
}
+ refillNorm(arraySize);
return arraySize;
} else {
uint32_t arraySize = freqStream_->readVInt();
@@ -261,6 +327,7 @@ int32_t TermDocsBuffer::refillV0() {
P4DEC(buf.data(), arraySize, docs_.data());
}
}
+ refillNorm(arraySize);
return arraySize;
}
}
@@ -299,7 +366,23 @@ int32_t TermDocsBuffer::refillV1() {
}
}
}
+ refillNorm(arraySize);
return arraySize;
}
+void TermDocsBuffer::refillNorm(int32_t size) {
+ if (!load_stats_) {
+ return;
+ }
+
+ for (int i = 0 ;i < size; i++) {
+ auto doc = docs_[i];
+ // avoid doc norms not set
+ if (doc < maxDoc && all_doc_norms_) {
+ norms_[i] = search::Similarity::decodeNorm(all_doc_norms_[doc]);
+ } else {
+ norms_[i] = 0;
+ }
+ }
+}
CL_NS_END
diff --git a/src/core/CLucene/index/SegmentTermPositions.cpp
b/src/core/CLucene/index/SegmentTermPositions.cpp
index 1c7db0703c7..e9cacc80218 100644
--- a/src/core/CLucene/index/SegmentTermPositions.cpp
+++ b/src/core/CLucene/index/SegmentTermPositions.cpp
@@ -32,8 +32,8 @@ TermPositions* SegmentTermPositions::__asTermPositions(){
return (TermPositions*) this;
}
-void SegmentTermPositions::seek(const TermInfo* ti, Term* term) {
- SegmentTermDocs::seek(ti, term);
+void SegmentTermPositions::seek(const TermInfo* ti, Term* term, bool
local_stats) {
+ SegmentTermDocs::seek(ti, term, local_stats);
if (ti != NULL)
lazySkipPointer = ti->proxPointer;
@@ -100,6 +100,10 @@ int32_t SegmentTermPositions::read(int32_t* /*docs*/,
int32_t* /*freqs*/, int32_
_CLTHROWA(CL_ERR_UnsupportedOperation,"TermPositions does not support
processing multiple documents in one call. Use TermDocs instead.");
}
+int32_t SegmentTermPositions::read(int32_t* /*docs*/, int32_t* /*freqs*/,
int32_t* /*norms*/, int32_t /*length*/) {
+ _CLTHROWA(CL_ERR_UnsupportedOperation,"TermPositions does not support
processing multiple documents in one call. Use TermDocs instead.");
+}
+
bool SegmentTermPositions::readRange(DocRange* docRange) {
_CLTHROWA(CL_ERR_UnsupportedOperation, "Unsupported operation:
SegmentTermPositions::readDocRange");
}
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index 620105fd617..a0a4b834956 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -31,12 +31,12 @@ public:
// Sets this to the data for a term.
// The enumeration is reset to the start of the data for this term.
- virtual void seek(Term* term)=0;
+ virtual void seek(Term* term, bool load_stats = false) = 0;
/** Sets this to the data for the current term in a {@link TermEnum}.
* This may be optimized in some implementations.
*/
- virtual void seek(TermEnum* termEnum)=0;
+ virtual void seek(TermEnum* termEnum, bool load_stats = false) = 0;
// Returns the current document number. <p> This is invalid until
{@link
// #next()} is called for the first time.
@@ -46,6 +46,10 @@ public:
// is invalid until {@link #next()} is called for the first time.
virtual int32_t freq() const=0;
+ // Returns the current document norm. <p> This is invalid until {@link
+ // #next()} is called for the first time.
+ virtual int32_t norm() const=0;
+
// Moves to the next pair in the enumeration. <p> Returns true iff
there is
// such a next pair in the enumeration.
virtual bool next() =0;
@@ -58,6 +62,7 @@ public:
// <p>Returns the number of entries read. Zero is only returned when
the
// stream has been exhausted.
virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length)=0;
+ virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t* norms,
int32_t length)=0;
virtual bool readRange(DocRange* docRange) = 0;
// Skips entries to the first beyond the current whose document number
is
@@ -86,6 +91,10 @@ public:
virtual int32_t docFreq() {
_CLTHROWA(CL_ERR_UnsupportedOperation, "TermDocs::docFreq does
not support this method.");
}
+
+ virtual int32_t docNorm() {
+ return 0;
+ }
};
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h
b/src/core/CLucene/index/_MultiSegmentReader.h
index c5f8deeea23..e1bd40a2224 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -104,6 +104,13 @@ public:
//Returns the document frequency of the current term in the set
int32_t docFreq(const Term* t=NULL);
+
+ // Returns the document norm
+ int32_t docNorm(const TCHAR* field, int32_t n);
+
+ // Returns the total norm of all terms appeared in all documents in
this field
+ std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
+
TermDocs* termDocs();
TermPositions* termPositions();
@@ -146,7 +153,7 @@ protected:
size_t pointer;
TermDocs* current; // == segTermDocs[pointer]
- TermDocs* termDocs(const int32_t i); //< internal use only
+ TermDocs* termDocs(const int32_t i, bool local_stats = false); //< internal
use only
virtual TermDocs* termDocs(IndexReader* reader);
void init(CL_NS(util)::ArrayBase<IndexReader*>* subReaders, const int32_t*
starts);
public:
@@ -156,13 +163,15 @@ public:
int32_t doc() const;
int32_t freq() const;
+ int32_t norm() const;
- void seek(TermEnum* termEnum);
- void seek(Term* tterm);
+ void seek(TermEnum* termEnum, bool load_stats = false);
+ void seek(Term* tterm, bool load_stats = false);
bool next();
/** Optimized implementation. */
int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
+ int32_t read(int32_t* docs, int32_t* freqs, int32_t* norms , int32_t length);
bool readRange(DocRange* docRange) override;
/* A Possible future optimization could skip entire segments */
@@ -173,6 +182,7 @@ public:
virtual TermPositions* __asTermPositions();
int32_t docFreq() override;
+ int32_t docNorm() override;
};
diff --git a/src/core/CLucene/index/_SegmentHeader.h
b/src/core/CLucene/index/_SegmentHeader.h
index 6bf7d1819b7..fd58d2fbf9e 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -32,9 +32,11 @@ class SegmentReader;
class TermDocsBuffer {
public:
- TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx,
IndexVersion indexVersion)
+ TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx, uint32_t
maxDoc, IndexVersion indexVersion)
: docs_(PFOR_BLOCK_SIZE + 3),
freqs_(PFOR_BLOCK_SIZE + 3),
+ norms_(PFOR_BLOCK_SIZE + 3),
+ maxDoc(maxDoc),
freqStream_(freqStream),
hasProx_(hasProx),
indexVersion_(indexVersion) {
@@ -43,9 +45,11 @@ public:
~TermDocsBuffer() {
cur_doc_ = 0;
cur_freq_ = 0;
+ cur_norm_ = 0;
docs_.clear();
freqs_.clear();
+ norms_.clear();
freqStream_ = nullptr;
}
@@ -64,12 +68,29 @@ public:
return freqs_[cur_freq_++];
}
+ inline int32_t getNorm() {
+ if (cur_norm_ >= size_) {
+ refill();
+ }
+ if(cur_norm_ >= maxDoc) {
+ return 0;
+ }
+ return norms_[cur_norm_++];
+ }
+
void refill();
void readRange(DocRange* docRange);
+ // set doc norms before readrange or refill
+ void setAllDocNorms(uint8_t* norms);
+
+ // need load state
+ void needLoadStats(bool load_stats = false);
+
private:
int32_t refillV0();
int32_t refillV1();
+ void refillNorm(int32_t size);
private:
uint32_t size_ = 0;
@@ -80,8 +101,19 @@ private:
uint32_t cur_freq_ = 0;
std::vector<uint32_t> freqs_;
+ //cur doc norm
+ uint32_t cur_norm_ = 0;
+ std::vector<uint32_t> norms_;
+
CL_NS(store)::IndexInput* freqStream_ = nullptr;
+ // need load statistic info
+ bool load_stats_ = false;
+
+ // save all doc norms in this term's field
+ uint32_t maxDoc = 0;
+ uint8_t* all_doc_norms_;
+
bool hasProx_ = false;
IndexVersion indexVersion_ = IndexVersion::kV0;
};
@@ -92,14 +124,19 @@ protected:
CL_NS(store)::IndexInput* freqStream;
int32_t count;
int32_t df;
+ int32_t maxDoc;
+
CL_NS(util)::BitSet* deletedDocs;
int32_t _doc = -1;
int32_t _freq = 0;
+ int32_t _norm = 0;
+
int32_t docs[PFOR_BLOCK_SIZE]; // buffered doc numbers
int32_t freqs[PFOR_BLOCK_SIZE]; // buffered term freqs
int32_t pointer;
int32_t pointerMax;
+ uint8_t* norms;
private:
int32_t skipInterval;
int32_t maxSkipLevels;
@@ -121,18 +158,22 @@ public:
SegmentTermDocs( const SegmentReader* Parent);
virtual ~SegmentTermDocs();
- virtual void seek(Term* term);
- virtual void seek(TermEnum* termEnum);
- virtual void seek(const TermInfo* ti,Term* term);
+ virtual void seek(Term* term, bool load_stats = false);
+ virtual void seek(TermEnum* termEnum, bool load_stats = false);
+ virtual void seek(const TermInfo* ti,Term* term, bool load_stats = false);
virtual void close();
virtual int32_t doc()const;
virtual int32_t freq()const;
+ virtual int32_t norm()const;
virtual bool next();
/** Optimized implementation. */
virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
+
+ virtual int32_t read(int32_t* docs, int32_t* freqs,int32_t* norms, int32_t
length);
+
bool readRange(DocRange* docRange) override;
/** Optimized implementation. */
@@ -142,6 +183,8 @@ public:
int32_t docFreq() override;
+ int32_t docNorm() override;
+
protected:
virtual void skippingDoc(){}
virtual void skipProx(const int64_t /*proxPointer*/, const int32_t
/*payloadLength*/){}
@@ -178,7 +221,7 @@ public:
virtual ~SegmentTermPositions();
private:
- void seek(const TermInfo* ti, Term* term);
+ void seek(const TermInfo* ti, Term* term, bool load_stats = false);
public:
void close();
@@ -193,6 +236,7 @@ protected:
public:
bool next();
int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
+ int32_t read(int32_t* docs, int32_t* freqs, int32_t* norms, int32_t length);
bool readRange(DocRange* docRange) override;
protected:
@@ -227,10 +271,11 @@ private:
virtual TermPositions* __asTermPositions();
//resolve SegmentTermDocs/TermPositions ambiguity
- void seek(Term* term){ SegmentTermDocs::seek(term); }
- void seek(TermEnum* termEnum){ SegmentTermDocs::seek(termEnum); }
+ void seek(Term* term, bool load_stats = false){ SegmentTermDocs::seek(term,
load_stats); }
+ void seek(TermEnum* termEnum, bool load_stats = false){
SegmentTermDocs::seek(termEnum, load_stats); }
int32_t doc() const{ return SegmentTermDocs::doc(); }
int32_t freq() const{ return SegmentTermDocs::freq(); }
+ int32_t norm() const{ return SegmentTermDocs::norm(); }
bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); }
};
@@ -302,6 +347,7 @@ class SegmentReader: public DirectoryIndexReader {
CL_NS(util)::Deletor::Dummy,
Norm > NormsType;
NormsType _norms;
+ std::unordered_map<TCHAR, std::optional<int64_t>> sum_total_term_freq;
uint8_t* ones;
uint8_t* fakeNorms();
@@ -419,13 +465,18 @@ public:
///Returns the number of documents which contain the term t
int32_t docFreq(const Term* t);
+ ///Returns the number of document whose id is doc in this field
+ int32_t docNorm(const TCHAR* field, int32_t doc);
+
+ ///Returns the total norm of all terms appeared in all documents in this
field
+ std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field);
+
///Returns the actual number of documents in the segment
int32_t numDocs();
///Returns the number of all the documents in the segment including the
ones that have
///been marked deleted
int32_t maxDoc() const;
-
void setTermInfosIndexDivisor(int32_t indexDivisor);
int32_t getTermInfosIndexDivisor();
@@ -434,6 +485,11 @@ public:
///Returns fake norms if norms aren't available
uint8_t* norms(const TCHAR* field);
+ uint8_t* norms(const TCHAR* field) const;
+
+ ///Returns the bytes array that holds the norms of a named field.
+ void norms(const TCHAR* field, uint8_t* bytes) const;
+
///Reads the Norms for field from disk
void norms(const TCHAR* field, uint8_t* bytes);
diff --git a/src/core/CLucene/search/IndexSearcher.cpp
b/src/core/CLucene/search/IndexSearcher.cpp
index f5b313a3b26..e20d6f44239 100644
--- a/src/core/CLucene/search/IndexSearcher.cpp
+++ b/src/core/CLucene/search/IndexSearcher.cpp
@@ -200,6 +200,21 @@ CL_NS_DEF(search)
return reader->docFreq(term);
}
+ // doc norm
+ int32_t IndexSearcher::docNorm(const TCHAR* field, int32_t doc) const {
+
+ CND_PRECONDITION(reader != NULL, "reader is NULL");
+
+ return reader->docNorm(field, doc);
+ }
+
+ std::optional<uint64_t> IndexSearcher::sumTotalTermFreq(const TCHAR* field)
const {
+
+ CND_PRECONDITION(reader != NULL, "reader is NULL");
+
+ return reader->sumTotalTermFreq(field);
+ }
+
_CL_DEPRECATED( doc(i, document) ) CL_NS(document)::Document*
IndexSearcher::doc(int32_t i){
CL_NS(document)::Document* ret = _CLNEW CL_NS(document)::Document;
if (!doc(i,ret) )
diff --git a/src/core/CLucene/search/IndexSearcher.h
b/src/core/CLucene/search/IndexSearcher.h
index 8f0b2000aff..6969b9ed409 100644
--- a/src/core/CLucene/search/IndexSearcher.h
+++ b/src/core/CLucene/search/IndexSearcher.h
@@ -77,6 +77,10 @@ public:
int32_t docFreq(const CL_NS(index)::Term* term) const;
+ int32_t docNorm(const TCHAR* field, int32_t doc) const;
+
+ std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) const;
+
bool doc(int32_t i, CL_NS(document)::Document& document);
bool doc(int32_t i, CL_NS(document)::Document* document);
_CL_DEPRECATED( doc(i, document) ) CL_NS(document)::Document*
doc(int32_t i);
diff --git a/src/core/CLucene/search/MultiSearcher.cpp
b/src/core/CLucene/search/MultiSearcher.cpp
index 0f2a6862706..872179ae7f3 100644
--- a/src/core/CLucene/search/MultiSearcher.cpp
+++ b/src/core/CLucene/search/MultiSearcher.cpp
@@ -5,8 +5,10 @@
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/_ApiHeader.h"
+#include <optional>
#include "CLucene/index/IndexReader.h"
#include "MultiSearcher.h"
+
#include "SearchHeader.h"
#include "Query.h"
#include "_HitQueue.h"
@@ -74,6 +76,30 @@ CL_NS_DEF(search)
return docFreq;
}
+// doc norm
+int32_t MultiSearcher::docNorm(const TCHAR* field, int32_t n) const {
+
+ CND_PRECONDITION(reader != NULL, "reader is NULL");
+ int32_t i = subSearcher(n); // find searcher index
+ return searchables[i]->docNorm(field, n - starts[i]);
+ }
+
+std::optional<uint64_t> MultiSearcher::sumTotalTermFreq(const TCHAR* field)
const {
+ bool fieldHasNorm = false;
+ int64_t sum = 0;
+ for (int32_t i = 0; i < searchablesLen; ++i) {
+ std::optional<int64_t> norm =
searchables[i]->sumTotalTermFreq(field);
+ if (norm != std::nullopt) {
+ fieldHasNorm = true;
+ sum += norm.value();
+ }
+ }
+ if (fieldHasNorm) {
+ return sum;
+ }
+ return std::nullopt;
+ }
+
/** For use by {@link HitCollector} implementations. */
bool MultiSearcher::doc(int32_t n, Document* d) {
int32_t i = subSearcher(n); // find searcher index
diff --git a/src/core/CLucene/search/MultiSearcher.h
b/src/core/CLucene/search/MultiSearcher.h
index 17adba01e8d..1815b0fc575 100644
--- a/src/core/CLucene/search/MultiSearcher.h
+++ b/src/core/CLucene/search/MultiSearcher.h
@@ -40,6 +40,10 @@ CL_NS_DEF(search)
int32_t docFreq(const CL_NS(index)::Term* term) const ;
+ int32_t docNorm(const TCHAR* field, int32_t n) const;
+
+ std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field) const;
+
/** For use by {@link HitCollector} implementations. */
bool doc(int32_t n, CL_NS(document)::Document* document);
diff --git a/src/core/CLucene/search/Searchable.h
b/src/core/CLucene/search/Searchable.h
index cb32a88579e..dabd7d45928 100644
--- a/src/core/CLucene/search/Searchable.h
+++ b/src/core/CLucene/search/Searchable.h
@@ -9,6 +9,7 @@
//#include "CLucene/index/IndexReader.h"
+#include <optional>
CL_CLASS_DEF(index,Term)
//#include "Filter.h"
CL_CLASS_DEF(document,Document)
@@ -67,7 +68,12 @@ CL_NS_DEF(search)
* @see IndexReader#docFreq(Term).
*/
virtual int32_t docFreq(const CL_NS(index)::Term* term) const = 0;
-
+ /** Expert: Returns the norm of document whoss id is <code>doc</code> in
the <code>field</code>.
+ */
+ virtual int32_t docNorm(const TCHAR* field, int32_t doc) const = 0;
+ /** Expert: Returns the total norm of all terms appeared in all
documents in this field
+ */
+ virtual std::optional<uint64_t> sumTotalTermFreq(const TCHAR* field)
const = 0;
/** Expert: Returns one greater than the largest possible document
number.
* Called by search code to compute term weights.
* @see IndexReader#maxDoc().
diff --git a/src/core/CLucene/search/Similarity.cpp
b/src/core/CLucene/search/Similarity.cpp
index b78ce677533..922d644ec8f 100644
--- a/src/core/CLucene/search/Similarity.cpp
+++ b/src/core/CLucene/search/Similarity.cpp
@@ -247,4 +247,12 @@ CL_NS_DEF(search)
return 0.0f;
return overlap / (float_t)maxOverlap;
}
+
+ LengthSimilarity::LengthSimilarity(){
+ }
+ LengthSimilarity::~LengthSimilarity(){
+ }
+ float_t LengthSimilarity::lengthNorm(const TCHAR* /*fieldName*/, int32_t
numTerms) {
+ return numTerms;
+ }
CL_NS_END
diff --git a/src/core/CLucene/search/Similarity.h
b/src/core/CLucene/search/Similarity.h
index 388898aba23..74b7a819d06 100644
--- a/src/core/CLucene/search/Similarity.h
+++ b/src/core/CLucene/search/Similarity.h
@@ -275,5 +275,14 @@ public:
float_t coord(int32_t overlap, int32_t maxOverlap);
};
+/** Expert: Length scoring implementation. */
+class CLUCENE_EXPORT LengthSimilarity: public DefaultSimilarity {
+public:
+ LengthSimilarity();
+ ~LengthSimilarity();
+ /** Implemented as <code>1/sqrt(numTerms)</code>. */
+ float_t lengthNorm(const TCHAR* fieldName, int32_t numTerms) override;
+};
+
CL_NS_END
#endif
diff --git a/src/core/CLucene/search/query/TermIterator.h
b/src/core/CLucene/search/query/TermIterator.h
index 3eb22a254de..82c5c71027d 100644
--- a/src/core/CLucene/search/query/TermIterator.h
+++ b/src/core/CLucene/search/query/TermIterator.h
@@ -27,6 +27,10 @@ public:
return termDocs_->freq();
}
+ inline int32_t norm() const {
+ return termDocs_->norm();
+ }
+
inline int32_t nextDoc() const {
if (termDocs_->next()) {
return termDocs_->doc();
@@ -45,6 +49,10 @@ public:
return termDocs_->docFreq();
}
+ inline int32_t docNorm() const {
+ return termDocs_->docNorm();
+ }
+
inline bool readRange(DocRange* docRange) const {
return termDocs_->readRange(docRange);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]