This is an automated email from the ASF dual-hosted git repository.
yangsiyu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 0e80bf4fa8b [fix](inverted index) fix BM25 LENGTH_TABLE using
byte4_to_int for correct norm decoding (#59713)
0e80bf4fa8b is described below
commit 0e80bf4fa8b68a3452feffefd285efb7d743a78a
Author: zzzxl <[email protected]>
AuthorDate: Tue Jan 13 12:11:57 2026 +0800
[fix](inverted index) fix BM25 LENGTH_TABLE using byte4_to_int for correct
norm decoding (#59713)
---
be/src/olap/collection_statistics.cpp | 59 ++++++++++++++--------
.../inverted_index/similarity/bm25_similarity.cpp | 2 +-
.../similarity/bm25_similarity_test.cpp | 28 +++++++++-
3 files changed, 65 insertions(+), 24 deletions(-)
diff --git a/be/src/olap/collection_statistics.cpp
b/be/src/olap/collection_statistics.cpp
index 714a19fe6b7..94130a1e6a7 100644
--- a/be/src/olap/collection_statistics.cpp
+++ b/be/src/olap/collection_statistics.cpp
@@ -17,6 +17,7 @@
#include "collection_statistics.h"
+#include <set>
#include <sstream>
#include "common/exception.h"
@@ -26,6 +27,7 @@
#include "olap/rowset/segment_v2/index_reader_helper.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
+#include "util/uid_util.h"
#include "vec/exprs/vexpr.h"
#include "vec/exprs/vexpr_context.h"
#include "vec/exprs/vliteral.h"
@@ -64,31 +66,44 @@ Status CollectionStatistics::collect(
}
}
-#ifndef NDEBUG
- std::stringstream ss;
- ss << "term_num_docs: " << _total_num_docs;
- for (const auto& [ws_field_name, num_tokens] : _total_num_tokens) {
- ss << ", [field_name: " << StringHelper::to_string(ws_field_name)
- << ", num_tokens: " << num_tokens;
- auto it = _term_doc_freqs.find(ws_field_name);
- if (it != _term_doc_freqs.end()) {
- ss << ", terms: {";
- bool first = true;
- for (const auto& [term, doc_freq] : it->second) {
- if (!first) {
- ss << ", ";
- }
- ss << StringHelper::to_string(term) << ": " << doc_freq;
- first = false;
+ // Build a single-line log with query_id, tablet_ids, and per-field term
statistics
+ if (VLOG_IS_ON(1)) {
+ std::set<int64_t> tablet_ids;
+ for (const auto& rs_split : rs_splits) {
+ if (rs_split.rs_reader && rs_split.rs_reader->rowset()) {
+
tablet_ids.insert(rs_split.rs_reader->rowset()->rowset_meta()->tablet_id());
}
- ss << "}";
- } else {
- ss << ", (no term stats)";
}
- ss << "]";
+
+ std::ostringstream oss;
+ oss << "CollectionStatistics: query_id=" <<
print_id(state->query_id());
+
+ oss << ", tablet_ids=[";
+ bool first_tablet = true;
+ for (int64_t tid : tablet_ids) {
+ if (!first_tablet) oss << ",";
+ oss << tid;
+ first_tablet = false;
+ }
+ oss << "]";
+
+ oss << ", total_num_docs=" << _total_num_docs;
+
+ for (const auto& [ws_field_name, num_tokens] : _total_num_tokens) {
+ oss << ", {field=" << StringHelper::to_string(ws_field_name)
+ << ", num_tokens=" << num_tokens << ", terms=[";
+
+ bool first_term = true;
+ for (const auto& [term, doc_freq] :
_term_doc_freqs.at(ws_field_name)) {
+ if (!first_term) oss << ", ";
+ oss << "(" << StringHelper::to_string(term) << ":" << doc_freq
<< ")";
+ first_term = false;
+ }
+ oss << "]}";
+ }
+
+ VLOG(1) << oss.str();
}
- LOG(INFO) << "CollectionStatistics: " << ss.str();
-#endif
return Status::OK();
}
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity.cpp
index a01b467c1e0..88865e140a6 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity.cpp
@@ -29,7 +29,7 @@ const int32_t BM25Similarity::NUM_FREE_VALUES = 255 -
static_cast<int>(MAX_INT4)
std::vector<float> BM25Similarity::LENGTH_TABLE = []() {
std::vector<float> table(256);
for (int32_t i = 0; i < 256; i++) {
- table[i] = int_to_byte4(i);
+ table[i] = (float)byte4_to_int((uint8_t)i);
}
return table;
}();
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity_test.cpp
index 34711aaac8b..e6d827a01cd 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity_test.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity_test.cpp
@@ -119,7 +119,7 @@ TEST_F(BM25SimilarityTest, ScoreEdgeCasesTest) {
ASSERT_GT(score_high, 0.0f);
float score_max_norm = similarity_->score(1.0f, 255);
- ASSERT_GT(score_max_norm, 0.0f);
+ ASSERT_GE(score_max_norm, 0.0f);
}
TEST_F(BM25SimilarityTest, Int4EncodingTest) {
@@ -231,6 +231,32 @@ TEST_F(BM25SimilarityTest, LengthTableInitializationTest) {
}
}
+TEST_F(BM25SimilarityTest, LengthTableCorrectDecoding) {
+ for (int i = 0; i < 256; ++i) {
+ float expected =
static_cast<float>(BM25Similarity::byte4_to_int(static_cast<uint8_t>(i)));
+ ASSERT_FLOAT_EQ(BM25Similarity::LENGTH_TABLE[i], expected)
+ << "LENGTH_TABLE[" << i << "] should equal byte4_to_int(" << i
<< ")";
+ }
+
+ std::vector<int32_t> test_doc_lengths = {0, 1, 10, 50, 100, 500, 1000,
5000, 10000};
+ for (int32_t doc_len : test_doc_lengths) {
+ uint8_t encoded_norm = BM25Similarity::int_to_byte4(doc_len);
+ float decoded_via_table = BM25Similarity::LENGTH_TABLE[encoded_norm];
+ int32_t decoded_via_func = BM25Similarity::byte4_to_int(encoded_norm);
+
+ ASSERT_FLOAT_EQ(decoded_via_table,
static_cast<float>(decoded_via_func))
+ << "Mismatch for doc_len=" << doc_len << ", encoded_norm=" <<
(int)encoded_norm;
+
+ ASSERT_LE(decoded_via_func, doc_len)
+ << "Decoded value should be <= original for doc_len=" <<
doc_len;
+ }
+
+ for (int i = 0; i < 256; ++i) {
+ int32_t correct_value =
BM25Similarity::byte4_to_int(static_cast<uint8_t>(i));
+ ASSERT_FLOAT_EQ(BM25Similarity::LENGTH_TABLE[i],
static_cast<float>(correct_value));
+ }
+}
+
TEST_F(BM25SimilarityTest, DifferentParametersTest) {
mock_stats_->set_mock_idf(1.0f);
mock_stats_->set_mock_avg_dl(1.0f);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]