github-actions[bot] commented on code in PR #64313:
URL: https://github.com/apache/doris/pull/64313#discussion_r3380307659
##########
cloud/src/recycler/checker.cpp:
##########
@@ -1338,6 +1353,116 @@ int InstanceChecker::do_delete_bitmap_inverted_check() {
return (leaked_delete_bitmaps > 0 || abnormal_delete_bitmaps > 0) ? 1 : 0;
}
+int InstanceChecker::collect_unexpired_job_tmp_rowsets(
+ std::unordered_map<int64_t, std::unordered_set<std::string>>&
tmp_rowsets) {
+ static constexpr int64_t max_unexpired_tmp_rowsets = 1000;
+ auto begin = meta_rowset_tmp_key({instance_id_, 0, 0});
+ auto end = meta_rowset_tmp_key({instance_id_, INT64_MAX, 0});
+ std::unique_ptr<RangeGetIterator> it;
+ int64_t num_scanned = 0;
+ int64_t num_non_job = 0;
+ int64_t num_skipped_non_job_txns = 0;
+ int64_t num_unexpired = 0;
+ int64_t num_expired = 0;
+ int64_t last_txn_id = -1;
+ int64_t current_time =
duration_cast<seconds>(system_clock::now().time_since_epoch()).count();
+
+ while (it == nullptr /* may be not init */ || (it->more() && !stopped())) {
+ std::unique_ptr<Transaction> txn;
+ TxnErrorCode err = txn_kv_->create_txn(&txn);
+ if (err != TxnErrorCode::TXN_OK) {
+ LOG(WARNING) << "failed to create txn";
+ return -1;
+ }
+ err = txn->get(begin, end, &it);
+ if (err != TxnErrorCode::TXN_OK) {
+ LOG(WARNING) << "failed to get tmp rowset kv, err=" << err;
+ return -1;
+ }
+ if (!it->has_next()) {
+ break;
+ }
+ while (it->has_next() && !stopped()) {
+ auto [k, v] = it->next();
+ ++num_scanned;
+
+ std::string_view k1 = k;
+ k1.remove_prefix(1);
+ std::vector<std::tuple<std::variant<int64_t, std::string>, int,
int>> out;
+ if (decode_key(&k1, &out) != 0 || out.size() < 5) {
+ LOG(WARNING) << "malformed tmp rowset key, key=" << hex(k);
+ return -1;
+ }
+ // 0x01 "meta" ${instance_id} "rowset_tmp" ${txn_id} ${tablet_id}
-> RowsetMetaCloudPB
+ auto txn_id = std::get<int64_t>(std::get<0>(out[3]));
+ bool is_first_rowset_of_txn = last_txn_id != txn_id;
+ last_txn_id = txn_id;
+
+ doris::RowsetMetaCloudPB rowset;
+ if (!rowset.ParseFromArray(v.data(), v.size())) {
+ LOG(WARNING) << "malformed tmp rowset meta, key=" << hex(k);
+ return -1;
+ }
+ if (!rowset.has_job_id() || rowset.job_id().empty()) {
+ ++num_non_job;
+ if (is_first_rowset_of_txn) {
+ ++num_skipped_non_job_txns;
+ if (txn_id == INT64_MAX) {
+ begin = end;
+ } else {
+ begin = meta_rowset_tmp_key({instance_id_, txn_id + 1,
0});
+ }
+ it.reset();
+ break;
+ }
+ if (!it->has_next()) {
+ begin = k;
+ begin.push_back('\x00');
+ }
+ continue;
+ }
+
+ // Must use the same threshold as the recycler so that a delete
bitmap is never
+ // reported as leaked while its tmp rowset is still alive from the
recycler's view.
+ // `earlest_ts` is a local sentinel initialized to 0 on purpose:
it keeps the value
+ // below any real expiration so the helper never updates the
recycler's
+ // earliest-ts bvar (the checker must not touch the recycler's
metrics).
+ int64_t earlest_ts = 0;
+ int64_t expiration =
+ calculate_tmp_rowset_expired_time(instance_id_, rowset,
&earlest_ts);
+ if (current_time < expiration) {
+ tmp_rowsets[rowset.tablet_id()].insert(rowset.rowset_id_v2());
+ ++num_unexpired;
+ if (num_unexpired >= max_unexpired_tmp_rowsets) {
+ LOG(WARNING)
+ << "collect unexpired tmp rowsets for delete
bitmap checker reached "
+ << "limit, remaining tmp rowsets will not be
considered and may cause "
+ << "false positives, instance_id=" << instance_id_
+ << ", num_scanned=" << num_scanned << ",
num_non_job=" << num_non_job
+ << ", num_skipped_non_job_txns=" <<
num_skipped_non_job_txns
+ << ", num_unexpired=" << num_unexpired
+ << ", num_expired=" << num_expired
+ << ", limit=" << max_unexpired_tmp_rowsets;
+ return 0;
+ }
Review Comment:
Returning success here lets `do_delete_bitmap_inverted_check()` continue
with an intentionally incomplete `unexpired_tmp_rowsets` map. In a cluster with
more than 1000 unexpired compaction/schema-change tmp rowsets, any live tmp
rowset after the cap is not considered by the later
`belongs_to_unexpired_tmp_rowset` check, so its delete bitmap is reported as
leaked and the checker returns failure even though the rowset is still within
the same retention window used by the recycler. Please either keep collecting
all live tmp rowsets or fail/skip this check when the cap is reached, rather
than producing a known false positive.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]