This is an automated email from the ASF dual-hosted git repository.

gavinchou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 28b3da8e0c6 [fix](cloud) skip stale tablet cache check for STOP_TOKEN 
(#63520)
28b3da8e0c6 is described below

commit 28b3da8e0c65ba632463fdb9e9f39de69c5afe39
Author: Gavin Chou <[email protected]>
AuthorDate: Sat May 23 15:41:45 2026 +0800

    [fix](cloud) skip stale tablet cache check for STOP_TOKEN (#63520)
    
    Problem Summary: In cloud mode, schema change on MOW tables can fail
    when registering a STOP_TOKEN compaction job. STOP_TOKEN is a lock
    marker and does not compact rowsets, but the meta-service previously
    applied the stale tablet cache check to all compaction types. If another
    BE advanced the tablet compaction counters while the schema change BE
    still held older cached counters, STOP_TOKEN registration could be
    rejected as STALE_TABLET_CACHE and the ALTER task failed. This change
    skips the stale tablet cache check for STOP_TOKEN while preserving the
    check for real compaction jobs.
    
    ### Release note
    
    Fix cloud mode schema change failures caused by STOP_TOKEN registration
    being rejected by stale tablet cache checks.
    
    ### Check List (For Author)
    
    - Test: Unit Test
    - Added `StopTokenSkipsStaleTabletCacheCheck` in
    `cloud/test/meta_service_job_test.cpp`; not run in this session.
    - Behavior changed: Yes. STOP_TOKEN compaction jobs no longer fail stale
    tablet cache validation because they do not read or compact rowsets.
    - Does this need documentation: No
    
    Co-authored-by: Siyang Tang <[email protected]>
---
 cloud/src/meta-service/meta_service_job.cpp | 12 ++++--
 cloud/test/meta_service_job_test.cpp        | 65 +++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/cloud/src/meta-service/meta_service_job.cpp 
b/cloud/src/meta-service/meta_service_job.cpp
index e335c0a9b40..dad84f6a36a 100644
--- a/cloud/src/meta-service/meta_service_job.cpp
+++ b/cloud/src/meta-service/meta_service_job.cpp
@@ -162,9 +162,15 @@ void start_compaction_job(MetaServiceCode& code, 
std::string& msg, std::stringst
             return;
         }
     }
-
-    if (compaction.base_compaction_cnt() < stats.base_compaction_cnt() ||
-        compaction.cumulative_compaction_cnt() < 
stats.cumulative_compaction_cnt()) {
+    // STOP_TOKEN is a lock marker used by schema change to block concurrent 
compactions during
+    // delete bitmap recalculation on MOW tables. It does not perform actual 
compaction, so the
+    // stale tablet cache check (which guards against compacting on outdated 
rowset metadata) is
+    // not meaningful for it and must be skipped to avoid spurious failures 
when the BE's cached
+    // compaction counts lag behind the meta-service due to a concurrent 
compaction completing
+    // on another BE node (see CORE-5964).
+    if (compaction.type() != TabletCompactionJobPB::STOP_TOKEN &&
+        (compaction.base_compaction_cnt() < stats.base_compaction_cnt() ||
+         compaction.cumulative_compaction_cnt() < 
stats.cumulative_compaction_cnt())) {
         code = MetaServiceCode::STALE_TABLET_CACHE;
         SS << "could not perform compaction on expired tablet cache."
            << " req_base_compaction_cnt=" << compaction.base_compaction_cnt()
diff --git a/cloud/test/meta_service_job_test.cpp 
b/cloud/test/meta_service_job_test.cpp
index d5c837e8711..1926f6c600a 100644
--- a/cloud/test/meta_service_job_test.cpp
+++ b/cloud/test/meta_service_job_test.cpp
@@ -1645,6 +1645,71 @@ void check_job_key(MetaServiceProxy* meta_service, 
std::string instance_id, int6
     }
 }
 
+// Regression test for CORE-5964: STOP_TOKEN should not be rejected by the 
stale tablet
+// cache check even when the BE's cached compaction counts lag behind the 
meta-service.
+// STOP_TOKEN is a lock marker used by schema change (MOW table) to block 
concurrent
+// compactions during delete bitmap recalculation -- it does not perform 
actual compaction
+// work, so verifying compaction count freshness is meaningless for it.
+TEST(MetaServiceJobTest, StopTokenSkipsStaleTabletCacheCheck) {
+    auto meta_service = get_meta_service();
+
+    auto sp = SyncPoint::get_instance();
+    DORIS_CLOUD_DEFER {
+        SyncPoint::get_instance()->clear_all_call_backs();
+    };
+    sp->set_call_back("get_instance_id", [&](auto&& args) {
+        auto* ret = try_any_cast_ret<std::string>(args);
+        ret->first = instance_id;
+        ret->second = true;
+    });
+    sp->enable_processing();
+
+    int64_t table_id = 1, index_id = 2, partition_id = 3, tablet_id = 101;
+
+    // Set up tablet index
+    auto index_key = meta_tablet_idx_key({instance_id, tablet_id});
+    TabletIndexPB idx_pb;
+    idx_pb.set_table_id(table_id);
+    idx_pb.set_index_id(index_id);
+    idx_pb.set_partition_id(partition_id);
+    idx_pb.set_tablet_id(tablet_id);
+    std::unique_ptr<Transaction> txn;
+    ASSERT_EQ(meta_service->txn_kv()->create_txn(&txn), TxnErrorCode::TXN_OK);
+    txn->put(index_key, idx_pb.SerializeAsString());
+
+    // Simulate meta-service state where cumulative_compaction_cnt=9 (advanced 
by another BE)
+    std::string stats_key =
+            stats_tablet_key({instance_id, table_id, index_id, partition_id, 
tablet_id});
+    TabletStatsPB stats;
+    stats.set_base_compaction_cnt(0);
+    stats.set_cumulative_compaction_cnt(9);
+    txn->put(stats_key, stats.SerializeAsString());
+    ASSERT_EQ(txn->commit(), TxnErrorCode::TXN_OK);
+
+    // A regular CUMULATIVE compaction with stale counts (req=8 < actual=9) 
must be rejected.
+    {
+        StartTabletJobResponse res;
+        start_compaction_job(meta_service.get(), tablet_id, "cumu_job", 
"ip:port",
+                             /*base_cnt=*/0, /*cumu_cnt=*/8, 
TabletCompactionJobPB::CUMULATIVE,
+                             res);
+        ASSERT_EQ(res.status().code(), MetaServiceCode::STALE_TABLET_CACHE)
+                << "CUMULATIVE with stale counts should be rejected";
+    }
+
+    // A STOP_TOKEN with the same stale counts must NOT be rejected (CORE-5964 
regression).
+    // The BE's cached cumulative_compaction_cnt=8 lags behind the actual 
value=9 on the
+    // meta-service side, but STOP_TOKEN registration must still succeed.
+    {
+        StartTabletJobResponse res;
+        start_compaction_job(meta_service.get(), tablet_id, "stop_token_job", 
"ip:port",
+                             /*base_cnt=*/0, /*cumu_cnt=*/8, 
TabletCompactionJobPB::STOP_TOKEN,
+                             res);
+        ASSERT_EQ(res.status().code(), MetaServiceCode::OK)
+                << "STOP_TOKEN with stale counts should NOT be rejected; got: "
+                << res.status().msg();
+    }
+}
+
 TEST(MetaServiceJobTest, DeleteBitmapUpdateLockCompatibilityTest) {
     auto meta_service = get_meta_service();
     auto sp = SyncPoint::get_instance();


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to