github-actions[bot] commented on code in PR #63377:
URL: https://github.com/apache/doris/pull/63377#discussion_r3301029915


##########
cloud/src/recycler/recycler.cpp:
##########
@@ -2906,108 +2905,110 @@ int InstanceRecycler::recycle_tablets(int64_t 
table_id, int64_t index_id,
         }
         int64_t tablet_id = tablet_meta_pb.tablet_id();
 
-        if (!check_lazy_txn_finished(txn_kv_, instance_id_, 
tablet_meta_pb.tablet_id())) {
+        if (config::enable_recycler_check_lazy_txn_finished &&
+            !check_lazy_txn_finished(txn_kv_, instance_id_, 
tablet_meta_pb.tablet_id())) {
             LOG(WARNING) << "lazy txn not finished tablet_id=" << 
tablet_meta_pb.tablet_id();
             return -1;
         }
 
-        tablet_idx_keys.push_back(meta_tablet_idx_key({instance_id_, 
tablet_id}));
-        restore_job_keys.push_back(job_restore_tablet_key({instance_id_, 
tablet_id}));
-        if (is_multi_version) {
-            // The tablet index/inverted index are recycled in 
recycle_versioned_tablet.
-            tablet_compact_stats_keys.push_back(
-                    versioned::tablet_compact_stats_key({instance_id_, 
tablet_id}));
-            tablet_load_stats_keys.push_back(
-                    versioned::tablet_load_stats_key({instance_id_, 
tablet_id}));
-            versioned_meta_tablet_keys.push_back(
-                    versioned::meta_tablet_key({instance_id_, tablet_id}));
-        }
         TEST_SYNC_POINT_RETURN_WITH_VALUE("recycle_tablet::bypass_check", 
false);
         sync_executor.add([this, &num_recycled, tid = tablet_id, range_move = 
use_range_remove,
-                           &metrics_context, k]() mutable -> TabletKeyPair {
+                           &metrics_context, k]() mutable -> TabletInfo {
             if (recycle_tablet(tid, metrics_context) != 0) {
                 LOG_WARNING("failed to recycle tablet")
                         .tag("instance_id", instance_id_)
                         .tag("tablet_id", tid);
                 range_move = false;
-                return {std::string_view(), range_move};
+                return {.tablet_meta_key = std::string_view(),
+                        .tablet_id = tid,
+                        .range_move = range_move};
             }
             ++num_recycled;
             LOG(INFO) << "recycle_tablets scan, key=" << (k.empty() ? 
"(empty)" : hex(k));
-            return {k, range_move};
+            return {.tablet_meta_key = k, .tablet_id = tid, .range_move = 
range_move};
         });
         return 0;
     };
 
     // TODO(AlexYue): Add one ut to cover use_range_remove = false
     auto loop_done = [&, this]() -> int {
+        int ret = 0;
         bool finished = true;
-        auto tablet_keys = sync_executor.when_all(&finished);
+        bool has_empty_key = false;
+        auto tablets_info = sync_executor.when_all(&finished);
         if (!finished) {
             LOG_WARNING("failed to recycle tablet").tag("instance_id", 
instance_id_);
             return -1;
         }
-        if (tablet_keys.empty() && tablet_idx_keys.empty()) return 0;
-        if (!tablet_keys.empty() &&
-            std::ranges::all_of(tablet_keys, [](const auto& k) { return 
k.first.empty(); })) {
-            return -1;
+
+        size_t size_before_erase = tablets_info.size();
+        std::erase_if(tablets_info, [](const TabletInfo& t) { return 
t.tablet_meta_key.empty(); });
+        if (tablets_info.empty()) {
+            return size_before_erase == 0 ? 0 : -1;
+        } else if (size_before_erase != tablets_info.size()) {
+            has_empty_key = true;
         }
+
+        ret = has_empty_key ? -1 : 0;
         // sort the vector using key's order
-        std::sort(tablet_keys.begin(), tablet_keys.end(),
-                  [](const auto& prev, const auto& last) { return prev.first < 
last.first; });
+        std::ranges::sort(tablets_info, [](const auto& prev, const auto& last) 
{
+            return prev.tablet_meta_key < last.tablet_meta_key;
+        });
         bool use_range_remove = true;
-        for (auto& [_, remove] : tablet_keys) {
-            if (!remove) {
-                use_range_remove = remove;
+        for (auto& tablet_info : tablets_info) {
+            if (!has_empty_key && !tablet_info.range_move) {

Review Comment:
   This keeps `use_range_remove` true whenever any task failed (`has_empty_key 
== true`). In a mixed batch, the failed tablet key is erased from 
`tablets_info`; if the remaining successful keys sort around it, the later 
range remove deletes the failed tablet's `meta_tablet_key` even though 
`recycle_tablet()` returned failure and its data/versioned cleanup did not 
complete. For example successes A/C and failed B will call `remove(A, C + 
'\0')`, deleting B's tablet meta and making the failed cleanup non-retryable. 
Please force point deletes whenever `has_empty_key` is true, or return before 
deleting any tablet metadata in that batch.



##########
cloud/test/recycler_test.cpp:
##########
@@ -8694,6 +8680,160 @@ TEST(RecyclerTest, 
recycle_tablet_with_delete_file_failure) {
     }
 }
 
+TEST(RecyclerTest, recycle_tablet_with_delete_file_partial_failure) {
+    // If part of object deletion fails, recycle_tablets should delete KV only 
for tablets whose
+    // objects were deleted successfully, but still return failure for the 
whole round.
+
+    auto* sp = SyncPoint::get_instance();
+    DORIS_CLOUD_DEFER {
+        sp->clear_all_call_backs();
+        sp->clear_trace();
+        sp->disable_processing();
+    };
+
+    auto txn_kv = std::make_shared<MemTxnKv>();
+    ASSERT_EQ(txn_kv->init(), 0);
+
+    InstanceInfoPB instance;
+    instance.set_instance_id(::instance_id);
+    auto obj_info = instance.add_obj_info();
+    obj_info->set_id("recycle_tablet_with_delete_file_failure");
+    obj_info->set_ak(config::test_s3_ak);
+    obj_info->set_sk(config::test_s3_sk);
+    obj_info->set_endpoint(config::test_s3_endpoint);
+    obj_info->set_region(config::test_s3_region);
+    obj_info->set_bucket(config::test_s3_bucket);
+    obj_info->set_prefix("recycle_tablet_with_delete_file_failure");
+
+    InstanceRecycler recycler(txn_kv, instance, thread_group,
+                              std::make_shared<TxnLazyCommitter>(txn_kv));
+    ASSERT_EQ(recycler.init(), 0);
+
+    doris::TabletSchemaCloudPB schema;
+    schema.set_schema_version(1);
+
+    constexpr int64_t table_id = 21000;
+    constexpr int64_t index_id = 21001;
+    constexpr int64_t partition_id = 21002;
+    constexpr int64_t tablet_id = 21003;
+
+    auto accessor = recycler.accessor_map_.begin()->second;
+
+    // Create 5 tablet with its metadata and index keys
+    for (int i = 0; i < 5; ++i) {
+        ASSERT_EQ(create_tablet(txn_kv.get(), table_id, index_id, 
partition_id, tablet_id + i), 0);
+    }
+
+    // Create some committed rowsets for the tablet (in meta_rowset_key)
+    // These will be picked up by recycle_tablet when it scans for rowsets
+    for (int i = 0; i < 5; ++i) {
+        for (int j = 0; j < 5; j++) {
+            create_committed_rowset(txn_kv.get(), accessor.get(),
+                                    "recycle_tablet_with_delete_file_failure", 
tablet_id + i, j,
+                                    index_id, 2, 1, true);
+        }
+    }
+
+    // Create partition version kv (required for lazy txn check)
+    ASSERT_EQ(create_partition_version_kv(txn_kv.get(), table_id, 
partition_id), 0);
+
+    // Inject failure: make delete_directory return -1
+    // This simulates the scenario where accessor fails to delete directory 
from object storage
+    std::atomic<int> delete_directory_call_count {0};
+    size_t partial_cnt = 0;
+    sp->set_call_back("SyncExecutor::Task::bypass_cancel",
+                      [](auto&& args) { *try_any_cast<bool*>(args[0]) = true; 
});
+    sp->set_call_back("MockAccessor::delete_prefix",
+                      [&delete_directory_call_count, &partial_cnt](auto&& 
args) {
+                          if (partial_cnt++ < 2) {
+                              auto* ret = try_any_cast_ret<int>(args);

Review Comment:
   This callback is invoked by multiple `recycle_tablet_pool` tasks 
concurrently, but `partial_cnt` is a plain `size_t`. The `partial_cnt++` is a 
data race and also makes the test nondeterministic about which tablet IDs fail, 
so the partial-failure coverage can be flaky. Please use an atomic counter, 
e.g. `std::atomic<size_t> partial_cnt {0};` with `fetch_add`, and make the 
intended failed/successful key layout deterministic enough to cover an 
interleaved failed key.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to