This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 64d27f3cdc0 [fix](gc tablet) fix get shutdown tablet cost a lot time 
#27693 (#28543)
64d27f3cdc0 is described below

commit 64d27f3cdc0e8fd3433c79950ae2a5bb0e03f6c9
Author: yujun <[email protected]>
AuthorDate: Mon Dec 18 15:59:37 2023 +0800

    [fix](gc tablet) fix get shutdown tablet cost a lot time #27693 (#28543)
---
 be/src/olap/tablet_manager.cpp | 256 +++++++++++++++++++++++------------------
 be/src/olap/tablet_manager.h   |  10 +-
 2 files changed, 148 insertions(+), 118 deletions(-)

diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index f6cfb6b6dc4..7aa5c52534f 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -638,24 +638,12 @@ TabletSharedPtr 
TabletManager::_get_tablet_unlocked(TTabletId tablet_id, bool in
     TabletSharedPtr tablet;
     tablet = _get_tablet_unlocked(tablet_id);
     if (tablet == nullptr && include_deleted) {
-        {
-            std::shared_lock rdlock(_shutdown_tablets_lock);
-            for (auto& deleted_tablet : _shutdown_tablets) {
-                CHECK(deleted_tablet != nullptr) << "deleted tablet is 
nullptr";
-                if (deleted_tablet->tablet_id() == tablet_id) {
-                    tablet = deleted_tablet;
-                    break;
-                }
-            }
-        }
-        if (tablet == nullptr) {
-            std::shared_lock rdlock(_shutdown_deleting_tablets_lock);
-            for (auto& deleted_tablet : _shutdown_deleting_tablets) {
-                CHECK(deleted_tablet != nullptr) << "deleted tablet is 
nullptr";
-                if (deleted_tablet->tablet_id() == tablet_id) {
-                    tablet = deleted_tablet;
-                    break;
-                }
+        std::shared_lock rdlock(_shutdown_tablets_lock);
+        for (auto& deleted_tablet : _shutdown_tablets) {
+            CHECK(deleted_tablet != nullptr) << "deleted tablet is nullptr";
+            if (deleted_tablet->tablet_id() == tablet_id) {
+                tablet = deleted_tablet;
+                break;
             }
         }
     }
@@ -1062,113 +1050,155 @@ Status 
TabletManager::build_all_report_tablets_info(std::map<TTabletId, TTablet>
 }
 
 Status TabletManager::start_trash_sweep() {
+    std::unique_lock<std::mutex> lock(_gc_tablets_lock, std::defer_lock);
+    if (!lock.try_lock()) {
+        return Status::OK();
+    }
+
     SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
+    for_each_tablet([](const TabletSharedPtr& tablet) { 
tablet->delete_expired_stale_rowset(); },
+                    filter_all_tablets);
+
+    std::list<TabletSharedPtr>::iterator last_it;
     {
-        for_each_tablet(
-                [](const TabletSharedPtr& tablet) { 
tablet->delete_expired_stale_rowset(); },
-                filter_all_tablets);
+        std::shared_lock rdlock(_shutdown_tablets_lock);
+        last_it = _shutdown_tablets.begin();
+        if (last_it == _shutdown_tablets.end()) {
+            return Status::OK();
+        }
     }
 
-    int32_t clean_num = 0;
-    do {
-#ifndef BE_TEST
-        sleep(1);
-#endif
-        clean_num = 0;
-        // should get write lock here, because it will remove tablet from 
shut_down_tablets
-        // and get tablet will access shut_down_tablets
-        {
-            std::lock_guard<std::shared_mutex> wrlock1(_shutdown_tablets_lock);
-            std::lock_guard<std::shared_mutex> 
wrlock2(_shutdown_deleting_tablets_lock);
-            for (const auto& tablet : _shutdown_tablets) {
-                _shutdown_deleting_tablets.push_back(tablet);
+    auto get_batch_tablets = [this, &last_it](int limit) {
+        std::vector<TabletSharedPtr> batch_tablets;
+        std::lock_guard<std::shared_mutex> wrdlock(_shutdown_tablets_lock);
+        while (last_it != _shutdown_tablets.end() && batch_tablets.size() < 
limit) {
+            // it means current tablet is referenced by other thread
+            if (last_it->use_count() > 1) {
+                last_it++;
+            } else {
+                batch_tablets.push_back(*last_it);
+                last_it = _shutdown_tablets.erase(last_it);
             }
-            _shutdown_tablets.clear();
         }
-        std::lock_guard<std::shared_mutex> 
wrlock(_shutdown_deleting_tablets_lock);
-        auto it = _shutdown_deleting_tablets.begin();
-        while (it != _shutdown_deleting_tablets.end()) {
-            // check if the meta has the tablet info and its state is shutdown
-            if (it->use_count() > 1) {
-                // it means current tablet is referenced by other thread
-                ++it;
-                continue;
-            }
-            TabletMetaSharedPtr tablet_meta(new TabletMeta());
-            Status check_st = TabletMetaManager::get_meta((*it)->data_dir(), 
(*it)->tablet_id(),
-                                                          
(*it)->schema_hash(), tablet_meta);
-            if (check_st.ok()) {
-                if (tablet_meta->tablet_state() != TABLET_SHUTDOWN ||
-                    tablet_meta->tablet_uid() != (*it)->tablet_uid()) {
-                    LOG(WARNING) << "tablet's state changed to normal, skip 
remove dirs"
-                                 << " tablet id = " << tablet_meta->tablet_id()
-                                 << " schema hash = " << 
tablet_meta->schema_hash()
-                                 << " old tablet_uid=" << (*it)->tablet_uid()
-                                 << " cur tablet_uid=" << 
tablet_meta->tablet_uid();
-                    // remove it from list
-                    it = _shutdown_deleting_tablets.erase(it);
-                    continue;
-                }
-                // move data to trash
-                const auto& tablet_path = (*it)->tablet_path();
-                bool exists = false;
-                Status exists_st = 
io::global_local_filesystem()->exists(tablet_path, &exists);
-                if (!exists_st) {
-                    continue;
-                }
-                if (exists) {
-                    // take snapshot of tablet meta
-                    auto meta_file_path = fmt::format("{}/{}.hdr", 
tablet_path, (*it)->tablet_id());
-                    (*it)->tablet_meta()->save(meta_file_path);
-                    LOG(INFO) << "start to move tablet to trash. " << 
tablet_path;
-                    Status rm_st = 
(*it)->data_dir()->move_to_trash(tablet_path);
-                    if (rm_st != Status::OK()) {
-                        LOG(WARNING) << "fail to move dir to trash. " << 
tablet_path;
-                        ++it;
-                        continue;
-                    }
-                }
-                // remove tablet meta
-                TabletMetaManager::remove((*it)->data_dir(), 
(*it)->tablet_id(),
-                                          (*it)->schema_hash());
-                LOG(INFO) << "successfully move tablet to trash. "
-                          << "tablet_id=" << (*it)->tablet_id()
-                          << ", schema_hash=" << (*it)->schema_hash()
-                          << ", tablet_path=" << tablet_path;
-                it = _shutdown_deleting_tablets.erase(it);
-                ++clean_num;
-            } else {
-                // if could not find tablet info in meta store, then check if 
dir existed
-                const auto& tablet_path = (*it)->tablet_path();
-                bool exists = false;
-                Status exists_st = 
io::global_local_filesystem()->exists(tablet_path, &exists);
-                if (!exists_st) {
-                    continue;
-                }
-                if (exists) {
-                    LOG(WARNING) << "errors while load meta from store, skip 
this tablet. "
-                                 << "tablet_id=" << (*it)->tablet_id()
-                                 << ", schema_hash=" << (*it)->schema_hash();
-                    ++it;
+
+        return batch_tablets;
+    };
+
+    std::list<TabletSharedPtr> failed_tablets;
+    // return true if need continue delete
+    auto delete_one_batch = [this, get_batch_tablets, &failed_tablets]() -> 
bool {
+        int limit = 200;
+        for (;;) {
+            auto batch_tablets = get_batch_tablets(limit);
+            for (const auto& tablet : batch_tablets) {
+                if (_move_tablet_to_trash(tablet)) {
+                    limit--;
                 } else {
-                    LOG(INFO) << "could not find tablet dir, skip it and 
remove it from gc-queue. "
-                              << "tablet_id=" << (*it)->tablet_id()
-                              << ", schema_hash=" << (*it)->schema_hash()
-                              << ", tablet_path=" << tablet_path;
-                    it = _shutdown_deleting_tablets.erase(it);
+                    failed_tablets.push_back(tablet);
                 }
             }
-
-            // yield to avoid holding _tablet_map_lock for too long
-            if (clean_num >= 200) {
-                break;
+            if (limit <= 0) {
+                return true;
+            }
+            if (batch_tablets.empty()) {
+                return false;
             }
         }
-        // >= 200 means there may be more tablets need to be handled
-        // So continue
-    } while (clean_num >= 200);
+
+        return false;
+    };
+
+    while (delete_one_batch()) {
+#ifndef BE_TEST
+        sleep(1);
+#endif
+    }
+
+    if (!failed_tablets.empty()) {
+        std::lock_guard<std::shared_mutex> wrlock(_shutdown_tablets_lock);
+        _shutdown_tablets.splice(_shutdown_tablets.end(), failed_tablets);
+    }
+
     return Status::OK();
-} // start_trash_sweep
+}
+
+bool TabletManager::_move_tablet_to_trash(const TabletSharedPtr& tablet) {
+    TabletMetaSharedPtr tablet_meta(new TabletMeta());
+    int64_t get_meta_ts = MonotonicMicros();
+    Status check_st = TabletMetaManager::get_meta(tablet->data_dir(), 
tablet->tablet_id(),
+                                                  tablet->schema_hash(), 
tablet_meta);
+    if (check_st.ok()) {
+        if (tablet_meta->tablet_state() != TABLET_SHUTDOWN ||
+            tablet_meta->tablet_uid() != tablet->tablet_uid()) {
+            LOG(WARNING) << "tablet's state changed to normal, skip remove 
dirs"
+                         << " tablet id = " << tablet_meta->tablet_id()
+                         << " schema hash = " << tablet_meta->schema_hash()
+                         << " old tablet_uid=" << tablet->tablet_uid()
+                         << " cur tablet_uid=" << tablet_meta->tablet_uid();
+            return true;
+        }
+        // move data to trash
+        const auto& tablet_path = tablet->tablet_path();
+        bool exists = false;
+        Status exists_st = io::global_local_filesystem()->exists(tablet_path, 
&exists);
+        if (!exists_st) {
+            return false;
+        }
+        if (exists) {
+            // take snapshot of tablet meta
+            auto meta_file_path = fmt::format("{}/{}.hdr", tablet_path, 
tablet->tablet_id());
+            int64_t save_meta_ts = MonotonicMicros();
+            auto save_st = tablet->tablet_meta()->save(meta_file_path);
+            if (!save_st.ok()) {
+                LOG(WARNING) << "failed to save meta, tablet_id=" << 
tablet_meta->tablet_id()
+                             << ", tablet_uid=" << tablet_meta->tablet_uid()
+                             << ", error=" << save_st;
+                return false;
+            }
+            int64_t now = MonotonicMicros();
+            LOG(INFO) << "start to move tablet to trash. " << tablet_path
+                      << ". rocksdb get meta cost " << (save_meta_ts - 
get_meta_ts)
+                      << " us, rocksdb save meta cost " << (now - 
save_meta_ts) << " us";
+            Status rm_st = tablet->data_dir()->move_to_trash(tablet_path);
+            if (!rm_st.ok()) {
+                LOG(WARNING) << "fail to move dir to trash. " << tablet_path;
+                return false;
+            }
+        }
+        // remove tablet meta
+        auto remove_st = TabletMetaManager::remove(tablet->data_dir(), 
tablet->tablet_id(),
+                                                   tablet->schema_hash());
+        if (!remove_st.ok()) {
+            LOG(WARNING) << "failed to remove meta, tablet_id=" << 
tablet_meta->tablet_id()
+                         << ", tablet_uid=" << tablet_meta->tablet_uid() << ", 
error=" << remove_st;
+            return false;
+        }
+        LOG(INFO) << "successfully move tablet to trash. "
+                  << "tablet_id=" << tablet->tablet_id()
+                  << ", schema_hash=" << tablet->schema_hash() << ", 
tablet_path=" << tablet_path;
+        return true;
+    } else {
+        // if could not find tablet info in meta store, then check if dir 
existed
+        const auto& tablet_path = tablet->tablet_path();
+        bool exists = false;
+        Status exists_st = io::global_local_filesystem()->exists(tablet_path, 
&exists);
+        if (!exists_st) {
+            return false;
+        }
+        if (exists) {
+            LOG(WARNING) << "errors while load meta from store, skip this 
tablet. "
+                         << "tablet_id=" << tablet->tablet_id()
+                         << ", schema_hash=" << tablet->schema_hash();
+            return false;
+        } else {
+            LOG(INFO) << "could not find tablet dir, skip it and remove it 
from gc-queue. "
+                      << "tablet_id=" << tablet->tablet_id()
+                      << ", schema_hash=" << tablet->schema_hash()
+                      << ", tablet_path=" << tablet_path;
+            return true;
+        }
+    }
+}
 
 bool TabletManager::register_clone_tablet(int64_t tablet_id) {
     tablets_shard& shard = _get_tablets_shard(tablet_id);
diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h
index aa763502195..e439804adb6 100644
--- a/be/src/olap/tablet_manager.h
+++ b/be/src/olap/tablet_manager.h
@@ -209,6 +209,8 @@ private:
 
     std::shared_mutex& _get_tablets_shard_lock(TTabletId tabletId);
 
+    bool _move_tablet_to_trash(const TabletSharedPtr& tablet);
+
 private:
     DISALLOW_COPY_AND_ASSIGN(TabletManager);
 
@@ -240,11 +242,9 @@ private:
     std::shared_mutex _shutdown_tablets_lock;
     // partition_id => tablet_info
     std::map<int64_t, std::set<TabletInfo>> _partition_tablet_map;
-    std::vector<TabletSharedPtr> _shutdown_tablets;
-
-    // gc thread will move _shutdown_tablets to _shutdown_deleting_tablets
-    std::shared_mutex _shutdown_deleting_tablets_lock;
-    std::list<TabletSharedPtr> _shutdown_deleting_tablets;
+    // the delete tablets. notice only allow function `start_trash_sweep` can 
erase tablets in _shutdown_tablets
+    std::list<TabletSharedPtr> _shutdown_tablets;
+    std::mutex _gc_tablets_lock;
 
     std::mutex _tablet_stat_cache_mutex;
     std::shared_ptr<std::vector<TTabletStat>> _tablet_stat_list_cache =


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to