This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new b85bb0e  [Bug-Fix] Some deleted tablets are not recycled on BE (#4401)
b85bb0e is described below

commit b85bb0e2e947c891e03fd288758838d9178cc90e
Author: yiguolei <[email protected]>
AuthorDate: Thu Aug 27 12:09:19 2020 +0800

    [Bug-Fix] Some deleted tablets are not recycled on BE (#4401)
---
 be/src/olap/data_dir.cpp               | 61 ++++++++++++++++++++++++++++------
 be/src/olap/data_dir.h                 |  5 +--
 be/src/olap/olap_server.cpp            |  5 ++-
 be/src/olap/tablet_manager.cpp         | 54 ++++++++++++++++++++++++++++++
 be/src/olap/tablet_manager.h           |  9 +++++
 be/src/olap/task/engine_clone_task.cpp |  8 +++++
 be/src/olap/task/engine_clone_task.h   |  2 ++
 7 files changed, 130 insertions(+), 14 deletions(-)

diff --git a/be/src/olap/data_dir.cpp b/be/src/olap/data_dir.cpp
index 524c1cd..863f065 100644
--- a/be/src/olap/data_dir.cpp
+++ b/be/src/olap/data_dir.cpp
@@ -753,6 +753,55 @@ void DataDir::remove_pending_ids(const std::string& id) {
     _pending_path_ids.erase(id);
 }
 
+// gc unused tablet schemahash dir
+void DataDir::perform_path_gc_by_tablet() {
+    std::unique_lock<std::mutex> lck(_check_path_mutex);
+    _cv.wait(lck, [this] { return _stop_bg_worker || 
!_all_tablet_schemahash_paths.empty(); });
+    if (_stop_bg_worker) {
+        return;
+    }
+    LOG(INFO) << "start to path gc by tablet schemahash.";
+    int counter = 0;
+    for (auto& path : _all_tablet_schemahash_paths) {
+        ++counter;
+        if (config::path_gc_check_step > 0 && counter % 
config::path_gc_check_step == 0) {
+            
SleepFor(MonoDelta::FromMilliseconds(config::path_gc_check_step_interval_ms));
+        }
+        TTabletId tablet_id = -1;
+        TSchemaHash schema_hash = -1;
+        bool is_valid = 
_tablet_manager->get_tablet_id_and_schema_hash_from_path(path, &tablet_id,
+                                                                               
  &schema_hash);
+        if (!is_valid) {
+            LOG(WARNING) << "unknown path:" << path;
+            continue;
+        }
+        // should not happen, because already check it is a valid tablet 
schema hash path in previous step
+        // so that log fatal here
+        if (tablet_id < 1 || schema_hash < 1) {
+            LOG(WARNING) << "invalid tablet id " << tablet_id << " or schema 
hash " << schema_hash
+                << ", path=" << path;
+            continue;
+        }
+        TabletSharedPtr tablet = _tablet_manager->get_tablet(tablet_id, 
schema_hash);
+        if (tablet != nullptr) {
+            // could find the tablet, then skip check it
+            continue;
+        }
+        boost::filesystem::path tablet_path(path);
+        boost::filesystem::path data_dir_path = 
+            
tablet_path.parent_path().parent_path().parent_path().parent_path();
+        std::string data_dir_string = data_dir_path.string();
+        DataDir* data_dir = 
StorageEngine::instance()->get_store(data_dir_string);
+        if (data_dir == nullptr) {
+            LOG(WARNING) << "could not find data dir for tablet path " << path;
+            continue;
+        }
+        _tablet_manager->try_delete_unused_tablet_path(data_dir, tablet_id, 
schema_hash, path);
+    }
+    _all_tablet_schemahash_paths.clear();
+    LOG(INFO) << "finished one time path gc by tablet.";
+}
+
 void DataDir::perform_path_gc_by_rowsetid() {
     // init the set of valid path
     // validate the path in data dir
@@ -827,7 +876,6 @@ void DataDir::perform_path_scan() {
             }
             for (const auto& tablet_id : tablet_ids) {
                 std::string tablet_id_path = shard_path + "/" + tablet_id;
-                _all_check_paths.insert(tablet_id_path);
                 std::set<std::string> schema_hashes;
                 ret = FileUtils::list_dirs_files(tablet_id_path, 
&schema_hashes, nullptr,
                                                  Env::Default());
@@ -838,7 +886,7 @@ void DataDir::perform_path_scan() {
                 }
                 for (const auto& schema_hash : schema_hashes) {
                     std::string tablet_schema_hash_path = tablet_id_path + "/" 
+ schema_hash;
-                    _all_check_paths.insert(tablet_schema_hash_path);
+                    
_all_tablet_schemahash_paths.insert(tablet_schema_hash_path);
                     std::set<std::string> rowset_files;
 
                     ret = FileUtils::list_dirs_files(tablet_schema_hash_path, 
nullptr,
@@ -873,15 +921,6 @@ bool DataDir::_check_pending_ids(const std::string& id) {
     return _pending_path_ids.find(id) != _pending_path_ids.end();
 }
 
-void DataDir::_remove_check_paths_no_lock(const std::set<std::string>& paths) {
-    for (const auto& path : paths) {
-        auto path_iter = _all_check_paths.find(path);
-        if (path_iter != _all_check_paths.end()) {
-            _all_check_paths.erase(path_iter);
-        }
-    }
-}
-
 Status DataDir::update_capacity() {
     try {
         boost::filesystem::path path_name(_path);
diff --git a/be/src/olap/data_dir.h b/be/src/olap/data_dir.h
index 5437a48..18cc612 100644
--- a/be/src/olap/data_dir.h
+++ b/be/src/olap/data_dir.h
@@ -106,6 +106,8 @@ public:
 
     void perform_path_gc_by_rowsetid();
 
+    void perform_path_gc_by_tablet();
+
     OLAPStatus remove_old_meta_and_files();
 
     bool convert_old_data_success();
@@ -140,8 +142,6 @@ private:
     OLAPStatus _clean_unfinished_converting_data();
     OLAPStatus _convert_old_tablet();
 
-    void _remove_check_paths_no_lock(const std::set<std::string>& paths);
-
     void _process_garbage_path(const std::string& path);
 
     void _remove_check_paths(const std::set<std::string>& paths);
@@ -184,6 +184,7 @@ private:
     std::mutex _check_path_mutex;
     std::condition_variable _cv;
     std::set<std::string> _all_check_paths;
+    std::set<std::string> _all_tablet_schemahash_paths;
 
     RWMutex _pending_path_mutex;
     std::set<std::string> _pending_path_ids;
diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp
index dac000a..8948e7a 100644
--- a/be/src/olap/olap_server.cpp
+++ b/be/src/olap/olap_server.cpp
@@ -371,7 +371,10 @@ void* StorageEngine::_path_gc_thread_callback(void* arg) {
     LOG(INFO) << "try to start path gc thread!";
 
     while (!_stop_bg_worker) {
-        LOG(INFO) << "try to perform path gc!";
+        LOG(INFO) << "try to perform path gc by tablet!";
+        ((DataDir*)arg)->perform_path_gc_by_tablet();
+        
+        LOG(INFO) << "try to perform path gc by rowsetid!";
         // perform path gc by rowset id
         ((DataDir*)arg)->perform_path_gc_by_rowsetid();
 
diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index 96af2c3..2290cf2 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -635,6 +635,8 @@ TabletSharedPtr TabletManager::get_tablet(TTabletId 
tablet_id, SchemaHash schema
 bool TabletManager::get_tablet_id_and_schema_hash_from_path(
         const string& path, TTabletId* tablet_id, TSchemaHash* schema_hash) {
     static re2::RE2 normal_re("/data/\\d+/(\\d+)/(\\d+)($|/)");
+    // match tablet schema hash data path, for example, the path is 
/data/1/16791/29998
+    // 1 is shard id , 16791 is tablet id, 29998 is schema hash
     if (RE2::PartialMatch(path, normal_re, tablet_id, schema_hash)) {
         return true;
     }
@@ -812,6 +814,12 @@ OLAPStatus TabletManager::load_tablet_from_meta(DataDir* 
data_dir, TTabletId tab
         return OLAP_ERR_TABLE_CREATE_FROM_HEADER_ERROR;
     }
 
+    // check if the tablet path exists since the path maybe deleted by gc 
thread
+    if (!Env::Default()->path_exists(tablet->tablet_path()).ok()) {
+        LOG(WARNING) << "tablet path not exists, create tablet failed, path=" 
<< tablet->tablet_path();
+        return OLAP_ERR_TABLE_ALREADY_DELETED_ERROR;
+    }
+
     if (tablet_meta->tablet_state() == TABLET_SHUTDOWN) {
         LOG(INFO) << "fail to load tablet because it is to be deleted. 
tablet_id=" << tablet_id
                   << " schema_hash=" << schema_hash << ", path=" << 
data_dir->path();
@@ -1086,6 +1094,52 @@ OLAPStatus TabletManager::start_trash_sweep() {
     return OLAP_SUCCESS;
 } // start_trash_sweep
 
+void TabletManager::register_clone_tablet(int64_t tablet_id) {
+    RWMutex& tablet_map_lock = _get_tablet_map_lock(tablet_id);
+    WriteLock wlock(&tablet_map_lock);
+    _tablets_under_clone.insert(tablet_id);
+}
+
+void TabletManager::unregister_clone_tablet(int64_t tablet_id) {
+    RWMutex& tablet_map_lock = _get_tablet_map_lock(tablet_id);
+    WriteLock wlock(&tablet_map_lock);
+    _tablets_under_clone.erase(tablet_id);
+}
+
+void TabletManager::try_delete_unused_tablet_path(DataDir* data_dir, TTabletId 
tablet_id,
+        SchemaHash schema_hash, const string& schema_hash_path) {
+    // acquire the read lock, so that there is no creating tablet or load 
tablet from meta tasks
+    // create tablet and load tablet task should check whether the dir exists
+    RWMutex& tablet_map_lock = _get_tablet_map_lock(tablet_id);
+    ReadLock rlock(&tablet_map_lock);
+
+    // check if meta already exists
+    TabletMetaSharedPtr tablet_meta(new TabletMeta());
+    OLAPStatus check_st = TabletMetaManager::get_meta(data_dir, tablet_id, 
+        schema_hash, tablet_meta);
+    if (check_st == OLAP_SUCCESS) {
+        LOG(INFO) << "tablet meta exist is meta store, skip delete the path " 
<< schema_hash_path;
+        return;
+    }
+
+    if (_tablets_under_clone.count(tablet_id) > 0) {
+        LOG(INFO) << "tablet is under clone, skip delete the path " << 
schema_hash_path;
+        return;
+    }
+
+    // TODO(ygl): may do other checks in the future
+    if (Env::Default()->path_exists(schema_hash_path).ok()) {
+        LOG(INFO) << "start to move tablet to trash. tablet_path = " << 
schema_hash_path;
+        OLAPStatus rm_st = move_to_trash(schema_hash_path, schema_hash_path);
+        if (rm_st != OLAP_SUCCESS) {
+            LOG(WARNING) << "fail to move dir to trash. dir=" << 
schema_hash_path;
+        } else {
+            LOG(INFO) << "move path " << schema_hash_path << " to trash 
successfully";
+        }
+    }
+    return;
+}
+
 bool TabletManager::try_schema_change_lock(TTabletId tablet_id) {
     bool res = false;
     VLOG(3) << "try_schema_change_lock begin. tablet_id=" << tablet_id;
diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h
index d556377..ea4de8c 100644
--- a/be/src/olap/tablet_manager.h
+++ b/be/src/olap/tablet_manager.h
@@ -125,6 +125,9 @@ public:
     // Prevent schema change executed concurrently.
     bool try_schema_change_lock(TTabletId tablet_id);
 
+    void try_delete_unused_tablet_path(DataDir* data_dir, TTabletId tablet_id,
+        SchemaHash schema_hash, const string& schema_hash_path);
+
     void update_root_path_info(std::map<std::string, DataDirInfo>* path_map,
                                size_t* tablet_counter);
 
@@ -134,6 +137,9 @@ public:
 
     void  obtain_all_tablets(vector<TabletInfo> &tablets_info);
 
+    void register_clone_tablet(int64_t tablet_id);
+    void unregister_clone_tablet(int64_t tablet_id);
+
 private:
     // Add a tablet pointer to StorageEngine
     // If force, drop the existing tablet add this new one
@@ -221,6 +227,9 @@ private:
     int64_t _last_update_stat_ms;
 
     inline tablet_map_t& _get_tablet_map(TTabletId tablet_id);
+
+    std::set<int64_t> _tablets_under_clone;
+    std::set<int64_t> _tablets_under_restore;
 };
 
 inline RWMutex& TabletManager::_get_tablet_map_lock(TTabletId tabletId) {
diff --git a/be/src/olap/task/engine_clone_task.cpp 
b/be/src/olap/task/engine_clone_task.cpp
index 9f951e3..ba4a6c7 100644
--- a/be/src/olap/task/engine_clone_task.cpp
+++ b/be/src/olap/task/engine_clone_task.cpp
@@ -63,6 +63,14 @@ EngineCloneTask::EngineCloneTask(const TCloneReq& clone_req,
     _master_info(master_info) {}
 
 OLAPStatus EngineCloneTask::execute() {
+    // register the tablet to avoid it is deleted by gc thread during clone 
process
+    
StorageEngine::instance()->tablet_manager()->register_clone_tablet(_clone_req.tablet_id);
+    OLAPStatus st = _do_clone();
+    
StorageEngine::instance()->tablet_manager()->unregister_clone_tablet(_clone_req.tablet_id);
+    return st;
+}
+
+OLAPStatus EngineCloneTask::_do_clone() {
     AgentStatus status = DORIS_SUCCESS;
     string src_file_path;
     TBackend src_host;
diff --git a/be/src/olap/task/engine_clone_task.h 
b/be/src/olap/task/engine_clone_task.h
index 52d1d1c..57a8e2e 100644
--- a/be/src/olap/task/engine_clone_task.h
+++ b/be/src/olap/task/engine_clone_task.h
@@ -45,6 +45,8 @@ public:
 
 private:
     
+    OLAPStatus _do_clone();
+    
     virtual OLAPStatus _finish_clone(Tablet* tablet, const std::string& 
clone_dir,
                                     int64_t committed_version, bool 
is_incremental_clone);
     


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to