This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new f772649  [Optimize] Optimize lock when check error storage (#6321)
f772649 is described below

commit f772649535de6db51a6fac4d925103402ca5c26f
Author: Lijia Liu <[email protected]>
AuthorDate: Sat Aug 7 21:30:49 2021 +0800

    [Optimize] Optimize lock when check error storage (#6321)
    
    1. `StorageEngine::_delete_tablets_on_unused_root_path` will try to obtain 
tablet shard write lock in `TabletManager`
    ```
    StorageEngine::_delete_tablets_on_unused_root_path
      TabletManager::drop_tablets_on_error_root_path
        obtain each tablet shard's write lock
    ```
    2. `TabletManager::build_all_report_tablets_info` and other methods will 
obtain tablet shard read lock frequently.
    
    So, `StorageEngine::_delete_tablets_on_unused_root_path` will hold 
`_store_lock` for a long time.
    This will make it difficult for other threads to get write `_store_lock`, 
such as `StorageEngine::get_stores_for_create_tablet`
    
    `drop_tablets_on_error_root_path` is a small probability event, 
`TabletManager::drop_tablets_on_error_root_path` should return when its param 
`tablet_info_vec` is empty
---
 be/src/agent/task_worker_pool.cpp   | 12 +++++++++++-
 be/src/olap/storage_engine.cpp      | 24 +++++++++++++-----------
 be/src/olap/tablet_manager.cpp      | 18 +++++++++++++-----
 be/src/util/doris_metrics.cpp       |  2 ++
 be/src/util/doris_metrics.h         |  1 +
 be/test/util/doris_metrics_test.cpp |  6 ++++++
 6 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/be/src/agent/task_worker_pool.cpp 
b/be/src/agent/task_worker_pool.cpp
index f3c43ae..c9681a9 100644
--- a/be/src/agent/task_worker_pool.cpp
+++ b/be/src/agent/task_worker_pool.cpp
@@ -1185,9 +1185,19 @@ void 
TaskWorkerPool::_report_tablet_worker_thread_callback() {
 
         _is_doing_work = true;
         request.tablets.clear();
+        uint64_t report_version = _s_report_version;
         OLAPStatus build_all_report_tablets_info_status =
                 
StorageEngine::instance()->tablet_manager()->build_all_report_tablets_info(
                         &request.tablets);
+        if (report_version < _s_report_version) {
+            // TODO llj This can only reduce the possibility for report error, 
but can't avoid it.
+            // If FE create a tablet in FE meta and send CREATE task to this 
BE, the tablet may not be included in this
+            // report, and the report version has a small probability that it 
has not been updated in time. When FE
+            // receives this report, it is possible to delete the new tablet.
+            LOG(WARNING) << "report version " << report_version << " change to 
" << _s_report_version;
+            
DorisMetrics::instance()->report_all_tablets_requests_skip->increment(1);
+            continue;
+        }
         if (build_all_report_tablets_info_status != OLAP_SUCCESS) {
             LOG(WARNING) << "build all report tablets info failed. status: "
                          << build_all_report_tablets_info_status;
@@ -1197,7 +1207,7 @@ void 
TaskWorkerPool::_report_tablet_worker_thread_callback() {
                 
std::max(DorisMetrics::instance()->tablet_cumulative_max_compaction_score->value(),
                          
DorisMetrics::instance()->tablet_base_max_compaction_score->value());
         request.__set_tablet_max_compaction_score(max_compaction_score);
-        request.__set_report_version(_s_report_version);
+        request.__set_report_version(report_version);
         _handle_report(request, ReportType::TABLET);
     }
     StorageEngine::instance()->deregister_report_listener(this);
diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp
index 9d265e8..1f5efbf 100644
--- a/be/src/olap/storage_engine.cpp
+++ b/be/src/olap/storage_engine.cpp
@@ -500,19 +500,21 @@ bool StorageEngine::_delete_tablets_on_unused_root_path() 
{
     uint32_t unused_root_path_num = 0;
     uint32_t total_root_path_num = 0;
 
-    // TODO(yingchun): _store_map is only updated in main and ~StorageEngine, 
maybe we can remove it?
-    std::lock_guard<std::mutex> l(_store_lock);
-    if (_store_map.empty()) {
-        return false;
-    }
+    {
+        // TODO(yingchun): _store_map is only updated in main and 
~StorageEngine, maybe we can remove it?
+        std::lock_guard<std::mutex> l(_store_lock);
+        if (_store_map.empty()) {
+            return false;
+        }
 
-    for (auto& it : _store_map) {
-        ++total_root_path_num;
-        if (it.second->is_used()) {
-            continue;
+        for (auto& it : _store_map) {
+            ++total_root_path_num;
+            if (it.second->is_used()) {
+                continue;
+            }
+            it.second->clear_tablets(&tablet_info_vec);
+            ++unused_root_path_num;
         }
-        it.second->clear_tablets(&tablet_info_vec);
-        ++unused_root_path_num;
     }
 
     if (too_many_disks_are_failed(unused_root_path_num, total_root_path_num)) {
diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index 09a9e87..8d3486d 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -562,13 +562,21 @@ OLAPStatus TabletManager::_drop_tablet_unlocked(TTabletId 
tablet_id, SchemaHash
 OLAPStatus TabletManager::drop_tablets_on_error_root_path(
         const std::vector<TabletInfo>& tablet_info_vec) {
     OLAPStatus res = OLAP_SUCCESS;
-    for (int32 i = 0; i < _tablets_shards_size; i++) {
+    if (tablet_info_vec.empty()) { // This is a high probability event
+        return res;
+    }
+    std::vector<std::set<size_t>> local_tmp_vector(_tablets_shards_size);
+    for (size_t idx = 0; idx < tablet_info_vec.size(); ++idx) {
+        local_tmp_vector[tablet_info_vec[idx].tablet_id & 
_tablets_shards_mask].insert(idx);
+    }
+    for (int32 i = 0; i < _tablets_shards_size; ++i) {
+        if (local_tmp_vector[i].empty()) {
+            continue;
+        }
         WriteLock wlock(_tablets_shards[i].lock.get());
-        for (const TabletInfo& tablet_info : tablet_info_vec) {
+        for (size_t idx : local_tmp_vector[i]) {
+            const TabletInfo& tablet_info = tablet_info_vec[idx];
             TTabletId tablet_id = tablet_info.tablet_id;
-            if ((tablet_id & _tablets_shards_mask) != i) {
-                continue;
-            }
             TSchemaHash schema_hash = tablet_info.schema_hash;
             VLOG_NOTICE << "drop_tablet begin. tablet_id=" << tablet_id
                         << ", schema_hash=" << schema_hash;
diff --git a/be/src/util/doris_metrics.cpp b/be/src/util/doris_metrics.cpp
index a1d6763..0c93390 100644
--- a/be/src/util/doris_metrics.cpp
+++ b/be/src/util/doris_metrics.cpp
@@ -52,6 +52,7 @@ DEFINE_ENGINE_COUNTER_METRIC(create_tablet_requests_failed, 
create_tablet, faile
 DEFINE_ENGINE_COUNTER_METRIC(drop_tablet_requests_total, drop_tablet, total);
 DEFINE_ENGINE_COUNTER_METRIC(report_all_tablets_requests_total, 
report_all_tablets, total);
 DEFINE_ENGINE_COUNTER_METRIC(report_all_tablets_requests_failed, 
report_all_tablets, failed);
+DEFINE_ENGINE_COUNTER_METRIC(report_all_tablets_requests_skip, 
report_all_tablets, skip)
 DEFINE_ENGINE_COUNTER_METRIC(report_tablet_requests_total, report_tablet, 
total);
 DEFINE_ENGINE_COUNTER_METRIC(report_tablet_requests_failed, report_tablet, 
failed);
 DEFINE_ENGINE_COUNTER_METRIC(report_disk_requests_total, report_disk, total);
@@ -191,6 +192,7 @@ DorisMetrics::DorisMetrics() : 
_metric_registry(_s_registry_name) {
     INT_COUNTER_METRIC_REGISTER(_server_metric_entity, 
drop_tablet_requests_total);
     INT_COUNTER_METRIC_REGISTER(_server_metric_entity, 
report_all_tablets_requests_total);
     INT_COUNTER_METRIC_REGISTER(_server_metric_entity, 
report_all_tablets_requests_failed);
+    INT_COUNTER_METRIC_REGISTER(_server_metric_entity, 
report_all_tablets_requests_skip);
     INT_COUNTER_METRIC_REGISTER(_server_metric_entity, 
report_tablet_requests_total);
     INT_COUNTER_METRIC_REGISTER(_server_metric_entity, 
report_tablet_requests_failed);
     INT_COUNTER_METRIC_REGISTER(_server_metric_entity, 
report_disk_requests_total);
diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h
index f0e711d..f363c73 100644
--- a/be/src/util/doris_metrics.h
+++ b/be/src/util/doris_metrics.h
@@ -64,6 +64,7 @@ public:
     IntCounter* report_all_tablets_requests_failed;
     IntCounter* report_tablet_requests_total;
     IntCounter* report_tablet_requests_failed;
+    IntCounter* report_all_tablets_requests_skip;
     IntCounter* report_disk_requests_total;
     IntCounter* report_disk_requests_failed;
     IntCounter* report_task_requests_total;
diff --git a/be/test/util/doris_metrics_test.cpp 
b/be/test/util/doris_metrics_test.cpp
index 81d3c7e..4305877 100644
--- a/be/test/util/doris_metrics_test.cpp
+++ b/be/test/util/doris_metrics_test.cpp
@@ -123,6 +123,12 @@ TEST_F(DorisMetricsTest, Normal) {
         ASSERT_STREQ("17", metric->to_string().c_str());
     }
     {
+        
DorisMetrics::instance()->report_all_tablets_requests_skip->increment(1);
+        auto metric = 
server_entity->get_metric("report_all_tablets_requests_skip", 
"engine_requests_total");
+        ASSERT_TRUE(metric != nullptr);
+        ASSERT_STREQ("1", metric->to_string().c_str());
+    }
+    {
         DorisMetrics::instance()->report_tablet_requests_total->increment(18);
         auto metric =
                 server_entity->get_metric("report_tablet_requests_total", 
"engine_requests_total");

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to