This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new f772649 [Optimize] Optimize lock when check error storage (#6321)
f772649 is described below
commit f772649535de6db51a6fac4d925103402ca5c26f
Author: Lijia Liu <[email protected]>
AuthorDate: Sat Aug 7 21:30:49 2021 +0800
[Optimize] Optimize lock when check error storage (#6321)
1. `StorageEngine::_delete_tablets_on_unused_root_path` will try to obtain
tablet shard write lock in `TabletManager`
```
StorageEngine::_delete_tablets_on_unused_root_path
TabletManager::drop_tablets_on_error_root_path
obtain each tablet shard's write lock
```
2. `TabletManager::build_all_report_tablets_info` and other methods will
obtain tablet shard read lock frequently.
So, `StorageEngine::_delete_tablets_on_unused_root_path` will hold
`_store_lock` for a long time.
This will make it difficult for other threads to get write `_store_lock`,
such as `StorageEngine::get_stores_for_create_tablet`
`drop_tablets_on_error_root_path` is a small probability event,
`TabletManager::drop_tablets_on_error_root_path` should return when its param
`tablet_info_vec` is empty
---
be/src/agent/task_worker_pool.cpp | 12 +++++++++++-
be/src/olap/storage_engine.cpp | 24 +++++++++++++-----------
be/src/olap/tablet_manager.cpp | 18 +++++++++++++-----
be/src/util/doris_metrics.cpp | 2 ++
be/src/util/doris_metrics.h | 1 +
be/test/util/doris_metrics_test.cpp | 6 ++++++
6 files changed, 46 insertions(+), 17 deletions(-)
diff --git a/be/src/agent/task_worker_pool.cpp
b/be/src/agent/task_worker_pool.cpp
index f3c43ae..c9681a9 100644
--- a/be/src/agent/task_worker_pool.cpp
+++ b/be/src/agent/task_worker_pool.cpp
@@ -1185,9 +1185,19 @@ void
TaskWorkerPool::_report_tablet_worker_thread_callback() {
_is_doing_work = true;
request.tablets.clear();
+ uint64_t report_version = _s_report_version;
OLAPStatus build_all_report_tablets_info_status =
StorageEngine::instance()->tablet_manager()->build_all_report_tablets_info(
&request.tablets);
+ if (report_version < _s_report_version) {
+ // TODO llj This can only reduce the possibility for report error,
but can't avoid it.
+ // If FE create a tablet in FE meta and send CREATE task to this
BE, the tablet may not be included in this
+ // report, and the report version has a small probability that it
has not been updated in time. When FE
+ // receives this report, it is possible to delete the new tablet.
+ LOG(WARNING) << "report version " << report_version << " change to
" << _s_report_version;
+
DorisMetrics::instance()->report_all_tablets_requests_skip->increment(1);
+ continue;
+ }
if (build_all_report_tablets_info_status != OLAP_SUCCESS) {
LOG(WARNING) << "build all report tablets info failed. status: "
<< build_all_report_tablets_info_status;
@@ -1197,7 +1207,7 @@ void
TaskWorkerPool::_report_tablet_worker_thread_callback() {
std::max(DorisMetrics::instance()->tablet_cumulative_max_compaction_score->value(),
DorisMetrics::instance()->tablet_base_max_compaction_score->value());
request.__set_tablet_max_compaction_score(max_compaction_score);
- request.__set_report_version(_s_report_version);
+ request.__set_report_version(report_version);
_handle_report(request, ReportType::TABLET);
}
StorageEngine::instance()->deregister_report_listener(this);
diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp
index 9d265e8..1f5efbf 100644
--- a/be/src/olap/storage_engine.cpp
+++ b/be/src/olap/storage_engine.cpp
@@ -500,19 +500,21 @@ bool StorageEngine::_delete_tablets_on_unused_root_path()
{
uint32_t unused_root_path_num = 0;
uint32_t total_root_path_num = 0;
- // TODO(yingchun): _store_map is only updated in main and ~StorageEngine,
maybe we can remove it?
- std::lock_guard<std::mutex> l(_store_lock);
- if (_store_map.empty()) {
- return false;
- }
+ {
+ // TODO(yingchun): _store_map is only updated in main and
~StorageEngine, maybe we can remove it?
+ std::lock_guard<std::mutex> l(_store_lock);
+ if (_store_map.empty()) {
+ return false;
+ }
- for (auto& it : _store_map) {
- ++total_root_path_num;
- if (it.second->is_used()) {
- continue;
+ for (auto& it : _store_map) {
+ ++total_root_path_num;
+ if (it.second->is_used()) {
+ continue;
+ }
+ it.second->clear_tablets(&tablet_info_vec);
+ ++unused_root_path_num;
}
- it.second->clear_tablets(&tablet_info_vec);
- ++unused_root_path_num;
}
if (too_many_disks_are_failed(unused_root_path_num, total_root_path_num)) {
diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index 09a9e87..8d3486d 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -562,13 +562,21 @@ OLAPStatus TabletManager::_drop_tablet_unlocked(TTabletId
tablet_id, SchemaHash
OLAPStatus TabletManager::drop_tablets_on_error_root_path(
const std::vector<TabletInfo>& tablet_info_vec) {
OLAPStatus res = OLAP_SUCCESS;
- for (int32 i = 0; i < _tablets_shards_size; i++) {
+ if (tablet_info_vec.empty()) { // This is a high probability event
+ return res;
+ }
+ std::vector<std::set<size_t>> local_tmp_vector(_tablets_shards_size);
+ for (size_t idx = 0; idx < tablet_info_vec.size(); ++idx) {
+ local_tmp_vector[tablet_info_vec[idx].tablet_id &
_tablets_shards_mask].insert(idx);
+ }
+ for (int32 i = 0; i < _tablets_shards_size; ++i) {
+ if (local_tmp_vector[i].empty()) {
+ continue;
+ }
WriteLock wlock(_tablets_shards[i].lock.get());
- for (const TabletInfo& tablet_info : tablet_info_vec) {
+ for (size_t idx : local_tmp_vector[i]) {
+ const TabletInfo& tablet_info = tablet_info_vec[idx];
TTabletId tablet_id = tablet_info.tablet_id;
- if ((tablet_id & _tablets_shards_mask) != i) {
- continue;
- }
TSchemaHash schema_hash = tablet_info.schema_hash;
VLOG_NOTICE << "drop_tablet begin. tablet_id=" << tablet_id
<< ", schema_hash=" << schema_hash;
diff --git a/be/src/util/doris_metrics.cpp b/be/src/util/doris_metrics.cpp
index a1d6763..0c93390 100644
--- a/be/src/util/doris_metrics.cpp
+++ b/be/src/util/doris_metrics.cpp
@@ -52,6 +52,7 @@ DEFINE_ENGINE_COUNTER_METRIC(create_tablet_requests_failed,
create_tablet, faile
DEFINE_ENGINE_COUNTER_METRIC(drop_tablet_requests_total, drop_tablet, total);
DEFINE_ENGINE_COUNTER_METRIC(report_all_tablets_requests_total,
report_all_tablets, total);
DEFINE_ENGINE_COUNTER_METRIC(report_all_tablets_requests_failed,
report_all_tablets, failed);
+DEFINE_ENGINE_COUNTER_METRIC(report_all_tablets_requests_skip,
report_all_tablets, skip)
DEFINE_ENGINE_COUNTER_METRIC(report_tablet_requests_total, report_tablet,
total);
DEFINE_ENGINE_COUNTER_METRIC(report_tablet_requests_failed, report_tablet,
failed);
DEFINE_ENGINE_COUNTER_METRIC(report_disk_requests_total, report_disk, total);
@@ -191,6 +192,7 @@ DorisMetrics::DorisMetrics() :
_metric_registry(_s_registry_name) {
INT_COUNTER_METRIC_REGISTER(_server_metric_entity,
drop_tablet_requests_total);
INT_COUNTER_METRIC_REGISTER(_server_metric_entity,
report_all_tablets_requests_total);
INT_COUNTER_METRIC_REGISTER(_server_metric_entity,
report_all_tablets_requests_failed);
+ INT_COUNTER_METRIC_REGISTER(_server_metric_entity,
report_all_tablets_requests_skip);
INT_COUNTER_METRIC_REGISTER(_server_metric_entity,
report_tablet_requests_total);
INT_COUNTER_METRIC_REGISTER(_server_metric_entity,
report_tablet_requests_failed);
INT_COUNTER_METRIC_REGISTER(_server_metric_entity,
report_disk_requests_total);
diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h
index f0e711d..f363c73 100644
--- a/be/src/util/doris_metrics.h
+++ b/be/src/util/doris_metrics.h
@@ -64,6 +64,7 @@ public:
IntCounter* report_all_tablets_requests_failed;
IntCounter* report_tablet_requests_total;
IntCounter* report_tablet_requests_failed;
+ IntCounter* report_all_tablets_requests_skip;
IntCounter* report_disk_requests_total;
IntCounter* report_disk_requests_failed;
IntCounter* report_task_requests_total;
diff --git a/be/test/util/doris_metrics_test.cpp
b/be/test/util/doris_metrics_test.cpp
index 81d3c7e..4305877 100644
--- a/be/test/util/doris_metrics_test.cpp
+++ b/be/test/util/doris_metrics_test.cpp
@@ -123,6 +123,12 @@ TEST_F(DorisMetricsTest, Normal) {
ASSERT_STREQ("17", metric->to_string().c_str());
}
{
+
DorisMetrics::instance()->report_all_tablets_requests_skip->increment(1);
+ auto metric =
server_entity->get_metric("report_all_tablets_requests_skip",
"engine_requests_total");
+ ASSERT_TRUE(metric != nullptr);
+ ASSERT_STREQ("1", metric->to_string().c_str());
+ }
+ {
DorisMetrics::instance()->report_tablet_requests_total->increment(18);
auto metric =
server_entity->get_metric("report_tablet_requests_total",
"engine_requests_total");
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]