This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-1.1-lts
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-1.1-lts by this push:
new 0f808ede72 [improvement](publish) fix publish timeout in cocurrent
load (#14646)
0f808ede72 is described below
commit 0f808ede729e9d66538568826350d1ccdfd636f0
Author: yixiutt <[email protected]>
AuthorDate: Mon Nov 28 19:22:57 2022 +0800
[improvement](publish) fix publish timeout in cocurrent load (#14646)
In concurrent load, some publish timeout happens occasionally. This is
cause by meta lock hold by other thread so publish add increase rowset
hang for several seconds.
StorageEngine::start_delete_unused_rowset will hold gc_mutex and it cost
a lot of time, so that add_used_rowset wait lock, and compaction
modify_rowset
or other tablet method will hold meta_lock and call add_unused_rowset which
will make meta_lock occupied for too long, finally makes publish timeout.
In this pr, I copy unused_rowsets in lock and delete these rowset without
lock,
makes gc_mutex more lightweight so meta lock can be acquired immediately in
publish thread.
My test shows that no publish timeout in concurrent stream load.
---
be/src/common/config.h | 2 +-
be/src/olap/storage_engine.cpp | 32 ++++++++++++++++++++------------
2 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/be/src/common/config.h b/be/src/common/config.h
index a86afb8c27..68c5e6cd7e 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -193,7 +193,7 @@ CONF_mInt32(default_num_rows_per_column_file_block, "1024");
// pending data policy
CONF_mInt32(pending_data_expire_time_sec, "1800");
// inc_rowset snapshot rs sweep time interval
-CONF_mInt32(tablet_rowset_stale_sweep_time_sec, "1800");
+CONF_mInt32(tablet_rowset_stale_sweep_time_sec, "300");
// garbage sweep policy
CONF_Int32(max_garbage_sweep_interval, "3600");
CONF_Int32(min_garbage_sweep_interval, "180");
diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp
index 665a394ede..deaa562a4d 100644
--- a/be/src/olap/storage_engine.cpp
+++ b/be/src/olap/storage_engine.cpp
@@ -883,20 +883,28 @@ void StorageEngine::_parse_default_rowset_type() {
}
void StorageEngine::start_delete_unused_rowset() {
- MutexLock lock(&_gc_mutex);
- for (auto it = _unused_rowsets.begin(); it != _unused_rowsets.end();) {
- if (it->second.use_count() != 1) {
- ++it;
- } else if (it->second->need_delete_file()) {
- VLOG_NOTICE << "start to remove rowset:" << it->second->rowset_id()
- << ", version:" << it->second->version().first << "-"
- << it->second->version().second;
- OLAPStatus status = it->second->remove();
- VLOG_NOTICE << "remove rowset:" << it->second->rowset_id()
- << " finished. status:" << status;
- it = _unused_rowsets.erase(it);
+ std::unordered_map<std::string, RowsetSharedPtr> unused_rowsets_copy;
+ {
+ MutexLock lock(&_gc_mutex);
+ for (auto it = _unused_rowsets.begin(); it != _unused_rowsets.end();) {
+ if (it->second.use_count() == 1 && it->second->need_delete_file())
{
+ unused_rowsets_copy[it->first] = it->second;
+ it = _unused_rowsets.erase(it);
+ } else {
+ ++it;
+ }
}
}
+ for (auto it = unused_rowsets_copy.begin(); it !=
unused_rowsets_copy.end(); ++it) {
+ // FIXME(cyx): Currently remote unused rowsets are generated by
compaction gc,
+ // we cannot remove them directly as other BE may need them.
+ VLOG_NOTICE << "start to remove rowset:" << it->second->rowset_id()
+ << ", version:" << it->second->version().first << "-"
+ << it->second->version().second;
+ OLAPStatus status = it->second->remove();
+ VLOG_NOTICE << "remove rowset:" << it->second->rowset_id()
+ << " finished. status:" << status;
+ }
}
void StorageEngine::add_unused_rowset(RowsetSharedPtr rowset) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]