This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-1.2-lts by this push:
new ad939722af Revert "[fix](replica) Fix inconsistent replica id between
BE and FE in corner case of tablet rebalance (#16889)" (#17386)
ad939722af is described below
commit ad939722af2bbd31b28bb32ebefc5786c02eaeb7
Author: plat1ko <[email protected]>
AuthorDate: Fri Mar 3 17:16:24 2023 +0800
Revert "[fix](replica) Fix inconsistent replica id between BE and FE in
corner case of tablet rebalance (#16889)" (#17386)
This reverts commit 783c7d38658ad5169868a2d8028b23f2f73da07a.
---
be/src/olap/tablet_manager.cpp | 14 +++++++++-----
be/src/olap/tablet_manager.h | 3 +--
be/src/olap/task/engine_clone_task.cpp | 17 ++++-------------
.../java/org/apache/doris/master/ReportHandler.java | 9 ++-------
4 files changed, 16 insertions(+), 27 deletions(-)
diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index 36b0eb64a9..9d111dd697 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -436,7 +436,8 @@ Status TabletManager::drop_tablet(TTabletId tablet_id,
TReplicaId replica_id,
auto& shard = _get_tablets_shard(tablet_id);
std::lock_guard wrlock(shard.lock);
if (shard.tablets_under_clone.count(tablet_id) > 0) {
- return Status::Aborted("tablet {} is under clone, skip drop task",
tablet_id);
+ LOG(INFO) << "tablet " << tablet_id << " is under clone, skip drop
task";
+ return Status::Aborted("aborted");
}
SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
return _drop_tablet_unlocked(tablet_id, replica_id, false,
is_drop_table_or_partition);
@@ -458,9 +459,12 @@ Status TabletManager::_drop_tablet_unlocked(TTabletId
tablet_id, TReplicaId repl
// We should compare replica id to avoid dropping new cloned tablet.
// Iff request replica id is 0, FE may be an older release, then we drop
this tablet as before.
if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) {
- return Status::Aborted("replica_id not match({} vs {})",
to_drop_tablet->replica_id(),
- replica_id);
+ LOG(WARNING) << "fail to drop tablet because replica id not match. "
+ << "tablet_id=" << tablet_id << ", replica_id=" <<
to_drop_tablet->replica_id()
+ << ", request replica_id=" << replica_id;
+ return Status::Aborted("aborted");
}
+
_remove_tablet_from_partition(to_drop_tablet);
tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
tablet_map.erase(tablet_id);
@@ -1039,10 +1043,10 @@ Status TabletManager::start_trash_sweep() {
return Status::OK();
} // start_trash_sweep
-bool TabletManager::register_clone_tablet(int64_t tablet_id) {
+void TabletManager::register_clone_tablet(int64_t tablet_id) {
tablets_shard& shard = _get_tablets_shard(tablet_id);
std::lock_guard<std::shared_mutex> wrlock(shard.lock);
- return shard.tablets_under_clone.insert(tablet_id).second;
+ shard.tablets_under_clone.insert(tablet_id);
}
void TabletManager::unregister_clone_tablet(int64_t tablet_id) {
diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h
index b01a5141d5..bbb298cae2 100644
--- a/be/src/olap/tablet_manager.h
+++ b/be/src/olap/tablet_manager.h
@@ -133,8 +133,7 @@ public:
void obtain_specific_quantity_tablets(std::vector<TabletInfo>&
tablets_info, int64_t num);
- // return `true` if register success
- bool register_clone_tablet(int64_t tablet_id);
+ void register_clone_tablet(int64_t tablet_id);
void unregister_clone_tablet(int64_t tablet_id);
void get_tablets_distribution_on_different_disks(
diff --git a/be/src/olap/task/engine_clone_task.cpp
b/be/src/olap/task/engine_clone_task.cpp
index 7550366412..4bb86501b0 100644
--- a/be/src/olap/task/engine_clone_task.cpp
+++ b/be/src/olap/task/engine_clone_task.cpp
@@ -63,9 +63,7 @@ EngineCloneTask::EngineCloneTask(const TCloneReq& clone_req,
const TMasterInfo&
Status EngineCloneTask::execute() {
// register the tablet to avoid it is deleted by gc thread during clone
process
SCOPED_ATTACH_TASK(_mem_tracker);
- if
(StorageEngine::instance()->tablet_manager()->register_clone_tablet(_clone_req.tablet_id))
{
- return Status::InternalError("tablet {} is under clone",
_clone_req.tablet_id);
- }
+
StorageEngine::instance()->tablet_manager()->register_clone_tablet(_clone_req.tablet_id);
Status st = _do_clone();
StorageEngine::instance()->tablet_manager()->unregister_clone_tablet(_clone_req.tablet_id);
return st;
@@ -83,13 +81,6 @@ Status EngineCloneTask::_do_clone() {
std::vector<Version> missed_versions;
// try to repair a tablet with missing version
if (tablet != nullptr) {
- if (tablet->replica_id() != _clone_req.replica_id) {
- // `tablet` may be a dropped replica in FE, e.g: BE1 migrates
replica of tablet_1 to BE2,
- // but before BE1 drop this replica, another new replica of
tablet_1 is migrated to BE1.
- // If we allow to clone success on dropped replica, replica id may
never be consistent between FE and BE.
- return Status::InternalError("replica_id not match({} vs {})",
tablet->replica_id(),
- _clone_req.replica_id);
- }
std::shared_lock migration_rlock(tablet->get_migration_lock(),
std::try_to_lock);
if (!migration_rlock.owns_lock()) {
return Status::OLAPInternalError(OLAP_ERR_RWLOCK_ERROR);
@@ -105,7 +96,7 @@ Status EngineCloneTask::_do_clone() {
// completed. Or remote be will just return header not the rowset
files. clone will failed.
if (missed_versions.empty()) {
LOG(INFO) << "missed version size = 0, skip clone and return
success. tablet_id="
- << _clone_req.tablet_id << " replica_id=" <<
_clone_req.replica_id;
+ << _clone_req.tablet_id;
_set_tablet_info(is_new_tablet);
return Status::OK();
}
@@ -113,8 +104,7 @@ Status EngineCloneTask::_do_clone() {
LOG(INFO) << "clone to existed tablet. missed_versions_size=" <<
missed_versions.size()
<< ", allow_incremental_clone=" << allow_incremental_clone
<< ", signature=" << _signature << ", tablet_id=" <<
_clone_req.tablet_id
- << ", committed_version=" << _clone_req.committed_version
- << ", replica_id=" << _clone_req.replica_id;
+ << ", committed_version=" << _clone_req.committed_version;
// try to download missing version from src backend.
// if tablet on src backend does not contains missing version, it will
download all versions,
@@ -122,6 +112,7 @@ Status EngineCloneTask::_do_clone() {
RETURN_IF_ERROR(_make_and_download_snapshots(*(tablet->data_dir()),
local_data_path,
&src_host,
&src_file_path, missed_versions,
&allow_incremental_clone));
+
RETURN_IF_ERROR(_finish_clone(tablet.get(), local_data_path,
_clone_req.committed_version,
allow_incremental_clone));
} else {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
index 6741e3bfcf..a38b91ff78 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
@@ -921,12 +921,6 @@ public class ReportHandler extends Daemon {
throw new MetaNotFoundException("tablet[" + tabletId + "] does
not exist");
}
- // check replica id
- long replicaId = backendTabletInfo.getReplicaId();
- if (replicaId <= 0) {
- throw new MetaNotFoundException("replica id is invalid, tablet
id: " + tabletId);
- }
-
long visibleVersion = partition.getVisibleVersion();
// check replica version
@@ -970,7 +964,8 @@ public class ReportHandler extends Daemon {
} else if (version < partition.getCommittedVersion()) {
lastFailedVersion = partition.getCommittedVersion();
}
- // use replicaId reported by BE to maintain replica meta
consistent between FE and BE
+
+ long replicaId = Env.getCurrentEnv().getNextId();
Replica replica = new Replica(replicaId, backendId, version,
schemaHash,
dataSize, remoteDataSize, rowCount,
ReplicaState.NORMAL,
lastFailedVersion, version);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]