This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch dev-1.0.0 in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit 1681e4f104b599b432e41621d137ac0bcb1d2577 Author: Mingyu Chen <[email protected]> AuthorDate: Fri Mar 11 17:24:20 2022 +0800 [fix](report) fix bug that tablet may already be delete when reporting (#8444) 1. This bug was introduced by #8209. Error in fe.warn.log: ``` java.lang.IllegalStateException: 560278 at com.google.common.base.Preconditions.checkState(Preconditions.java:508) ~[spark-dpp-0.15-SNAPSHOT.jar:0.15-SNAPSHOT] at org.apache.doris.catalog.TabletInvertedIndex.getReplica(TabletInvertedIndex.java:462) ~[palo-fe.jar:0.15-SNAPSHOT] at org.apache.doris.catalog.Catalog.replayBackendReplicasInfo(Catalog.java:6941) ~[palo-fe.jar:0.15-SNAPSHOT] at org.apache.doris.persist.EditLog.loadJournal(EditLog.java:626) [palo-fe.jar:0.15-SNAPSHOT] at org.apache.doris.catalog.Catalog.replayJournal(Catalog.java:2446) [palo-fe.jar:0.15-SNAPSHOT] at org.apache.doris.master.Checkpoint.doCheckpoint(Checkpoint.java:116) [palo-fe.jar:0.15-SNAPSHOT] at org.apache.doris.master.Checkpoint.runAfterCatalogReady(Checkpoint.java:74) [palo-fe.jar:0.15-SNAPSHOT] at org.apache.doris.common.util.MasterDaemon.runOneCycle(MasterDaemon.java:58) [palo-fe.jar:0.15-SNAPSHOT] at org.apache.doris.common.util.Daemon.run(Daemon.java:116) [palo-fe.jar:0.15-SNAPSHOT] ``` Since the reporting of a tablet and the deletion of a tablet are two independent events and are not mutually exclusive, it may happen that the tablet is deleted first and the reporting is done later. 2. Change the tablet report info. Now, the version of a tablet report from BE is the largest continuous version. Eg, versions: [1,2,3,5,7], the report version of this tablet will be 3. --- be/src/olap/tablet.cpp | 27 ++++++++++++++-------- be/src/olap/tablet.h | 6 +++-- .../java/org/apache/doris/catalog/Catalog.java | 8 +++++++ 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index a6f5648..862dccb 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -807,12 +807,12 @@ void Tablet::calc_missed_versions_unlocked(int64_t spec_version, } } -void Tablet::max_continuous_version_from_beginning(Version* version) { +void Tablet::max_continuous_version_from_beginning(Version* version, Version* max_version) { ReadLock rdlock(&_meta_lock); - _max_continuous_version_from_beginning_unlocked(version); + _max_continuous_version_from_beginning_unlocked(version, max_version); } -void Tablet::_max_continuous_version_from_beginning_unlocked(Version* version) const { +void Tablet::_max_continuous_version_from_beginning_unlocked(Version* version, Version* max_version) const { std::vector<Version> existing_versions; for (auto& rs : _tablet_meta->all_rs_metas()) { existing_versions.emplace_back(rs->version()); @@ -833,6 +833,9 @@ void Tablet::_max_continuous_version_from_beginning_unlocked(Version* version) c max_continuous_version = existing_versions[i]; } *version = max_continuous_version; + if (max_version != nullptr) { + *max_version = existing_versions.back(); + } } void Tablet::calculate_cumulative_point() { @@ -1247,9 +1250,15 @@ void Tablet::build_tablet_report_info(TTabletInfo* tablet_info) { tablet_info->row_count = _tablet_meta->num_rows(); tablet_info->data_size = _tablet_meta->tablet_footprint(); - tablet_info->__set_version_miss(false); - auto max_rowset = rowset_with_max_version(); - if (max_rowset == nullptr) { + // Here we need to report to FE if there are any missing versions of tablet. + // We start from the initial version and traverse backwards until we meet a discontinuous version. + Version cversion; + Version max_version; + _max_continuous_version_from_beginning_unlocked(&cversion, &max_version); + tablet_info->__set_version_miss(cversion.second < max_version.second); + // find rowset with max version + auto iter = _rs_version_map.find(max_version); + if (iter == _rs_version_map.end()) { // If the tablet is in running state, it must not be doing schema-change. so if we can not // access its rowsets, it means that the tablet is bad and needs to be reported to the FE // for subsequent repairs (through the cloning task) @@ -1260,10 +1269,10 @@ void Tablet::build_tablet_report_info(TTabletInfo* tablet_info) { // still sets the state to normal when reporting. Note that every task has an timeout, // so if the task corresponding to this change hangs, when the task timeout, FE will know // and perform state modification operations. - } else { - tablet_info->__set_version_miss(check_version_integrity({0, max_rowset->version().second}, true)); } - tablet_info->version = max_rowset->version().second; + + // the report version is the largest continuous version, same logic as in FE side + tablet_info->version = cversion.second; // Useless but it is a required filed in TTabletInfo tablet_info->version_hash = 0; tablet_info->__set_partition_id(_tablet_meta->partition_id()); diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index b11fedf..bf17327 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -176,7 +176,8 @@ public: // This function to find max continuous version from the beginning. // For example: If there are 1, 2, 3, 5, 6, 7 versions belongs tablet, then 3 is target. - void max_continuous_version_from_beginning(Version* version); + // 3 will be saved in "version", and 7 will be saved in "max_version", if max_version != nullptr + void max_continuous_version_from_beginning(Version* version, Version* max_version = nullptr); // operation for query OLAPStatus split_range(const OlapTuple& start_key_strings, const OlapTuple& end_key_strings, @@ -272,7 +273,8 @@ private: // Returns: // version: the max continuous version from beginning - void _max_continuous_version_from_beginning_unlocked(Version* version) const; + // max_version: the max version of this tablet + void _max_continuous_version_from_beginning_unlocked(Version* version, Version* max_version) const; RowsetSharedPtr _rowset_with_largest_size(); /// Delete stale rowset by version. This method not only delete the version in expired rowset map, /// but also delete the version in rowset meta vector. diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java index e2bf25e..c92c579 100755 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java @@ -7001,6 +7001,14 @@ public class Catalog { List<BackendReplicasInfo.ReplicaReportInfo> replicaInfos = backendReplicasInfo.getReplicaReportInfos(); for (BackendReplicasInfo.ReplicaReportInfo info : replicaInfos) { + if (tabletInvertedIndex.getTabletMeta(info.tabletId) == null) { + // The tablet has been deleted. Because the reporting of tablet and + // the deletion of tablet are two independent events, + // and directly do not do mutually exclusive processing, + // so it may appear that the tablet is deleted first, and the reporting information is processed later. + // Here we simply ignore the deleted tablet. + continue; + } Replica replica = tabletInvertedIndex.getReplica(info.tabletId, backendId); if (replica == null) { LOG.warn("failed to find replica of tablet {} on backend {} when replaying backend report info", --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
