This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch dev-1.0.0
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git

commit 1681e4f104b599b432e41621d137ac0bcb1d2577
Author: Mingyu Chen <[email protected]>
AuthorDate: Fri Mar 11 17:24:20 2022 +0800

    [fix](report) fix bug that tablet may already be delete when reporting 
(#8444)
    
    1.
    This bug was introduced by #8209.
    Error in fe.warn.log:
    ```
    java.lang.IllegalStateException: 560278
            at 
com.google.common.base.Preconditions.checkState(Preconditions.java:508) 
~[spark-dpp-0.15-SNAPSHOT.jar:0.15-SNAPSHOT]
            at 
org.apache.doris.catalog.TabletInvertedIndex.getReplica(TabletInvertedIndex.java:462)
 ~[palo-fe.jar:0.15-SNAPSHOT]
            at 
org.apache.doris.catalog.Catalog.replayBackendReplicasInfo(Catalog.java:6941) 
~[palo-fe.jar:0.15-SNAPSHOT]
            at org.apache.doris.persist.EditLog.loadJournal(EditLog.java:626) 
[palo-fe.jar:0.15-SNAPSHOT]
            at 
org.apache.doris.catalog.Catalog.replayJournal(Catalog.java:2446) 
[palo-fe.jar:0.15-SNAPSHOT]
            at 
org.apache.doris.master.Checkpoint.doCheckpoint(Checkpoint.java:116) 
[palo-fe.jar:0.15-SNAPSHOT]
            at 
org.apache.doris.master.Checkpoint.runAfterCatalogReady(Checkpoint.java:74) 
[palo-fe.jar:0.15-SNAPSHOT]
            at 
org.apache.doris.common.util.MasterDaemon.runOneCycle(MasterDaemon.java:58) 
[palo-fe.jar:0.15-SNAPSHOT]
            at org.apache.doris.common.util.Daemon.run(Daemon.java:116) 
[palo-fe.jar:0.15-SNAPSHOT]
    ```
    
    Since the reporting of a tablet and the deletion of a tablet are two 
independent events
    and are not mutually exclusive, it may happen that the tablet is deleted 
first and the reporting is done later.
    
    2.
    Change the tablet report info. Now, the version of a tablet report from BE 
is the largest continuous version.
    Eg, versions: [1,2,3,5,7], the report version of this tablet will be 3.
---
 be/src/olap/tablet.cpp                             | 27 ++++++++++++++--------
 be/src/olap/tablet.h                               |  6 +++--
 .../java/org/apache/doris/catalog/Catalog.java     |  8 +++++++
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index a6f5648..862dccb 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -807,12 +807,12 @@ void Tablet::calc_missed_versions_unlocked(int64_t 
spec_version,
     }
 }
 
-void Tablet::max_continuous_version_from_beginning(Version* version) {
+void Tablet::max_continuous_version_from_beginning(Version* version, Version* 
max_version) {
     ReadLock rdlock(&_meta_lock);
-    _max_continuous_version_from_beginning_unlocked(version);
+    _max_continuous_version_from_beginning_unlocked(version, max_version);
 }
 
-void Tablet::_max_continuous_version_from_beginning_unlocked(Version* version) 
const {
+void Tablet::_max_continuous_version_from_beginning_unlocked(Version* version, 
Version* max_version) const {
     std::vector<Version> existing_versions;
     for (auto& rs : _tablet_meta->all_rs_metas()) {
         existing_versions.emplace_back(rs->version());
@@ -833,6 +833,9 @@ void 
Tablet::_max_continuous_version_from_beginning_unlocked(Version* version) c
         max_continuous_version = existing_versions[i];
     }
     *version = max_continuous_version;
+    if (max_version != nullptr) {
+        *max_version = existing_versions.back();
+    }
 }
 
 void Tablet::calculate_cumulative_point() {
@@ -1247,9 +1250,15 @@ void Tablet::build_tablet_report_info(TTabletInfo* 
tablet_info) {
     tablet_info->row_count = _tablet_meta->num_rows();
     tablet_info->data_size = _tablet_meta->tablet_footprint();
 
-    tablet_info->__set_version_miss(false);
-    auto max_rowset = rowset_with_max_version();
-    if (max_rowset == nullptr) {
+    // Here we need to report to FE if there are any missing versions of 
tablet.
+    // We start from the initial version and traverse backwards until we meet 
a discontinuous version.
+    Version cversion;
+    Version max_version;
+    _max_continuous_version_from_beginning_unlocked(&cversion, &max_version);
+    tablet_info->__set_version_miss(cversion.second < max_version.second);
+    // find rowset with max version
+    auto iter = _rs_version_map.find(max_version);
+    if (iter == _rs_version_map.end()) {
         // If the tablet is in running state, it must not be doing 
schema-change. so if we can not
         // access its rowsets, it means that the tablet is bad and needs to be 
reported to the FE
         // for subsequent repairs (through the cloning task)
@@ -1260,10 +1269,10 @@ void Tablet::build_tablet_report_info(TTabletInfo* 
tablet_info) {
         // still sets the state to normal when reporting. Note that every task 
has an timeout,
         // so if the task corresponding to this change hangs, when the task 
timeout, FE will know
         // and perform state modification operations.
-    } else {
-        tablet_info->__set_version_miss(check_version_integrity({0, 
max_rowset->version().second}, true));
     }
-    tablet_info->version = max_rowset->version().second;
+
+    // the report version is the largest continuous version, same logic as in 
FE side
+    tablet_info->version = cversion.second;
     // Useless but it is a required filed in TTabletInfo
     tablet_info->version_hash = 0;
     tablet_info->__set_partition_id(_tablet_meta->partition_id());
diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h
index b11fedf..bf17327 100644
--- a/be/src/olap/tablet.h
+++ b/be/src/olap/tablet.h
@@ -176,7 +176,8 @@ public:
 
     // This function to find max continuous version from the beginning.
     // For example: If there are 1, 2, 3, 5, 6, 7 versions belongs tablet, 
then 3 is target.
-    void max_continuous_version_from_beginning(Version* version);
+    // 3 will be saved in "version", and 7 will be saved in "max_version", if 
max_version != nullptr
+    void max_continuous_version_from_beginning(Version* version, Version* 
max_version = nullptr);
 
     // operation for query
     OLAPStatus split_range(const OlapTuple& start_key_strings, const 
OlapTuple& end_key_strings,
@@ -272,7 +273,8 @@ private:
 
     // Returns:
     // version: the max continuous version from beginning
-    void _max_continuous_version_from_beginning_unlocked(Version* version) 
const;
+    // max_version: the max version of this tablet
+    void _max_continuous_version_from_beginning_unlocked(Version* version, 
Version* max_version) const;
     RowsetSharedPtr _rowset_with_largest_size();
     /// Delete stale rowset by version. This method not only delete the 
version in expired rowset map,
     /// but also delete the version in rowset meta vector.
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java
index e2bf25e..c92c579 100755
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java
@@ -7001,6 +7001,14 @@ public class Catalog {
         List<BackendReplicasInfo.ReplicaReportInfo> replicaInfos = 
backendReplicasInfo.getReplicaReportInfos();
 
         for (BackendReplicasInfo.ReplicaReportInfo info : replicaInfos) {
+            if (tabletInvertedIndex.getTabletMeta(info.tabletId) == null) {
+                // The tablet has been deleted. Because the reporting of 
tablet and
+                // the deletion of tablet are two independent events,
+                // and directly do not do mutually exclusive processing,
+                // so it may appear that the tablet is deleted first, and the 
reporting information is processed later.
+                // Here we simply ignore the deleted tablet.
+                continue;
+            }
             Replica replica = tabletInvertedIndex.getReplica(info.tabletId, 
backendId);
             if (replica == null) {
                 LOG.warn("failed to find replica of tablet {} on backend {} 
when replaying backend report info",

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to