This is an automated email from the ASF dual-hosted git repository.
yangzhg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new abcd56c [Enhance] Support show unrecoverable tablets (#6045)
abcd56c is described below
commit abcd56c6c827dcbddf103870462e91c8497d51c6
Author: Mingyu Chen <[email protected]>
AuthorDate: Tue Jun 22 09:19:12 2021 +0800
[Enhance] Support show unrecoverable tablets (#6045)
* [Enhance] Support show unrecoverable tablets
The unrecoverable tablets are tablets which non of their replicas are
healthy.
We should be able to find out these tablets then manual intervention.
And these tablets should not be added to the tablet scheduler.
---
.../main/java/org/apache/doris/catalog/Tablet.java | 9 ++++++--
.../java/org/apache/doris/clone/TabletChecker.java | 5 ++++
.../org/apache/doris/clone/TabletScheduler.java | 24 ++++++++++---------
.../common/proc/IncompleteTabletsProcNode.java | 9 ++++++--
.../apache/doris/common/proc/StatisticProcDir.java | 27 +++++++++++++++-------
.../java/org/apache/doris/httpv2/HttpServer.java | 3 +++
.../org/apache/doris/master/ReportHandler.java | 3 ++-
7 files changed, 56 insertions(+), 24 deletions(-)
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java
index 361d6ec..b7288db 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java
@@ -62,6 +62,7 @@ public class Tablet extends MetaObject implements Writable {
COLOCATE_MISMATCH, // replicas do not all locate in right colocate
backends set.
COLOCATE_REDUNDANT, // replicas match the colocate backends set, but
redundant.
NEED_FURTHER_REPAIR, // one of replicas need a definite repair.
+ UNRECOVERABLE // non of replicas are healthy
}
@SerializedName(value = "id")
@@ -455,7 +456,9 @@ public class Tablet extends MetaObject implements Writable {
// 1. alive replicas are not enough
int aliveBackendsNum = aliveBeIdsInCluster.size();
- if (alive < replicationNum && replicas.size() >= aliveBackendsNum
+ if (alive == 0) {
+ return Pair.create(TabletStatus.UNRECOVERABLE, Priority.VERY_HIGH);
+ } else if (alive < replicationNum && replicas.size() >=
aliveBackendsNum
&& aliveBackendsNum >= replicationNum && replicationNum > 1) {
// there is no enough backend for us to create a new replica, so
we have to delete an existing replica,
// so there can be available backend for us to create a new
replica.
@@ -473,7 +476,9 @@ public class Tablet extends MetaObject implements Writable {
}
// 2. version complete replicas are not enough
- if (aliveAndVersionComplete < (replicationNum / 2) + 1) {
+ if (aliveAndVersionComplete == 0) {
+ return Pair.create(TabletStatus.UNRECOVERABLE, Priority.VERY_HIGH);
+ } else if (aliveAndVersionComplete < (replicationNum / 2) + 1) {
return Pair.create(TabletStatus.VERSION_INCOMPLETE,
TabletSchedCtx.Priority.HIGH);
} else if (aliveAndVersionComplete < replicationNum) {
return Pair.create(TabletStatus.VERSION_INCOMPLETE,
TabletSchedCtx.Priority.NORMAL);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java
b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java
index 4e375c1..2c9666d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java
@@ -329,6 +329,11 @@ public class TabletChecker extends MasterDaemon {
// Only set last status check time when status is healthy.
tablet.setLastStatusCheckTime(startTime);
continue;
+ } else if (statusWithPrio.first == TabletStatus.UNRECOVERABLE)
{
+ // This tablet is not recoverable, do not set it into
tablet scheduler
+ // all UNRECOVERABLE tablet can be seen from "show proc
'/statistic'"
+ counter.unhealthyTabletNum++;
+ continue;
} else if (isInPrios) {
statusWithPrio.second = TabletSchedCtx.Priority.VERY_HIGH;
prioPartIsHealthy = false;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
index 0794296..9c4b2b4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
@@ -575,17 +575,19 @@ public class TabletScheduler extends MasterDaemon {
case FORCE_REDUNDANT:
handleRedundantReplica(tabletCtx, true);
break;
- case REPLICA_MISSING_IN_CLUSTER:
- handleReplicaClusterMigration(tabletCtx, batchTask);
- break;
- case COLOCATE_MISMATCH:
- handleColocateMismatch(tabletCtx, batchTask);
- break;
- case COLOCATE_REDUNDANT:
- handleColocateRedundant(tabletCtx);
- break;
- default:
- break;
+ case REPLICA_MISSING_IN_CLUSTER:
+ handleReplicaClusterMigration(tabletCtx, batchTask);
+ break;
+ case COLOCATE_MISMATCH:
+ handleColocateMismatch(tabletCtx, batchTask);
+ break;
+ case COLOCATE_REDUNDANT:
+ handleColocateRedundant(tabletCtx);
+ break;
+ case UNRECOVERABLE:
+ throw new SchedException(Status.UNRECOVERABLE, "tablet is
unrecoverable");
+ default:
+ break;
}
} else {
// balance
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java
b/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java
index b278c47..4cdf5de 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java
@@ -29,20 +29,23 @@ import java.util.List;
public class IncompleteTabletsProcNode implements ProcNodeInterface {
public static final ImmutableList<String> TITLE_NAMES = new
ImmutableList.Builder<String>()
-
.add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets")
+
.add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets").add("BadTablets")
.build();
private static final Joiner JOINER = Joiner.on(",");
Collection<Long> unhealthyTabletIds;
Collection<Long> inconsistentTabletIds;
Collection<Long> cloningTabletIds;
+ Collection<Long> unrecoverableTabletIds;
public IncompleteTabletsProcNode(Collection<Long> unhealthyTabletIds,
Collection<Long> inconsistentTabletIds,
- Collection<Long> cloningTabletIds) {
+ Collection<Long> cloningTabletIds,
+ Collection<Long> unrecoverableTabletIds) {
this.unhealthyTabletIds = unhealthyTabletIds;
this.inconsistentTabletIds = inconsistentTabletIds;
this.cloningTabletIds = cloningTabletIds;
+ this.unrecoverableTabletIds = unrecoverableTabletIds;
}
@Override
@@ -56,9 +59,11 @@ public class IncompleteTabletsProcNode implements
ProcNodeInterface {
String incompleteTablets =
JOINER.join(Arrays.asList(unhealthyTabletIds));
String inconsistentTablets =
JOINER.join(Arrays.asList(inconsistentTabletIds));
String cloningTablets = JOINER.join(Arrays.asList(cloningTabletIds));
+ String unrecoverableTablets =
JOINER.join(Arrays.asList(unrecoverableTabletIds));
row.add(incompleteTablets);
row.add(inconsistentTablets);
row.add(cloningTablets);
+ row.add(unrecoverableTablets);
result.addRow(row);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java
b/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java
index 001f00c..596267c 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java
@@ -17,10 +17,6 @@
package org.apache.doris.common.proc;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Multimap;
import org.apache.doris.catalog.Catalog;
import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.MaterializedIndex;
@@ -38,6 +34,12 @@ import org.apache.doris.common.util.ListComparator;
import org.apache.doris.system.SystemInfoService;
import org.apache.doris.task.AgentTaskQueue;
import org.apache.doris.thrift.TTaskType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Multimap;
+
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -49,7 +51,7 @@ public class StatisticProcDir implements ProcDirInterface {
public static final ImmutableList<String> TITLE_NAMES = new
ImmutableList.Builder<String>()
.add("DbId").add("DbName").add("TableNum").add("PartitionNum")
.add("IndexNum").add("TabletNum").add("ReplicaNum").add("UnhealthyTabletNum")
- .add("InconsistentTabletNum").add("CloningTabletNum")
+
.add("InconsistentTabletNum").add("CloningTabletNum").add("BadTabletNum")
.build();
private static final Logger LOG =
LogManager.getLogger(StatisticProcDir.class);
@@ -61,12 +63,15 @@ public class StatisticProcDir implements ProcDirInterface {
Multimap<Long, Long> inconsistentTabletIds;
// db id -> set(tablet id)
Multimap<Long, Long> cloningTabletIds;
+ // db id -> set(tablet id)
+ Multimap<Long, Long> unrecoverableTabletIds;
public StatisticProcDir(Catalog catalog) {
this.catalog = catalog;
unhealthyTabletIds = HashMultimap.create();
inconsistentTabletIds = HashMultimap.create();
cloningTabletIds = HashMultimap.create();
+ unrecoverableTabletIds = HashMultimap.create();
}
@Override
@@ -140,8 +145,11 @@ public class StatisticProcDir implements ProcDirInterface {
// here we treat REDUNDANT as HEALTHY, for
user friendly.
if (res.first != TabletStatus.HEALTHY &&
res.first != TabletStatus.REDUNDANT
- && res.first !=
TabletStatus.COLOCATE_REDUNDANT && res.first !=
TabletStatus.NEED_FURTHER_REPAIR) {
+ && res.first !=
TabletStatus.COLOCATE_REDUNDANT && res.first != TabletStatus.NEED_FURTHER_REPAIR
+ && res.first !=
TabletStatus.UNRECOVERABLE) {
unhealthyTabletIds.put(dbId,
tablet.getId());
+ } else if (res.first ==
TabletStatus.UNRECOVERABLE) {
+ unrecoverableTabletIds.put(dbId,
tablet.getId());
}
if (!tablet.isConsistent()) {
@@ -166,6 +174,7 @@ public class StatisticProcDir implements ProcDirInterface {
oneLine.add(unhealthyTabletIds.get(dbId).size());
oneLine.add(inconsistentTabletIds.get(dbId).size());
oneLine.add(cloningTabletIds.get(dbId).size());
+ oneLine.add(unrecoverableTabletIds.get(dbId).size());
lines.add(oneLine);
@@ -195,6 +204,7 @@ public class StatisticProcDir implements ProcDirInterface {
finalLine.add(unhealthyTabletIds.size());
finalLine.add(inconsistentTabletIds.size());
finalLine.add(cloningTabletIds.size());
+ finalLine.add(unrecoverableTabletIds.size());
lines.add(finalLine);
// add result
@@ -224,7 +234,8 @@ public class StatisticProcDir implements ProcDirInterface {
}
return new IncompleteTabletsProcNode(unhealthyTabletIds.get(dbId),
- inconsistentTabletIds.get(dbId),
- cloningTabletIds.get(dbId));
+ inconsistentTabletIds.get(dbId),
+ cloningTabletIds.get(dbId),
+ unrecoverableTabletIds.get(dbId));
}
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java
b/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java
index 0e57825..8f062ea 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java
@@ -65,6 +65,9 @@ public class HttpServer extends SpringBootServletInitializer {
properties.put("spring.http.encoding.force", true);
properties.put("spring.servlet.multipart.max-file-size",
this.maxFileSize);
properties.put("spring.servlet.multipart.max-request-size",
this.maxRequestSize);
+ // This is to disable the spring-boot-devtools restart feature.
+ // To avoid some unexpected behavior.
+ System.setProperty("spring.devtools.restart.enabled", "false");
properties.put("logging.config", dorisHome + "/conf/" +
SpringLog4j2Config.SPRING_LOG_XML_FILE);
new SpringApplicationBuilder()
.sources(HttpServer.class)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
index 1782b79..31a2dc4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
@@ -1014,7 +1014,8 @@ public class ReportHandler extends Daemon {
db.getClusterName(), visibleVersion, visibleVersionHash,
replicationNum, aliveBeIdsInCluster);
- if (status.first == TabletStatus.VERSION_INCOMPLETE ||
status.first == TabletStatus.REPLICA_MISSING) {
+ if (status.first == TabletStatus.VERSION_INCOMPLETE ||
status.first == TabletStatus.REPLICA_MISSING
+ || status.first == TabletStatus.UNRECOVERABLE) {
long lastFailedVersion = -1L;
long lastFailedVersionHash = 0L;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]