This is an automated email from the ASF dual-hosted git repository.
dlmarion pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/main by this push:
new 8e9f6c2b96 Emit tablets needing recovery from TabletGroupWatcher
(#6360)
8e9f6c2b96 is described below
commit 8e9f6c2b96eec28bd675ec7e77f789932a2cb00c
Author: Dave Marion <[email protected]>
AuthorDate: Tue May 5 07:37:18 2026 -0400
Emit tablets needing recovery from TabletGroupWatcher (#6360)
This change emits a metric from the root, meta, and user
TabletGroupWatcher threads regarding the number of tablets
that it sees as needing recovery.
---
.../org/apache/accumulo/core/metrics/Metric.java | 9 +++++++++
.../accumulo/manager/TabletGroupWatcher.java | 23 ++++++++++++++++++++--
2 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
b/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
index ce55be21d9..46cfadd347 100644
--- a/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
+++ b/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
@@ -372,12 +372,21 @@ public enum Metric {
MANAGER_ROOT_TGW_ERRORS("accumulo.tabletmgmt.root.errors", MetricType.GAUGE,
"Error count encountered by the TabletGroupWatcher for the ROOT data
level.",
MetricDocSection.MANAGER, "Root Tablet Watcher Errors", null, NUMBER),
+ MANAGER_ROOT_TGW_RECOVERY("accumulo.tabletmgmt.root.recovery",
MetricType.GAUGE,
+ "Recovery count encountered by the TabletGroupWatcher for the ROOT data
level.",
+ MetricDocSection.MANAGER, "Root Tablet Watcher Recoveries", null,
NUMBER),
MANAGER_META_TGW_ERRORS("accumulo.tabletmgmt.meta.errors", MetricType.GAUGE,
"Error count encountered by the TabletGroupWatcher for the META data
level.",
MetricDocSection.MANAGER, "Meta Tablet Watcher Errors", null, NUMBER),
+ MANAGER_META_TGW_RECOVERY("accumulo.tabletmgmt.meta.recovery",
MetricType.GAUGE,
+ "Recovery count encountered by the TabletGroupWatcher for the META data
level.",
+ MetricDocSection.MANAGER, "Meta Tablet Watcher Recoveries", null,
NUMBER),
MANAGER_USER_TGW_ERRORS("accumulo.tabletmgmt.user.errors", MetricType.GAUGE,
"Error count encountered by the TabletGroupWatcher for the USER data
level.",
MetricDocSection.MANAGER, "User Tablet Watcher Errors", null, NUMBER),
+ MANAGER_USER_TGW_RECOVERY("accumulo.tabletmgmt.user.recovery",
MetricType.GAUGE,
+ "Recovery count encountered by the TabletGroupWatcher for the USER data
level.",
+ MetricDocSection.MANAGER, "User Tablet Watcher Recoveries", null,
NUMBER),
MANAGER_GOAL_STATE("accumulo.manager.goal.state", MetricType.GAUGE,
"Manager goal state: -1=unknown, 0=CLEAN_STOP, 1=SAFE_MODE, 2=NORMAL.",
MetricDocSection.MANAGER, "Manager Goal State", null, NUMBER),
diff --git
a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java
b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java
index c1192a7421..c69bc2d0d5 100644
---
a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java
+++
b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java
@@ -27,8 +27,11 @@ import static
org.apache.accumulo.core.metrics.Metric.COMPACTION_META_SVC_ERRORS
import static
org.apache.accumulo.core.metrics.Metric.COMPACTION_ROOT_SVC_ERRORS;
import static
org.apache.accumulo.core.metrics.Metric.COMPACTION_USER_SVC_ERRORS;
import static org.apache.accumulo.core.metrics.Metric.MANAGER_META_TGW_ERRORS;
+import static
org.apache.accumulo.core.metrics.Metric.MANAGER_META_TGW_RECOVERY;
import static org.apache.accumulo.core.metrics.Metric.MANAGER_ROOT_TGW_ERRORS;
+import static
org.apache.accumulo.core.metrics.Metric.MANAGER_ROOT_TGW_RECOVERY;
import static org.apache.accumulo.core.metrics.Metric.MANAGER_USER_TGW_ERRORS;
+import static
org.apache.accumulo.core.metrics.Metric.MANAGER_USER_TGW_RECOVERY;
import java.io.IOException;
import java.time.Duration;
@@ -140,6 +143,7 @@ abstract class TabletGroupWatcher extends
AccumuloDaemonThread {
private static class TabletGroupWatcherMetrics implements MetricsProducer {
private final AtomicLong errorsGauge = new AtomicLong(0);
+ private final AtomicLong recoveryGauge = new AtomicLong(0);
private final AtomicInteger compactionConfigurationError = new
AtomicInteger(0);
private final Ample.DataLevel level;
@@ -151,6 +155,10 @@ abstract class TabletGroupWatcher extends
AccumuloDaemonThread {
errorsGauge.incrementAndGet();
}
+ public void setTabletGroupWatcherRecovery(long recoveries) {
+ recoveryGauge.set(recoveries);
+ }
+
public void setCompactionServiceConfigurationError() {
this.compactionConfigurationError.set(1);
}
@@ -163,24 +171,30 @@ abstract class TabletGroupWatcher extends
AccumuloDaemonThread {
public void registerMetrics(MeterRegistry registry) {
Metric errorMetric;
+ Metric recoveryMetric;
Metric svcCfgErrorMetric;
switch (level) {
case USER -> {
errorMetric = MANAGER_USER_TGW_ERRORS;
+ recoveryMetric = MANAGER_USER_TGW_RECOVERY;
svcCfgErrorMetric = COMPACTION_USER_SVC_ERRORS;
}
case METADATA -> {
errorMetric = MANAGER_META_TGW_ERRORS;
+ recoveryMetric = MANAGER_META_TGW_RECOVERY;
svcCfgErrorMetric = COMPACTION_META_SVC_ERRORS;
}
case ROOT -> {
errorMetric = MANAGER_ROOT_TGW_ERRORS;
+ recoveryMetric = MANAGER_ROOT_TGW_RECOVERY;
svcCfgErrorMetric = COMPACTION_ROOT_SVC_ERRORS;
}
default -> throw new IllegalStateException("Unknown level " + level);
}
Gauge.builder(errorMetric.getName(), errorsGauge, AtomicLong::get)
.description(errorMetric.getDescription()).register(registry);
+ Gauge.builder(recoveryMetric.getName(), recoveryGauge, AtomicLong::get)
+ .description(recoveryMetric.getDescription()).register(registry);
Gauge.builder(svcCfgErrorMetric.getName(), compactionConfigurationError,
AtomicInteger::get)
.description(svcCfgErrorMetric.getDescription()).register(registry);
@@ -540,6 +554,8 @@ abstract class TabletGroupWatcher extends
AccumuloDaemonThread {
Set<TServerInstance> filteredServersToShutdown =
new HashSet<>(tableMgmtParams.getServersToShutdown());
+ long tabletsNeedingRecovery = 0;
+
while (iter.hasNext() && !manager.isShutdownRequested()) {
final TabletManagement mti = iter.next();
if (mti == null) {
@@ -616,8 +632,8 @@ abstract class TabletGroupWatcher extends
AccumuloDaemonThread {
final Set<ManagementAction> actions = mti.getActions();
if (actions.contains(ManagementAction.NEEDS_RECOVERY) && goal !=
TabletGoalState.HOSTED) {
- LOG.warn("Tablet has wals, but goal is not hosted. Tablet: {},
goal:{}", tm.getExtent(),
- goal);
+ LOG.warn("Tablet has wals, but goal is not hosted. This is an error.
Tablet: {}, goal:{}",
+ tm.getExtent(), goal);
}
if (actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)) {
@@ -717,6 +733,7 @@ abstract class TabletGroupWatcher extends
AccumuloDaemonThread {
&& recoverySession.recoverLogs(tm.getLogs())) {
LOG.debug("Not hosting {} as it needs recovery, logs: {}",
tm.getExtent(),
tm.getLogs().size());
+ tabletsNeedingRecovery++;
continue;
}
switch (state) {
@@ -769,6 +786,8 @@ abstract class TabletGroupWatcher extends
AccumuloDaemonThread {
}
}
+ this.metrics.setTabletGroupWatcherRecovery(tabletsNeedingRecovery);
+
flushChanges(tLists);
if (isFullScan) {