This is an automated email from the ASF dual-hosted git repository.

dlmarion pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/main by this push:
     new 8e9f6c2b96 Emit tablets needing recovery from TabletGroupWatcher 
(#6360)
8e9f6c2b96 is described below

commit 8e9f6c2b96eec28bd675ec7e77f789932a2cb00c
Author: Dave Marion <[email protected]>
AuthorDate: Tue May 5 07:37:18 2026 -0400

    Emit tablets needing recovery from TabletGroupWatcher (#6360)
    
    This change emits a metric from the root, meta, and user
    TabletGroupWatcher threads regarding the number of tablets
    that it sees as needing recovery.
---
 .../org/apache/accumulo/core/metrics/Metric.java   |  9 +++++++++
 .../accumulo/manager/TabletGroupWatcher.java       | 23 ++++++++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java 
b/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
index ce55be21d9..46cfadd347 100644
--- a/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
+++ b/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
@@ -372,12 +372,21 @@ public enum Metric {
   MANAGER_ROOT_TGW_ERRORS("accumulo.tabletmgmt.root.errors", MetricType.GAUGE,
       "Error count encountered by the TabletGroupWatcher for the ROOT data 
level.",
       MetricDocSection.MANAGER, "Root Tablet Watcher Errors", null, NUMBER),
+  MANAGER_ROOT_TGW_RECOVERY("accumulo.tabletmgmt.root.recovery", 
MetricType.GAUGE,
+      "Recovery count encountered by the TabletGroupWatcher for the ROOT data 
level.",
+      MetricDocSection.MANAGER, "Root Tablet Watcher Recoveries", null, 
NUMBER),
   MANAGER_META_TGW_ERRORS("accumulo.tabletmgmt.meta.errors", MetricType.GAUGE,
       "Error count encountered by the TabletGroupWatcher for the META data 
level.",
       MetricDocSection.MANAGER, "Meta Tablet Watcher Errors", null, NUMBER),
+  MANAGER_META_TGW_RECOVERY("accumulo.tabletmgmt.meta.recovery", 
MetricType.GAUGE,
+      "Recovery count encountered by the TabletGroupWatcher for the META data 
level.",
+      MetricDocSection.MANAGER, "Meta Tablet Watcher Recoveries", null, 
NUMBER),
   MANAGER_USER_TGW_ERRORS("accumulo.tabletmgmt.user.errors", MetricType.GAUGE,
       "Error count encountered by the TabletGroupWatcher for the USER data 
level.",
       MetricDocSection.MANAGER, "User Tablet Watcher Errors", null, NUMBER),
+  MANAGER_USER_TGW_RECOVERY("accumulo.tabletmgmt.user.recovery", 
MetricType.GAUGE,
+      "Recovery count encountered by the TabletGroupWatcher for the USER data 
level.",
+      MetricDocSection.MANAGER, "User Tablet Watcher Recoveries", null, 
NUMBER),
   MANAGER_GOAL_STATE("accumulo.manager.goal.state", MetricType.GAUGE,
       "Manager goal state: -1=unknown, 0=CLEAN_STOP, 1=SAFE_MODE, 2=NORMAL.",
       MetricDocSection.MANAGER, "Manager Goal State", null, NUMBER),
diff --git 
a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java
 
b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java
index c1192a7421..c69bc2d0d5 100644
--- 
a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java
+++ 
b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java
@@ -27,8 +27,11 @@ import static 
org.apache.accumulo.core.metrics.Metric.COMPACTION_META_SVC_ERRORS
 import static 
org.apache.accumulo.core.metrics.Metric.COMPACTION_ROOT_SVC_ERRORS;
 import static 
org.apache.accumulo.core.metrics.Metric.COMPACTION_USER_SVC_ERRORS;
 import static org.apache.accumulo.core.metrics.Metric.MANAGER_META_TGW_ERRORS;
+import static 
org.apache.accumulo.core.metrics.Metric.MANAGER_META_TGW_RECOVERY;
 import static org.apache.accumulo.core.metrics.Metric.MANAGER_ROOT_TGW_ERRORS;
+import static 
org.apache.accumulo.core.metrics.Metric.MANAGER_ROOT_TGW_RECOVERY;
 import static org.apache.accumulo.core.metrics.Metric.MANAGER_USER_TGW_ERRORS;
+import static 
org.apache.accumulo.core.metrics.Metric.MANAGER_USER_TGW_RECOVERY;
 
 import java.io.IOException;
 import java.time.Duration;
@@ -140,6 +143,7 @@ abstract class TabletGroupWatcher extends 
AccumuloDaemonThread {
 
   private static class TabletGroupWatcherMetrics implements MetricsProducer {
     private final AtomicLong errorsGauge = new AtomicLong(0);
+    private final AtomicLong recoveryGauge = new AtomicLong(0);
     private final AtomicInteger compactionConfigurationError = new 
AtomicInteger(0);
     private final Ample.DataLevel level;
 
@@ -151,6 +155,10 @@ abstract class TabletGroupWatcher extends 
AccumuloDaemonThread {
       errorsGauge.incrementAndGet();
     }
 
+    public void setTabletGroupWatcherRecovery(long recoveries) {
+      recoveryGauge.set(recoveries);
+    }
+
     public void setCompactionServiceConfigurationError() {
       this.compactionConfigurationError.set(1);
     }
@@ -163,24 +171,30 @@ abstract class TabletGroupWatcher extends 
AccumuloDaemonThread {
     public void registerMetrics(MeterRegistry registry) {
 
       Metric errorMetric;
+      Metric recoveryMetric;
       Metric svcCfgErrorMetric;
       switch (level) {
         case USER -> {
           errorMetric = MANAGER_USER_TGW_ERRORS;
+          recoveryMetric = MANAGER_USER_TGW_RECOVERY;
           svcCfgErrorMetric = COMPACTION_USER_SVC_ERRORS;
         }
         case METADATA -> {
           errorMetric = MANAGER_META_TGW_ERRORS;
+          recoveryMetric = MANAGER_META_TGW_RECOVERY;
           svcCfgErrorMetric = COMPACTION_META_SVC_ERRORS;
         }
         case ROOT -> {
           errorMetric = MANAGER_ROOT_TGW_ERRORS;
+          recoveryMetric = MANAGER_ROOT_TGW_RECOVERY;
           svcCfgErrorMetric = COMPACTION_ROOT_SVC_ERRORS;
         }
         default -> throw new IllegalStateException("Unknown level " + level);
       }
       Gauge.builder(errorMetric.getName(), errorsGauge, AtomicLong::get)
           .description(errorMetric.getDescription()).register(registry);
+      Gauge.builder(recoveryMetric.getName(), recoveryGauge, AtomicLong::get)
+          .description(recoveryMetric.getDescription()).register(registry);
       Gauge.builder(svcCfgErrorMetric.getName(), compactionConfigurationError, 
AtomicInteger::get)
           .description(svcCfgErrorMetric.getDescription()).register(registry);
 
@@ -540,6 +554,8 @@ abstract class TabletGroupWatcher extends 
AccumuloDaemonThread {
     Set<TServerInstance> filteredServersToShutdown =
         new HashSet<>(tableMgmtParams.getServersToShutdown());
 
+    long tabletsNeedingRecovery = 0;
+
     while (iter.hasNext() && !manager.isShutdownRequested()) {
       final TabletManagement mti = iter.next();
       if (mti == null) {
@@ -616,8 +632,8 @@ abstract class TabletGroupWatcher extends 
AccumuloDaemonThread {
       final Set<ManagementAction> actions = mti.getActions();
 
       if (actions.contains(ManagementAction.NEEDS_RECOVERY) && goal != 
TabletGoalState.HOSTED) {
-        LOG.warn("Tablet has wals, but goal is not hosted. Tablet: {}, 
goal:{}", tm.getExtent(),
-            goal);
+        LOG.warn("Tablet has wals, but goal is not hosted. This is an error. 
Tablet: {}, goal:{}",
+            tm.getExtent(), goal);
       }
 
       if (actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)) {
@@ -717,6 +733,7 @@ abstract class TabletGroupWatcher extends 
AccumuloDaemonThread {
               && recoverySession.recoverLogs(tm.getLogs())) {
             LOG.debug("Not hosting {} as it needs recovery, logs: {}", 
tm.getExtent(),
                 tm.getLogs().size());
+            tabletsNeedingRecovery++;
             continue;
           }
           switch (state) {
@@ -769,6 +786,8 @@ abstract class TabletGroupWatcher extends 
AccumuloDaemonThread {
       }
     }
 
+    this.metrics.setTabletGroupWatcherRecovery(tabletsNeedingRecovery);
+
     flushChanges(tLists);
 
     if (isFullScan) {

Reply via email to