cshannon commented on code in PR #4771:
URL: https://github.com/apache/accumulo/pull/4771#discussion_r1694001849


##########
server/manager/src/main/java/org/apache/accumulo/manager/metrics/fate/FateMetrics.java:
##########
@@ -126,44 +111,56 @@ private void update() {
 
   @Override
   public void registerMetrics(final MeterRegistry registry) {
+    var type = fateStore.type().name().toLowerCase();
+    var instanceTypeTag = Tag.of("instanceType", type);
+
     registry.gauge(METRICS_FATE_OPS, totalCurrentOpsGauge);
-    registry.gauge(METRICS_FATE_OPS_ACTIVITY, totalOpsGauge);
-    registry.gauge(METRICS_FATE_ERRORS, List.of(Tag.of("type", 
"zk.connection")), fateErrorsGauge);
-    registry.gauge(METRICS_FATE_TX,
-        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.NEW.name().toLowerCase())), newTxGauge);
+
+    registry.gauge(METRICS_FATE_TX, List
+        .of(Tag.of("state", 
ReadOnlyFateStore.TStatus.NEW.name().toLowerCase()), instanceTypeTag),
+        newTxGauge);
     registry.gauge(METRICS_FATE_TX,
-        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.SUBMITTED.name().toLowerCase())),
+        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.SUBMITTED.name().toLowerCase()),
+            instanceTypeTag),
         submittedTxGauge);
     registry.gauge(METRICS_FATE_TX,
-        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.IN_PROGRESS.name().toLowerCase())),
+        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.IN_PROGRESS.name().toLowerCase()),
+            instanceTypeTag),
         inProgressTxGauge);
     registry.gauge(METRICS_FATE_TX,
-        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.FAILED_IN_PROGRESS.name().toLowerCase())),
+        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.FAILED_IN_PROGRESS.name().toLowerCase()),
+            instanceTypeTag),
         failedInProgressTxGauge);
     registry.gauge(METRICS_FATE_TX,
-        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.FAILED.name().toLowerCase())),
+        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.FAILED.name().toLowerCase()),
+            instanceTypeTag),
         failedTxGauge);
     registry.gauge(METRICS_FATE_TX,
-        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.SUCCESSFUL.name().toLowerCase())),
+        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.SUCCESSFUL.name().toLowerCase()),
+            instanceTypeTag),
         successfulTxGauge);
     registry.gauge(METRICS_FATE_TX,
-        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.UNKNOWN.name().toLowerCase())),
+        List.of(Tag.of("state", 
ReadOnlyFateStore.TStatus.UNKNOWN.name().toLowerCase()),
+            instanceTypeTag),
         unknownTxGauge);
 
-    update();
-
     // get fate status is read only operation - no reason to be nice on 
shutdown.
-    ScheduledExecutorService scheduler =
-        ThreadPools.getServerThreadPools().createScheduledExecutorService(1, 
"fateMetricsPoller");
+    ScheduledExecutorService scheduler = ThreadPools.getServerThreadPools()
+        .createScheduledExecutorService(1, type + "FateMetricsPoller");
     Runtime.getRuntime().addShutdownHook(new Thread(scheduler::shutdownNow));
 
+    // Only update as part of the scheduler thread.
+    // We have to call update() in a new thread because this method to
+    // register metrics is called on start up in the Manager before it's 
finished
+    // initializing, so we can't scan the User fate store until after startup 
is done.
+    // If we called update() here in this method directly we would get stuck 
forever.

Review Comment:
   Yeah it ended up deadlocked and stuck and was the root cause of the issue 
described [here](https://github.com/apache/accumulo/pull/4696#issue-2370686608)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to