keith-turner commented on code in PR #4771:
URL: https://github.com/apache/accumulo/pull/4771#discussion_r1694001093
##########
test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java:
##########
@@ -287,4 +287,28 @@ public void metricTags() throws Exception {
});
}
}
+
+ @Test
+ public void fateMetrics() throws Exception {
+ doWorkToGenerateMetrics();
+ cluster.stop();
+
+ List<String> statsDMetrics;
+
+ while (!(statsDMetrics = sink.getLines()).isEmpty()) {
+ statsDMetrics.stream().filter(line ->
line.startsWith("accumulo.fate.tx"))
+ .map(TestStatsDSink::parseStatsDMetric).forEach(a -> {
+ var t = a.getTags();
+ log.debug("METRICS, received from statsd - name: '{}' num tags:
{}, tags: {} = {}",
+ a.getName(), t.size(), t, a.getValue());
+
+ // Verify the fate metrics contain state and instanceType
+ // Checking the value would be hard to test because the metrics
are updated on a timer
+ // and fate transactions get cleaned up when finished so the
current state is a bit
+ // non-deterministic
+ assertNotNull(a.getTags().get("state"));
+ assertNotNull(a.getTags().get("instanceType"));
Review Comment:
Could these verify the values of the tags fall within the known set of valid
values?
##########
server/manager/src/main/java/org/apache/accumulo/manager/metrics/fate/FateMetrics.java:
##########
@@ -126,44 +111,56 @@ private void update() {
@Override
public void registerMetrics(final MeterRegistry registry) {
+ var type = fateStore.type().name().toLowerCase();
+ var instanceTypeTag = Tag.of("instanceType", type);
+
registry.gauge(METRICS_FATE_OPS, totalCurrentOpsGauge);
- registry.gauge(METRICS_FATE_OPS_ACTIVITY, totalOpsGauge);
- registry.gauge(METRICS_FATE_ERRORS, List.of(Tag.of("type",
"zk.connection")), fateErrorsGauge);
- registry.gauge(METRICS_FATE_TX,
- List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.NEW.name().toLowerCase())), newTxGauge);
+
+ registry.gauge(METRICS_FATE_TX, List
+ .of(Tag.of("state",
ReadOnlyFateStore.TStatus.NEW.name().toLowerCase()), instanceTypeTag),
+ newTxGauge);
registry.gauge(METRICS_FATE_TX,
- List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.SUBMITTED.name().toLowerCase())),
+ List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.SUBMITTED.name().toLowerCase()),
+ instanceTypeTag),
submittedTxGauge);
registry.gauge(METRICS_FATE_TX,
- List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.IN_PROGRESS.name().toLowerCase())),
+ List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.IN_PROGRESS.name().toLowerCase()),
+ instanceTypeTag),
inProgressTxGauge);
registry.gauge(METRICS_FATE_TX,
- List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.FAILED_IN_PROGRESS.name().toLowerCase())),
+ List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.FAILED_IN_PROGRESS.name().toLowerCase()),
+ instanceTypeTag),
failedInProgressTxGauge);
registry.gauge(METRICS_FATE_TX,
- List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.FAILED.name().toLowerCase())),
+ List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.FAILED.name().toLowerCase()),
+ instanceTypeTag),
failedTxGauge);
registry.gauge(METRICS_FATE_TX,
- List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.SUCCESSFUL.name().toLowerCase())),
+ List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.SUCCESSFUL.name().toLowerCase()),
+ instanceTypeTag),
successfulTxGauge);
registry.gauge(METRICS_FATE_TX,
- List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.UNKNOWN.name().toLowerCase())),
+ List.of(Tag.of("state",
ReadOnlyFateStore.TStatus.UNKNOWN.name().toLowerCase()),
+ instanceTypeTag),
unknownTxGauge);
- update();
-
// get fate status is read only operation - no reason to be nice on
shutdown.
- ScheduledExecutorService scheduler =
- ThreadPools.getServerThreadPools().createScheduledExecutorService(1,
"fateMetricsPoller");
+ ScheduledExecutorService scheduler = ThreadPools.getServerThreadPools()
+ .createScheduledExecutorService(1, type + "FateMetricsPoller");
Runtime.getRuntime().addShutdownHook(new Thread(scheduler::shutdownNow));
+ // Only update as part of the scheduler thread.
+ // We have to call update() in a new thread because this method to
+ // register metrics is called on start up in the Manager before it's
finished
+ // initializing, so we can't scan the User fate store until after startup
is done.
+ // If we called update() here in this method directly we would get stuck
forever.
Review Comment:
Was calling update here causing the manager to get stuck and not do its job?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]