kfaraz commented on code in PR #14517:
URL: https://github.com/apache/druid/pull/14517#discussion_r1252559077
##########
server/src/main/java/org/apache/druid/client/HttpServerInventoryView.java:
##########
@@ -443,61 +454,57 @@ public Map<String, Object> getDebugInfo()
return result;
}
- private void scheduleSyncMonitoring()
- {
- executor.scheduleAtFixedRate(
- () -> {
- log.debug("Running the Sync Monitoring.");
-
- try {
- syncMonitoring();
- }
- catch (Exception ex) {
- if (ex instanceof InterruptedException) {
- Thread.currentThread().interrupt();
- } else {
- log.makeAlert(ex, "Exception in sync monitoring.").emit();
- }
- }
- },
- 1,
- 5,
- TimeUnit.MINUTES
- );
- }
-
@VisibleForTesting
- void syncMonitoring()
+ void checkAndResetUnhealthyServers()
{
// Ensure that the collection is not being modified during iteration.
Iterate over a copy
final Set<Map.Entry<String, DruidServerHolder>> serverEntrySet =
ImmutableSet.copyOf(servers.entrySet());
for (Map.Entry<String, DruidServerHolder> e : serverEntrySet) {
DruidServerHolder serverHolder = e.getValue();
- if (!serverHolder.syncer.isOK()) {
+ if (serverHolder.syncer.needsReset()) {
synchronized (servers) {
- // check again that server is still there and only then reset.
+ // Reset only if the server is still present in the map
if (servers.containsKey(e.getKey())) {
- log.makeAlert(
- "Server[%s] is not syncing properly. Current state is [%s].
Resetting it.",
+ log.warn(
+ "Resetting server[%s] with state[%s] as it is not syncing
properly.",
serverHolder.druidServer.getName(),
serverHolder.syncer.getDebugInfo()
- ).emit();
+ );
serverRemoved(serverHolder.druidServer);
- serverAdded(new DruidServer(
- serverHolder.druidServer.getName(),
- serverHolder.druidServer.getHostAndPort(),
- serverHolder.druidServer.getHostAndTlsPort(),
- serverHolder.druidServer.getMaxSize(),
- serverHolder.druidServer.getType(),
- serverHolder.druidServer.getTier(),
- serverHolder.druidServer.getPriority()
- ));
+ serverAdded(serverHolder.druidServer.copyWithoutSegments());
}
}
}
}
}
+ private void emitServerStatusMetrics()
+ {
+ final ServiceMetricEvent.Builder eventBuilder =
ServiceMetricEvent.builder();
+ try {
+ final Map<String, DruidServerHolder> serversCopy =
ImmutableMap.copyOf(servers);
+ serversCopy.forEach((serverName, serverHolder) -> {
+ final DruidServer server = serverHolder.druidServer;
+ eventBuilder.setDimension("tier", server.getTier());
+ eventBuilder.setDimension("server", serverName);
+
+ final boolean isSynced = serverHolder.syncer.isSyncedSuccessfully();
+ serviceEmitter.emit(
+ eventBuilder.build("segment/serverview/sync/healthy", isSynced ? 1
: 0)
+ );
+ final long unstableTimeMillis =
serverHolder.syncer.getUnstableTimeMillis();
+ if (unstableTimeMillis > 0) {
+ serviceEmitter.emit(
+ eventBuilder.build("segment/serverview/sync/unstableTime",
unstableTimeMillis)
+ );
+ }
Review Comment:
Yeah, this metric is mostly going to be 0 under normal operation, so I
thought to just skip it.
I guess alerts could still be set on the `healthy` metric but yeah those
alerts would have to work on a window whereas with `unstableTime`, the window
aggregate is already captured in the metric itself.
Let me know if you still feel that we should emit a zero metric, I can make
the changes.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]