This is an automated email from the ASF dual-hosted git repository.
ethanli pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/storm.git
The following commit(s) were added to refs/heads/master by this push:
new 15d5872 STORM-3618 add meter to track scheduling errors
new 3eca57d Merge pull request #3246 from agresch/agresch_storm_3618
15d5872 is described below
commit 15d58729ef14c45c85d19faa5d409bb8ceae5006
Author: Aaron Gresch <[email protected]>
AuthorDate: Wed Apr 8 15:18:56 2020 -0500
STORM-3618 add meter to track scheduling errors
---
docs/ClusterMetrics.md | 3 ++-
.../org/apache/storm/scheduler/resource/ResourceAwareScheduler.java | 3 +++
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/docs/ClusterMetrics.md b/docs/ClusterMetrics.md
index 7760f51..c7e9b69 100644
--- a/docs/ClusterMetrics.md
+++ b/docs/ClusterMetrics.md
@@ -58,6 +58,7 @@ These are metrics that are specific to a nimbus instance. In
many instances onl
|-------------|------|-------------|
| nimbus:files-upload-duration-ms | timer | Time it takes to upload a file
from start to finish (Not Blobs, but this may change) |
| nimbus:longest-scheduling-time-ms | gauge | Longest time ever taken so far
to schedule. This includes the current scheduling run, which is intended to
detect if scheduling is stuck for some reason. |
+| nimbus:mkAssignments-Errors | meter | tracks exceptions from mkAssignments |
| nimbus:num-activate-calls | meter | calls to the activate thrift method. |
| nimbus:num-added-executors-per-scheduling | histogram | number of executors
added after a scheduling run. |
| nimbus:num-added-slots-per-scheduling | histogram | number of slots added
after a scheduling run. |
@@ -102,7 +103,7 @@ These are metrics that are specific to a nimbus instance.
In many instances onl
| nimbus:num-uploadChunk-calls | meter | calls to uploadChunk thrift method. |
| nimbus:num-uploadNewCredentials-calls | meter | calls to
uploadNewCredentials thrift method. |
| nimbus:process-worker-metric-calls | meter | calls to processWorkerMetrics
thrift method. |
-| nimbus:mkAssignments-Errors | meter | tracks exceptions from mkAssignments |
+| nimbus:scheduler-internal-errors | meter | tracks internal scheduling errors
|
| nimbus:topology-scheduling-duration-ms | timer | time it takes to do a
scheduling run. |
| nimbus:total-available-memory-non-negative | gauge | available memory on the
cluster MB |
| nimbuses:uptime-secs | histogram | uptime of nimbuses |
diff --git
a/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
b/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
index f7e34ec..a26246a 100644
---
a/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
+++
b/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
@@ -58,6 +58,7 @@ public class ResourceAwareScheduler implements IScheduler {
private int schedulingTimeoutSeconds;
private ExecutorService backgroundScheduling;
private Meter schedulingTimeoutMeter;
+ private Meter internalErrorMeter;
private static void markFailedTopology(User u, Cluster c, TopologyDetails
td, String message) {
markFailedTopology(u, c, td, message, null);
@@ -78,6 +79,7 @@ public class ResourceAwareScheduler implements IScheduler {
public void prepare(Map<String, Object> conf, StormMetricsRegistry
metricsRegistry) {
this.conf = conf;
schedulingTimeoutMeter =
metricsRegistry.registerMeter("nimbus:num-scheduling-timeouts");
+ internalErrorMeter =
metricsRegistry.registerMeter("nimbus:scheduler-internal-errors");
schedulingPriorityStrategy = ReflectionUtils.newInstance(
(String)
conf.get(DaemonConfig.RESOURCE_AWARE_SCHEDULER_PRIORITY_STRATEGY));
configLoader = ConfigLoaderFactoryService.createConfigLoader(conf);
@@ -235,6 +237,7 @@ public class ResourceAwareScheduler implements IScheduler {
}
}
} catch (Exception ex) {
+ internalErrorMeter.mark();
markFailedTopology(topologySubmitter, cluster, td,
"Internal Error - Exception thrown when scheduling.
Please check logs for details", ex);
return;