This is an automated email from the ASF dual-hosted git repository.
feiwang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new 80bdb4680 [CELEBORN-1892] Adding register with master fail count
metric for worker
80bdb4680 is described below
commit 80bdb46801cf5cee3c5a9ea6542c53a78a89bef5
Author: Sanskar Modi <[email protected]>
AuthorDate: Wed Jun 11 11:04:59 2025 -0700
[CELEBORN-1892] Adding register with master fail count metric for worker
### What changes were proposed in this pull request?
Adding register with master fail count metric for worker
### Why are the changes needed?
This will help put monitoring around if workers are not able to register
with master like wrong endpoints are passed or master becomes unavailable.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Local setup
<img width="724" alt="Screenshot 2025-06-04 at 10 44 56 AM"
src="https://github.com/user-attachments/assets/1f84557b-5df8-422f-b602-bb5316a72a0e"
/>
Closes #3308 from s0nskar/worker_register_metric.
Authored-by: Sanskar Modi <[email protected]>
Signed-off-by: Wang, Fei <[email protected]>
---
assets/grafana/celeborn-dashboard.json | 93 ++++++++++++++++++++++
docs/monitoring.md | 3 +-
.../celeborn/service/deploy/worker/Worker.scala | 4 +-
.../service/deploy/worker/WorkerSource.scala | 3 +
4 files changed, 100 insertions(+), 3 deletions(-)
diff --git a/assets/grafana/celeborn-dashboard.json
b/assets/grafana/celeborn-dashboard.json
index d98edb8b6..93b28996c 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -3319,6 +3319,99 @@
"title": "metrics_IsDecommissioningWorker_Value",
"type": "timeseries"
},
+ {
+ "datasource": {
+ "default": false,
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 70
+ },
+ "id": 241,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "expr":
"metrics_RegisterWithMasterFailCount_Count{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_RegisterWithMasterFailCount_Count",
+ "type": "timeseries"
+ },
{
"datasource": {
"type": "prometheus",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index d2f8ced8f..e7d278fab 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -242,11 +242,12 @@ These metrics are exposed by Celeborn worker.
| WorkerConsumeSpeed | The speed of worker consumption
for congestion control.
|
| IsDecommissioningWorker | 1 means worker decommissioning,
0 means not decommissioning.
|
| UnreleasedShuffleCount | Unreleased shuffle count when
worker is decommissioning.
|
- | UnreleasedPartitionLocationCount | Unreleased partition location
counit when worker is shutting down.
|
+ | UnreleasedPartitionLocationCount | Unreleased partition location
count when worker is shutting down.
|
| MemoryStorageFileCount | The count of files in Memory
Storage of a worker.
|
| MemoryFileStorageSize | The total amount of memory used
by Memory Storage.
|
| EvictedFileCount | The count of files evicted from
Memory Storage to Disk
|
| DirectMemoryUsageRatio | Ratio of direct memory used and
max direct memory.
|
+ | RegisterWithMasterFailCount | The count of failures in
register with master request.
|
| push_usedHeapMemory |
|
| push_usedDirectMemory |
|
| push_numHeapArenas |
|
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index 0a1e14ff4..79928df44 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -52,7 +52,6 @@ import org.apache.celeborn.common.util.{CelebornExitKind,
CollectionUtils, JavaU
// Can Remove this if celeborn don't support scala211 in future
import org.apache.celeborn.common.util.FunctionConverter._
import org.apache.celeborn.server.common.{HttpService, Service}
-import
org.apache.celeborn.service.deploy.worker.WorkerSource.ACTIVE_CONNECTION_COUNT
import
org.apache.celeborn.service.deploy.worker.congestcontrol.CongestionController
import org.apache.celeborn.service.deploy.worker.memory.{ChannelsLimiter,
MemoryManager}
import
org.apache.celeborn.service.deploy.worker.memory.MemoryManager.ServingState
@@ -477,7 +476,7 @@ private[celeborn] class Worker(
case (ServingState.PUSH_AND_REPLICATE_PAUSED, _, _) => true
case (ServingState.PUSH_PAUSED, _, _) => true
case (_, _, Some(activeConnectionMax)) =>
- workerSource.getCounterCount(ACTIVE_CONNECTION_COUNT) >=
activeConnectionMax
+ workerSource.getCounterCount(WorkerSource.ACTIVE_CONNECTION_COUNT) >=
activeConnectionMax
case _ => false
}
}
@@ -672,6 +671,7 @@ private[celeborn] class Worker(
logWarning(
s"Register worker to master failed, will retry after
${Utils.msDurationToString(interval)}",
throwable)
+
workerSource.incCounter(WorkerSource.REGISTER_WITH_MASTER_FAIL_COUNT)
exception = throwable
null
}
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index 01fd138b5..c65f86b11 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -59,6 +59,7 @@ class WorkerSource(conf: CelebornConf) extends
AbstractSource(conf, Role.WORKER)
addCounter(SEGMENT_START_FAIL_COUNT)
addCounter(SLOTS_ALLOCATED)
+ addCounter(REGISTER_WITH_MASTER_FAIL_COUNT)
// add timers
addTimer(COMMIT_FILES_TIME)
@@ -142,6 +143,8 @@ object WorkerSource {
val RUNNING_APPLICATION_COUNT = "RunningApplicationCount"
+ val REGISTER_WITH_MASTER_FAIL_COUNT = "RegisterWithMasterFailCount"
+
// fetch data
val OPEN_STREAM_TIME = "OpenStreamTime"
val FETCH_CHUNK_TIME = "FetchChunkTime"