This is an automated email from the ASF dual-hosted git repository.

feiwang pushed a commit to branch branch-0.6
in repository https://gitbox.apache.org/repos/asf/celeborn.git


The following commit(s) were added to refs/heads/branch-0.6 by this push:
     new 10fbace4a [CELEBORN-1892] Adding register with master fail count 
metric for worker
10fbace4a is described below

commit 10fbace4afd48178361e3958dbe9f468f983ce2c
Author: Sanskar Modi <[email protected]>
AuthorDate: Wed Jun 11 11:04:59 2025 -0700

    [CELEBORN-1892] Adding register with master fail count metric for worker
    
    ### What changes were proposed in this pull request?
    
    Adding register with master fail count metric for worker
    
    ### Why are the changes needed?
    
    This will help put monitoring around if workers are not able to register 
with master like wrong endpoints are passed or master becomes unavailable.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    Local setup
    
    <img width="724" alt="Screenshot 2025-06-04 at 10 44 56 AM" 
src="https://github.com/user-attachments/assets/1f84557b-5df8-422f-b602-bb5316a72a0e";
 />
    
    Closes #3308 from s0nskar/worker_register_metric.
    
    Authored-by: Sanskar Modi <[email protected]>
    Signed-off-by: Wang, Fei <[email protected]>
    (cherry picked from commit 80bdb46801cf5cee3c5a9ea6542c53a78a89bef5)
    Signed-off-by: Wang, Fei <[email protected]>
---
 assets/grafana/celeborn-dashboard.json             | 93 ++++++++++++++++++++++
 docs/monitoring.md                                 |  3 +-
 .../celeborn/service/deploy/worker/Worker.scala    |  4 +-
 .../service/deploy/worker/WorkerSource.scala       |  3 +
 4 files changed, 100 insertions(+), 3 deletions(-)

diff --git a/assets/grafana/celeborn-dashboard.json 
b/assets/grafana/celeborn-dashboard.json
index d98edb8b6..93b28996c 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -3319,6 +3319,99 @@
           "title": "metrics_IsDecommissioningWorker_Value",
           "type": "timeseries"
         },
+        {
+          "datasource": {
+            "default": false,
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": false,
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green"
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 70
+          },
+          "id": 241,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "expr": 
"metrics_RegisterWithMasterFailCount_Count{instance=~\"${instance}\"}",
+              "legendFormat": "${baseLegend}",
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_RegisterWithMasterFailCount_Count",
+          "type": "timeseries"
+        },
         {
           "datasource": {
             "type": "prometheus",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index d2f8ced8f..e7d278fab 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -242,11 +242,12 @@ These metrics are exposed by Celeborn worker.
     | WorkerConsumeSpeed                     | The speed of worker consumption 
for congestion control.                                                         
|
     | IsDecommissioningWorker                | 1 means worker decommissioning, 
0 means not decommissioning.                                                    
|
     | UnreleasedShuffleCount                 | Unreleased shuffle count when 
worker is decommissioning.                                                      
  |
-    | UnreleasedPartitionLocationCount       | Unreleased partition location 
counit when worker is shutting down.                                            
  |
+    | UnreleasedPartitionLocationCount       | Unreleased partition location 
count when worker is shutting down.                                             
  |
     | MemoryStorageFileCount                 | The count of files in Memory 
Storage of a worker.                                                            
   |
     | MemoryFileStorageSize                  | The total amount of memory used 
by Memory Storage.                                                              
|
     | EvictedFileCount                       | The count of files evicted from 
Memory Storage to Disk                                                          
|
     | DirectMemoryUsageRatio                 | Ratio of direct memory used and 
max direct memory.                                                              
|
+    | RegisterWithMasterFailCount            | The count of failures in 
register with master request.                                                   
       |
     | push_usedHeapMemory                    |                                 
                                                                                
|
     | push_usedDirectMemory                  |                                 
                                                                                
|
     | push_numHeapArenas                     |                                 
                                                                                
|
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index 0a1e14ff4..79928df44 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -52,7 +52,6 @@ import org.apache.celeborn.common.util.{CelebornExitKind, 
CollectionUtils, JavaU
 // Can Remove this if celeborn don't support scala211 in future
 import org.apache.celeborn.common.util.FunctionConverter._
 import org.apache.celeborn.server.common.{HttpService, Service}
-import 
org.apache.celeborn.service.deploy.worker.WorkerSource.ACTIVE_CONNECTION_COUNT
 import 
org.apache.celeborn.service.deploy.worker.congestcontrol.CongestionController
 import org.apache.celeborn.service.deploy.worker.memory.{ChannelsLimiter, 
MemoryManager}
 import 
org.apache.celeborn.service.deploy.worker.memory.MemoryManager.ServingState
@@ -477,7 +476,7 @@ private[celeborn] class Worker(
       case (ServingState.PUSH_AND_REPLICATE_PAUSED, _, _) => true
       case (ServingState.PUSH_PAUSED, _, _) => true
       case (_, _, Some(activeConnectionMax)) =>
-        workerSource.getCounterCount(ACTIVE_CONNECTION_COUNT) >= 
activeConnectionMax
+        workerSource.getCounterCount(WorkerSource.ACTIVE_CONNECTION_COUNT) >= 
activeConnectionMax
       case _ => false
     }
   }
@@ -672,6 +671,7 @@ private[celeborn] class Worker(
             logWarning(
               s"Register worker to master failed, will retry after 
${Utils.msDurationToString(interval)}",
               throwable)
+            
workerSource.incCounter(WorkerSource.REGISTER_WITH_MASTER_FAIL_COUNT)
             exception = throwable
             null
         }
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index 01fd138b5..c65f86b11 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -59,6 +59,7 @@ class WorkerSource(conf: CelebornConf) extends 
AbstractSource(conf, Role.WORKER)
   addCounter(SEGMENT_START_FAIL_COUNT)
 
   addCounter(SLOTS_ALLOCATED)
+  addCounter(REGISTER_WITH_MASTER_FAIL_COUNT)
 
   // add timers
   addTimer(COMMIT_FILES_TIME)
@@ -142,6 +143,8 @@ object WorkerSource {
 
   val RUNNING_APPLICATION_COUNT = "RunningApplicationCount"
 
+  val REGISTER_WITH_MASTER_FAIL_COUNT = "RegisterWithMasterFailCount"
+
   // fetch data
   val OPEN_STREAM_TIME = "OpenStreamTime"
   val FETCH_CHUNK_TIME = "FetchChunkTime"

Reply via email to