This is an automated email from the ASF dual-hosted git repository.

feiwang pushed a commit to branch branch-0.6
in repository https://gitbox.apache.org/repos/asf/celeborn.git


The following commit(s) were added to refs/heads/branch-0.6 by this push:
     new 7ce01117f [CELEBORN-2024] Publish commit files fail count metrics
7ce01117f is described below

commit 7ce01117f2661aa406244571ba6df5870a9bff0f
Author: Sanskar Modi <[email protected]>
AuthorDate: Tue Jun 17 11:52:45 2025 -0700

    [CELEBORN-2024] Publish commit files fail count metrics
    
    <!--
    Thanks for sending a pull request!  Here are some tips for you:
      - Make sure the PR title start w/ a JIRA ticket, e.g. '[CELEBORN-XXXX] 
Your PR title ...'.
      - Be sure to keep the PR description updated to reflect all changes.
      - Please write your PR title to summarize what this PR proposes.
      - If possible, provide a concise example to reproduce the issue for a 
faster review.
    -->
    
    ### What changes were proposed in this pull request?
    Added a commit files request fail count metric.
    
    ### Why are the changes needed?
    To monitor and tune the configurations around the commit files workflow.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    Local setup
    
    <img width="739" alt="Screenshot 2025-06-04 at 10 51 06 AM" 
src="https://github.com/user-attachments/assets/d6256028-d8b7-4a81-90b1-3dcbf61adeba";
 />
    
    Closes #3307 from s0nskar/commit_metric.
    
    Authored-by: Sanskar Modi <[email protected]>
    Signed-off-by: Wang, Fei <[email protected]>
    (cherry picked from commit 2a2c6e4687f8dacbcacd63e01c7a8c515d1dc20b)
    Signed-off-by: Wang, Fei <[email protected]>
---
 assets/grafana/celeborn-dashboard.json             | 89 ++++++++++++++++++++++
 docs/monitoring.md                                 |  1 +
 .../service/deploy/worker/Controller.scala         |  4 +
 .../service/deploy/worker/WorkerSource.scala       |  3 +
 4 files changed, 97 insertions(+)

diff --git a/assets/grafana/celeborn-dashboard.json 
b/assets/grafana/celeborn-dashboard.json
index 9acf842b4..261948b91 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -7002,6 +7002,95 @@
           ],
           "title": "metrics_CommitFilesTime_Max",
           "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green"
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 71
+          },
+          "id": 242,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "expr": "metrics_CommitFilesFailCount_Count{role=\"Worker\", 
instance=~\"${instance}\"}",
+              "legendFormat": "${baseLegend}",
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_CommitFilesFailCount_Count",
+          "type": "timeseries"
         }
       ],
       "title": "FlushDataRelatives",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index e7d278fab..61fa2670a 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -214,6 +214,7 @@ These metrics are exposed by Celeborn worker.
     | TakeBufferTime                         | The time for a worker to take 
out a buffer from a disk flusher.                                               
  |
     | FlushDataTime                          | The time for a worker to write 
a buffer which is 256KB by default to storage.                                  
 |
     | CommitFilesTime                        | The time for a worker to flush 
buffers and close files related to specified shuffle.                           
 |
+    | CommitFilesFailCount                   | The count of commit files 
request failed in current worker.                                               
      |
     | SlotsAllocated                         | Slots allocated in last hour.   
                                                                                
|
     | ActiveSlotsCount                       | The number of slots currently 
being used in a worker.                                                         
  |
     | ReserveSlotsTime                       | ReserveSlots means acquire a 
disk buffer and record partition location.                                      
   |
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Controller.scala
 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Controller.scala
index a6870c836..64624e9ad 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Controller.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Controller.scala
@@ -645,6 +645,8 @@ private[deploy] class Controller(
 
                 commitInfo.status = CommitInfo.COMMIT_FINISHED
               }
+
+              workerSource.incCounter(WorkerSource.COMMIT_FILES_FAIL_COUNT)
             } else {
               // finish, cancel timeout job first.
               timeout.cancel()
@@ -784,6 +786,8 @@ private[deploy] class Controller(
                 commitInfo.response = replyResponse
                 context.reply(replyResponse)
                 epochIterator.remove()
+
+                workerSource.incCounter(WorkerSource.COMMIT_FILES_FAIL_COUNT)
               }
             }
           }
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index c65f86b11..56a8ea94c 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -61,6 +61,8 @@ class WorkerSource(conf: CelebornConf) extends 
AbstractSource(conf, Role.WORKER)
   addCounter(SLOTS_ALLOCATED)
   addCounter(REGISTER_WITH_MASTER_FAIL_COUNT)
 
+  addCounter(COMMIT_FILES_FAIL_COUNT)
+
   // add timers
   addTimer(COMMIT_FILES_TIME)
   addTimer(RESERVE_SLOTS_TIME)
@@ -191,6 +193,7 @@ object WorkerSource {
   val TAKE_BUFFER_TIME = "TakeBufferTime"
   val FLUSH_DATA_TIME = "FlushDataTime"
   val COMMIT_FILES_TIME = "CommitFilesTime"
+  val COMMIT_FILES_FAIL_COUNT = "CommitFilesFailCount"
   val FLUSH_WORKING_QUEUE_SIZE = "FlushWorkingQueueSize"
 
   // slots

Reply via email to