This is an automated email from the ASF dual-hosted git repository.

ethanfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new ca60613f2 [CELEBORN-1817] add committed file size metrics
ca60613f2 is described below

commit ca60613f2fcf7dfa89d6499f1f1d0a9c454bcd5a
Author: Nan <[email protected]>
AuthorDate: Tue Jan 7 10:17:45 2025 +0800

    [CELEBORN-1817] add committed file size metrics
    
    ### What changes were proposed in this pull request?
    
    this PR adds the file size metrics for workers
    
    ### Why are the changes needed?
    
    the reason for us to add this metric is that we observed that, likely due 
to the delayed processing of split messages, we have jobs writing 40-50g files 
even the split threshold is 10g (we use soft split)
    
    we want to have this metrics to monitor the severity of the issue
    
    ### Does this PR introduce _any_ user-facing change?
    
    yes, one more metrics
    
    ### How was this patch tested?
    
    (ignore the dashboard title, it's a dummy one)
    
    
![image](https://github.com/user-attachments/assets/d88c15e6-d740-4def-94d5-03666bbb38ca)
    
    Closes #3047 from CodingCat/committed_file_size.
    
    Authored-by: Nan <[email protected]>
    Signed-off-by: mingji <[email protected]>
---
 assets/grafana/celeborn-dashboard.json             | 306 ++++++++++++++++++++-
 docs/monitoring.md                                 |   1 +
 .../deploy/worker/storage/PartitionDataWriter.java |   2 +
 .../service/deploy/worker/WorkerSource.scala       |   2 +
 4 files changed, 305 insertions(+), 6 deletions(-)

diff --git a/assets/grafana/celeborn-dashboard.json 
b/assets/grafana/celeborn-dashboard.json
index cb1c1ebb2..f06802f93 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -2951,11 +2951,305 @@
           ],
           "title": "metrics_IsDecommissioningWorker_Value",
           "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "description": "size of partition files in bytes",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": false,
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 110
+          },
+          "id": 235,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "editorMode": "code",
+              "expr": "metrics_PartitionFileSizeBytes_Mean{role=\"Worker\", 
instance=~\"${instance}\"}",
+              "hide": false,
+              "instant": false,
+              "legendFormat": "${baseLegend}",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_ PartitionFileSizeBytes_Mean",
+          "type": "timeseries"
         }
       ],
       "title": "Worker",
       "type": "row"
     },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "p99 size of partition files in size",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 70
+      },
+      "id": 236,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "metrics_PartitionFileSizeBytes_P99{role=\"Worker\", 
instance=~\"${instance}\"}",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "${baseLegend}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "metrics_ PartitionFileSizeBytes_P99",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "max size of partition files in size",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 110
+      },
+      "id": 237,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "metrics_PartitionFileSizeBytes_max{role=\"Worker\", 
instance=~\"${instance}\"}",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "${baseLegend}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "metrics_ PartitionFileSizeBytes_MAX",
+      "type": "timeseries"
+    },
     {
       "collapsed": true,
       "gridPos": {
@@ -5091,7 +5385,7 @@
               "fullMetaSearch": false,
               "includeNullMetadata": true,
               "instant": false,
-              "legendFormat": "__auto",
+              "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A",
               "useBackend": false
@@ -5191,7 +5485,7 @@
               "fullMetaSearch": false,
               "includeNullMetadata": true,
               "instant": false,
-              "legendFormat": "__auto",
+              "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A",
               "useBackend": false
@@ -5291,7 +5585,7 @@
               "fullMetaSearch": false,
               "includeNullMetadata": true,
               "instant": false,
-              "legendFormat": "__auto",
+              "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A",
               "useBackend": false
@@ -5390,7 +5684,7 @@
               "fullMetaSearch": false,
               "includeNullMetadata": true,
               "instant": false,
-              "legendFormat": "__auto",
+              "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A",
               "useBackend": false
@@ -5490,7 +5784,7 @@
               "fullMetaSearch": false,
               "includeNullMetadata": true,
               "instant": false,
-              "legendFormat": "__auto",
+              "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A",
               "useBackend": false
@@ -5590,7 +5884,7 @@
               "fullMetaSearch": false,
               "includeNullMetadata": true,
               "instant": false,
-              "legendFormat": "__auto",
+              "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A",
               "useBackend": false
diff --git a/docs/monitoring.md b/docs/monitoring.md
index b2424fec7..6a1adbb45 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -206,6 +206,7 @@ These metrics are exposed by Celeborn worker.
     | PausePushDataAndReplicateTime          | The time for a worker to stop 
receiving pushData from clients and other workers because of back pressure.     
  |
     | PausePushData                          | The count for a worker to stop 
receiving pushData from clients because of back pressure.                       
 |
     | PausePushDataAndReplicate              | The count for a worker to stop 
receiving pushData from clients and other workers because of back pressure.     
 |
+    | PartitionFileSizeBytes                 | The size of partition files 
committed in current worker.                                                    
    |
     | TakeBufferTime                         | The time for a worker to take 
out a buffer from a disk flusher.                                               
  |
     | FlushDataTime                          | The time for a worker to write 
a buffer which is 256KB by default to storage.                                  
 |
     | CommitFilesTime                        | The time for a worker to flush 
buffers and close files related to specified shuffle.                           
 |
diff --git 
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionDataWriter.java
 
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionDataWriter.java
index c711752fb..8eae732a1 100644
--- 
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionDataWriter.java
+++ 
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionDataWriter.java
@@ -549,8 +549,10 @@ public abstract class PartitionDataWriter implements 
DeviceObserver {
       }
     }
     if (diskFileInfo != null) {
+      source.updateHistogram(WorkerSource.PARTITION_FILE_SIZE(), 
diskFileInfo.getFileLength());
       return diskFileInfo.getFileLength();
     } else {
+      source.updateHistogram(WorkerSource.PARTITION_FILE_SIZE(), 
memoryFileInfo.getFileLength());
       return memoryFileInfo.getFileLength();
     }
   }
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index 891988a5b..b8e72dd80 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -84,6 +84,7 @@ class WorkerSource(conf: CelebornConf) extends 
AbstractSource(conf, Role.WORKER)
   addTimer(CLEAN_EXPIRED_SHUFFLE_KEYS_TIME)
 
   addHistogram(FETCH_CHUNK_TRANSFER_SIZE)
+  addHistogram(PARTITION_FILE_SIZE)
 
   def getCounterCount(metricsName: String): Long = {
     val metricNameWithLabel = metricNameWithCustomizedLabels(metricsName, 
Map.empty)
@@ -215,6 +216,7 @@ object WorkerSource {
   val DEVICE_OS_TOTAL_CAPACITY = "DeviceOSTotalBytes"
   val DEVICE_CELEBORN_FREE_CAPACITY = "DeviceCelebornFreeBytes"
   val DEVICE_CELEBORN_TOTAL_CAPACITY = "DeviceCelebornTotalBytes"
+  val PARTITION_FILE_SIZE = "PartitionFileSizeBytes"
 
   // congestion control
   val POTENTIAL_CONSUME_SPEED = "PotentialConsumeSpeed"

Reply via email to