This is an automated email from the ASF dual-hosted git repository.
feiwang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new 9ba54b39e [CELEBORN-1968] Publish metric for unreleased partition
location count when worker was gracefully shutdown
9ba54b39e is described below
commit 9ba54b39e2f55382fc51e03ea897353859957cef
Author: Sanskar Modi <[email protected]>
AuthorDate: Mon May 12 04:34:44 2025 -0700
[CELEBORN-1968] Publish metric for unreleased partition location count when
worker was gracefully shutdown
### What changes were proposed in this pull request?
Adding a worker metrics for publish unreleased partition location count
when worker was gracefully shutdown.
<img width="742" alt="Screenshot 2025-04-16 at 1 19 18 AM"
src="https://github.com/user-attachments/assets/159f744a-cd76-45a2-9387-930f27dd72be"
/>
### Why are the changes needed?
Similar to https://github.com/apache/celeborn/pull/2711, Currently celeborn
don't publish the count of unreleased partition location when worker is
gracefully exit. This can be useful for monitoring and configuring the
gracefulShutdownTimeout.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
NA
Closes #3213 from s0nskar/unrelease_partition_location.
Lead-authored-by: Sanskar Modi <[email protected]>
Co-authored-by: Wang, Fei <[email protected]>
Signed-off-by: Wang, Fei <[email protected]>
---
assets/grafana/celeborn-dashboard.json | 96 ++++++++++++++++++++++
docs/monitoring.md | 1 +
.../celeborn/service/deploy/worker/Worker.scala | 9 ++
.../service/deploy/worker/WorkerSource.scala | 3 +
4 files changed, 109 insertions(+)
diff --git a/assets/grafana/celeborn-dashboard.json
b/assets/grafana/celeborn-dashboard.json
index 2962deadf..8e98e6402 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -3236,6 +3236,102 @@
],
"title": "metrics_ PartitionFileSizeBytes_Mean",
"type": "timeseries"
+ },
+ {
+ "datasource": {
+ "default": false,
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 70
+ },
+ "id": 238,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "builder",
+ "expr":
"metrics_UnreleasedPartitionLocationCount_Value{role=\"Worker\",
instance=~\"${instance}\"}",
+ "instant": false,
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_UnreleasedPartitionLocationCount_Value",
+ "type": "timeseries"
}
],
"title": "Worker",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 377d00aa1..95cefcf92 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -240,6 +240,7 @@ These metrics are exposed by Celeborn worker.
| WorkerConsumeSpeed | The speed of worker consumption
for congestion control.
|
| IsDecommissioningWorker | 1 means worker decommissioning,
0 means not decommissioning.
|
| UnreleasedShuffleCount | Unreleased shuffle count when
worker is decommissioning.
|
+ | UnreleasedPartitionLocationCount | Unreleased partition location
counit when worker is shutting down.
|
| MemoryStorageFileCount | The count of files in Memory
Storage of a worker.
|
| MemoryFileStorageSize | The total amount of memory used
by Memory Storage.
|
| EvictedFileCount | The count of files evicted from
Memory Storage to Disk
|
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index edfb21d21..502740cd7 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -455,6 +455,15 @@ private[celeborn] class Worker(
0
}
}
+ // Unreleased partition location count when worker is restarting
+ workerSource.addGauge(WorkerSource.UNRELEASED_PARTITION_LOCATION_COUNT) { ()
=>
+ if (shutdown.get()) {
+ partitionLocationInfo.primaryPartitionLocations.size() +
+ partitionLocationInfo.replicaPartitionLocations.size()
+ } else {
+ 0
+ }
+ }
workerSource.addGauge(WorkerSource.CLEAN_TASK_QUEUE_SIZE) { () =>
cleanTaskQueue.size()
}
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index b8e72dd80..c4a82225c 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -231,6 +231,9 @@ object WorkerSource {
val IS_DECOMMISSIONING_WORKER = "IsDecommissioningWorker"
val UNRELEASED_SHUFFLE_COUNT = "UnreleasedShuffleCount"
+ // graceful
+ val UNRELEASED_PARTITION_LOCATION_COUNT = "UnreleasedPartitionLocationCount"
+
// clean
val CLEAN_TASK_QUEUE_SIZE = "CleanTaskQueueSize"
val CLEAN_EXPIRED_SHUFFLE_KEYS_TIME = "CleanExpiredShuffleKeysTime"