This is an automated email from the ASF dual-hosted git repository.
ethanfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new c788c3802 [CELEBORN-1328] Introduce ActiveSlotsCount metric to monitor
the number of active slots
c788c3802 is described below
commit c788c380256b011afa5184f7ddc8affe79c9d9e4
Author: CodingCat <[email protected]>
AuthorDate: Mon Apr 8 11:08:05 2024 +0800
[CELEBORN-1328] Introduce ActiveSlotsCount metric to monitor the number of
active slots
### What changes were proposed in this pull request?
Introduce `ActiveSlots` metric to represent the disk resource demand
currently in the cluster.
### Why are the changes needed?
It's recommended to introduce `ActiveSlots` metric to represent the disk
resource demand currently in the cluster.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
In our test cluster (we can see the value of activeSlots increases and then
back to 0 after the application finished, and slotsAllocated is increasing all
the way).

Closes #2386 from CodingCat/slots_decrease.
Lead-authored-by: CodingCat <[email protected]>
Co-authored-by: Nan Zhu <[email protected]>
Co-authored-by: Fei Wang <[email protected]>
Signed-off-by: mingji <[email protected]>
---
assets/grafana/celeborn-dashboard.json | 90 ++++++++++++++++++++++
.../apache/celeborn/common/meta/DeviceInfo.scala | 2 +-
docs/monitoring.md | 2 +
.../celeborn/service/deploy/worker/Worker.scala | 3 +
.../service/deploy/worker/WorkerSource.scala | 1 +
5 files changed, 97 insertions(+), 1 deletion(-)
diff --git a/assets/grafana/celeborn-dashboard.json
b/assets/grafana/celeborn-dashboard.json
index cf327a9dd..a8fe453dd 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -1281,6 +1281,96 @@
"title": "metrics_SlotsAllocated_increase_1h",
"type": "timeseries"
},
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 290
+ },
+ "id": 48,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "expr": "metrics_ActiveSlotsCount_Value",
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_ActiveSlotsCount_Value",
+ "type": "timeseries"
+ },
{
"datasource": {
"type": "prometheus",
diff --git
a/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala
b/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala
index e3e547407..b0438bd1b 100644
--- a/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala
+++ b/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala
@@ -141,7 +141,7 @@ class DiskInfo(
shuffleAllocations.put(shuffleKey, shuffleAllocated - slots)
applicationAllocations.put(applicationId, applicationAllocated - slots)
}
- activeSlots = activeSlots - Math.min(shuffleAllocated, slots)
+ activeSlots -= Math.min(shuffleAllocated, slots)
}
def releaseSlots(shuffleKey: String): Unit = this.synchronized {
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 11d03be6b..042c6dbb1 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -199,6 +199,8 @@ These metrics are exposed by Celeborn worker.
- CommitFilesTime
- The time for a worker to flush buffers and close files related to
specified shuffle.
- SlotsAllocated
+ - ActiveSlotsCount
+ - The number of slots currently being used in a worker
- ReserveSlotsTime
- ActiveConnectionCount
- NettyMemory
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index 88ad05c4e..0da4b5513 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -396,6 +396,9 @@ private[celeborn] class Worker(
workerSource.addGauge(WorkerSource.PAUSE_PUSH_DATA_AND_REPLICATE_COUNT) { ()
=>
memoryManager.getPausePushDataAndReplicateCounter
}
+ workerSource.addGauge(WorkerSource.ACTIVE_SLOTS_COUNT) { () =>
+ workerInfo.usedSlots()
+ }
private def highWorkload: Boolean = {
(memoryManager.currentServingState, conf.workerActiveConnectionMax) match {
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index 034be2d55..6be2a9256 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -170,6 +170,7 @@ object WorkerSource {
// slots
val SLOTS_ALLOCATED = "SlotsAllocated"
+ val ACTIVE_SLOTS_COUNT = "ActiveSlotsCount"
val RESERVE_SLOTS_TIME = "ReserveSlotsTime"
// connection