This is an automated email from the ASF dual-hosted git repository.
ethanfeng pushed a commit to branch branch-0.3
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git
The following commit(s) were added to refs/heads/branch-0.3 by this push:
new d9ded4c65 [CELEBORN-1035] Expose RunningApplicationCount,
PartitionWritten and PartitionFileCount metric by Celeborn master
d9ded4c65 is described below
commit d9ded4c652643b6ba8de10a80164a628d5603cd9
Author: SteNicholas <[email protected]>
AuthorDate: Thu Oct 19 22:07:17 2023 +0800
[CELEBORN-1035] Expose RunningApplicationCount, PartitionWritten and
PartitionFileCount metric by Celeborn master
### What changes were proposed in this pull request?
Meta manager records `appHeartbeatTime`, `partitionTotalWritten` and
`partitionTotalFileCount`, which are useful to monitor the application
heartbeat and shuffle partition. `RunningApplicationCount`, `PartitionWritten`
and `PartitionFileCount` metrics are exposed by Celeborn master to monitor the
application and shuffle partition.
### Why are the changes needed?
`Master` exposes `RunningApplicationCount`, `PartitionWritten` and
`PartitionFileCount` metrics.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Internal tests.
Closes #1976 from SteNicholas/CELEBORN-1035.
Authored-by: SteNicholas <[email protected]>
Signed-off-by: mingji <[email protected]>
---
METRICS.md | 3 +
assets/grafana/celeborn-dashboard.json | 274 +++++++++++++++++++++
docs/monitoring.md | 5 +
.../celeborn/service/deploy/master/Master.scala | 26 +-
.../service/deploy/master/MasterSource.scala | 6 +
5 files changed, 312 insertions(+), 2 deletions(-)
diff --git a/METRICS.md b/METRICS.md
index b9b76e3c8..7b5081a9d 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -65,8 +65,11 @@ Here is an example of grafana dashboard importing.
|:--------------------------------------:|:-----------------:|:---------------------------------------------------------------------------------------------------------------:|
| WorkerCount | master |
The count of active workers.
|
| ExcludedWorkerCount | master |
The count of workers in excluded list.
|
+| RunningApplicationCount | master |
The count of running applications in the cluster.
|
| OfferSlotsTime | master |
The time of offer slots.
|
| PartitionSize | master | The
estimated partition size of last 20 flush window whose length is 15 seconds by
defaults. |
+| PartitionWritten | master |
The active shuffle size.
|
+| PartitionFileCount | master |
The active shuffle partition count.
|
| RegisteredShuffleCount | master and worker |
The value means count of registered shuffle.
|
| CommitFilesTime | worker |
CommitFiles means flush and close a shuffle partition file.
|
| ReserveSlotsTime | worker |
ReserveSlots means acquire a disk buffer and record partition location.
|
diff --git a/assets/grafana/celeborn-dashboard.json
b/assets/grafana/celeborn-dashboard.json
index 146b33cc5..cb676a140 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -164,6 +164,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
+ "description": "The count of registered shuffle.",
"fieldConfig": {
"defaults": {
"color": {
@@ -247,6 +248,96 @@
],
"title": "metrics_RegisteredShuffleCount_Value",
"type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The count of running applications.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 1
+ },
+ "id": 95,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "expr": "metrics_ApplicationCount_Value",
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_RunningApplicationCount_Value",
+ "type": "timeseries"
}
],
"title": "Overall",
@@ -446,6 +537,189 @@
"title": "metrics_PartitionSize_Value",
"type": "timeseries"
},
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The active shuffle size.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 27
+ },
+ "id": 122,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "metrics_PartitionWritten_Value",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_PartitionWritten_Value",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The active shuffle partition count.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 27
+ },
+ "id": 124,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "metrics_PartitionFileCount_Value",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_PartitionFileCount_Value",
+ "type": "timeseries"
+ },
{
"datasource": {
"type": "prometheus",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index dd991877d..5999d7f83 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -97,9 +97,14 @@ These metrics are exposed by Celeborn master.
- LostWorkers
- ExcludedWorkerCount
- RegisteredShuffleCount
+ - RunningApplicationCount
- IsActiveMaster
- PartitionSize
- The size of estimated shuffle partition.
+ - PartitionWritten
+ - The active shuffle size.
+ - PartitionFileCount
+ - The active shuffle partition count.
- OfferSlotsTime
- The time for masters to handle `RequestSlots` request when
registering shuffle.
diff --git
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
index b134781b8..73957e6e1 100644
---
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
+++
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -20,11 +20,10 @@ package org.apache.celeborn.service.deploy.master
import java.io.IOException
import java.net.BindException
import java.util
-import java.util.Collections
import java.util.concurrent.{ConcurrentHashMap, ScheduledFuture, TimeUnit}
+import java.util.function.ToLongFunction
import scala.collection.JavaConverters._
-import scala.collection.mutable
import scala.util.Random
import org.apache.hadoop.fs.{FileSystem, Path}
@@ -161,7 +160,30 @@ private[celeborn] class Master(
}
masterSource.addGauge(MasterSource.WORKER_COUNT) { () =>
statusSystem.workers.size }
masterSource.addGauge(MasterSource.LOST_WORKER_COUNT) { () =>
statusSystem.lostWorkers.size }
+ masterSource.addGauge(MasterSource.RUNNING_APPLICATION_COUNT) { () =>
+ statusSystem.appHeartbeatTime.size
+ }
masterSource.addGauge(MasterSource.PARTITION_SIZE) { () =>
statusSystem.estimatedPartitionSize }
+ masterSource.addGauge(MasterSource.PARTITION_WRITTEN) { () =>
+ statusSystem.workers.parallelStream()
+ .mapToLong(new ToLongFunction[WorkerInfo]() {
+ override def applyAsLong(value: WorkerInfo): Long =
+ value.userResourceConsumption.values().parallelStream()
+ .mapToLong(new ToLongFunction[ResourceConsumption]() {
+ override def applyAsLong(value: ResourceConsumption): Long =
value.diskBytesWritten
+ }).sum()
+ }).sum()
+ }
+ masterSource.addGauge(MasterSource.PARTITION_FILE_COUNT) { () =>
+ statusSystem.workers.parallelStream()
+ .mapToLong(new ToLongFunction[WorkerInfo]() {
+ override def applyAsLong(value: WorkerInfo): Long =
+ value.userResourceConsumption.values().parallelStream()
+ .mapToLong(new ToLongFunction[ResourceConsumption]() {
+ override def applyAsLong(value: ResourceConsumption): Long =
value.diskFileCount
+ }).sum()
+ }).sum()
+ }
masterSource.addGauge(MasterSource.IS_ACTIVE_MASTER) { () => isMasterActive }
metricsSystem.registerSource(resourceConsumptionSource)
diff --git
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
index cd3f1f0ea..05c14b7df 100644
---
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
+++
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
@@ -41,9 +41,15 @@ object MasterSource {
val REGISTERED_SHUFFLE_COUNT = "RegisteredShuffleCount"
+ val RUNNING_APPLICATION_COUNT = "RunningApplicationCount"
+
val IS_ACTIVE_MASTER = "IsActiveMaster"
val PARTITION_SIZE = "PartitionSize"
+ val PARTITION_WRITTEN = "PartitionWritten"
+
+ val PARTITION_FILE_COUNT = "PartitionFileCount"
+
val OFFER_SLOTS_TIME = "OfferSlotsTime"
}