This is an automated email from the ASF dual-hosted git repository.
ethanfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new 29e930488 [CELEBORN-1100] Introduce ChunkStreamCount,
OpenStreamFailCount metrics about opening stream of FetchHandler
29e930488 is described below
commit 29e930488bb6a2bf805d871115854b24d504e2d7
Author: SteNicholas <[email protected]>
AuthorDate: Fri Jan 5 17:05:35 2024 +0800
[CELEBORN-1100] Introduce ChunkStreamCount, OpenStreamFailCount metrics
about opening stream of FetchHandler
### What changes were proposed in this pull request?
Introduces `ChunkStreamCount`, `OpenStreamFailCount` metrics about opening
stream of `FetchHandler`:
- `WorkerSource` adds `ChunkStreamCount`, `OpenStreamFailCount` metrics.
- Corrects the grafana dashboard of `celeborn-dashboard.json`.
`celeborn-dashboard.json` has been verified via [Celeborn
Dashboard](https://stenicholas.grafana.net/d/U_qgru_7z/celeborn?orgId=1&refresh=5s).
For example:
1. `"expr": "metrics_RunningApplicationCount_Value"`
2. Moves the panel positition of `FetchChunkFailCount` to
`FetchRelatives` instead of `PushRelatives`.
3. Updates the `gridPos` of some panels.
### Why are the changes needed?
There are no any metrics about opening stream of `FetchHandler` for
Celeborn Worker.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
[Celeborn
Dashboard](https://stenicholas.grafana.net/d/U_qgru_7z/celeborn?orgId=1&refresh=5s)
Closes #2212 from SteNicholas/CELEBORN-1100.
Authored-by: SteNicholas <[email protected]>
Signed-off-by: mingji <[email protected]>
---
METRICS.md | 4 +-
assets/grafana/celeborn-dashboard.json | 532 ++++++++++++++-------
docs/monitoring.md | 5 +-
.../deploy/worker/storage/ChunkStreamManager.java | 3 +-
.../deploy/worker/storage/CreditStreamManager.java | 5 -
.../service/deploy/worker/FetchHandler.scala | 4 +
.../service/deploy/worker/WorkerSource.scala | 20 +-
.../worker/storage/ChunkStreamManagerSuiteJ.java | 14 +-
.../worker/storage/CreditStreamManagerSuiteJ.java | 12 +-
9 files changed, 393 insertions(+), 206 deletions(-)
diff --git a/METRICS.md b/METRICS.md
index 986042342..518d83189 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -93,9 +93,11 @@ Here is an example of Grafana dashboard importing.
| FlushDataTime | worker |
FlushData means flush a disk buffer to disk.
|
| OpenStreamTime | worker |
OpenStream means read a shuffle file and send client about chunks size and
stream index. |
| FetchChunkTime | worker |
FetchChunk means read a chunk from a shuffle file and send to client.
|
+| ChunkStreamCount | worker |
The stream count for reduce partition reading streams in current worker.
|
+| OpenStreamFailCount | worker |
The count of opening stream failed in current worker.
|
+| FetchChunkFailCount | worker |
The count of fetching chunk failed in current worker.
|
| PrimaryPushDataTime | worker |
PrimaryPushData means handle pushdata of primary partition location.
|
| ReplicaPushDataTime | worker |
ReplicaPushData means handle pushdata of replica partition location.
|
-| FetchChunkFailCount | worker |
The count of fetching chunk failed in current worker.
|
| WriteDataFailCount | worker |
The count of writing PushData or PushMergedData failed in current worker.
|
| ReplicateDataFailCount | worker |
The count of replicating PushData or PushMergedData failed in current worker.
|
| ReplicateDataWriteFailCount | worker | The count
of replicating PushData or PushMergedData failed caused by write failure in
peer worker. |
diff --git a/assets/grafana/celeborn-dashboard.json
b/assets/grafana/celeborn-dashboard.json
index 8c9780d73..575c2aa15 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -115,8 +115,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -206,8 +205,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -297,8 +295,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -312,8 +309,8 @@
"gridPos": {
"h": 9,
"w": 12,
- "x": 12,
- "y": 1
+ "x": 0,
+ "y": 10
},
"id": 95,
"options": {
@@ -334,7 +331,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_ApplicationCount_Value",
+ "expr": "metrics_RunningApplicationCount_Value",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -418,7 +415,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 27
+ "y": 20
},
"id": 121,
"options": {
@@ -511,7 +508,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 27
+ "y": 20
},
"id": 120,
"options": {
@@ -603,8 +600,8 @@
"gridPos": {
"h": 8,
"w": 12,
- "x": 12,
- "y": 27
+ "x": 0,
+ "y": 28
},
"id": 122,
"options": {
@@ -696,7 +693,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 27
+ "y": 28
},
"id": 124,
"options": {
@@ -788,7 +785,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 35
+ "y": 36
},
"id": 100,
"options": {
@@ -878,7 +875,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 35
+ "y": 36
},
"id": 102,
"options": {
@@ -968,7 +965,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 43
+ "y": 44
},
"id": 36,
"options": {
@@ -991,6 +988,7 @@
},
"expr": "metrics_ExcludedWorkerCount_Value",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
@@ -1057,7 +1055,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 43
+ "y": 44
},
"id": 117,
"options": {
@@ -1147,8 +1145,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -1237,8 +1234,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -1328,8 +1324,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -1419,8 +1414,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -1458,6 +1452,7 @@
},
"expr": "metrics_PausePushData_Value",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
@@ -1509,8 +1504,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -1548,6 +1542,7 @@
},
"expr": "metrics_PausePushDataAndReplicate_Value",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
@@ -1599,8 +1594,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -1616,7 +1610,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 63
+ "y": 30
},
"id": 182,
"options": {
@@ -1639,10 +1633,11 @@
},
"expr": "metrics_PausePushDataTime_Value",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
- "title": "Pause Push Data Time Count",
+ "title": "metrics_PausePushDataTime_Value",
"type": "timeseries"
},
{
@@ -1690,8 +1685,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -1705,7 +1699,7 @@
"gridPos": {
"h": 8,
"w": 12,
- "x": 0,
+ "x": 12,
"y": 30
},
"id": 179,
@@ -1727,12 +1721,13 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_ActiveConnectionCount_Count{}",
+ "expr": "metrics_ActiveConnectionCount_Count",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
- "title": "ActiveConnectionCount",
+ "title": "metrics_ActiveConnectionCount_Count",
"type": "timeseries"
},
{
@@ -1781,8 +1776,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -1797,8 +1791,8 @@
"gridPos": {
"h": 8,
"w": 12,
- "x": 12,
- "y": 30
+ "x": 0,
+ "y": 38
},
"id": 181,
"options": {
@@ -1876,8 +1870,7 @@
"mode": "absolute",
"steps": [
{
- "color": "green",
- "value": null
+ "color": "green"
},
{
"color": "red",
@@ -1892,9 +1885,9 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 30
+ "y": 38
},
- "id": 181,
+ "id": 183,
"options": {
"legend": {
"calcs": [],
@@ -1934,7 +1927,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 3
+ "y": 46
},
"id": 134,
"panels": [
@@ -1999,7 +1992,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 55
+ "y": 47
},
"id": 68,
"options": {
@@ -2089,7 +2082,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 55
+ "y": 47
},
"id": 70,
"options": {
@@ -2179,7 +2172,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 64
+ "y": 56
},
"id": 72,
"options": {
@@ -2269,7 +2262,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 64
+ "y": 56
},
"id": 74,
"options": {
@@ -2358,9 +2351,9 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 73
+ "y": 65
},
- "id": 75,
+ "id": 76,
"options": {
"legend": {
"calcs": [],
@@ -2380,13 +2373,104 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_FetchChunkFailCount_Count",
+ "expr": "metrics_WriteDataFailCount_Count",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
- "title": "metrics_FetchChunkFailCount_Count",
+ "title": "metrics_WriteDataFailCount_Count",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 65
+ },
+ "id": 129,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "metrics_ReplicateDataWriteFailCount_Count",
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_ReplicateDataWriteFailCount_Count",
"type": "timeseries"
},
{
@@ -2451,7 +2535,7 @@
"x": 0,
"y": 73
},
- "id": 76,
+ "id": 128,
"options": {
"legend": {
"calcs": [],
@@ -2471,13 +2555,13 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_WriteDataFailCount_Count",
+ "expr": "metrics_ReplicateDataFailCount_Count",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
- "title": "metrics_WriteDataFailCount_Count",
+ "title": "metrics_ReplicateDataFailCount_Count",
"type": "timeseries"
},
{
@@ -2542,7 +2626,7 @@
"x": 12,
"y": 73
},
- "id": 129,
+ "id": 131,
"options": {
"legend": {
"calcs": [],
@@ -2562,13 +2646,13 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataWriteFailCount_Count",
+ "expr": "metrics_ReplicateDataTimeoutCount_Count",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
- "title": "metrics_ReplicateDataWriteFailCount_Count",
+ "title": "metrics_ReplicateDataTimeoutCount_Count",
"type": "timeseries"
},
{
@@ -2633,7 +2717,7 @@
"x": 0,
"y": 81
},
- "id": 128,
+ "id": 132,
"options": {
"legend": {
"calcs": [],
@@ -2653,13 +2737,13 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataFailCount_Count",
+ "expr": "metrics_ReplicateDataConnectionExceptionCount_Count",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
- "title": "metrics_ReplicateDataFailCount_Count",
+ "title": "metrics_ReplicateDataConnectionExceptionCount_Count",
"type": "timeseries"
},
{
@@ -2724,7 +2808,7 @@
"x": 12,
"y": 81
},
- "id": 131,
+ "id": 130,
"options": {
"legend": {
"calcs": [],
@@ -2744,15 +2828,29 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataTimeoutCount_Count",
+ "expr": "metrics_ReplicateDataCreateConnectionFailCount_Count",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
- "title": "metrics_ReplicateDataTimeoutCount_Count",
+ "title": "metrics_ReplicateDataCreateConnectionFailCount_Count",
"type": "timeseries"
- },
+ }
+ ],
+ "title": "PushRelatives",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 89
+ },
+ "id": 12,
+ "panels": [
{
"datasource": {
"type": "prometheus",
@@ -2805,7 +2903,8 @@
"value": 80
}
]
- }
+ },
+ "unit": "ms"
},
"overrides": []
},
@@ -2813,9 +2912,9 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 89
+ "y": 90
},
- "id": 132,
+ "id": 66,
"options": {
"legend": {
"calcs": [],
@@ -2834,14 +2933,12 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "editorMode": "code",
- "expr": "metrics_ReplicateDataConnectionExceptionCount_Count",
+ "expr": "metrics_OpenStreamTime_Mean",
"legendFormat": "${baseLegend}",
- "range": true,
"refId": "A"
}
],
- "title": "metrics_ReplicateDataConnectionExceptionCount_Count",
+ "title": "metrics_OpenStreamTime_Mean",
"type": "timeseries"
},
{
@@ -2896,7 +2993,8 @@
"value": 80
}
]
- }
+ },
+ "unit": "ms"
},
"overrides": []
},
@@ -2904,9 +3002,9 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 89
+ "y": 90
},
- "id": 130,
+ "id": 96,
"options": {
"legend": {
"calcs": [],
@@ -2925,30 +3023,14 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "editorMode": "code",
- "expr": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+ "expr": "metrics_OpenStreamTime_Max",
"legendFormat": "${baseLegend}",
- "range": true,
"refId": "A"
}
],
- "title": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+ "title": "metrics_OpenStreamTime_Max",
"type": "timeseries"
- }
- ],
- "title": "PushRelatives",
- "type": "row"
- },
- {
- "collapsed": true,
- "gridPos": {
- "h": 1,
- "w": 24,
- "x": 0,
- "y": 4
- },
- "id": 12,
- "panels": [
+ },
{
"datasource": {
"type": "prometheus",
@@ -3010,9 +3092,9 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 56
+ "y": 98
},
- "id": 66,
+ "id": 17,
"options": {
"legend": {
"calcs": [],
@@ -3031,12 +3113,12 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_OpenStreamTime_Mean",
+ "expr": "metrics_FetchChunkTime_Mean",
"legendFormat": "${baseLegend}",
"refId": "A"
}
],
- "title": "metrics_OpenStreamTime_Mean",
+ "title": "metrics_FetchChunkTime_Mean",
"type": "timeseries"
},
{
@@ -3100,9 +3182,9 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 56
+ "y": 98
},
- "id": 96,
+ "id": 18,
"options": {
"legend": {
"calcs": [],
@@ -3121,12 +3203,12 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_OpenStreamTime_Max",
+ "expr": "metrics_FetchChunkTime_Max",
"legendFormat": "${baseLegend}",
"refId": "A"
}
],
- "title": "metrics_OpenStreamTime_Max",
+ "title": "metrics_FetchChunkTime_Max",
"type": "timeseries"
},
{
@@ -3181,8 +3263,7 @@
"value": 80
}
]
- },
- "unit": "ms"
+ }
},
"overrides": []
},
@@ -3190,9 +3271,9 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 64
+ "y": 106
},
- "id": 17,
+ "id": 73,
"options": {
"legend": {
"calcs": [],
@@ -3211,12 +3292,14 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_FetchChunkTime_Mean",
+ "editorMode": "code",
+ "expr": "metrics_ChunkStreamCount_Value",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
- "title": "metrics_FetchChunkTime_Mean",
+ "title": "metrics_ChunkStreamCount_Value",
"type": "timeseries"
},
{
@@ -3271,8 +3354,7 @@
"value": 80
}
]
- },
- "unit": "ms"
+ }
},
"overrides": []
},
@@ -3280,9 +3362,9 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 64
+ "y": 106
},
- "id": 18,
+ "id": 77,
"options": {
"legend": {
"calcs": [],
@@ -3301,12 +3383,105 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_FetchChunkTime_Max",
+ "editorMode": "code",
+ "expr": "metrics_OpenStreamFailCount_Count",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
- "title": "metrics_FetchChunkTime_Max",
+ "title": "metrics_OpenStreamFailCount_Count",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 114
+ },
+ "id": 75,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "metrics_FetchChunkFailCount_Count",
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_FetchChunkFailCount_Count",
"type": "timeseries"
}
],
@@ -3319,7 +3494,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 5
+ "y": 122
},
"id": 10,
"panels": [
@@ -3384,7 +3559,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 57
+ "y": 123
},
"id": 78,
"options": {
@@ -3474,7 +3649,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 57
+ "y": 123
},
"id": 80,
"options": {
@@ -3564,7 +3739,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 65
+ "y": 131
},
"id": 4,
"options": {
@@ -3654,7 +3829,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 65
+ "y": 131
},
"id": 6,
"options": {
@@ -3744,7 +3919,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 74
+ "y": 140
},
"id": 56,
"options": {
@@ -3834,7 +4009,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 74
+ "y": 140
},
"id": 58,
"options": {
@@ -3873,7 +4048,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 6
+ "y": 148
},
"id": 8,
"panels": [
@@ -3938,7 +4113,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 31
+ "y": 149
},
"id": 19,
"options": {
@@ -3961,6 +4136,7 @@
},
"expr": "metrics_NettyMemory_Value",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
@@ -4028,7 +4204,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 31
+ "y": 149
},
"id": 20,
"options": {
@@ -4051,6 +4227,7 @@
},
"expr": "metrics_DiskBuffer_Value",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
@@ -4118,7 +4295,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 39
+ "y": 157
},
"id": 165,
"options": {
@@ -4210,7 +4387,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 39
+ "y": 157
},
"id": 166,
"options": {
@@ -4302,7 +4479,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 47
+ "y": 165
},
"id": 167,
"options": {
@@ -4394,7 +4571,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 47
+ "y": 165
},
"id": 168,
"options": {
@@ -4486,7 +4663,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 55
+ "y": 173
},
"id": 169,
"options": {
@@ -4578,7 +4755,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 55
+ "y": 173
},
"id": 170,
"options": {
@@ -4669,7 +4846,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 63
+ "y": 181
},
"id": 108,
"options": {
@@ -4761,7 +4938,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 63
+ "y": 181
},
"id": 104,
"options": {
@@ -4852,7 +5029,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 71
+ "y": 189
},
"id": 106,
"options": {
@@ -4893,7 +5070,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 7
+ "y": 197
},
"id": 50,
"panels": [
@@ -4958,7 +5135,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 16
+ "y": 198
},
"id": 44,
"options": {
@@ -5048,7 +5225,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 16
+ "y": 198
},
"id": 46,
"options": {
@@ -5137,7 +5314,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 24
+ "y": 206
},
"id": 48,
"options": {
@@ -5160,6 +5337,7 @@
},
"expr": "metrics_SortingFiles_Value",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
@@ -5226,7 +5404,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 24
+ "y": 206
},
"id": 180,
"options": {
@@ -5249,6 +5427,7 @@
},
"expr": "metrics_SortedFiles_Value",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
@@ -5316,7 +5495,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 32
+ "y": 214
},
"id": 88,
"options": {
@@ -5339,6 +5518,7 @@
},
"expr": "metrics_SortMemory_Value",
"legendFormat": "${baseLegend}",
+ "range": true,
"refId": "A"
}
],
@@ -5405,7 +5585,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 32
+ "y": 214
},
"id": 135,
"options": {
@@ -5446,7 +5626,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 8
+ "y": 222
},
"id": 157,
"panels": [
@@ -5512,7 +5692,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 17
+ "y": 223
},
"id": 159,
"options": {
@@ -5605,7 +5785,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 17
+ "y": 223
},
"id": 160,
"options": {
@@ -5698,7 +5878,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 25
+ "y": 231
},
"id": 161,
"options": {
@@ -5739,7 +5919,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 9
+ "y": 239
},
"id": 137,
"panels": [
@@ -5804,7 +5984,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 82
+ "y": 240
},
"id": 139,
"options": {
@@ -5896,7 +6076,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 82
+ "y": 240
},
"id": 141,
"options": {
@@ -5988,7 +6168,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 91
+ "y": 249
},
"id": 142,
"options": {
@@ -6080,7 +6260,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 91
+ "y": 249
},
"id": 143,
"options": {
@@ -6172,7 +6352,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 100
+ "y": 258
},
"id": 144,
"options": {
@@ -6264,7 +6444,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 100
+ "y": 258
},
"id": 145,
"options": {
@@ -6356,7 +6536,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 109
+ "y": 267
},
"id": 146,
"options": {
@@ -6448,7 +6628,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 109
+ "y": 267
},
"id": 147,
"options": {
@@ -6540,7 +6720,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 118
+ "y": 276
},
"id": 148,
"options": {
@@ -6632,7 +6812,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 118
+ "y": 276
},
"id": 149,
"options": {
@@ -6724,7 +6904,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 127
+ "y": 285
},
"id": 150,
"options": {
@@ -6816,7 +6996,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 127
+ "y": 285
},
"id": 151,
"options": {
@@ -6907,7 +7087,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 136
+ "y": 294
},
"id": 153,
"options": {
@@ -6998,7 +7178,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 136
+ "y": 294
},
"id": 154,
"options": {
@@ -7089,7 +7269,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 144
+ "y": 302
},
"id": 155,
"options": {
@@ -7130,7 +7310,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 10
+ "y": 310
},
"id": 110,
"panels": [
@@ -7194,7 +7374,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 97
+ "y": 311
},
"id": 112,
"options": {
@@ -7285,7 +7465,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 97
+ "y": 311
},
"id": 116,
"options": {
@@ -7326,7 +7506,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 11
+ "y": 319
},
"id": 123,
"panels": [
@@ -7391,7 +7571,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 36
+ "y": 320
},
"id": 125,
"options": {
@@ -7484,7 +7664,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 36
+ "y": 320
},
"id": 126,
"options": {
@@ -7577,7 +7757,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 44
+ "y": 328
},
"id": 163,
"options": {
@@ -7670,7 +7850,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 44
+ "y": 328
},
"id": 162,
"options": {
@@ -7763,7 +7943,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 52
+ "y": 336
},
"id": 127,
"options": {
@@ -7804,7 +7984,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 12
+ "y": 344
},
"id": 172,
"panels": [
@@ -7869,7 +8049,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 37
+ "y": 345
},
"id": 174,
"options": {
@@ -7962,7 +8142,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 37
+ "y": 345
},
"id": 176,
"options": {
@@ -8054,7 +8234,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 46
+ "y": 354
},
"id": 175,
"options": {
@@ -8147,7 +8327,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 46
+ "y": 354
},
"id": 177,
"options": {
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 819658007..531b703e3 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -140,11 +140,14 @@ These metrics are exposed by Celeborn worker.
- The time for a worker to process openStream RPC and return
StreamHandle.
- FetchChunkTime
- The time for a worker to fetch a chunk which is 8MB by default from
a reduced partition.
+ - ChunkStreamCount
+ - Stream count for reduce partition reading streams.
+ - OpenStreamFailCount
+ - FetchChunkFailCount
- PrimaryPushDataTime
- The time for a worker to handle a pushData RPC sent from a celeborn
client.
- ReplicaPushDataTime
- The time for a worker to handle a pushData RPC sent from a celeborn
worker by replicating.
- - FetchChunkFailCount
- WriteDataFailCount
- ReplicateDataFailCount
- ReplicateDataWriteFailCount
diff --git
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/ChunkStreamManager.java
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/ChunkStreamManager.java
index f488399d5..731754345 100644
---
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/ChunkStreamManager.java
+++
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/ChunkStreamManager.java
@@ -188,8 +188,7 @@ public class ChunkStreamManager {
return new Tuple2<>(state.shuffleKey, state.fileName);
}
- @VisibleForTesting
- public int numStreamStates() {
+ public int getStreamsCount() {
return streams.size();
}
diff --git
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/CreditStreamManager.java
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/CreditStreamManager.java
index 49bf4c633..22b2f10a4 100644
---
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/CreditStreamManager.java
+++
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/CreditStreamManager.java
@@ -175,11 +175,6 @@ public class CreditStreamManager {
startRecycleThread(); // lazy start thread
}
- @VisibleForTesting
- public int numStreamStates() {
- return streams.size();
- }
-
@VisibleForTesting
public int numRecycleStreams() {
return recycleStreamIds.size();
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/FetchHandler.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/FetchHandler.scala
index 52ecc4a88..c7b657ba4 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/FetchHandler.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/FetchHandler.scala
@@ -60,6 +60,9 @@ class FetchHandler(
var registered: AtomicBoolean = new AtomicBoolean(false)
def init(worker: Worker): Unit = {
+ workerSource.addGauge(WorkerSource.CHUNK_STREAM_COUNT) { () =>
+ chunkStreamManager.getStreamsCount
+ }
workerSource.addGauge(WorkerSource.CREDIT_STREAM_COUNT) { () =>
creditStreamManager.getStreamsCount
@@ -281,6 +284,7 @@ class FetchHandler(
}
} catch {
case e: IOException =>
+ workerSource.incCounter(WorkerSource.OPEN_STREAM_FAIL_COUNT)
handleRpcIOException(client, rpcRequestId, shuffleKey, fileName, e,
callback)
} finally {
workerSource.stopTimer(WorkerSource.OPEN_STREAM_TIME, shuffleKey)
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index b363f3865..e65b83fa6 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -26,6 +26,7 @@ class WorkerSource(conf: CelebornConf) extends
AbstractSource(conf, MetricsSyste
import WorkerSource._
// add counters
+ addCounter(OPEN_STREAM_FAIL_COUNT)
addCounter(FETCH_CHUNK_FAIL_COUNT)
addCounter(WRITE_DATA_FAIL_COUNT)
addCounter(REPLICATE_DATA_FAIL_COUNT)
@@ -69,13 +70,15 @@ class WorkerSource(conf: CelebornConf) extends
AbstractSource(conf, MetricsSyste
}
object WorkerSource {
- val COMMIT_FILES_TIME = "CommitFilesTime"
- val RESERVE_SLOTS_TIME = "ReserveSlotsTime"
- val FLUSH_DATA_TIME = "FlushDataTime"
- val OPEN_STREAM_TIME = "OpenStreamTime"
+ val REGISTERED_SHUFFLE_COUNT = "RegisteredShuffleCount"
+
+ val RUNNING_APPLICATION_COUNT = "RunningApplicationCount"
// fetch data
+ val OPEN_STREAM_TIME = "OpenStreamTime"
val FETCH_CHUNK_TIME = "FetchChunkTime"
+ val CHUNK_STREAM_COUNT = "ChunkStreamCount"
+ val OPEN_STREAM_FAIL_COUNT = "OpenStreamFailCount"
val FETCH_CHUNK_FAIL_COUNT = "FetchChunkFailCount"
// push data
@@ -100,13 +103,12 @@ object WorkerSource {
// flush
val TAKE_BUFFER_TIME = "TakeBufferTime"
-
- val REGISTERED_SHUFFLE_COUNT = "RegisteredShuffleCount"
-
- val RUNNING_APPLICATION_COUNT = "RunningApplicationCount"
+ val FLUSH_DATA_TIME = "FlushDataTime"
+ val COMMIT_FILES_TIME = "CommitFilesTime"
// slots
val SLOTS_ALLOCATED = "SlotsAllocated"
+ val RESERVE_SLOTS_TIME = "ReserveSlotsTime"
// connection
val ACTIVE_CONNECTION_COUNT = "ActiveConnectionCount"
@@ -124,6 +126,8 @@ object WorkerSource {
val BUFFER_STREAM_READ_BUFFER = "BufferStreamReadBuffer"
val READ_BUFFER_DISPATCHER_REQUESTS_LENGTH =
"ReadBufferDispatcherRequestsLength"
val READ_BUFFER_ALLOCATED_COUNT = "ReadBufferAllocatedCount"
+
+ // credit
val CREDIT_STREAM_COUNT = "CreditStreamCount"
val ACTIVE_MAP_PARTITION_COUNT = "ActiveMapPartitionCount"
diff --git
a/worker/src/test/java/org/apache/celeborn/service/deploy/worker/storage/ChunkStreamManagerSuiteJ.java
b/worker/src/test/java/org/apache/celeborn/service/deploy/worker/storage/ChunkStreamManagerSuiteJ.java
index f77bd4f6b..515b176d5 100644
---
a/worker/src/test/java/org/apache/celeborn/service/deploy/worker/storage/ChunkStreamManagerSuiteJ.java
+++
b/worker/src/test/java/org/apache/celeborn/service/deploy/worker/storage/ChunkStreamManagerSuiteJ.java
@@ -43,23 +43,23 @@ public class ChunkStreamManagerSuiteJ {
manager.registerStream("shuffleKey1", buffers2, "shuffleFile1", null);
manager.registerStream("shuffleKey2", buffers3, "shuffleFile2", null);
long stream3 = manager.registerStream("shuffleKey3", buffers4,
"shuffleFile3", null);
- Assert.assertEquals(4, manager.numStreamStates());
- Assert.assertEquals(manager.numStreamStates(), manager.numShuffleSteams());
+ Assert.assertEquals(4, manager.getStreamsCount());
+ Assert.assertEquals(manager.getStreamsCount(), manager.numShuffleSteams());
manager.cleanupExpiredShuffleKey(new
HashSet<>(Arrays.asList("shuffleKey1", "shuffleKey2")));
manager.cleanupExpiredShuffleKey(new
HashSet<>(Arrays.asList("none_exit_shuffleKey")));
- Assert.assertEquals(1, manager.numStreamStates());
- Assert.assertEquals(manager.numStreamStates(), manager.numShuffleSteams());
+ Assert.assertEquals(1, manager.getStreamsCount());
+ Assert.assertEquals(manager.getStreamsCount(), manager.numShuffleSteams());
// stream removed when buffer fully read
manager.streams.remove(stream3);
manager.shuffleStreamIds.get("shuffleKey3").remove(stream3);
- Assert.assertEquals(0, manager.numStreamStates());
- Assert.assertEquals(manager.numStreamStates(), manager.numShuffleSteams());
+ Assert.assertEquals(0, manager.getStreamsCount());
+ Assert.assertEquals(manager.getStreamsCount(), manager.numShuffleSteams());
// cleanup shuffleKey3
manager.cleanupExpiredShuffleKey(new
HashSet<>(Arrays.asList("shuffleKey3")));
- Assert.assertEquals(manager.numStreamStates(), manager.numShuffleSteams());
+ Assert.assertEquals(manager.getStreamsCount(), manager.numShuffleSteams());
}
}
diff --git
a/worker/src/test/java/org/apache/celeborn/service/deploy/worker/storage/CreditStreamManagerSuiteJ.java
b/worker/src/test/java/org/apache/celeborn/service/deploy/worker/storage/CreditStreamManagerSuiteJ.java
index bafe180f8..ad9b1e311 100644
---
a/worker/src/test/java/org/apache/celeborn/service/deploy/worker/storage/CreditStreamManagerSuiteJ.java
+++
b/worker/src/test/java/org/apache/celeborn/service/deploy/worker/storage/CreditStreamManagerSuiteJ.java
@@ -85,12 +85,12 @@ public class CreditStreamManagerSuiteJ {
long registerStream1 =
creditStreamManager.registerStream(streamIdConsumer, channel, 0, 1, 1,
diskFileInfo);
Assert.assertTrue(registerStream1 > 0);
- Assert.assertEquals(1, creditStreamManager.numStreamStates());
+ Assert.assertEquals(1, creditStreamManager.getStreamsCount());
long registerStream2 =
creditStreamManager.registerStream(streamIdConsumer, channel, 0, 1, 1,
diskFileInfo);
Assert.assertNotEquals(registerStream1, registerStream2);
- Assert.assertEquals(2, creditStreamManager.numStreamStates());
+ Assert.assertEquals(2, creditStreamManager.getStreamsCount());
creditStreamManager.registerStream(streamIdConsumer, channel, 0, 1, 1,
diskFileInfo);
creditStreamManager.registerStream(streamIdConsumer, channel, 0, 1, 1,
diskFileInfo);
@@ -103,7 +103,7 @@ public class CreditStreamManagerSuiteJ {
mapPartitionData1.getStreamReader(registerStream1).recycle();
- timeOutOrMeetCondition(() -> creditStreamManager.numStreamStates() == 3);
+ timeOutOrMeetCondition(() -> creditStreamManager.getStreamsCount() == 3);
Assert.assertEquals(creditStreamManager.numRecycleStreams(), 0);
// registerStream2 can't be cleaned as registerStream2 is not finished
@@ -113,14 +113,14 @@ public class CreditStreamManagerSuiteJ {
creditStreamManager.cleanResource(registerStream2);
Assert.assertEquals(creditStreamManager.numRecycleStreams(), 1);
- Assert.assertEquals(3, creditStreamManager.numStreamStates());
+ Assert.assertEquals(3, creditStreamManager.getStreamsCount());
// recycle all channel
numInFlightRequests.decrementAndGet();
creditStreamManager.connectionTerminated(channel);
- timeOutOrMeetCondition(() -> creditStreamManager.numStreamStates() == 0);
+ timeOutOrMeetCondition(() -> creditStreamManager.getStreamsCount() == 0);
// when cpu is busy, even through that timeOutOrMeetCondition is true,
- // creditStreamManager.numStreamStates are still not be removed
+ // creditStreamManager.getStreamsCount are still not be removed
Assert.assertTrue(creditStreamManager.numRecycleStreams() >= 0);
}