This is an automated email from the ASF dual-hosted git repository.
rexxiong pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new f886751e8 [CELEBORN-1812] Distinguish sorting-file from sort-tasks
waiting to be submitted
f886751e8 is described below
commit f886751e809ebeabfc88f0af084ad72611060ecf
Author: wuziyi <[email protected]>
AuthorDate: Sat Jan 4 10:27:53 2025 +0800
[CELEBORN-1812] Distinguish sorting-file from sort-tasks waiting to be
submitted
### What changes were proposed in this pull request?
Current implementation uses `
shuffleSortTaskDeque.size()` as current sorting file count.This value might
be more appropriately described as the sort tasks waiting to be submitted to
`fileSorterExecutors`. And the actual current sorting file number ( doing some
disk-io operation etc) should be get from `sortingShuffleFiles`.
### Why are the changes needed?
Add metrics to monitor current sorting files which is making disk-io
operations.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?

Closes #3040 from Z1Wu/fix/sorting_file_metrics.
Authored-by: wuziyi <[email protected]>
Signed-off-by: Shuang <[email protected]>
---
assets/grafana/celeborn-dashboard.json | 1152 +++++++++++---------
docs/monitoring.md | 1 +
.../worker/storage/PartitionFilesSorter.java | 6 +-
.../celeborn/service/deploy/worker/Worker.scala | 3 +
.../service/deploy/worker/WorkerSource.scala | 1 +
5 files changed, 635 insertions(+), 528 deletions(-)
diff --git a/assets/grafana/celeborn-dashboard.json
b/assets/grafana/celeborn-dashboard.json
index fb1f00577..cb1c1ebb2 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -9390,552 +9390,650 @@
"y": 7
},
"id": 50,
- "panels": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 0,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "auto",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "ms"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 933
- },
- "id": 44,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
- },
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "expr": "metrics_SortTime_Mean{instance=~\"${instance}\"}",
- "legendFormat": "${baseLegend}",
- "refId": "A"
- }
- ],
- "title": "metrics_SortTime_Mean",
- "type": "timeseries"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 0,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "auto",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "ms"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 933
- },
- "id": 46,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
- },
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "expr": "metrics_SortTime_Max{instance=~\"${instance}\"}",
- "legendFormat": "${baseLegend}",
- "refId": "A"
- }
- ],
- "title": "metrics_SortTime_Max",
- "type": "timeseries"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 0,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "auto",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
+ "panels": [],
+ "title": "SortRelatives",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
}
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 941
- },
- "id": 192,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
+ ]
},
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "expr": "metrics_SortingFiles_Value{instance=~\"${instance}\"}",
- "legendFormat": "${baseLegend}",
- "range": true,
- "refId": "A"
- }
- ],
- "title": "metrics_SortingFiles_Value",
- "type": "timeseries"
+ "unit": "ms"
},
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 8
+ },
+ "id": 44,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
{
"datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 0,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "auto",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- }
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 941
- },
- "id": 180,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
- },
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "expr": "metrics_SortedFiles_Value{instance=~\"${instance}\"}",
- "legendFormat": "${baseLegend}",
- "range": true,
- "refId": "A"
- }
- ],
- "title": "metrics_SortedFiles_Value",
- "type": "timeseries"
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "expr": "metrics_SortTime_Mean{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_SortTime_Mean",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "ms"
},
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 8
+ },
+ "id": 46,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 0,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "auto",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "decbytes"
- },
- "overrides": []
+ "expr": "metrics_SortTime_Max{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_SortTime_Max",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 16
+ },
+ "id": 192,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
},
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 949
+ "expr": "metrics_SortingFiles_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_SortingFiles_Value",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 16
+ },
+ "id": 180,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
},
- "id": 88,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
+ "expr": "metrics_SortedFiles_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_SortedFiles_Value",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
},
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "expr": "metrics_SortMemory_Value{instance=~\"${instance}\"}",
- "legendFormat": "${baseLegend}",
- "range": true,
- "refId": "A"
- }
- ],
- "title": "metrics_SortMemory_Value",
- "type": "timeseries"
+ "unit": "decbytes"
},
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 24
+ },
+ "id": 88,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 0,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "auto",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
+ "expr": "metrics_SortMemory_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_SortMemory_Value",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
}
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 949
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 24
+ },
+ "id": 135,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
},
- "id": 135,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
+ "editorMode": "code",
+ "expr": "metrics_SortedFileSize_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_SortedFileSize_Value",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 32
+ },
+ "id": 234,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
},
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr":
"metrics_SortedFileSize_Value{instance=~\"${instance}\"}",
- "legendFormat": "${baseLegend}",
- "range": true,
- "refId": "A"
- }
- ],
- "title": "metrics_SortedFileSize_Value",
- "type": "timeseries"
+ "editorMode": "code",
+ "expr": "metrics_PendingSortTasks_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
}
],
- "title": "SortRelatives",
- "type": "row"
+ "title": "metrics_PendingSortTasks_Value",
+ "type": "timeseries"
},
{
"collapsed": true,
@@ -9943,7 +10041,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 8
+ "y": 40
},
"id": 157,
"panels": [
@@ -10236,7 +10334,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 9
+ "y": 41
},
"id": 137,
"panels": [
@@ -12086,7 +12184,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 10
+ "y": 42
},
"id": 110,
"panels": [
@@ -12282,7 +12380,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 11
+ "y": 43
},
"id": 123,
"panels": [
@@ -12760,7 +12858,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 12
+ "y": 44
},
"id": 172,
"panels": [
@@ -13144,7 +13242,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 13
+ "y": 45
},
"id": 226,
"panels": [
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 553672912..b2424fec7 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -217,6 +217,7 @@ These metrics are exposed by Celeborn worker.
| SortTime | The time for a worker to sort a
shuffle file.
|
| SortMemory | The memory used by sorting
shuffle files.
|
| SortingFiles | The count of sorting shuffle
files.
|
+ | PendingSortTaks | The count of sort tasks waiting
to be submitted to FileSorterExecutors.
|
| SortedFiles | The count of sorted shuffle
files.
|
| SortedFileSize | The count of sorted shuffle
files 's total size.
|
| DiskBuffer | The memory occupied by pushData
and pushMergedData which should be written to disk.
|
diff --git
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
index dfc2f3e99..3385874c7 100644
---
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
+++
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
@@ -177,10 +177,14 @@ public class PartitionFilesSorter extends
ShuffleRecoverHelper {
});
}
- public int getSortingCount() {
+ public int getPendingSortTaskCount() {
return shuffleSortTaskDeque.size();
}
+ public int getSortingCount() {
+ return
sortingShuffleFiles.values().stream().map(Set::size).reduce(Integer::sum).orElse(0);
+ }
+
public int getSortedCount() {
return sortedFileCount.get();
}
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index 6afcc23a5..ac9dc6328 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -370,6 +370,9 @@ private[celeborn] class Worker(
workerSource.addGauge(WorkerSource.SORT_MEMORY) { () =>
memoryManager.getSortMemoryCounter.get()
}
+ workerSource.addGauge(WorkerSource.PENDING_SORT_TASKS) { () =>
+ partitionsSorter.getPendingSortTaskCount
+ }
workerSource.addGauge(WorkerSource.SORTING_FILES) { () =>
partitionsSorter.getSortingCount
}
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index 8275e3595..891988a5b 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -193,6 +193,7 @@ object WorkerSource {
val SORT_TIME = "SortTime"
val SORT_MEMORY = "SortMemory"
val SORTING_FILES = "SortingFiles"
+ val PENDING_SORT_TASKS = "PendingSortTasks"
val SORTED_FILES = "SortedFiles"
val SORTED_FILE_SIZE = "SortedFileSize"
val DISK_BUFFER = "DiskBuffer"