This is an automated email from the ASF dual-hosted git repository.
ethanfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new c3d33daab [CELEBORN-1627] Introduce `instance` variable for celeborn
dashboard to filter metrics
c3d33daab is described below
commit c3d33daabc6360f9ac7a8c397ae57d71fab094e4
Author: Wang, Fei <[email protected]>
AuthorDate: Wed Oct 9 14:47:03 2024 +0800
[CELEBORN-1627] Introduce `instance` variable for celeborn dashboard to
filter metrics
### What changes were proposed in this pull request?
1. add `instanceLabel` in metrics source, prefer `FQDN:port` than `ip:port`
even with `celeborn.network.bind.preferIpAddress=false` before
2. add variable `instance` with `label_values(metrics_JVMCPUTime_Value,
instance)` same as `celeborn-jvm-dashboard.json`
3. add filter `instance=~"${instance}"` for every metrics
4. add missing `legendFormat` for memory file storage metrics expressions
### Why are the changes needed?
There should be too many celeborn instances in production use case, it is
better to add filter with instance.
### Does this PR introduce _any_ user-facing change?
Yes. introduce new variable.
But the instance default value is `ALL`, same behavior as before.
### How was this patch tested?
Config: `celeborn.network.bind.preferIpAddress=false`
<img width="1141" alt="image"
src="https://github.com/user-attachments/assets/c3161069-790a-4cb2-8654-6d52cf8e5fb0">
<img width="944" alt="image"
src="https://github.com/user-attachments/assets/293b8bd4-252a-459c-aa86-5f4aa75eb594">
<img width="939" alt="image"
src="https://github.com/user-attachments/assets/1e1b28af-dd71-4c5b-8285-57473a6c9650">
For JVM metrics, before it was ip:port, and now it is FQDN:port.
<img width="947" alt="image"
src="https://github.com/user-attachments/assets/fe00762f-605d-4b5e-b0a4-c586bdc0ec1a">
Closes #2777 from turboFei/legend_base.
Authored-by: Wang, Fei <[email protected]>
Signed-off-by: mingji <[email protected]>
---
assets/grafana/celeborn-dashboard.json | 241 ++++++++++++---------
.../common/metrics/source/AbstractSource.scala | 9 +-
.../celeborn/common/metrics/source/Role.scala | 23 ++
.../metrics/source/CelebornSourceSuite.scala | 27 ++-
.../celeborn/service/deploy/master/Master.scala | 12 +-
.../service/deploy/master/MasterSource.scala | 5 +-
.../celeborn/common/metrics/MetricsSystem.scala | 3 -
.../celeborn/service/deploy/worker/Worker.scala | 12 +-
.../service/deploy/worker/WorkerSource.scala | 5 +-
9 files changed, 200 insertions(+), 137 deletions(-)
diff --git a/assets/grafana/celeborn-dashboard.json
b/assets/grafana/celeborn-dashboard.json
index 553bd44b0..1592ec2de 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -155,7 +155,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_RegisteredShuffleCount_Value",
+ "expr":
"metrics_RegisteredShuffleCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -249,7 +249,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_WorkerCount_Value",
+ "expr": "metrics_WorkerCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -344,7 +344,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr":
"metrics_DeviceCelebornTotalBytes_Value{role=\"Master\"}",
+ "expr": "metrics_DeviceCelebornTotalBytes_Value{role=\"Master\",
instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -439,7 +439,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_DeviceCelebornFreeBytes_Value{role=\"Master\"}",
+ "expr": "metrics_DeviceCelebornFreeBytes_Value{role=\"Master\",
instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -533,7 +533,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_RunningApplicationCount_Value",
+ "expr":
"metrics_RunningApplicationCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -643,7 +643,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_IsActiveMaster_Value",
+ "expr":
"metrics_IsActiveMaster_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -740,7 +740,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_PartitionSize_Value",
+ "expr": "metrics_PartitionSize_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -835,7 +835,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_ShutdownWorkerCount_Value",
+ "expr":
"metrics_ShutdownWorkerCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -931,7 +931,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ActiveShuffleFileCount_Value{role=\"Master\"}",
+ "expr": "metrics_ActiveShuffleFileCount_Value{role=\"Master\",
instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -1028,7 +1028,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ActiveShuffleSize_Value{role=\"Master\"}",
+ "expr": "metrics_ActiveShuffleSize_Value{role=\"Master\",
instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -1123,7 +1123,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_OfferSlotsTime_Max",
+ "expr": "metrics_OfferSlotsTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -1216,7 +1216,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_OfferSlotsTime_Mean",
+ "expr": "metrics_OfferSlotsTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -1310,7 +1310,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ExcludedWorkerCount_Value",
+ "expr":
"metrics_ExcludedWorkerCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -1405,7 +1405,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_AvailableWorkerCount_Value",
+ "expr":
"metrics_AvailableWorkerCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -1499,7 +1499,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_LostWorkerCount_Value",
+ "expr":
"metrics_LostWorkerCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -1606,7 +1606,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "increase(metrics_SlotsAllocated_Count[1h])",
+ "expr":
"increase(metrics_SlotsAllocated_Count[1h]){instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -1699,7 +1699,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_ReserveSlotsTime_Mean",
+ "expr":
"metrics_ReserveSlotsTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -1793,7 +1793,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReserveSlotsTime_Max",
+ "expr":
"metrics_ReserveSlotsTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -1886,7 +1886,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_PausePushData_Value",
+ "expr": "metrics_PausePushData_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -1979,7 +1979,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_PausePushDataAndReplicate_Value",
+ "expr":
"metrics_PausePushDataAndReplicate_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -2073,7 +2073,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_PausePushDataTime_Value",
+ "expr":
"metrics_PausePushDataTime_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -2167,7 +2167,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_PausePushDataAndReplicateTime_Value",
+ "expr":
"metrics_PausePushDataAndReplicateTime_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -2263,7 +2263,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_ActiveShuffleSize_Value{role=\"Worker\"}",
+ "expr": "metrics_ActiveShuffleSize_Value{role=\"Worker\",
instance=~\"${instance}\"}",
"instant": false,
"legendFormat": "${baseLegend}",
"range": true,
@@ -2359,7 +2359,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_ActiveShuffleFileCount_Value{role=\"Worker\"}",
+ "expr": "metrics_ActiveShuffleFileCount_Value{role=\"Worker\",
instance=~\"${instance}\"}",
"instant": false,
"legendFormat": "${baseLegend}",
"range": true,
@@ -2453,7 +2453,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_ActiveConnectionCount_Count",
+ "expr":
"metrics_ActiveConnectionCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -2546,7 +2546,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_ActiveSlotsCount_Value",
+ "expr":
"metrics_ActiveSlotsCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -2641,7 +2641,7 @@
},
"disableTextWrap": false,
"editorMode": "builder",
- "expr": "metrics_FlushWorkingQueueSize_Value",
+ "expr":
"metrics_FlushWorkingQueueSize_Value{instance=~\"${instance}\"}",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
@@ -2841,7 +2841,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_PrimaryPushDataTime_Mean",
+ "expr":
"metrics_PrimaryPushDataTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -2931,7 +2931,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_PrimaryPushDataTime_Max",
+ "expr":
"metrics_PrimaryPushDataTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -3021,7 +3021,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_ReplicaPushDataTime_Mean",
+ "expr":
"metrics_ReplicaPushDataTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -3111,7 +3111,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_ReplicaPushDataTime_Max",
+ "expr":
"metrics_ReplicaPushDataTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -3201,7 +3201,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_WriteDataSuccessCount_Count",
+ "expr":
"metrics_WriteDataSuccessCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -3292,7 +3292,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_WriteDataFailCount_Count",
+ "expr":
"metrics_WriteDataFailCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -3383,7 +3383,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataFailCount_Count",
+ "expr":
"metrics_ReplicateDataFailCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -3474,7 +3474,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataWriteFailCount_Count",
+ "expr":
"metrics_ReplicateDataWriteFailCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -3565,7 +3565,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+ "expr":
"metrics_ReplicateDataCreateConnectionFailCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -3656,7 +3656,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataConnectionExceptionCount_Count",
+ "expr":
"metrics_ReplicateDataConnectionExceptionCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -3747,7 +3747,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataTimeoutCount_Count",
+ "expr":
"metrics_ReplicateDataTimeoutCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -3838,7 +3838,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataFailNonCriticalCauseCount_Count",
+ "expr":
"metrics_ReplicateDataFailNonCriticalCauseCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -3929,7 +3929,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_WriteDataHardSplitCount_Count",
+ "expr":
"metrics_WriteDataHardSplitCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -4034,7 +4034,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_OpenStreamTime_Mean",
+ "expr": "metrics_OpenStreamTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -4124,7 +4124,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_OpenStreamTime_Max",
+ "expr": "metrics_OpenStreamTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -4214,7 +4214,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_FetchChunkTime_Mean",
+ "expr": "metrics_FetchChunkTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -4304,7 +4304,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_FetchChunkTime_Max",
+ "expr": "metrics_FetchChunkTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -4394,7 +4394,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_OpenStreamSuccessCount_Count",
+ "expr":
"metrics_OpenStreamSuccessCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -4485,7 +4485,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_OpenStreamFailCount_Count",
+ "expr":
"metrics_OpenStreamFailCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -4576,7 +4576,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_FetchChunkSuccessCount_Count",
+ "expr":
"metrics_FetchChunkSuccessCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -4667,7 +4667,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_FetchChunkFailCount_Count",
+ "expr":
"metrics_FetchChunkFailCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -4758,7 +4758,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ActiveChunkStreamCount_Value",
+ "expr":
"metrics_ActiveChunkStreamCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -4863,7 +4863,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_TakeBufferTime_Mean",
+ "expr": "metrics_TakeBufferTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -4953,7 +4953,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_TakeBufferTime_Max",
+ "expr": "metrics_TakeBufferTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -5043,7 +5043,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_FlushDataTime_Mean",
+ "expr": "metrics_FlushDataTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -5133,7 +5133,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_FlushDataTime_Max",
+ "expr": "metrics_FlushDataTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -5223,7 +5223,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_CommitFilesTime_Mean",
+ "expr":
"metrics_CommitFilesTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -5313,7 +5313,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_CommitFilesTime_Max",
+ "expr": "metrics_CommitFilesTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -5421,7 +5421,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_NettyMemory_Value",
+ "expr": "metrics_NettyMemory_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -5516,7 +5516,8 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_DirectMemoryUsageRatio_Value",
+ "expr":
"metrics_DirectMemoryUsageRatio_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
"instant": false,
"range": true,
"refId": "A"
@@ -5611,7 +5612,8 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_MemoryFileStorageSize_Value",
+ "expr":
"metrics_MemoryFileStorageSize_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
"instant": false,
"range": true,
"refId": "A"
@@ -5705,7 +5707,8 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_MemoryStorageFileCount_Value",
+ "expr":
"metrics_MemoryStorageFileCount_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
"instant": false,
"range": true,
"refId": "A"
@@ -5796,7 +5799,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_DiskBuffer_Value",
+ "expr": "metrics_DiskBuffer_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -5887,7 +5890,8 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_EvictedFileCount_Value",
+ "expr":
"metrics_EvictedFileCount_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
"instant": false,
"range": true,
"refId": "A"
@@ -5979,7 +5983,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_push_usedHeapMemory_Value",
+ "expr":
"metrics_push_usedHeapMemory_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -6071,7 +6075,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_push_usedDirectMemory_Value",
+ "expr":
"metrics_push_usedDirectMemory_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -6163,7 +6167,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_fetch_usedHeapMemory_Value",
+ "expr":
"metrics_fetch_usedHeapMemory_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -6255,7 +6259,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_fetch_usedDirectMemory_Value",
+ "expr":
"metrics_fetch_usedDirectMemory_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -6347,7 +6351,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_replicate_usedHeapMemory_Value",
+ "expr":
"metrics_replicate_usedHeapMemory_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -6439,7 +6443,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_replicate_usedDirectMemory_Value",
+ "expr":
"metrics_replicate_usedDirectMemory_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -6530,7 +6534,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_ReadBufferAllocatedCount_Value",
+ "expr":
"metrics_ReadBufferAllocatedCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -6622,7 +6626,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_BufferStreamReadBuffer_Value",
+ "expr":
"metrics_BufferStreamReadBuffer_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -6713,7 +6717,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_ReadBufferDispatcherRequestsLength_Value",
+ "expr":
"metrics_ReadBufferDispatcherRequestsLength_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -6818,7 +6822,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_SortTime_Mean",
+ "expr": "metrics_SortTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -6908,7 +6912,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_SortTime_Max",
+ "expr": "metrics_SortTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
@@ -6997,7 +7001,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_SortingFiles_Value",
+ "expr": "metrics_SortingFiles_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -7087,7 +7091,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_SortedFiles_Value",
+ "expr": "metrics_SortedFiles_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -7178,7 +7182,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "metrics_SortMemory_Value",
+ "expr": "metrics_SortMemory_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -7269,7 +7273,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_SortedFileSize_Value",
+ "expr":
"metrics_SortedFileSize_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -7376,7 +7380,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_PotentialConsumeSpeed_Value",
+ "expr":
"metrics_PotentialConsumeSpeed_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -7469,7 +7473,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_WorkerConsumeSpeed_Value",
+ "expr":
"metrics_WorkerConsumeSpeed_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -7562,7 +7566,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_UserProduceSpeed_Value",
+ "expr":
"metrics_UserProduceSpeed_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -7668,7 +7672,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_PrimaryPushDataHandshakeTime_Mean",
+ "expr":
"metrics_PrimaryPushDataHandshakeTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -7760,7 +7764,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_PrimaryPushDataHandshakeTime_Max",
+ "expr":
"metrics_PrimaryPushDataHandshakeTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -7852,7 +7856,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicaPushDataHandshakeTime_Mean",
+ "expr":
"metrics_ReplicaPushDataHandshakeTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -7944,7 +7948,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicaPushDataHandshakeTime_Max",
+ "expr":
"metrics_ReplicaPushDataHandshakeTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8036,7 +8040,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_PrimaryRegionStartTime_Mean",
+ "expr":
"metrics_PrimaryRegionStartTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8128,7 +8132,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_PrimaryRegionStartTime_Max",
+ "expr":
"metrics_PrimaryRegionStartTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8220,7 +8224,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicaRegionStartTime_Mean",
+ "expr":
"metrics_ReplicaRegionStartTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8312,7 +8316,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicaRegionStartTime_Max",
+ "expr":
"metrics_ReplicaRegionStartTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8404,7 +8408,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_PrimaryRegionFinishTime_Mean",
+ "expr":
"metrics_PrimaryRegionFinishTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8496,7 +8500,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_PrimaryRegionFinishTime_Max",
+ "expr":
"metrics_PrimaryRegionFinishTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8588,7 +8592,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicaRegionFinishTime_Mean",
+ "expr":
"metrics_ReplicaRegionFinishTime_Mean{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8680,7 +8684,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicaRegionFinishTime_Max",
+ "expr":
"metrics_ReplicaRegionFinishTime_Max{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8771,7 +8775,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_PushDataHandshakeFailCount_Count",
+ "expr":
"metrics_PushDataHandshakeFailCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8862,7 +8866,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_RegionStartFailCount_Count",
+ "expr":
"metrics_RegionStartFailCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -8953,7 +8957,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_RegionStartFailCount_Count",
+ "expr":
"metrics_RegionStartFailCount_Count{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -9517,7 +9521,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_ActiveCreditStreamCount_Value",
+ "expr":
"metrics_ActiveCreditStreamCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -9608,7 +9612,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
- "expr": "metrics_ActiveMapPartitionCount_Value",
+ "expr":
"metrics_ActiveMapPartitionCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -9714,7 +9718,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_JVMCPUTime_Value",
+ "expr": "metrics_JVMCPUTime_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -9807,7 +9811,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_LastMinuteSystemLoad_Value",
+ "expr":
"metrics_LastMinuteSystemLoad_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -9900,7 +9904,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_DeviceOSFreeBytes_Value",
+ "expr":
"metrics_DeviceOSFreeBytes_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -9993,7 +9997,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_DeviceCelebornFreeBytes_Value",
+ "expr":
"metrics_DeviceCelebornFreeBytes_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -10086,7 +10090,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_AvailableProcessors_Value",
+ "expr":
"metrics_AvailableProcessors_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -10192,7 +10196,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_diskFileCount_Value",
+ "expr": "metrics_diskFileCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -10285,7 +10289,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_diskBytesWritten_Value",
+ "expr":
"metrics_diskBytesWritten_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -10377,7 +10381,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_hdfsFileCount_Value",
+ "expr": "metrics_hdfsFileCount_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -10470,7 +10474,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_hdfsBytesWritten_Value",
+ "expr":
"metrics_hdfsBytesWritten_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
@@ -10490,6 +10494,31 @@
"tags": [],
"templating": {
"list": [
+ {
+ "current": {},
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "definition": "label_values(metrics_JVMCPUTime_Value, instance)",
+ "hide": 0,
+ "includeAll": true,
+ "label": "instance",
+ "mapping": "",
+ "mappingOnLegend": true,
+ "multi": true,
+ "name": "instance",
+ "options": [],
+ "query": {
+ "query": "label_values(metrics_JVMCPUTime_Value, instance)",
+ "refId": "StandardVariableQuery"
+ },
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "type": "query"
+ },
{
"current": {
"selected": false,
@@ -10527,4 +10556,4 @@
"uid": "U_qgru_7z",
"version": 2,
"weekStart": ""
-}
\ No newline at end of file
+}
diff --git
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
index 58aa71bee..95562c911 100644
---
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
+++
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/AbstractSource.scala
@@ -64,7 +64,14 @@ abstract class AbstractSource(conf: CelebornConf, role:
String)
ThreadUtils.newDaemonSingleThreadScheduledExecutor("worker-metrics-cleaner")
val roleLabel: (String, String) = "role" -> role
- val staticLabels: Map[String, String] = conf.metricsExtraLabels + roleLabel
+ val instanceLabel: Map[String, String] = role match {
+ case Role.MASTER =>
+ Map("instance" -> s"${Utils.localHostName(conf)}:${conf.masterHttpPort}")
+ case Role.WORKER =>
+ Map("instance" -> s"${Utils.localHostName(conf)}:${conf.workerHttpPort}")
+ case _ => Map.empty
+ }
+ val staticLabels: Map[String, String] = conf.metricsExtraLabels + roleLabel
++ instanceLabel
val staticLabelsString: String = MetricLabels.labelString(staticLabels)
val applicationLabel = "applicationId"
diff --git
a/common/src/main/scala/org/apache/celeborn/common/metrics/source/Role.scala
b/common/src/main/scala/org/apache/celeborn/common/metrics/source/Role.scala
new file mode 100644
index 000000000..50b509643
--- /dev/null
+++ b/common/src/main/scala/org/apache/celeborn/common/metrics/source/Role.scala
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.celeborn.common.metrics.source
+
+object Role {
+ val MASTER = "master"
+ val WORKER = "worker"
+}
diff --git
a/common/src/test/scala/org/apache/celeborn/common/metrics/source/CelebornSourceSuite.scala
b/common/src/test/scala/org/apache/celeborn/common/metrics/source/CelebornSourceSuite.scala
index 89330bee9..d6eeb2358 100644
---
a/common/src/test/scala/org/apache/celeborn/common/metrics/source/CelebornSourceSuite.scala
+++
b/common/src/test/scala/org/apache/celeborn/common/metrics/source/CelebornSourceSuite.scala
@@ -24,11 +24,15 @@ class CelebornSourceSuite extends CelebornFunSuite {
test("test getMetrics with customized label") {
val conf = new CelebornConf()
- createAbstractSourceAndCheck(conf, "")
+ createAbstractSourceAndCheck(conf, "", Role.MASTER)
+ createAbstractSourceAndCheck(conf, "", Role.WORKER)
}
- def createAbstractSourceAndCheck(conf: CelebornConf, extraLabels: String):
Unit = {
- val mockSource = new AbstractSource(conf, "mock") {
+ def createAbstractSourceAndCheck(
+ conf: CelebornConf,
+ extraLabels: String,
+ role: String = "mock"): Unit = {
+ val mockSource = new AbstractSource(conf, role) {
override def sourceName: String = "mockSource"
}
val user1 = Map("user" -> "user1")
@@ -55,12 +59,17 @@ class CelebornSourceSuite extends CelebornFunSuite {
if (extraLabels.nonEmpty) {
extraLabelsStr = extraLabels + ","
}
- val exp1 = s"""metrics_Gauge1_Value{${extraLabelsStr}role="mock"} 1000"""
- val exp2 =
s"""metrics_Gauge2_Value{${extraLabelsStr}role="mock",user="user1"} 2000"""
- val exp3 = s"""metrics_Counter1_Count{${extraLabelsStr}role="mock"} 3000"""
- val exp4 =
s"""metrics_Counter2_Count{${extraLabelsStr}role="mock",user="user2"} 4000"""
- val exp5 = s"""metrics_Timer1_Count{${extraLabelsStr}role="mock"} 1"""
- val exp6 =
s"""metrics_Timer2_Count{${extraLabelsStr}role="mock",user="user3"} 1"""
+ val instanceLabelStr =
+ mockSource.instanceLabel.map(kv =>
s"""${kv._1}="${kv._2}",""").mkString(",")
+ val exp1 =
s"""metrics_Gauge1_Value{${extraLabelsStr}${instanceLabelStr}role="$role"}
1000"""
+ val exp2 =
+
s"""metrics_Gauge2_Value{${extraLabelsStr}${instanceLabelStr}role="$role",user="user1"}
2000"""
+ val exp3 =
s"""metrics_Counter1_Count{${extraLabelsStr}${instanceLabelStr}role="$role"}
3000"""
+ val exp4 =
+
s"""metrics_Counter2_Count{${extraLabelsStr}${instanceLabelStr}role="$role",user="user2"}
4000"""
+ val exp5 =
s"""metrics_Timer1_Count{${extraLabelsStr}${instanceLabelStr}role="$role"} 1"""
+ val exp6 =
+
s"""metrics_Timer2_Count{${extraLabelsStr}${instanceLabelStr}role="$role",user="user3"}
1"""
assert(res.contains(exp1))
assert(res.contains(exp2))
diff --git
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
index 544c9656b..d0623a20b 100644
---
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
+++
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -39,7 +39,7 @@ import org.apache.celeborn.common.identity.UserIdentifier
import org.apache.celeborn.common.internal.Logging
import org.apache.celeborn.common.meta.{DiskInfo, WorkerInfo, WorkerStatus}
import org.apache.celeborn.common.metrics.MetricsSystem
-import org.apache.celeborn.common.metrics.source.{JVMCPUSource, JVMSource,
ResourceConsumptionSource, SystemMiscSource, ThreadPoolSource}
+import org.apache.celeborn.common.metrics.source.{JVMCPUSource, JVMSource,
ResourceConsumptionSource, Role, SystemMiscSource, ThreadPoolSource}
import org.apache.celeborn.common.network.CelebornRackResolver
import org.apache.celeborn.common.network.protocol.TransportMessage
import org.apache.celeborn.common.protocol._
@@ -67,15 +67,15 @@ private[celeborn] class Master(
MetricsSystem.createMetricsSystem(serviceName, conf)
// init and register master metrics
private val resourceConsumptionSource =
- new ResourceConsumptionSource(conf, MetricsSystem.ROLE_MASTER)
- private val threadPoolSource = ThreadPoolSource(conf,
MetricsSystem.ROLE_MASTER)
+ new ResourceConsumptionSource(conf, Role.MASTER)
+ private val threadPoolSource = ThreadPoolSource(conf, Role.MASTER)
private val masterSource = new MasterSource(conf)
metricsSystem.registerSource(resourceConsumptionSource)
metricsSystem.registerSource(masterSource)
metricsSystem.registerSource(threadPoolSource)
- metricsSystem.registerSource(new JVMSource(conf, MetricsSystem.ROLE_MASTER))
- metricsSystem.registerSource(new JVMCPUSource(conf,
MetricsSystem.ROLE_MASTER))
- metricsSystem.registerSource(new SystemMiscSource(conf,
MetricsSystem.ROLE_MASTER))
+ metricsSystem.registerSource(new JVMSource(conf, Role.MASTER))
+ metricsSystem.registerSource(new JVMCPUSource(conf, Role.MASTER))
+ metricsSystem.registerSource(new SystemMiscSource(conf, Role.MASTER))
private val bindPreferIP: Boolean = conf.bindPreferIP
private val authEnabled = conf.authEnabled
diff --git
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
index 970fa4f5b..b2e725244 100644
---
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
+++
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
@@ -18,10 +18,9 @@
package org.apache.celeborn.service.deploy.master
import org.apache.celeborn.common.CelebornConf
-import org.apache.celeborn.common.metrics.MetricsSystem
-import org.apache.celeborn.common.metrics.source.AbstractSource
+import org.apache.celeborn.common.metrics.source.{AbstractSource, Role}
-class MasterSource(conf: CelebornConf) extends AbstractSource(conf,
MetricsSystem.ROLE_MASTER) {
+class MasterSource(conf: CelebornConf) extends AbstractSource(conf,
Role.MASTER) {
override val sourceName = "master"
import MasterSource._
diff --git
a/service/src/main/scala/org/apache/celeborn/common/metrics/MetricsSystem.scala
b/service/src/main/scala/org/apache/celeborn/common/metrics/MetricsSystem.scala
index ab919bb14..3baab6ebf 100644
---
a/service/src/main/scala/org/apache/celeborn/common/metrics/MetricsSystem.scala
+++
b/service/src/main/scala/org/apache/celeborn/common/metrics/MetricsSystem.scala
@@ -177,9 +177,6 @@ object MetricsSystem {
val SINK_REGEX: Regex = "^sink\\.(.+)\\.(.+)".r
val SOURCE_REGEX: Regex =
"^org.apache.celeborn.common.metrics.source\\.(.+)\\.(.+)".r
- val ROLE_WORKER = "Worker"
- val ROLE_MASTER = "Master"
-
private[this] val MINIMAL_POLL_UNIT = TimeUnit.SECONDS
private[this] val MINIMAL_POLL_PERIOD = 1
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index e3cbd8642..d0371e2bd 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -37,7 +37,7 @@ import org.apache.celeborn.common.identity.UserIdentifier
import org.apache.celeborn.common.internal.Logging
import org.apache.celeborn.common.meta.{DiskInfo, WorkerInfo,
WorkerPartitionLocationInfo}
import org.apache.celeborn.common.metrics.MetricsSystem
-import org.apache.celeborn.common.metrics.source.{JVMCPUSource, JVMSource,
ResourceConsumptionSource, SystemMiscSource, ThreadPoolSource}
+import org.apache.celeborn.common.metrics.source.{JVMCPUSource, JVMSource,
ResourceConsumptionSource, Role, SystemMiscSource, ThreadPoolSource}
import org.apache.celeborn.common.network.{CelebornRackResolver,
TransportContext}
import org.apache.celeborn.common.network.sasl.SaslServerBootstrap
import org.apache.celeborn.common.network.server.TransportServerBootstrap
@@ -73,14 +73,14 @@ private[celeborn] class Worker(
MetricsSystem.createMetricsSystem(serviceName, conf)
val workerSource = new WorkerSource(conf)
private val resourceConsumptionSource =
- new ResourceConsumptionSource(conf, MetricsSystem.ROLE_WORKER)
- private val threadPoolSource = ThreadPoolSource(conf,
MetricsSystem.ROLE_WORKER)
+ new ResourceConsumptionSource(conf, Role.WORKER)
+ private val threadPoolSource = ThreadPoolSource(conf, Role.WORKER)
metricsSystem.registerSource(workerSource)
metricsSystem.registerSource(threadPoolSource)
metricsSystem.registerSource(resourceConsumptionSource)
- metricsSystem.registerSource(new JVMSource(conf, MetricsSystem.ROLE_WORKER))
- metricsSystem.registerSource(new JVMCPUSource(conf,
MetricsSystem.ROLE_WORKER))
- metricsSystem.registerSource(new SystemMiscSource(conf,
MetricsSystem.ROLE_WORKER))
+ metricsSystem.registerSource(new JVMSource(conf, Role.WORKER))
+ metricsSystem.registerSource(new JVMCPUSource(conf, Role.WORKER))
+ metricsSystem.registerSource(new SystemMiscSource(conf, Role.WORKER))
private val topResourceConsumptionCount =
conf.metricsWorkerAppTopResourceConsumptionCount
private val topApplicationUserIdentifiers =
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index 15096fadc..26532a6bf 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -25,12 +25,11 @@ import scala.collection.JavaConverters._
import com.google.common.collect.Sets
import org.apache.celeborn.common.CelebornConf
-import org.apache.celeborn.common.metrics.MetricsSystem
-import org.apache.celeborn.common.metrics.source.AbstractSource
+import org.apache.celeborn.common.metrics.source.{AbstractSource, Role}
import org.apache.celeborn.common.network.client.TransportClient
import org.apache.celeborn.common.util.{JavaUtils, Utils}
-class WorkerSource(conf: CelebornConf) extends AbstractSource(conf,
MetricsSystem.ROLE_WORKER) {
+class WorkerSource(conf: CelebornConf) extends AbstractSource(conf,
Role.WORKER) {
override val sourceName = "worker"
val appActiveConnections: ConcurrentHashMap[String, util.Set[String]] =