sreejasahithi commented on code in PR #10398:
URL: https://github.com/apache/ozone/pull/10398#discussion_r3332340615
##########
hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Container
Balancer Metrics.json:
##########
@@ -0,0 +1,1379 @@
+{
+ "annotations": [
+ {
+ "kind": "AnnotationQuery",
+ "spec": {
+ "builtIn": true,
+ "enable": true,
+ "hide": true,
+ "iconColor": "",
+ "name": "Annotations & Alerts",
+ "query": {
+ "group": "grafana",
+ "kind": "DataQuery",
+ "spec": {},
+ "version": "v0"
+ }
+ }
+ }
+ ],
+ "cursorSync": "Crosshair",
+ "description": "Comprehensive tracking of Ozone cluster balancing
operations. Monitors real-time DataNode capacity convergence, current iteration
health (Scheduled vs Completed), and lifetime data movement metrics.",
+ "editable": true,
+ "elements": {
+ "panel-1": {
+ "kind": "Panel",
+ "spec": {
+ "data": {
+ "kind": "QueryGroup",
+ "spec": {
+ "queries": [
+ {
+ "kind": "PanelQuery",
+ "spec": {
+ "hidden": false,
+ "query": {
+ "datasource": {
+ "name": "${datasource}"
+ },
+ "group": "prometheus",
+ "kind": "DataQuery",
+ "spec": {
+ "editorMode": "code",
+ "expr":
"sum(container_balancer_metrics_num_datanodes_unbalanced)",
+ "legendFormat": "Unbalanced DataNodes",
+ "range": false
+ },
+ "version": "v0"
+ },
+ "refId": "A"
+ }
+ }
+ ],
+ "queryOptions": {},
+ "transformations": []
+ }
+ },
+ "description": "Tracks the total number of DataNodes whose capacity
usage falls outside the configured cluster balance threshold. A healthy, fully
balanced cluster should ideally maintain a value of 0.",
+ "id": 1,
+ "links": [],
+ "title": "Unbalanced DataNodes",
+ "vizConfig": {
+ "group": "stat",
+ "kind": "VizConfig",
+ "spec": {
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "orange",
+ "value": 1
+ },
+ {
+ "color": "red",
+ "value": 5
+ }
+ ]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ }
+ },
+ "version": "13.0.1+security-01"
+ }
+ }
+ },
+ "panel-2": {
+ "kind": "Panel",
+ "spec": {
+ "data": {
+ "kind": "QueryGroup",
+ "spec": {
+ "queries": [
+ {
+ "kind": "PanelQuery",
+ "spec": {
+ "hidden": false,
+ "query": {
+ "datasource": {
+ "name": "${datasource}"
+ },
+ "group": "prometheus",
+ "kind": "DataQuery",
+ "spec": {
+ "editorMode": "code",
+ "expr":
"sum(container_balancer_metrics_data_size_unbalanced_gb * 1024 * 1024 * 1024)",
+ "legendFormat": "Total Unbalanced Data Size",
+ "range": true
+ },
+ "version": "v0"
+ },
+ "refId": "A"
+ }
+ }
+ ],
+ "queryOptions": {},
+ "transformations": []
+ }
+ },
+ "description": "Represents the total volume of data in gigabytes
currently residing on over-utilized nodes that must be shifted to
under-utilized nodes to satisfy your configured container balancing
thresholds.",
+ "id": 2,
+ "links": [],
+ "title": "Cluster Unbalanced Data Size Over Time",
+ "vizConfig": {
+ "group": "timeseries",
+ "kind": "VizConfig",
+ "spec": {
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "left",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "smooth",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "showValues": false,
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": []
+ },
+ "options": {
+ "annotations": {
+ "clustering": -1,
+ "multiLane": false
+ },
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "hideZeros": false,
+ "mode": "single",
+ "sort": "none"
+ }
+ }
+ },
+ "version": "13.0.1+security-01"
+ }
+ }
+ },
+ "panel-3": {
+ "kind": "Panel",
+ "spec": {
+ "data": {
+ "kind": "QueryGroup",
+ "spec": {
+ "queries": [
+ {
+ "kind": "PanelQuery",
+ "spec": {
+ "hidden": false,
+ "query": {
+ "datasource": {
+ "name": "${datasource}"
+ },
+ "group": "prometheus",
+ "kind": "DataQuery",
+ "spec": {
+ "editorMode": "code",
+ "expr":
"sum(increase(container_balancer_metrics_data_size_moved_gb[$__range]))",
+ "legendFormat": "Moved Data Size (GB)",
+ "range": true
Review Comment:
should this be
`sum(container_balancer_metrics_data_size_moved_gb_in_latest_iteration)`
instead?
since the title of this panel shows 'Size Moved (Latest)'
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]