navinko commented on code in PR #10398:
URL: https://github.com/apache/ozone/pull/10398#discussion_r3337128515
##########
hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Container
Balancer Metrics.json:
##########
@@ -0,0 +1,1379 @@
+{
+ "annotations": [
+ {
+ "kind": "AnnotationQuery",
+ "spec": {
+ "builtIn": true,
+ "enable": true,
+ "hide": true,
+ "iconColor": "",
+ "name": "Annotations & Alerts",
+ "query": {
+ "group": "grafana",
+ "kind": "DataQuery",
+ "spec": {},
+ "version": "v0"
+ }
+ }
+ }
+ ],
+ "cursorSync": "Crosshair",
+ "description": "Comprehensive tracking of Ozone cluster balancing
operations. Monitors real-time DataNode capacity convergence, current iteration
health (Scheduled vs Completed), and lifetime data movement metrics.",
+ "editable": true,
+ "elements": {
+ "panel-1": {
+ "kind": "Panel",
+ "spec": {
+ "data": {
+ "kind": "QueryGroup",
+ "spec": {
+ "queries": [
+ {
+ "kind": "PanelQuery",
+ "spec": {
+ "hidden": false,
+ "query": {
+ "datasource": {
+ "name": "${datasource}"
+ },
+ "group": "prometheus",
+ "kind": "DataQuery",
+ "spec": {
+ "editorMode": "code",
+ "expr":
"sum(container_balancer_metrics_num_datanodes_unbalanced)",
+ "legendFormat": "Unbalanced DataNodes",
+ "range": false
+ },
+ "version": "v0"
+ },
+ "refId": "A"
+ }
+ }
+ ],
+ "queryOptions": {},
+ "transformations": []
+ }
+ },
+ "description": "Tracks the total number of DataNodes whose capacity
usage falls outside the configured cluster balance threshold. A healthy, fully
balanced cluster should ideally maintain a value of 0.",
+ "id": 1,
+ "links": [],
+ "title": "Unbalanced DataNodes",
+ "vizConfig": {
+ "group": "stat",
+ "kind": "VizConfig",
+ "spec": {
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "orange",
+ "value": 1
+ },
+ {
+ "color": "red",
+ "value": 5
+ }
+ ]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ }
+ },
+ "version": "13.0.1+security-01"
+ }
+ }
+ },
+ "panel-2": {
+ "kind": "Panel",
+ "spec": {
+ "data": {
+ "kind": "QueryGroup",
+ "spec": {
+ "queries": [
+ {
+ "kind": "PanelQuery",
+ "spec": {
+ "hidden": false,
+ "query": {
+ "datasource": {
+ "name": "${datasource}"
+ },
+ "group": "prometheus",
+ "kind": "DataQuery",
+ "spec": {
+ "editorMode": "code",
+ "expr":
"sum(container_balancer_metrics_data_size_unbalanced_gb * 1024 * 1024 * 1024)",
+ "legendFormat": "Total Unbalanced Data Size",
+ "range": true
+ },
+ "version": "v0"
+ },
+ "refId": "A"
+ }
+ }
+ ],
+ "queryOptions": {},
+ "transformations": []
+ }
+ },
+ "description": "Represents the total volume of data in gigabytes
currently residing on over-utilized nodes that must be shifted to
under-utilized nodes to satisfy your configured container balancing
thresholds.",
+ "id": 2,
+ "links": [],
+ "title": "Cluster Unbalanced Data Size Over Time",
+ "vizConfig": {
+ "group": "timeseries",
+ "kind": "VizConfig",
+ "spec": {
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "left",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "smooth",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "showValues": false,
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": []
+ },
+ "options": {
+ "annotations": {
+ "clustering": -1,
+ "multiLane": false
+ },
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "hideZeros": false,
+ "mode": "single",
+ "sort": "none"
+ }
+ }
+ },
+ "version": "13.0.1+security-01"
+ }
+ }
+ },
+ "panel-3": {
+ "kind": "Panel",
+ "spec": {
+ "data": {
+ "kind": "QueryGroup",
+ "spec": {
+ "queries": [
+ {
+ "kind": "PanelQuery",
+ "spec": {
+ "hidden": false,
+ "query": {
+ "datasource": {
+ "name": "${datasource}"
+ },
+ "group": "prometheus",
+ "kind": "DataQuery",
+ "spec": {
+ "editorMode": "code",
+ "expr":
"sum(increase(container_balancer_metrics_data_size_moved_gb[$__range]))",
+ "legendFormat": "Moved Data Size (GB)",
+ "range": true
Review Comment:
Thanks for catching this !
For the latest run stats , was trying to get delta using total data size
moved metrics "container_balancer_metrics_data_size_moved_gb"
- I was trying to apply rate with aggregate function sum (), which makes
query invalid , to fix that it requires to bind with increase function .
Realised this is not even required we already have another metrics for
latest _iteration
"container_balancer_metrics_data_size_moved_gb_in_latest_iteration"
Fixed this now :
<img width="300" height="150" alt="image"
src="https://github.com/user-attachments/assets/7a076829-e25e-4548-b105-199f59384c1c"
/>
<img width="300" height="150" alt="image"
src="https://github.com/user-attachments/assets/b4a61675-ac7e-4c21-9bb0-c08e1fe99468"
/>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]