This is an automated email from the ASF dual-hosted git repository.
wusheng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/skywalking-banyandb.git
The following commit(s) were added to refs/heads/main by this push:
new 75b06389 Add metrics section in ob doc (#543)
75b06389 is described below
commit 75b0638993b37caf4742e78861cea08b2bec1b10
Author: Gao Hongtao <[email protected]>
AuthorDate: Tue Sep 24 17:45:21 2024 +0800
Add metrics section in ob doc (#543)
---
docs/operation/grafana-cluster.json | 2513 +++++++++++++++++++++++++++++++++++
docs/operation/observability.md | 238 +++-
2 files changed, 2750 insertions(+), 1 deletion(-)
diff --git a/docs/operation/grafana-cluster.json
b/docs/operation/grafana-cluster.json
new file mode 100644
index 00000000..18606b8d
--- /dev/null
+++ b/docs/operation/grafana-cluster.json
@@ -0,0 +1,2513 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__elements": {},
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "11.2.0"
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "stat",
+ "name": "Stat",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "timeseries",
+ "name": "Time series",
+ "version": ""
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "grafana",
+ "uid": "-- Grafana --"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 6,
+ "panels": [],
+ "title": "Stats ($instance)",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The write rate is the number of write operations per
second. It is calculated by summing the total number of written operations for
measures and streams.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "wps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 0,
+ "y": 1
+ },
+ "id": 8,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(rate(banyandb_measure_total_written{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval])) +
sum(rate(banyandb_stream_tst_total_written{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval]))",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Write Rate",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The total memory is the total memory available on the
system.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 8,
+ "y": 1
+ },
+ "id": 5,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(banyandb_system_memory_state{job=~\"$job\",
instance=~\"$instance\",kind=\"total\"})",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Total Memory",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The disk usage is the percentage of disk space used. If
the disk usage is over 80%, it may indicate that the disk is almost full.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 16,
+ "y": 1
+ },
+ "id": 7,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(banyandb_system_disk{job=~\"$job\",
instance=~\"$instance\",kind=\"used\"})",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Disk Usage ",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The query rate is the number of query operations per
second. It is the query rate on the liaison server.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "rps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 0,
+ "y": 6
+ },
+ "id": 9,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(rate(banyandb_liaison_grpc_total_started{job=~\"$job\",
instance=~\"$instance\", method=\"query\"}[$__rate_interval]))",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Query Rate",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The total CPU is the total number of CPUs available on
the system.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 8,
+ "y": 6
+ },
+ "id": 2,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(banyandb_system_cpu_num{job=~\"$job\",
instance=~\"$instance\"})",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Total CPU",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The write and query errors rate is the number of write
and query errors per minute. It is calculated by summing the total number of
write and query errors from liaison and data servers.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ },
+ "unit": "cpm"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 16,
+ "y": 6
+ },
+ "id": 27,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_liaison_grpc_total_err{job=~\"$job\",instance=~\"$instance\",method=\"query\"}[$__rate_interval])*60)
+
sum(rate(banyandb_liaison_grpc_total_stream_msg_sent_err{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*60)
+
sum(rate(banyandb_liaison_grpc_total_stream_msg_received_err{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*60)
+
sum(rate(banyandb_queue_sub_total_msg_sent_err{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*60)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Write and Query Errors Rate",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The etcd operation rate is the number of etcd operations
per second. It is calculated by summing the total number of etcd operations.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "cps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 0,
+ "y": 11
+ },
+ "id": 28,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_liaison_grpc_total_registry_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
+
sum(rate(banyandb_liaison_grpc_total_started{job=~\"$job\",instance=~\"$instance\",method!=\"query\"}[$__rate_interval]))",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Etcd Operation Rate",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The active instances is the number of active instances
in the BanyanDB cluster.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 16,
+ "x": 8,
+ "y": 11
+ },
+ "id": 10,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(min_over_time(up{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval])) by (job)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Active Instances",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 16
+ },
+ "id": 4,
+ "panels": [],
+ "title": "Resource Usage ($instance)",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The CPU usage is the percentage of CPU used. If the CPU
usage is over 80%, it may indicate that the CPU is overloaded.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineStyle": {
+ "fill": "solid"
+ },
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 11,
+ "x": 0,
+ "y": 17
+ },
+ "id": 1,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "max(rate(process_cpu_seconds_total{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval]) /
banyandb_system_cpu_num{job=~\"$job\", instance=~\"$instance\"}) by (job)",
+ "instant": false,
+ "interval": "",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "CPU Usage",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The RSS memory usage is the percentage of resident
memory used. If the memory usage is over 80%, it may indicate that the memory
is almost full.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 13,
+ "x": 11,
+ "y": 17
+ },
+ "id": 3,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"max(max_over_time(process_resident_memory_bytes{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval]) /
sum(banyandb_system_memory_state{job=~\"$job\", instance=~\"$instance\",
kind=\"total\"}) by (job,instance)) by(job)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "RSS memory usage $instance",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The disk usage is the percentage of disk space used. If
the disk usage is over 80%, it may indicate that the disk is almost full.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 11,
+ "x": 0,
+ "y": 25
+ },
+ "id": 11,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "max(sum(banyandb_system_disk{job=~\"$job\",
instance=~\"$instance\", kind=\"used\"}) /
sum(banyandb_system_memory_state{job=~\"$job\", instance=~\"$instance\",
kind=\"total\"})) by (job)",
+ "instant": false,
+ "interval": "",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Disk Usage ($instance)",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The network usage is the number of bytes sent and
received per second.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "binBps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 13,
+ "x": 11,
+ "y": 25
+ },
+ "id": 12,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_system_net_state{job=~\"$job\",instanct=~\"$instance\",kind=\"bytes_recv\"}[$__rate_interval]))
by (name)",
+ "instant": false,
+ "legendFormat": "{{name}}-recv",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_system_net_state{job=~\"$job\",instanct=~\"$instance\",kind=\"bytes_sent\"}[$__rate_interval]))
by (name)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "{{name}}-sent",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Network usage ($instance)",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 33
+ },
+ "id": 13,
+ "panels": [],
+ "title": "Storage ($instance)",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The write rate is the number of write operations per
second. It is calculated by summing the total number of written operations for
measures and streams. It's grouped by the `group` tag.\n\nYou can view the
write rate of different instance to find out the hot instance.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 0,
+ "y": 34
+ },
+ "id": 14,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(rate(banyandb_measure_total_written{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval])) by (group)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": " +
sum(rate(banyandb_stream_tst_total_written{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval])) by (group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Write Rate",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The query latency is the average query latency in
seconds. It is calculated by summing the total query latency and dividing by
the total number of queries.\n\nYou can view the query latency of different
instance to find out the instance with high query latency. Because BanyanDB
will fetch all instances to query, the query latency of the instance with high
query latency will affect the overall query latency.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 5
+ }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 8,
+ "y": 34
+ },
+ "id": 15,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(rate(banyandb_liaison_grpc_total_latency{job=~\"$job\",
instance=~\"$instance\",method=\"query\"}[$__rate_interval])) by( group) /
sum(rate(banyandb_liaison_grpc_total_started{job=~\"$job\",
instance=~\"$instance\",method=\"query\"}[$__rate_interval])) by (group)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Query Latency",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The total data is the total number of data points stored
in BanyanDB. It's grouped by the `group` tag.\n\nYou can view the total data of
different instance to find out the instance with high data points. If the
difference between the total data of different instances is too large, it may
indicate that the data is not evenly distributed.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 16,
+ "y": 34
+ },
+ "id": 16,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(banyandb_measure_total_file_elements{job=~\"$job\",instance=~\"$instance\"})by(group)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(banyandb_stream_tst_total_file_elements{job=~\"$job\",instance=~\"$instance\"})by(group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Total Data",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The merge file rate is the number of merge file
operations per minute. It is calculated by summing the total number of merge
file operations. It's grouped by the `group` tag.\n\nIf the value surges, it
may indicate that too many small files are being merged. It may bring following
problems:\n\n- Increase the disk I/O\n- Slow down the query performance\n-
Increase the CPU usage",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "rotrpm"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 40
+ },
+ "id": 17,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_measure_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)
* 60",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_stream_tst_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)
* 60",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Merge File Rate",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The merge file latency is the average merge file latency
in seconds. It is calculated by summing the total merge file latency and
dividing by the total number of merge file operations. It's grouped by the
`group` tag.\n\nIf the value surges, it may indicate that the merge file
operation is slow. It may be caused by the high disk I/O and other resource
usage. It may bring following problems:\n\n- Slow down the query performance\n-
Increase the CPU usage\n- Increase t [...]
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 8,
+ "y": 40
+ },
+ "id": 18,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_measure_total_merge_latency{job=~\"$job\",
instance=~\"$instance\",type=\"file\"}[$__rate_interval]))by(group) /
sum(rate(banyandb_measure_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_stream_tst_total_merge_latency{job=~\"$job\",
instance=~\"$instance\",type=\"file\"}[$__rate_interval]))by(group) /
sum(rate(banyandb_stream_tst_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Merge File Latency",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The merge file partitions is the average number of
partitions merged per merge file operation. It is calculated by summing the
total number of partitions merged and dividing by the total number of merge
file operations. It's grouped by the `group` tag.\n\nIf the value surges, it
may indicate that too many partitions are being merged. It may because the
partition number is too large that indicates the server is under a high write
load.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 16,
+ "y": 40
+ },
+ "id": 20,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(rate(banyandb_measure_total_merged_parts{job=~\"$job\",
instance=~\"$instance\",type=\"file\"}[$__rate_interval]))by(group) /
sum(rate(banyandb_measure_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_stream_tst_total_merged_parts{job=~\"$job\",
instance=~\"$instance\",type=\"file\"}[$__rate_interval]))by(group) /
sum(rate(banyandb_stream_tst_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Merge File Partitions",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The series write rate is the number of series write
operations per second. It is calculated by summing the total number of series
write operations for measures and streams. It's grouped by the `group`
tag.\n\nIf the value surges, it may indicate that the old series are being
updated frequently by the new series. It may be caused by the high cardinality
of the series and bring following problems:\n\n- Increase the series inverted
index size\n- Slow down the query per [...]
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "wps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 48
+ },
+ "id": 21,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_measure_inverted_index_total_updates{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_stream_storage_inverted_index_total_updates{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Series Write Rate",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The series term search rate is the number of series term
search operations per second. It is calculated by summing the total number of
series term search operations for measures and streams. It's grouped by the
`group` tag.\n\nIf the value is too large, it may indicate that reading
operation fetch too many series. It may be caused by the high cardinality of
the series and bring following problems:\n\n- Slow down the query
performance\n- Increase the CPU usage\n- Inc [...]
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "rps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 8,
+ "y": 48
+ },
+ "id": 25,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_stream_storage_inverted_index_total_term_searchers_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_measure_inverted_index_total_term_searchers_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Series Term Search Rate",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The total series is the total number of series stored in
BanyanDB. It's grouped by the `group` tag.\n\nIf the value is too large, it may
indicate that the high cardinality of the series. It may bring following
problems:\n\n- Increase the series inverted index size\n- Slow down the query
performance",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 16,
+ "y": 48
+ },
+ "id": 22,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(banyandb_measure_inverted_index_total_doc_count{job=~\"$job\",instance=~\"$instance\"})
by (group)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(banyandb_stream_storage_inverted_index_total_doc_count{job=~\"$job\",instance=~\"$instance\"})
by (group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Total Series",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 55
+ },
+ "id": 23,
+ "panels": [],
+ "title": "Stream Inverted Index ($instance)",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The write rate is the number of write operations per
second. It is calculated by summing the total number of written operations for
streams. It's grouped by the `group` tag.\n\nIf the value is too large, it may
indicate that too many data points are being indexed and bring following
problems:\n\n- Increase the inverted index size\n- Slow down the query
performance\n- Increase the CPU usage\n- Increase the memory usage",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "wps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 56
+ },
+ "id": 24,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_stream_tst_inverted_index_total_updates{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Write Rate",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The term search rate is the number of term search
operations per second. It is calculated by summing the total number of term
search operations for streams. It's grouped by the `group` tag.\n\nIf the value
is too large, it may indicate that reading operation fetch too many data
points. It may bring following problems:\n\n- Slow down the query
performance\n- Increase the CPU usage\n- Increase the memory usage",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "rps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 8,
+ "y": 56
+ },
+ "id": 19,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(rate(banyandb_stream_tst_inverted_index_total_term_searchers_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Term Search Rate",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The total documents is the total number of documents
stored in the stream inverted index. It's grouped by the `group` tag.\n\nIf the
value is too large, it may indicate that too many data points are being indexed
and bring following problems:\n\n- Increase the inverted index size\n- Slow
down the query performance\n- Increase the CPU usage\n- Increase the memory
usage",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 16,
+ "y": 56
+ },
+ "id": 26,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Last *",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"sum(banyandb_stream_tst_inverted_index_total_doc_count{job=~\"$job\",instance=~\"$instance\"})
by (group)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Total Documents",
+ "type": "timeseries"
+ }
+ ],
+ "schemaVersion": 39,
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {},
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "definition": "label_values(banyandb_system_up_time,job)",
+ "hide": 0,
+ "includeAll": false,
+ "multi": false,
+ "name": "job",
+ "options": [],
+ "query": {
+ "qryType": 1,
+ "query": "label_values(banyandb_system_up_time,job)",
+ "refId": "PrometheusVariableQueryEditor-VariableQuery"
+ },
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "type": "query"
+ },
+ {
+ "allValue": ".*",
+ "current": {},
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "definition":
"label_values(banyandb_system_up_time{job=\"$job\"},instance)",
+ "hide": 1,
+ "includeAll": true,
+ "multi": true,
+ "name": "instance",
+ "options": [],
+ "query": {
+ "qryType": 1,
+ "query":
"label_values(banyandb_system_up_time{job=\"$job\"},instance)",
+ "refId": "PrometheusVariableQueryEditor-VariableQuery"
+ },
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "type": "query"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "browser",
+ "title": "BanyanDB Cluster",
+ "uid": "ddy81kbj931mof",
+ "version": 19,
+ "weekStart": ""
+}
\ No newline at end of file
diff --git a/docs/operation/observability.md b/docs/operation/observability.md
index 7119b4cf..2c6b330e 100644
--- a/docs/operation/observability.md
+++ b/docs/operation/observability.md
@@ -24,6 +24,238 @@ When query tracing is enabled, the slow query log won't be
generated.
## Metrics
+BanyanDB expose metrics for monitoring and analysis. In this part, we use some
variables to represent the metrics, such as `$job` and `$instance`. The `$job`
is the job name of the BanyanDB collection job, and the `$instance` is the
instance name of the BanyanDB instance.
+
+`__rate_interval` is a variable that represents the rate interval. It is used
to calculate the rate of metrics.
+
+### Stats
+
+`Stats` metrics are used to monitor the overall status of BanyanDB. The
following metrics are available:
+
+#### Write Rate
+
+The write rate is the number of write operations per second. It is calculated
by summing the total number of written operations for measures and streams.
+
+**Expression**: `sum(rate(banyandb_measure_total_written{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval])) +
sum(rate(banyandb_stream_tst_total_written{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval]))`
+
+#### Total Memory
+
+The total memory is the total memory available on the system.
+
+**Expression**: `sum(banyandb_system_memory_state{job=~\"$job\",
instance=~\"$instance\",kind=\"total\"})`
+
+#### Disk Usage
+
+The disk usage is the percentage of disk space used. If the disk usage is over
80%, it may indicate that the disk is almost full.
+
+**Expression**: `sum(banyandb_system_disk{job=~\"$job\",
instance=~\"$instance\",kind=\"used\"})`
+
+#### Query Rate
+
+The query rate is the number of query operations per second. It is the query
rate on the liaison server.
+
+**Expression**: `sum(rate(banyandb_liaison_grpc_total_started{job=~\"$job\",
instance=~\"$instance\", method=\"query\"}[$__rate_interval]))`
+
+#### Total CPU
+
+The total CPU is the total number of CPUs available on the system.
+
+**Expression**: `sum(banyandb_system_cpu_num{job=~\"$job\",
instance=~\"$instance\"})`
+
+#### Write and Query Errors Rate
+
+The write and query errors rate is the number of write and query errors per
minute. It is calculated by summing the total number of write and query errors
from liaison and data servers.
+
+**Expression**:
`sum(rate(banyandb_liaison_grpc_total_err{job=~\"$job\",instance=~\"$instance\",method=\"query\"}[$__rate_interval])*60)
+
sum(rate(banyandb_liaison_grpc_total_stream_msg_sent_err{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*60)
+
sum(rate(banyandb_liaison_grpc_total_stream_msg_received_err{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*60)
+
sum(rate(banyandb_queue_sub_total_msg_sent_err{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*60)`
+
+#### Etcd Operation Rate
+
+The etcd operation rate is the number of etcd operations per second. It is
calculated by summing the total number of etcd operations.
+
+**Expression**:
`sum(rate(banyandb_liaison_grpc_total_registry_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
+
sum(rate(banyandb_liaison_grpc_total_started{job=~\"$job\",instance=~\"$instance\",method!=\"query\"}[$__rate_interval]))`
+
+#### Active Instances
+
+The active instances is the number of active instances in the BanyanDB cluster.
+
+**Expression**: `sum(min_over_time(up{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval])) by (job)`
+
+### Resource Usage
+
+`Resource Usage` metrics are used to monitor the resource usage of BanyanDB on
the node. The following metrics are available:
+
+#### CPU Usage
+
+The CPU usage is the percentage of CPU used. If the CPU usage is over 80%, it
may indicate that the CPU is overloaded.
+
+**Expression**: `max(rate(process_cpu_seconds_total{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval]) /
banyandb_system_cpu_num{job=~\"$job\", instance=~\"$instance\"}) by (job)`
+
+#### RSS memory usage
+
+The RSS memory usage is the percentage of resident memory used. If the memory
usage is over 80%, it may indicate that the memory is almost full.
+
+**Expression**:
`max(max_over_time(process_resident_memory_bytes{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval]) /
sum(banyandb_system_memory_state{job=~\"$job\", instance=~\"$instance\",
kind=\"total\"}) by (job,instance)) by(job)`
+
+#### Disk Usage
+
+The disk usage is the percentage of disk space used. If the disk usage is over
80%, it may indicate that the disk is almost full.
+
+**Expression**: `max(sum(banyandb_system_disk{job=~\"$job\",
instance=~\"$instance\", kind=\"used\"}) /
sum(banyandb_system_memory_state{job=~\"$job\", instance=~\"$instance\",
kind=\"total\"})) by (job)`
+
+#### Network Usage
+
+The network usage is the number of bytes sent and received per second.
+
+**Expression1**:
`sum(rate(banyandb_system_net_state{job=~\"$job\",instanct=~\"$instance\",kind=\"bytes_recv\"}[$__rate_interval]))
by (name)`
+
+**Expression2**:
`sum(rate(banyandb_system_net_state{job=~\"$job\",instanct=~\"$instance\",kind=\"bytes_sent\"}[$__rate_interval]))
by (name)`
+
+### Storage
+
+`Storage` metrics are used to monitor the storage status of BanyanDB. The
following metrics are available:
+
+#### Write Rate
+
+The write rate is the number of write operations per second. It is calculated
by summing the total number of written operations for measures and streams.
It's grouped by the `group` tag.
+
+You can view the write rate of different instance to find out the hot instance.
+
+**Expression**: `sum(rate(banyandb_measure_total_written{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval])) by (group) +
sum(rate(banyandb_stream_tst_total_written{job=~\"$job\",
instance=~\"$instance\"}[$__rate_interval])) by (group)`
+
+#### Query Latency
+
+The query latency is the average query latency in seconds. It is calculated by
summing the total query latency and dividing by the total number of queries.
+
+You can view the query latency of different instance to find out the instance
with high query latency. Because BanyanDB will fetch all instances to query,
the query latency of the instance with high query latency will affect the
overall query latency.
+
+**Expression**: `sum(rate(banyandb_liaison_grpc_total_latency{job=~\"$job\",
instance=~\"$instance\",method=\"query\"}[$__rate_interval])) by( group) /
sum(rate(banyandb_liaison_grpc_total_started{job=~\"$job\",
instance=~\"$instance\",method=\"query\"}[$__rate_interval])) by (group)`
+
+#### Total Data
+
+The total data is the total number of data points stored in BanyanDB. It's
grouped by the `group` tag.
+
+You can view the total data of different instance to find out the instance
with high data points. If the difference between the total data of different
instances is too large, it may indicate that the data is not evenly distributed.
+
+**Expression1**:
`sum(banyandb_measure_total_file_elements{job=~\"$job\",instance=~\"$instance\"})by(group)`
+**Expression2**:
`sum(banyandb_stream_tst_total_file_elements{job=~\"$job\",instance=~\"$instance\"})by(group)`
+
+#### Merge File Rate
+
+The merge file rate is the number of merge file operations per minute. It is
calculated by summing the total number of merge file operations. It's grouped
by the `group` tag.
+
+If the value surges, it may indicate that too many small files are being
merged. It may bring following problems:
+
+- Increase the disk I/O
+- Slow down the query performance
+- Increase the CPU usage
+
+**Expression1**:
`sum(rate(banyandb_measure_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)
* 60`
+**Expression2**:
`sum(rate(banyandb_stream_tst_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)
* 60`
+
+#### Merge File Latency
+
+The merge file latency is the average merge file latency in seconds. It is
calculated by summing the total merge file latency and dividing by the total
number of merge file operations. It's grouped by the `group` tag.
+
+If the value surges, it may indicate that the merge file operation is slow. It
may be caused by the high disk I/O and other resource usage. It may bring
following problems:
+
+- Slow down the query performance
+- Increase the CPU usage
+- Increase the memory usage
+
+**Expression1**: `sum(rate(banyandb_measure_total_merge_latency{job=~\"$job\",
instance=~\"$instance\",type=\"file\"}[$__rate_interval]))by(group) /
sum(rate(banyandb_measure_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)`
+**Expression2**:
`sum(rate(banyandb_stream_tst_total_merge_latency{job=~\"$job\",
instance=~\"$instance\",type=\"file\"}[$__rate_interval]))by(group) /
sum(rate(banyandb_stream_tst_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)`
+
+#### Merge File Partitions
+
+The merge file partitions is the average number of partitions merged per merge
file operation. It is calculated by summing the total number of partitions
merged and dividing by the total number of merge file operations. It's grouped
by the `group` tag.
+
+If the value surges, it may indicate that too many partitions are being
merged. It may because the partition number is too large that indicates the
server is under a high write load.
+
+**Expression1**: `sum(rate(banyandb_measure_total_merged_parts{job=~\"$job\",
instance=~\"$instance\",type=\"file\"}[$__rate_interval]))by(group) /
sum(rate(banyandb_measure_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)`
+
+**Expression2**:
`sum(rate(banyandb_stream_tst_total_merged_parts{job=~\"$job\",
instance=~\"$instance\",type=\"file\"}[$__rate_interval]))by(group) /
sum(rate(banyandb_stream_tst_total_merge_loop_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))by(group)`
+
+#### Series Write Rate
+
+The series write rate is the number of series write operations per second. It
is calculated by summing the total number of series write operations for
measures and streams. It's grouped by the `group` tag.
+
+If the value surges, it may indicate that the old series are being updated
frequently by the new series. It may be caused by the high cardinality of the
series and bring following problems:
+
+- Increase the series inverted index size
+- Slow down the query performance
+
+**Expression1**:
`sum(rate(banyandb_measure_inverted_index_total_updates{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)`
+**Expression2**:
`sum(rate(banyandb_stream_storage_inverted_index_total_updates{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)`
+
+##### Series Term Search Rate
+
+The series term search rate is the number of series term search operations per
second. It is calculated by summing the total number of series term search
operations for measures and streams. It's grouped by the `group` tag.
+
+If the value is too large, it may indicate that reading operation fetch too
many series. It may be caused by the high cardinality of the series and bring
following problems:
+
+- Slow down the query performance
+- Increase the CPU usage
+- Increase the memory usage
+
+**Expression1**:
`sum(rate(banyandb_stream_storage_inverted_index_total_term_searchers_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)`
+**Expression2**:
`sum(rate(banyandb_measure_inverted_index_total_term_searchers_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)`
+
+#### Total Series
+
+The total series is the total number of series stored in BanyanDB. It's
grouped by the `group` tag.
+
+If the value is too large, it may indicate that the high cardinality of the
series. It may bring following problems:
+
+- Increase the series inverted index size
+- Slow down the query performance
+
+**Expression1**:
`sum(banyandb_measure_inverted_index_total_doc_count{job=~\"$job\",instance=~\"$instance\"})
by (group)`
+**Expression2**:
`sum(banyandb_stream_storage_inverted_index_total_doc_count{job=~\"$job\",instance=~\"$instance\"})
by (group)`
+
+### Stream Inverted Index
+
+`Stream Inverted Index` metrics are used to monitor the stream inverted index
status of BanyanDB. The following metrics are available:
+
+#### Stream Inverted Index Write Rate
+
+The write rate is the number of write operations per second. It is calculated
by summing the total number of written operations for streams. It's grouped by
the `group` tag.
+
+If the value is too large, it may indicate that too many data points are being
indexed and bring following problems:
+
+- Increase the inverted index size
+- Slow down the query performance
+- Increase the CPU usage
+- Increase the memory usage
+
+**Expression**:
`sum(rate(banyandb_stream_tst_inverted_index_total_updates{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)`
+
+#### Term Search Rate
+
+The term search rate is the number of term search operations per second. It is
calculated by summing the total number of term search operations for streams.
It's grouped by the `group` tag.
+
+If the value is too large, it may indicate that reading operation fetch too
many data points. It may bring following problems:
+
+- Slow down the query performance
+- Increase the CPU usage
+- Increase the memory usage
+
+**Expression**:
`sum(rate(banyandb_stream_tst_inverted_index_total_term_searchers_started{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))
by (group)`
+
+#### Total Documents
+
+The total documents is the total number of documents stored in the stream
inverted index. It's grouped by the `group` tag.
+
+If the value is too large, it may indicate that too many data points are being
indexed and bring following problems:
+
+- Increase the inverted index size
+- Slow down the query performance
+- Increase the CPU usage
+- Increase the memory usage
+
+**Expression**:
`sum(banyandb_stream_tst_inverted_index_total_doc_count{job=~\"$job\",instance=~\"$instance\"})
by (group)`
+
+## Metrics Providers
+
BanyanDB has built-in support for metrics collection. Currently, there are two
supported metrics provider: `prometheus` and `native`. These can be enabled
through `observability-modes` flag, allowing you to activate one or both of
them.
### Prometheus
@@ -32,7 +264,11 @@ Prometheus is auto enabled at run time, if no flag is
passed or if `promethus` i
When the Prometheus metrics provider is enabled, the metrics server listens on
port `2121`. This allows Prometheus to scrape metrics data from BanyanDB for
monitoring and analysis.
-### Self-observability
+#### Grafana Dashboard
+
+Check out the [BanyanDB Cluster Dashboard](grafana-cluster.json) for
monitoring BanyanDB metrics.
+
+### Native
If the `observability-modes` flag is set to `native`, the self-observability
metrics provider will be enabled. The some of metrics will be displayed in the
dashboard of [banyandb-ui](http://localhost:17913/)