Copilot commented on code in PR #1178:
URL:
https://github.com/apache/skywalking-banyandb/pull/1178#discussion_r3413798850
##########
test/cases/lifecycle/lifecycle.go:
##########
@@ -252,16 +252,29 @@ func verifyMigrationMetrics(reg
observability.MetricsRegistry) {
body := rec.Body.String()
// Last-run metrics (banyandb_lifecycle_last_run_timestamp_seconds +
// banyandb_lifecycle_last_run_success) are stamped by the deferred
- // recordLastRun() at the end of action(). A successful cycle must set
+ // recordLastRun() at the end of action() with the (remote_node,
+ // remote_role, remote_tier, group) label set, sourced from the
+ // cycle's last processed group. A successful cycle must set
// success=1 with a non-zero epoch; an empty value would mean the
// gauges were never registered (PreRun not run) or the action never
// reached the defer. Prometheus emits floats in scientific notation
// for large values like epoch seconds (e.g. 1.781007822e+09), so the
- // assertion accepts either fixed or scientific form.
-
gomega.Expect(body).To(gomega.MatchRegexp(`(?m)^banyandb_lifecycle_last_run_timestamp_seconds
(?:[1-9]\d{9}|[1-9]\.\d+e\+0?[89])`),
- "banyandb_lifecycle_last_run_timestamp_seconds must be set to a
non-zero epoch, got:\n"+body)
-
gomega.Expect(body).To(gomega.MatchRegexp(`(?m)^banyandb_lifecycle_last_run_success
1$`),
- "banyandb_lifecycle_last_run_success must be 1 after a
successful cycle, got:\n"+body)
+ // assertion accepts either fixed or scientific form. The metric
+ // names now carry labels — Prometheus' exposition format sorts label
+ // names alphabetically (group, remote_node, remote_role, remote_tier),
+ // and the regex requires all four to be present so a regression to
+ // the unlabeld form fails the regex. The label block is captured as
+ // a single `[^}]*` then each required label is asserted with a
Review Comment:
Typo in comment: "unlabeld" -> "unlabeled".
##########
test/cases/lifecycle/lifecycle.go:
##########
@@ -652,8 +665,33 @@ func crossSegmentTimestamps() (single, left, right
time.Time) {
// at runtime — no extra CLI flags needed beyond what the test setup already
// passes via SharedContext.MetadataFlags. See deriveSelfIdentity in
// banyand/backup/lifecycle/steps.go for the resolution rules.
+// runLifecycleMigration runs a single hot->warm lifecycle migration, pointing
+// the lifecycle service at the co-located data node. Returns the
MetricsRegistry
+// the lifecycle service registered its metrics with so the test can scrape
them.
+//
+// The integration test cluster has the data node bound to "localhost"
+// (pkg/test/setup/setup.go:host = "localhost") and its GrpcAddress
+// registered as `localhost:<port>`. The lifecycle CLI's resolveSelfIdentity
+// matches selfPodHost against the host portion of the registered
+// GrpcAddress, so we set POD_NAME=localhost for the duration of the
+// call (and restore the prior value on exit) so selfPodHostname()
+// returns "localhost" and matches the data node. In production this
+// is set by the K8s downward API to the lifecycle pod's actual pod
+// name (e.g. demo-banyandb-data-hot-0); the integration test uses
+// "localhost" because the data node's bind address is the loopback.
Review Comment:
This comment block is duplicated and still references deriveSelfIdentity
even though the implementation was renamed to resolveSelfIdentity and now keys
off POD_NAME. Keeping both copies makes the test harder to follow and risks it
drifting out of sync again.
##########
banyand/backup/lifecycle/steps.go:
##########
@@ -302,33 +329,59 @@ func parseGroup(
}
nsl, err := pub.ParseLabelSelector(nst.NodeSelector)
if err != nil {
- return nil, errors.WithMessagef(err, "failed to parse node
selector %s", nst.NodeSelector)
+ return nil, "", "", "", errors.WithMessagef(err, "failed to
parse node selector %s", nst.NodeSelector)
}
nodeSel := node.NewRoundRobinSelector("", metadata)
if ok, _ := nodeSel.OnInit([]schema.Kind{schema.KindGroup}); !ok {
- return nil, fmt.Errorf("failed to initialize node selector for
group %s", g.Metadata.Name)
+ return nil, "", "", "", fmt.Errorf("failed to initialize node
selector for group %s", g.Metadata.Name)
}
client := pub.NewWithoutMetadata(omr) //nolint:contextcheck // health
check goroutine uses context.Background()
// Stamp the lifecycle's self identity onto the publisher so the wire
// SenderNode / SenderRole / SenderTier fields and the parallel
- // banyandb_lifecycle_migration_* labels are populated. The three
- // values are derived from already-known inputs (the co-located data
- // node's gRPC address and the cluster's data-node registry) so the
- // fix needs no new CLI flags:
- // - SenderNode = the data node whose GrpcAddress matches the
- // lifecycle's --grpc-addr (i.e. the co-located data node). Its
- // Metadata.Name is the BanyanDB NodeID the receiver records as
- // remote_node.
- // - SenderRole = "lifecycle" (no Role enum entry; matches the
- // liaison's hard-coded "liaison" pattern in
pkg/cmdsetup/liaison.go).
- // - SenderTier = the matched data node's `type` label
- // (hot/warm/cold), which becomes the receiver's remote_tier.
- // Falls back to the lifecycle's own --node-labels when the co-located
- // data node isn't in the registry yet (cold start), and to empty
- // when neither is available — preserving the pre-fix behavior.
- senderNode, senderTier := deriveSelfIdentity(coLocatedDataNodeAddr,
nodeLabels, nodes)
- if senderNode != "" || senderTier != "" {
- client.SetSelfNode(senderNode, "lifecycle", senderTier)
+ // banyandb_lifecycle_migration_* labels are populated. The
+ // resolveSelfIdentity algorithm matches the lifecycle's own pod
+ // hostname (POD_NAME -> os.Hostname(), same precedence as
+ // nativeNodeContext at service.go:160-165) against the
+ // data-node registry's GrpcAddress with loopback-alias and
+ // port-strip normalization. The first matching registry entry is
+ // the co-located data pod; its Metadata.Name is the BanyanDB
+ // NodeID the receiver records as remote_node, and its
+ // Labels["type"] is the receiver's remote_tier. SenderRole is
+ // hard-coded to "lifecycle" to mirror the liaison's
+ // "liaison" pattern at pkg/cmdsetup/liaison.go:170-171.
+ //
+ // The (senderNode, "lifecycle", senderTier) tuple returned here is
+ // consumed by three downstream emissions, all sharing the same
+ // (remote_node, remote_role, remote_tier) label form: (a) the wire
+ // SenderNode/Role/Tier fields on every SendRequest
+ // (banyand/queue/queue.go:62-68), (b) the per-message
+ // banyandb_lifecycle_migration_* family emitted by the
+ // lifecycle-tier pub (file-sync:
banyand/queue/pub/chunked_sync.go:67-82;
+ // batch-write: banyand/queue/pub/batch.go:215, 271, 291-292, 421,
+ // 471-472, 488, 511, 520, 532), and (c) the cycle-level
+ // banyandb_lifecycle_cycles_total + last_run_* metrics stamped by
+ // the caller (process*Group). The two families describe different
+ // sides (sender vs destination) and are not cross-joinable — see
+ // the struct comment in service.go and CHANGES.md for the
+ // asymmetry.
+ selfHost := selfPodHostname()
+ senderNode, senderTier, resolvedOK := resolveSelfIdentity(selfHost,
nodes)
+ if resolvedOK {
+ senderRole = "lifecycle"
+ client.SetSelfNode(senderNode, senderRole, senderTier)
+ // Info log so operators can see which identity the agent
Review Comment:
Hard-coding the sender role string here duplicates lifecycleRoleName
(service.go) and makes it easier for the role label to drift between wire
stamping, metrics labels, and native node context. Using the package constant
keeps the sender role consistent across the lifecycle service.
##########
docs/operation/grafana-fodc-migration.json:
##########
@@ -0,0 +1,2322 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__elements": {},
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "11.2.0"
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "stat",
+ "name": "Stat",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "table",
+ "name": "Table",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "timeseries",
+ "name": "Time series",
+ "version": ""
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "grafana",
+ "uid": "-- Grafana --"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 100,
+ "panels": [],
+ "title": "Migration Health Overview",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Time since the most recent lifecycle cycle completed
across all source pods/groups. Daily-batch SLA: red if older than 26h.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 93600
+ }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 0,
+ "y": 1
+ },
+ "id": 101,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "time() -
max(banyandb_lifecycle_last_run_timestamp_seconds{job=~\"$job\",
group=~\"$group\"})",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Last Migration (age)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Lifecycle migration cycles completed in the last 24h
(sum of increase over banyandb_lifecycle_cycles_total).",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 4,
+ "y": 1
+ },
+ "id": 102,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "sum(increase(banyandb_lifecycle_cycles_total{job=~\"$job\",
group=~\"$group\"}[24h]))",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Cycles (24h)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Number of source pods whose most recent migration cycle
reported failure (last_run_success==0). Red if any.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 8,
+ "y": 1
+ },
+ "id": 103,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "count(banyandb_lifecycle_last_run_success{job=~\"$job\"} ==
0) or vector(0)",
+ "instant": false,
Review Comment:
The "Last-run Failures" stat ignores the dashboard's $group selection. Since
last_run_* are now labeled by group, this panel will keep counting failures
from other groups even when a group is selected.
##########
docs/operation/grafana-fodc-migration.json:
##########
@@ -0,0 +1,2322 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__elements": {},
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "11.2.0"
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "stat",
+ "name": "Stat",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "table",
+ "name": "Table",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "timeseries",
+ "name": "Time series",
+ "version": ""
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "grafana",
+ "uid": "-- Grafana --"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 100,
+ "panels": [],
+ "title": "Migration Health Overview",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Time since the most recent lifecycle cycle completed
across all source pods/groups. Daily-batch SLA: red if older than 26h.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 93600
+ }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 0,
+ "y": 1
+ },
+ "id": 101,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "time() -
max(banyandb_lifecycle_last_run_timestamp_seconds{job=~\"$job\",
group=~\"$group\"})",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Last Migration (age)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Lifecycle migration cycles completed in the last 24h
(sum of increase over banyandb_lifecycle_cycles_total).",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 4,
+ "y": 1
+ },
+ "id": 102,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "sum(increase(banyandb_lifecycle_cycles_total{job=~\"$job\",
group=~\"$group\"}[24h]))",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Cycles (24h)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Number of source pods whose most recent migration cycle
reported failure (last_run_success==0). Red if any.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 8,
+ "y": 1
+ },
+ "id": 103,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "count(banyandb_lifecycle_last_run_success{job=~\"$job\"} ==
0) or vector(0)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Last-run Failures",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Total bytes sent by the lifecycle publisher in the last
24h (file-sync sent_bytes).",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 12,
+ "y": 1
+ },
+ "id": 104,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr":
"sum(increase(banyandb_lifecycle_migration_sent_bytes{job=~\"$job\",
group=~\"$group\"}[24h]))",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Data Migrated (24h)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Total lifecycle migration errors in the last 24h. Red if
any.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 16,
+ "y": 1
+ },
+ "id": 105,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr":
"sum(increase(banyandb_lifecycle_migration_total_err{job=~\"$job\"}[24h])) or
vector(0)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Migration Errors (24h)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Distinct source pods currently emitting lifecycle
migration metrics.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 20,
+ "y": 1
+ },
+ "id": 106,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "count(count by
(pod_name)(banyandb_lifecycle_migration_total_finished{job=~\"$job\"}))",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Active Source Pods",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 5
+ },
+ "id": 110,
+ "panels": [],
+ "title": "Cycle Status",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Completed migration cycles per source pod and group
(banyandb_lifecycle_cycles_total). This counter is attributed to the SOURCE
node only \u2014 its remote_* labels carry the lifecycle node's own (sender)
identity, not the migration destination, so there is no destination column
here. A single cycle may fan out to multiple destination tiers; see the Flows
table for the per-flow source\u2192dest breakdown.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "custom": {
+ "align": "auto",
+ "cellOptions": {
+ "type": "auto"
+ },
+ "inspect": false
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 6
+ },
+ "id": 111,
+ "options": {
+ "showHeader": true,
+ "cellHeight": "sm",
+ "footer": {
+ "show": false,
+ "reducer": [
+ "sum"
+ ],
+ "countRows": false,
+ "fields": ""
+ }
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "sum by (node_type, pod_name,
group)(banyandb_lifecycle_cycles_total{job=~\"$job\", group=~\"$group\"})",
+ "format": "table",
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "A"
+ }
+ ],
+ "transformations": [
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+ "Time": true,
+ "job": true,
+ "instance": true,
+ "pod": true,
+ "container_name": true,
+ "node_role": true,
+ "remote_role": true,
+ "remote_tier": true,
+ "remote_node": true,
+ "__name__": true
+ },
+ "renameByName": {
+ "node_type": "Src tier",
+ "pod_name": "Src pod",
+ "group": "Group",
+ "Value": "Cycles"
+ },
+ "indexByName": {
+ "node_type": 0,
+ "pod_name": 1,
+ "group": 2,
+ "Value": 3
+ }
+ }
+ }
+ ],
+ "title": "Cycle Ledger (per source pod \u00d7 group)",
+ "type": "table"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Per source pod: age of the last completed cycle and its
success flag.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "custom": {
+ "align": "auto",
+ "cellOptions": {
+ "type": "auto"
+ },
+ "inspect": false
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Last run age"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "s"
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Success"
+ },
+ "properties": [
+ {
+ "id": "mappings",
+ "value": [
+ {
+ "type": "value",
+ "options": {
+ "1": {
+ "text": "OK",
+ "color": "green",
+ "index": 0
+ },
+ "0": {
+ "text": "FAIL",
+ "color": "red",
+ "index": 1
+ }
+ }
+ }
+ ]
+ },
+ {
+ "id": "custom.cellOptions",
+ "value": {
+ "type": "color-background",
+ "mode": "basic"
+ }
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 6
+ },
+ "id": 112,
+ "options": {
+ "showHeader": true,
+ "cellHeight": "sm",
+ "footer": {
+ "show": false,
+ "reducer": [
+ "sum"
+ ],
+ "countRows": false,
+ "fields": ""
+ }
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "time() -
banyandb_lifecycle_last_run_timestamp_seconds{job=~\"$job\"}",
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "A",
+ "format": "table"
Review Comment:
"Last Run per Source Pod" query A is missing the group matcher even though
banyandb_lifecycle_last_run_timestamp_seconds is now labeled by group. Without
filtering, the table can show pods whose last-run label tuple belongs to a
different group than the selected $group.
##########
CHANGES.md:
##########
@@ -7,8 +7,10 @@ Release Notes.
### Features
- Add two catalogs to the queue batch-write metrics so traffic is comparable
on both ends: a per-batch-stream **batch catalog**
(`total_batch_started`/`total_batch_finished`/`total_batch_latency`, buckets to
~300s) on `queue_pub`/`queue_sub` and the `lifecycle_migration` mirror, and a
per-message **message catalog**
(`total_message_started`/`total_message_finished`) on `queue_sub` (the
publisher's existing `total_*` already counts per message). All existing
`total_*` series are unchanged. Note: pub `total_batch_finished` means the
batch stream's final response was received, not that every message was acked —
true per-message pub acks are deferred to a follow-up that adds an
`accepted_count` field to `cluster.v1.SendResponse`.
- Redesign the queue (`queue_pub`/`queue_sub`) metrics around a uniform model:
keep only `total_started`, `total_finished`, `total_latency` (now a histogram)
and `total_err`, plus file-sync-only `sent_bytes` (pub) / `received_bytes`
(sub). Replace the `topic` label with `operation`
(`batch-write`/`file-sync`/`query`/`control`) and `group`, add an `error_type`
label on `total_err`, and add remote-endpoint labels
(`remote_node`/`remote_role`/`remote_tier`) so the liaison↔data (hot/warm/cold)
call graph can be reconstructed against `/cluster/topology` (`remote_node`
equals the node `metadata.name`). The sender's identity travels on the wire via
new `cluster.v1.SendRequest` (`group`,
`sender_node`/`sender_role`/`sender_tier`) and `SyncMetadata` (`sender_*`)
fields; pub-side `remote_role`/`remote_tier` are resolved from the connection
registry. Pre-marshaled (`[]byte`) payloads on the secondary-index sync path
(measure/stream series-index, stream local-index, trace sidx-series) carry t
heir business group explicitly on the bus message
(`bus.NewMessageWithNodeAndGroup`), so those `batch-write` metrics are labeled
by `group` instead of falling back to empty. [Breaking Change] The previous
`queue_*` metric and label names (`*_total_msg_*`, `queue_pub_send_*`,
`*_inflight_*`/retry/backoff gauges, `chunked_sync_*`/`chunk_reorder_*`, and
the `topic` label) are removed; update any dashboards/alerts accordingly.
-- Stamp the lifecycle's tier-migration publisher's identity onto the wire so
the receiving data node records a non-empty
`remote_node`/`remote_role`/`remote_tier` on its
`banyandb_queue_sub_total_finished` series. The lifecycle's `parseGroup`
resolves the lifecycle's self identity by matching its `--grpc-addr` (the
co-located data node's gRPC address) against the data-node registry —
`Metadata.Name` becomes `remote_node`, `Labels["type"]` becomes `remote_tier` —
and calls `SetSelfNode(senderNode, "lifecycle", senderTier)` on the migration
publisher. The label-superset match against `--node-labels` and a
`Labels["type"]`-only match are the cold-start and lifecycle-only-knows-tier
fallbacks. Mirrors the liaison's existing `SetSelfNode(node.NodeID, "liaison",
liaisonTier)` call in `pkg/cmdsetup/liaison.go:170-171`; no new CLI flags are
introduced.
+- Stamp the lifecycle's tier-migration publisher's identity onto the wire so
the receiving data node records a non-empty
`remote_node`/`remote_role`/`remote_tier` on its
`banyandb_queue_sub_total_finished` series. The lifecycle's `parseGroup`
resolves the lifecycle's self identity by matching the lifecycle pod's hostname
(POD_NAME via the K8s downward API, falling back to `os.Hostname()` — same
precedence as `nativeNodeContext` at `banyand/backup/lifecycle/service.go`)
against the data-node registry's `GrpcAddress` (host-portion match with
loopback-alias and IP-literal normalization, via `hostMatches` at
`banyand/backup/lifecycle/steps.go`) — `Metadata.Name` becomes `remote_node`,
`Labels["type"]` becomes `remote_tier` — and calls `SetSelfNode(senderNode,
"lifecycle", senderTier)` on the migration publisher. The previous
`--grpc-addr` address-match (Pass 1) and `--node-labels` superset-match (Pass
2-3) fallbacks are replaced by this single host-based match because they failed
on the production cluster where the data pod's `GrpcAddress` is a
headless-service FQDN but the lifecycle's `--grpc-addr` is the loopback.
Mirrors the liaison's existing `SetSelfNode(node.NodeID, "liaison",
liaisonTier)` call in `pkg/cmdsetup/liaison.go:170-171`; no new CLI flags are
introduced.
- Add `banyandb_lifecycle_last_run_timestamp_seconds` and
`banyandb_lifecycle_last_run_success` gauges to the lifecycle service for
at-a-glance health monitoring. `last_run_timestamp_seconds` records the
wall-clock epoch (in seconds) of the most recent migration cycle;
`last_run_success` is `1` on a nil error and `0` otherwise. Both are stamped by
a `defer` at the end of `action()` so every return path (success, error,
recovered panic) updates the pair atomically — dashboards can pin an "is the
lifecycle healthy" panel on the `last_run_success` value and a "when did it
last run" panel on the timestamp. Nil-gauge safe (no panic with a
`BypassRegistry` metrics backend).
+- Refactor the lifecycle cycle-level metrics
(`banyandb_lifecycle_cycles_total`,
`banyandb_lifecycle_last_run_timestamp_seconds`,
`banyandb_lifecycle_last_run_success`) to carry labels `remote_node`,
`remote_role`, `remote_tier`, `group`. The label form mirrors the per-message
`banyandb_lifecycle_migration_*` family emitted by the queue/pub lifecycle
publisher, but the two families describe DIFFERENT things: the cycle-level
series describe the SENDER (the lifecycle pod's co-located data pod, captured
at the cycle's last-seen group) while the per-message pub series describe the
DESTINATION (the node each chunk is sent to, resolved from `getNodeInfo`). They
share the same label form so dashboard matchers and regexes apply to both, but
they are NOT cross-joinable on `(group, node, role, tier)` because the `node`
label carries different meaning in each family. The cycle-level Inc is in
`recordCycleGroup` (per group processed); the cycle-level gauges are stamped
atomically at cycle end i
n `recordLastRun` (which first Deletes the previous-tuple series to avoid
stale labeled series shadowing the new stamp, then Sets the new tuple with the
cycle's start time and the success flag). On the empty-cycle path both gauges
are stamped with all-empty labels. [Breaking Change] Update any alert/panel
that pinned the unlabeld form (e.g. `banyandb_lifecycle_last_run_success == 1`)
to use the labeled form (`banyandb_lifecycle_last_run_success{group!=""}`).
+- Remove `banyandb_lifecycle_self_identity_resolution_total`. The
regression-detection role moves to the now-labeled
`banyandb_lifecycle_cycles_total{remote_node!=""}` (an empty `remote_node`
series means the registry match failed for every group, the bug the old counter
caught), plus the existing receiver-side count of empty `remote_node` on
lifecycle `banyandb_queue_sub_total_finished` series. The wire-level
`cluster.v1.SendRequest` sender-identity fields are unchanged.
Review Comment:
Typo in release notes: "unlabeld" -> "unlabeled".
##########
docs/operation/grafana-fodc-migration.json:
##########
@@ -0,0 +1,2322 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__elements": {},
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "11.2.0"
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "stat",
+ "name": "Stat",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "table",
+ "name": "Table",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "timeseries",
+ "name": "Time series",
+ "version": ""
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "grafana",
+ "uid": "-- Grafana --"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 100,
+ "panels": [],
+ "title": "Migration Health Overview",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Time since the most recent lifecycle cycle completed
across all source pods/groups. Daily-batch SLA: red if older than 26h.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 93600
+ }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 0,
+ "y": 1
+ },
+ "id": 101,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "time() -
max(banyandb_lifecycle_last_run_timestamp_seconds{job=~\"$job\",
group=~\"$group\"})",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Last Migration (age)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Lifecycle migration cycles completed in the last 24h
(sum of increase over banyandb_lifecycle_cycles_total).",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 4,
+ "y": 1
+ },
+ "id": 102,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "sum(increase(banyandb_lifecycle_cycles_total{job=~\"$job\",
group=~\"$group\"}[24h]))",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Cycles (24h)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Number of source pods whose most recent migration cycle
reported failure (last_run_success==0). Red if any.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 8,
+ "y": 1
+ },
+ "id": 103,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "count(banyandb_lifecycle_last_run_success{job=~\"$job\"} ==
0) or vector(0)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Last-run Failures",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Total bytes sent by the lifecycle publisher in the last
24h (file-sync sent_bytes).",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 12,
+ "y": 1
+ },
+ "id": 104,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr":
"sum(increase(banyandb_lifecycle_migration_sent_bytes{job=~\"$job\",
group=~\"$group\"}[24h]))",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Data Migrated (24h)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Total lifecycle migration errors in the last 24h. Red if
any.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 16,
+ "y": 1
+ },
+ "id": 105,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr":
"sum(increase(banyandb_lifecycle_migration_total_err{job=~\"$job\"}[24h])) or
vector(0)",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Migration Errors (24h)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Distinct source pods currently emitting lifecycle
migration metrics.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 20,
+ "y": 1
+ },
+ "id": 106,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "count(count by
(pod_name)(banyandb_lifecycle_migration_total_finished{job=~\"$job\"}))",
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Active Source Pods",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 5
+ },
+ "id": 110,
+ "panels": [],
+ "title": "Cycle Status",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Completed migration cycles per source pod and group
(banyandb_lifecycle_cycles_total). This counter is attributed to the SOURCE
node only \u2014 its remote_* labels carry the lifecycle node's own (sender)
identity, not the migration destination, so there is no destination column
here. A single cycle may fan out to multiple destination tiers; see the Flows
table for the per-flow source\u2192dest breakdown.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "custom": {
+ "align": "auto",
+ "cellOptions": {
+ "type": "auto"
+ },
+ "inspect": false
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 6
+ },
+ "id": 111,
+ "options": {
+ "showHeader": true,
+ "cellHeight": "sm",
+ "footer": {
+ "show": false,
+ "reducer": [
+ "sum"
+ ],
+ "countRows": false,
+ "fields": ""
+ }
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "sum by (node_type, pod_name,
group)(banyandb_lifecycle_cycles_total{job=~\"$job\", group=~\"$group\"})",
+ "format": "table",
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "A"
+ }
+ ],
+ "transformations": [
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+ "Time": true,
+ "job": true,
+ "instance": true,
+ "pod": true,
+ "container_name": true,
+ "node_role": true,
+ "remote_role": true,
+ "remote_tier": true,
+ "remote_node": true,
+ "__name__": true
+ },
+ "renameByName": {
+ "node_type": "Src tier",
+ "pod_name": "Src pod",
+ "group": "Group",
+ "Value": "Cycles"
+ },
+ "indexByName": {
+ "node_type": 0,
+ "pod_name": 1,
+ "group": 2,
+ "Value": 3
+ }
+ }
+ }
+ ],
+ "title": "Cycle Ledger (per source pod \u00d7 group)",
+ "type": "table"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Per source pod: age of the last completed cycle and its
success flag.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "custom": {
+ "align": "auto",
+ "cellOptions": {
+ "type": "auto"
+ },
+ "inspect": false
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Last run age"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "s"
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Success"
+ },
+ "properties": [
+ {
+ "id": "mappings",
+ "value": [
+ {
+ "type": "value",
+ "options": {
+ "1": {
+ "text": "OK",
+ "color": "green",
+ "index": 0
+ },
+ "0": {
+ "text": "FAIL",
+ "color": "red",
+ "index": 1
+ }
+ }
+ }
+ ]
+ },
+ {
+ "id": "custom.cellOptions",
+ "value": {
+ "type": "color-background",
+ "mode": "basic"
+ }
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 6
+ },
+ "id": 112,
+ "options": {
+ "showHeader": true,
+ "cellHeight": "sm",
+ "footer": {
+ "show": false,
+ "reducer": [
+ "sum"
+ ],
+ "countRows": false,
+ "fields": ""
+ }
+ },
+ "pluginVersion": "11.2.0",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "time() -
banyandb_lifecycle_last_run_timestamp_seconds{job=~\"$job\"}",
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "A",
+ "format": "table"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "sum by
(pod_name)(banyandb_lifecycle_last_run_success{job=~\"$job\"})",
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "B",
+ "format": "table"
+ }
Review Comment:
"Last Run per Source Pod" query B sums last_run_success across all groups
and also ignores $group. last_run_success is a 0/1 gauge; using max (and
filtering by $group) is safer if multiple labeled series exist temporarily
(e.g. during scrape gaps) and matches the panel intent of showing a single
success flag per pod.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]