This is an automated email from the ASF dual-hosted git repository. hanahmily pushed a commit to branch fix/lifecycle-self-identity-resolution in repository https://gitbox.apache.org/repos/asf/skywalking-banyandb.git
commit 0dc28294db15e7a740763a28bf9e1ee8034a0fb6 Author: Hongtao Gao <[email protected]> AuthorDate: Fri Jun 12 13:50:34 2026 +0000 refactor(lifecycle): add emittedLastRunSet bool to correctly Delete empty-tuple predecessor Follow-up to 65775875. Addresses codex round-4 issue 1: after an empty cycle stamps the gauges with all-empty labels, the next non-empty cycle's hasPrev check (any non-empty emittedLastRun* field) would return false for the all-empty predecessor and skip the Delete, leaving the empty series to shadow the new stamp. - banyand/backup/lifecycle/service.go: add emittedLastRunSet bool to lifecycleService. recordLastRun uses this flag instead of the any-non-empty-field check to decide whether to Delete the previous tuple. Set unconditionally to true after every Set, even when the new tuple is all-empty (so the next non-empty cycle knows to Delete the empty-cycle series). - banyand/backup/lifecycle/metrics_test.go: add TestRecordLastRunEmptyThenNonEmptyCycle. Cycle A: empty cycle stamps all-empty labels and sets emittedLastRunSet=true. Cycle B: non-empty cycle must Delete the all-empty tuple from cycle A before stamping the new tuple. Without the bool flag the test would fail (the hasPrev check would return false on the all-empty predecessor). - banyand/backup/lifecycle/steps.go: comment at line 100 no longer claims resolveSelfIdentity matches against NodeID; it only matches GrpcAddress. Wording tightened to match steps.go:112-124. --- banyand/backup/lifecycle/metrics_test.go | 48 +++++++++++++++++ banyand/backup/lifecycle/service.go | 92 ++++++++++++-------------------- banyand/backup/lifecycle/steps.go | 14 ++--- 3 files changed, 88 insertions(+), 66 deletions(-) diff --git a/banyand/backup/lifecycle/metrics_test.go b/banyand/backup/lifecycle/metrics_test.go index ec44fed11..05c29440d 100644 --- a/banyand/backup/lifecycle/metrics_test.go +++ b/banyand/backup/lifecycle/metrics_test.go @@ -315,6 +315,54 @@ func TestRecordLastRunTwoCycleReplaceStaleSeries(t *testing.T) { require.Equal(t, 1, okGauge.deleted) } +// TestRecordLastRunEmptyThenNonEmptyCycle is the regression test for +// the empty-tuple state collision: an empty cycle (no recordCycleGroup +// ran; the action-start reset left lastRun* all empty) stamps the +// gauges with all-empty labels, and a subsequent non-empty cycle must +// still Delete the all-empty-labels series before stamping its own +// tuple. Without the emittedLastRunSet bool flag, the hasPrev check +// "any emittedLastRun* field is non-empty" would return false for an +// all-empty predecessor and skip the Delete, leaving the empty series +// to shadow the new stamp. +func TestRecordLastRunEmptyThenNonEmptyCycle(t *testing.T) { + tsGauge, okGauge := &recordingGauge{}, &recordingGauge{} + l := &lifecycleService{ + lastRunTimestamp: tsGauge, + lastRunSuccess: okGauge, + } + + // Cycle A: empty cycle (no recordCycleGroup; all lastRun* are + // empty strings from action()'s prelude reset). recordLastRun + // stamps the all-empty tuple and sets emittedLastRunSet = true. + l.recordLastRun(time.Unix(1717929600, 0), nil) + require.Equal(t, 1, tsGauge.called) + require.Equal(t, 0, tsGauge.deleted, + "first-ever recordLastRun must NOT Delete (no previous tuple)") + require.Equal(t, []string{"", "", "", ""}, tsGauge.lastLabels, + "empty cycle must stamp all-empty labels") + require.True(t, l.emittedLastRunSet, + "emittedLastRunSet must be true after a successful Set, even with all-empty labels") + require.Equal(t, "", l.emittedLastRunGroup) + + // Cycle B: non-empty cycle. recordLastRun must Delete the + // all-empty tuple from cycle A before stamping the new tuple. + l.lastRunGroup = "metrics-day" + l.lastRunNode = "data-hot-0:17912" + l.lastRunRole = "lifecycle" + l.lastRunTier = "hot" + l.recordLastRun(time.Unix(1717929700, 0), nil) + + require.Equal(t, 1, tsGauge.deleted, + "second recordLastRun must Delete the previous all-empty tuple (the empty-cycle predecessor)") + require.Equal(t, []string{"", "", "", ""}, tsGauge.deletedLabel[0], + "Delete must target the all-empty tuple from cycle A") + require.Equal(t, 2, tsGauge.called) + require.Equal(t, + []string{"data-hot-0:17912", "lifecycle", "hot", "metrics-day"}, + tsGauge.lastLabels, + "cycle B must stamp the new tuple after deleting the empty one") +} + // TestRecordLastRunFailure stamps the gauges with success=0 when the action // returned an error. The timestamp is still set — operators want to know // "when did the last attempt happen, and did it succeed?". diff --git a/banyand/backup/lifecycle/service.go b/banyand/backup/lifecycle/service.go index a1da3e8b0..19338f1b1 100644 --- a/banyand/backup/lifecycle/service.go +++ b/banyand/backup/lifecycle/service.go @@ -89,78 +89,50 @@ const ( type lifecycleService struct { databasev1.UnimplementedClusterStateServiceServer databasev1.UnimplementedNodeQueryServiceServer - metadata metadata.Repo - omr observability.MetricsRegistry - pm protector.Memory - // cycleLabels is the label set shared by the three cycle-level - // metrics (cyclesTotal, lastRunTimestamp, lastRunSuccess): the - // SENDER identity of the lifecycle pod (remote_node = the - // co-located data pod's BanyanDB NodeID, remote_role = "lifecycle", - // remote_tier = the data pod's tier label) plus the GROUP being - // processed. The label form mirrors the parallel - // banyandb_lifecycle_migration_* family emitted by the queue/pub - // lifecycle publisher, but the cycle-level series describe the - // SENDER side (one tuple per cycle, the cycle's last-seen group) - // while the per-message pub series describe the DESTINATION side - // (one tuple per chunk, the destination's NodeID/role/tier resolved - // from getNodeInfo). The two families are independently useful - // (cycle health vs. per-message traffic) and share the same label - // form so dashboard matchers and regexes apply to both. - cyclesTotal meter.Counter - lastRunTimestamp meter.Gauge - lastRunSuccess meter.Gauge - // Last-seen (group, remote_node, remote_role, remote_tier) tuple - // from the most recent parseGroup call. Used by the deferred - // recordLastRun to stamp the cycle-level last_run_* gauges when - // the cycle ends. Reset to empty strings at the start of each - // action() so an empty cycle (no parseGroup succeeded) doesn't - // inherit the previous cycle's labels. - lastRunGroup string - lastRunNode string - lastRunRole string - lastRunTier string - // emittedLastRunGroup/Node/Role/Tier is the (group, remote_*) - // tuple of the last_run_* series that was last Set on - // Prometheus. recordLastRun uses this to Delete the previous - // series before stamping the new one, so each cycle's tuple - // fully replaces the previous cycle's tuple instead of - // accumulating as a stale series. Reset only when the previous - // Set succeeded; the empty-cycle path (no recordCycleGroup ran) - // leaves this set to whatever the previous cycle emitted, so - // the next non-empty cycle's Delete still fires correctly. - emittedLastRunGroup string - emittedLastRunNode string - emittedLastRunRole string - emittedLastRunTier string + pm protector.Memory + omr observability.MetricsRegistry metricsClient queue.Client - grpcServer *grpclib.Server - httpSrv *http.Server - tlsReloader *pkgtls.Reloader - currentNode *databasev1.Node - clientCloser context.CancelFunc - stopCh chan struct{} - sch *timestamp.Scheduler + cyclesTotal meter.Counter + lastRunTimestamp meter.Gauge + lastRunSuccess meter.Gauge + metadata metadata.Repo l *logger.Logger clusterStateMgr *clusterStateManager metricsKeeperStop chan struct{} + sch *timestamp.Scheduler + stopCh chan struct{} + clientCloser context.CancelFunc + currentNode *databasev1.Node + tlsReloader *pkgtls.Reloader + httpSrv *http.Server + grpcServer *grpclib.Server + gRPCAddr string + schedule string + emittedLastRunRole string + emittedLastRunNode string + emittedLastRunGroup string + lifecycleCertFile string + lastRunTier string + lastRunRole string + lastRunNode string lifecycleHost string lifecycleHTTPAddr string streamRoot string traceRoot string progressFilePath string reportDir string - schedule string + emittedLastRunTier string cert string - gRPCAddr string + lastRunGroup string lifecycleKeyFile string lifecycleGRPCAddr string measureRoot string - lifecycleCertFile string localNodeMD schema.Metadata maxExecutionTimes int chunkSize run.Bytes lifecycleGRPCPort uint32 lifecycleHTTPPort uint32 + emittedLastRunSet bool enableTLS bool insecure bool lifecycleTLS bool @@ -357,7 +329,7 @@ func (l *lifecycleService) GracefulStop() { } func (l *lifecycleService) Name() string { - return "lifecycle" + return lifecycleRoleName } // buildLocalNodeMD builds the schema metadata registered on the native metrics @@ -773,9 +745,7 @@ func (l *lifecycleService) recordLastRun(start time.Time, err error) { prevLabels := []string{ l.emittedLastRunNode, l.emittedLastRunRole, l.emittedLastRunTier, l.emittedLastRunGroup, } - hasPrev := l.emittedLastRunGroup != "" || l.emittedLastRunNode != "" || - l.emittedLastRunRole != "" || l.emittedLastRunTier != "" - if hasPrev { + if l.emittedLastRunSet { if l.lastRunTimestamp != nil { l.lastRunTimestamp.Delete(prevLabels...) } @@ -790,8 +760,12 @@ func (l *lifecycleService) recordLastRun(start time.Time, err error) { l.lastRunSuccess.Set(success, l.lastRunNode, l.lastRunRole, l.lastRunTier, l.lastRunGroup) } // Update the emitted-tuple tracking so the next cycle's recordLastRun - // deletes THIS cycle's tuple. This runs after the Set so a panic in - // Set doesn't leave the tracking inconsistent with Prometheus. + // deletes THIS cycle's tuple. emittedLastRunSet is unconditionally + // true after a successful Set, even for the all-empty-labels case + // (so the next non-empty cycle knows to Delete the all-empty + // series). This runs after the Set so a panic in Set doesn't + // leave the tracking inconsistent with Prometheus. + l.emittedLastRunSet = true l.emittedLastRunGroup = l.lastRunGroup l.emittedLastRunNode = l.lastRunNode l.emittedLastRunRole = l.lastRunRole diff --git a/banyand/backup/lifecycle/steps.go b/banyand/backup/lifecycle/steps.go index 02d4f7a95..888f21b5a 100644 --- a/banyand/backup/lifecycle/steps.go +++ b/banyand/backup/lifecycle/steps.go @@ -97,13 +97,13 @@ func (l *lifecycleService) getSnapshots(ctx context.Context, groups []*commonv1. // POD_NAME (K8s downward API) and falls back to os.Hostname() — the // same precedence as nativeNodeContext at service.go:160-165. The // function then looks the host up directly in the data-node registry, -// matching against the host portion of GrpcAddress and NodeID (the -// registry may carry either an IP, a headless-service FQDN, or a -// loopback alias, depending on which bind address the data pod -// registered with). The first registry entry whose host matches (with -// loopback-alias normalization) is the co-located data pod; its -// Metadata.Name is the SenderNode and its Labels["type"] is the -// SenderTier. +// matching against the host portion of GrpcAddress (the registry may +// carry an IP, a headless-service FQDN, or a loopback alias, depending +// on which bind address the data pod registered with). The first +// registry entry whose host matches (with loopback-alias and +// IP-literal normalization via hostMatches) is the co-located data +// pod; its Metadata.Name is the SenderNode and its Labels["type"] is +// the SenderTier. // // Re-runs on every parseGroup call (no caching) so a data-pod // restart, re-registration, or new host is picked up by the next
