This is an automated email from the ASF dual-hosted git repository.

hanahmily pushed a commit to branch fix/lifecycle-self-identity-resolution
in repository https://gitbox.apache.org/repos/asf/skywalking-banyandb.git

commit 0dc28294db15e7a740763a28bf9e1ee8034a0fb6
Author: Hongtao Gao <[email protected]>
AuthorDate: Fri Jun 12 13:50:34 2026 +0000

    refactor(lifecycle): add emittedLastRunSet bool to correctly Delete 
empty-tuple predecessor
    
    Follow-up to 65775875. Addresses codex round-4 issue 1: after an
    empty cycle stamps the gauges with all-empty labels, the next
    non-empty cycle's hasPrev check (any non-empty emittedLastRun*
    field) would return false for the all-empty predecessor and skip
    the Delete, leaving the empty series to shadow the new stamp.
    
    - banyand/backup/lifecycle/service.go: add emittedLastRunSet bool
      to lifecycleService. recordLastRun uses this flag instead of the
      any-non-empty-field check to decide whether to Delete the
      previous tuple. Set unconditionally to true after every Set,
      even when the new tuple is all-empty (so the next non-empty
      cycle knows to Delete the empty-cycle series).
    
    - banyand/backup/lifecycle/metrics_test.go: add
      TestRecordLastRunEmptyThenNonEmptyCycle. Cycle A: empty cycle
      stamps all-empty labels and sets emittedLastRunSet=true. Cycle B:
      non-empty cycle must Delete the all-empty tuple from cycle A
      before stamping the new tuple. Without the bool flag the test
      would fail (the hasPrev check would return false on the
      all-empty predecessor).
    
    - banyand/backup/lifecycle/steps.go: comment at line 100 no longer
      claims resolveSelfIdentity matches against NodeID; it only
      matches GrpcAddress. Wording tightened to match steps.go:112-124.
---
 banyand/backup/lifecycle/metrics_test.go | 48 +++++++++++++++++
 banyand/backup/lifecycle/service.go      | 92 ++++++++++++--------------------
 banyand/backup/lifecycle/steps.go        | 14 ++---
 3 files changed, 88 insertions(+), 66 deletions(-)

diff --git a/banyand/backup/lifecycle/metrics_test.go 
b/banyand/backup/lifecycle/metrics_test.go
index ec44fed11..05c29440d 100644
--- a/banyand/backup/lifecycle/metrics_test.go
+++ b/banyand/backup/lifecycle/metrics_test.go
@@ -315,6 +315,54 @@ func TestRecordLastRunTwoCycleReplaceStaleSeries(t 
*testing.T) {
        require.Equal(t, 1, okGauge.deleted)
 }
 
+// TestRecordLastRunEmptyThenNonEmptyCycle is the regression test for
+// the empty-tuple state collision: an empty cycle (no recordCycleGroup
+// ran; the action-start reset left lastRun* all empty) stamps the
+// gauges with all-empty labels, and a subsequent non-empty cycle must
+// still Delete the all-empty-labels series before stamping its own
+// tuple. Without the emittedLastRunSet bool flag, the hasPrev check
+// "any emittedLastRun* field is non-empty" would return false for an
+// all-empty predecessor and skip the Delete, leaving the empty series
+// to shadow the new stamp.
+func TestRecordLastRunEmptyThenNonEmptyCycle(t *testing.T) {
+       tsGauge, okGauge := &recordingGauge{}, &recordingGauge{}
+       l := &lifecycleService{
+               lastRunTimestamp: tsGauge,
+               lastRunSuccess:   okGauge,
+       }
+
+       // Cycle A: empty cycle (no recordCycleGroup; all lastRun* are
+       // empty strings from action()'s prelude reset). recordLastRun
+       // stamps the all-empty tuple and sets emittedLastRunSet = true.
+       l.recordLastRun(time.Unix(1717929600, 0), nil)
+       require.Equal(t, 1, tsGauge.called)
+       require.Equal(t, 0, tsGauge.deleted,
+               "first-ever recordLastRun must NOT Delete (no previous tuple)")
+       require.Equal(t, []string{"", "", "", ""}, tsGauge.lastLabels,
+               "empty cycle must stamp all-empty labels")
+       require.True(t, l.emittedLastRunSet,
+               "emittedLastRunSet must be true after a successful Set, even 
with all-empty labels")
+       require.Equal(t, "", l.emittedLastRunGroup)
+
+       // Cycle B: non-empty cycle. recordLastRun must Delete the
+       // all-empty tuple from cycle A before stamping the new tuple.
+       l.lastRunGroup = "metrics-day"
+       l.lastRunNode = "data-hot-0:17912"
+       l.lastRunRole = "lifecycle"
+       l.lastRunTier = "hot"
+       l.recordLastRun(time.Unix(1717929700, 0), nil)
+
+       require.Equal(t, 1, tsGauge.deleted,
+               "second recordLastRun must Delete the previous all-empty tuple 
(the empty-cycle predecessor)")
+       require.Equal(t, []string{"", "", "", ""}, tsGauge.deletedLabel[0],
+               "Delete must target the all-empty tuple from cycle A")
+       require.Equal(t, 2, tsGauge.called)
+       require.Equal(t,
+               []string{"data-hot-0:17912", "lifecycle", "hot", "metrics-day"},
+               tsGauge.lastLabels,
+               "cycle B must stamp the new tuple after deleting the empty one")
+}
+
 // TestRecordLastRunFailure stamps the gauges with success=0 when the action
 // returned an error. The timestamp is still set — operators want to know
 // "when did the last attempt happen, and did it succeed?".
diff --git a/banyand/backup/lifecycle/service.go 
b/banyand/backup/lifecycle/service.go
index a1da3e8b0..19338f1b1 100644
--- a/banyand/backup/lifecycle/service.go
+++ b/banyand/backup/lifecycle/service.go
@@ -89,78 +89,50 @@ const (
 type lifecycleService struct {
        databasev1.UnimplementedClusterStateServiceServer
        databasev1.UnimplementedNodeQueryServiceServer
-       metadata metadata.Repo
-       omr      observability.MetricsRegistry
-       pm       protector.Memory
-       // cycleLabels is the label set shared by the three cycle-level
-       // metrics (cyclesTotal, lastRunTimestamp, lastRunSuccess): the
-       // SENDER identity of the lifecycle pod (remote_node = the
-       // co-located data pod's BanyanDB NodeID, remote_role = "lifecycle",
-       // remote_tier = the data pod's tier label) plus the GROUP being
-       // processed. The label form mirrors the parallel
-       // banyandb_lifecycle_migration_* family emitted by the queue/pub
-       // lifecycle publisher, but the cycle-level series describe the
-       // SENDER side (one tuple per cycle, the cycle's last-seen group)
-       // while the per-message pub series describe the DESTINATION side
-       // (one tuple per chunk, the destination's NodeID/role/tier resolved
-       // from getNodeInfo). The two families are independently useful
-       // (cycle health vs. per-message traffic) and share the same label
-       // form so dashboard matchers and regexes apply to both.
-       cyclesTotal      meter.Counter
-       lastRunTimestamp meter.Gauge
-       lastRunSuccess   meter.Gauge
-       // Last-seen (group, remote_node, remote_role, remote_tier) tuple
-       // from the most recent parseGroup call. Used by the deferred
-       // recordLastRun to stamp the cycle-level last_run_* gauges when
-       // the cycle ends. Reset to empty strings at the start of each
-       // action() so an empty cycle (no parseGroup succeeded) doesn't
-       // inherit the previous cycle's labels.
-       lastRunGroup string
-       lastRunNode  string
-       lastRunRole  string
-       lastRunTier  string
-       // emittedLastRunGroup/Node/Role/Tier is the (group, remote_*)
-       // tuple of the last_run_* series that was last Set on
-       // Prometheus. recordLastRun uses this to Delete the previous
-       // series before stamping the new one, so each cycle's tuple
-       // fully replaces the previous cycle's tuple instead of
-       // accumulating as a stale series. Reset only when the previous
-       // Set succeeded; the empty-cycle path (no recordCycleGroup ran)
-       // leaves this set to whatever the previous cycle emitted, so
-       // the next non-empty cycle's Delete still fires correctly.
-       emittedLastRunGroup string
-       emittedLastRunNode  string
-       emittedLastRunRole  string
-       emittedLastRunTier  string
+       pm                  protector.Memory
+       omr                 observability.MetricsRegistry
        metricsClient       queue.Client
-       grpcServer          *grpclib.Server
-       httpSrv             *http.Server
-       tlsReloader         *pkgtls.Reloader
-       currentNode         *databasev1.Node
-       clientCloser        context.CancelFunc
-       stopCh              chan struct{}
-       sch                 *timestamp.Scheduler
+       cyclesTotal         meter.Counter
+       lastRunTimestamp    meter.Gauge
+       lastRunSuccess      meter.Gauge
+       metadata            metadata.Repo
        l                   *logger.Logger
        clusterStateMgr     *clusterStateManager
        metricsKeeperStop   chan struct{}
+       sch                 *timestamp.Scheduler
+       stopCh              chan struct{}
+       clientCloser        context.CancelFunc
+       currentNode         *databasev1.Node
+       tlsReloader         *pkgtls.Reloader
+       httpSrv             *http.Server
+       grpcServer          *grpclib.Server
+       gRPCAddr            string
+       schedule            string
+       emittedLastRunRole  string
+       emittedLastRunNode  string
+       emittedLastRunGroup string
+       lifecycleCertFile   string
+       lastRunTier         string
+       lastRunRole         string
+       lastRunNode         string
        lifecycleHost       string
        lifecycleHTTPAddr   string
        streamRoot          string
        traceRoot           string
        progressFilePath    string
        reportDir           string
-       schedule            string
+       emittedLastRunTier  string
        cert                string
-       gRPCAddr            string
+       lastRunGroup        string
        lifecycleKeyFile    string
        lifecycleGRPCAddr   string
        measureRoot         string
-       lifecycleCertFile   string
        localNodeMD         schema.Metadata
        maxExecutionTimes   int
        chunkSize           run.Bytes
        lifecycleGRPCPort   uint32
        lifecycleHTTPPort   uint32
+       emittedLastRunSet   bool
        enableTLS           bool
        insecure            bool
        lifecycleTLS        bool
@@ -357,7 +329,7 @@ func (l *lifecycleService) GracefulStop() {
 }
 
 func (l *lifecycleService) Name() string {
-       return "lifecycle"
+       return lifecycleRoleName
 }
 
 // buildLocalNodeMD builds the schema metadata registered on the native metrics
@@ -773,9 +745,7 @@ func (l *lifecycleService) recordLastRun(start time.Time, 
err error) {
        prevLabels := []string{
                l.emittedLastRunNode, l.emittedLastRunRole, 
l.emittedLastRunTier, l.emittedLastRunGroup,
        }
-       hasPrev := l.emittedLastRunGroup != "" || l.emittedLastRunNode != "" ||
-               l.emittedLastRunRole != "" || l.emittedLastRunTier != ""
-       if hasPrev {
+       if l.emittedLastRunSet {
                if l.lastRunTimestamp != nil {
                        l.lastRunTimestamp.Delete(prevLabels...)
                }
@@ -790,8 +760,12 @@ func (l *lifecycleService) recordLastRun(start time.Time, 
err error) {
                l.lastRunSuccess.Set(success, l.lastRunNode, l.lastRunRole, 
l.lastRunTier, l.lastRunGroup)
        }
        // Update the emitted-tuple tracking so the next cycle's recordLastRun
-       // deletes THIS cycle's tuple. This runs after the Set so a panic in
-       // Set doesn't leave the tracking inconsistent with Prometheus.
+       // deletes THIS cycle's tuple. emittedLastRunSet is unconditionally
+       // true after a successful Set, even for the all-empty-labels case
+       // (so the next non-empty cycle knows to Delete the all-empty
+       // series). This runs after the Set so a panic in Set doesn't
+       // leave the tracking inconsistent with Prometheus.
+       l.emittedLastRunSet = true
        l.emittedLastRunGroup = l.lastRunGroup
        l.emittedLastRunNode = l.lastRunNode
        l.emittedLastRunRole = l.lastRunRole
diff --git a/banyand/backup/lifecycle/steps.go 
b/banyand/backup/lifecycle/steps.go
index 02d4f7a95..888f21b5a 100644
--- a/banyand/backup/lifecycle/steps.go
+++ b/banyand/backup/lifecycle/steps.go
@@ -97,13 +97,13 @@ func (l *lifecycleService) getSnapshots(ctx 
context.Context, groups []*commonv1.
 // POD_NAME (K8s downward API) and falls back to os.Hostname() — the
 // same precedence as nativeNodeContext at service.go:160-165. The
 // function then looks the host up directly in the data-node registry,
-// matching against the host portion of GrpcAddress and NodeID (the
-// registry may carry either an IP, a headless-service FQDN, or a
-// loopback alias, depending on which bind address the data pod
-// registered with). The first registry entry whose host matches (with
-// loopback-alias normalization) is the co-located data pod; its
-// Metadata.Name is the SenderNode and its Labels["type"] is the
-// SenderTier.
+// matching against the host portion of GrpcAddress (the registry may
+// carry an IP, a headless-service FQDN, or a loopback alias, depending
+// on which bind address the data pod registered with). The first
+// registry entry whose host matches (with loopback-alias and
+// IP-literal normalization via hostMatches) is the co-located data
+// pod; its Metadata.Name is the SenderNode and its Labels["type"] is
+// the SenderTier.
 //
 // Re-runs on every parseGroup call (no caching) so a data-pod
 // restart, re-registration, or new host is picked up by the next

Reply via email to