Re: [PR] Fix FODC agent leaving node_role unspecified on startup race [skywalking-banyandb]

via GitHub Thu, 04 Jun 2026 17:39:56 -0700


Copilot commented on code in PR #1157:
URL: 
https://github.com/apache/skywalking-banyandb/pull/1157#discussion_r3359672815



##########
fodc/agent/internal/cluster/collector.go:
##########
@@ -259,6 +265,47 @@ func (c *Collector) updateCurrentNodes(nodes 
map[string]*databasev1.Node) {
        c.log.Info().Int("nodes_count", len(nodes)).Msg("Updated current nodes 
from all endpoints")
 }
 
+// pollCurrentNodeWithRetry repeatedly polls the current node until node info 
with a
+// resolved (non-unspecified) role is obtained, or the resolve budget elapses. 
The
+// sibling lifecycle/banyandb gRPC server may not be listening when the agent 
first
+// starts; the original single-shot poll left node_role permanently 
ROLE_UNSPECIFIED
+// whenever that brief startup race was lost.
+func (c *Collector) pollCurrentNodeWithRetry(ctx context.Context) {
+       deadline := time.Now().Add(c.resolveBudget)
+       backoff := initialBackoff
+       for {
+               c.pollCurrentNode(ctx)
+               if c.hasResolvedRole() {
+                       return
+               }
+               if ctx.Err() != nil || !time.Now().Before(deadline) {
+                       c.log.Warn().Msg("Node role unresolved after resolve 
budget; proceeding with current node info")
+                       return
+               }
+               select {
+               case <-ctx.Done():
+                       return
+               case <-c.closer.CloseNotify():
+                       return
+               case <-time.After(backoff):
+               }
+               backoff = min(backoff*2, maxBackoff)
+       }
+}

Review Comment:
   `pollCurrentNodeWithRetry` computes a `deadline` but calls 
`c.pollCurrentNode(ctx)` with the original context, so the overall runtime is 
not actually bounded by `resolveBudget`: a single `pollCurrentNode` can take up 
to `maxRetries * 5s` (per-attempt timeout) plus backoffs, and the loop can 
exceed both `resolveBudget` and the 30s `WaitForNodeFetched` timeout. This can 
reintroduce the startup issue if `StartCollector` times out and the caller 
snapshots `ROLE_UNSPECIFIED` before the retry loop eventually resolves.



##########
fodc/agent/internal/cluster/collector_test.go:
##########
@@ -766,6 +768,54 @@ func TestProcessClusterStates_EmptyPodName(t *testing.T) {
        }
 }
 
+type fakeNodeQueryClient struct {
+       node      *databasev1.Node
+       failCount int
+       calls     int
+}
+
+func (f *fakeNodeQueryClient) GetCurrentNode(_ context.Context, _ 
*databasev1.GetCurrentNodeRequest,
+       _ ...grpc.CallOption,
+) (*databasev1.GetCurrentNodeResponse, error) {
+       f.calls++
+       if f.calls <= f.failCount {
+               return nil, fmt.Errorf("simulated unavailable on call %d", 
f.calls)
+       }
+       return &databasev1.GetCurrentNodeResponse{Node: f.node}, nil
+}
+
+func TestCollector_PollCurrentNodeWithRetry_ResolvesAfterTransientFailure(t 
*testing.T) {
+       log := initTestLogger(t)
+       c := NewCollector(log, []string{"test"}, time.Second, "test-pod")
+       c.resolveBudget = 5 * time.Second
+       node := &databasev1.Node{
+               Metadata: &commonv1.Metadata{Name: "data-0"},
+               Roles:    []databasev1.Role{databasev1.Role_ROLE_DATA},
+       }
+       // fetchCurrentNodeFromEndpoint itself retries maxRetries(=3) times per 
poll, so
+       // failing the first 3 GetCurrentNode calls forces a second outer poll 
iteration.
+       fake := &fakeNodeQueryClient{node: node, failCount: maxRetries}
+       c.clients = []*endpointClient{{nodeQueryClient: fake, addr: "test"}}
+       c.pollCurrentNodeWithRetry(context.Background())
+       role, _ := c.GetNodeInfo()
+       assert.Equal(t, databasev1.Role_name[int32(databasev1.Role_ROLE_DATA)], 
role)
+       assert.Greater(t, fake.calls, maxRetries)
+}
+
+func TestCollector_PollCurrentNodeWithRetry_GivesUpAfterBudget(t *testing.T) {
+       log := initTestLogger(t)
+       c := NewCollector(log, []string{"test"}, time.Second, "test-pod")
+       c.resolveBudget = 200 * time.Millisecond
+       fake := &fakeNodeQueryClient{failCount: 1 << 30} // always fails
+       c.clients = []*endpointClient{{nodeQueryClient: fake, addr: "test"}}
+       start := time.Now()
+       c.pollCurrentNodeWithRetry(context.Background())
+       elapsed := time.Since(start)
+       role, _ := c.GetNodeInfo()
+       assert.Equal(t, 
databasev1.Role_name[int32(databasev1.Role_ROLE_UNSPECIFIED)], role)
+       assert.Less(t, elapsed, 10*time.Second) // returns promptly, does not 
hang
+}

Review Comment:
   The budget test currently asserts `elapsed < 10s`, which is too loose to 
catch regressions (e.g., if the retry loop accidentally waits far beyond the 
configured 200ms budget). Tightening the bound to be relative to 
`c.resolveBudget` makes the test actually validate the intended behavior and 
prevents future changes from silently reintroducing long startup stalls.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Fix FODC agent leaving node_role unspecified on startup race [skywalking-banyandb]

Reply via email to