Copilot commented on code in PR #1157:
URL:
https://github.com/apache/skywalking-banyandb/pull/1157#discussion_r3359672815
##########
fodc/agent/internal/cluster/collector.go:
##########
@@ -259,6 +265,47 @@ func (c *Collector) updateCurrentNodes(nodes
map[string]*databasev1.Node) {
c.log.Info().Int("nodes_count", len(nodes)).Msg("Updated current nodes
from all endpoints")
}
+// pollCurrentNodeWithRetry repeatedly polls the current node until node info
with a
+// resolved (non-unspecified) role is obtained, or the resolve budget elapses.
The
+// sibling lifecycle/banyandb gRPC server may not be listening when the agent
first
+// starts; the original single-shot poll left node_role permanently
ROLE_UNSPECIFIED
+// whenever that brief startup race was lost.
+func (c *Collector) pollCurrentNodeWithRetry(ctx context.Context) {
+ deadline := time.Now().Add(c.resolveBudget)
+ backoff := initialBackoff
+ for {
+ c.pollCurrentNode(ctx)
+ if c.hasResolvedRole() {
+ return
+ }
+ if ctx.Err() != nil || !time.Now().Before(deadline) {
+ c.log.Warn().Msg("Node role unresolved after resolve
budget; proceeding with current node info")
+ return
+ }
+ select {
+ case <-ctx.Done():
+ return
+ case <-c.closer.CloseNotify():
+ return
+ case <-time.After(backoff):
+ }
+ backoff = min(backoff*2, maxBackoff)
+ }
+}
Review Comment:
`pollCurrentNodeWithRetry` computes a `deadline` but calls
`c.pollCurrentNode(ctx)` with the original context, so the overall runtime is
not actually bounded by `resolveBudget`: a single `pollCurrentNode` can take up
to `maxRetries * 5s` (per-attempt timeout) plus backoffs, and the loop can
exceed both `resolveBudget` and the 30s `WaitForNodeFetched` timeout. This can
reintroduce the startup issue if `StartCollector` times out and the caller
snapshots `ROLE_UNSPECIFIED` before the retry loop eventually resolves.
##########
fodc/agent/internal/cluster/collector_test.go:
##########
@@ -766,6 +768,54 @@ func TestProcessClusterStates_EmptyPodName(t *testing.T) {
}
}
+type fakeNodeQueryClient struct {
+ node *databasev1.Node
+ failCount int
+ calls int
+}
+
+func (f *fakeNodeQueryClient) GetCurrentNode(_ context.Context, _
*databasev1.GetCurrentNodeRequest,
+ _ ...grpc.CallOption,
+) (*databasev1.GetCurrentNodeResponse, error) {
+ f.calls++
+ if f.calls <= f.failCount {
+ return nil, fmt.Errorf("simulated unavailable on call %d",
f.calls)
+ }
+ return &databasev1.GetCurrentNodeResponse{Node: f.node}, nil
+}
+
+func TestCollector_PollCurrentNodeWithRetry_ResolvesAfterTransientFailure(t
*testing.T) {
+ log := initTestLogger(t)
+ c := NewCollector(log, []string{"test"}, time.Second, "test-pod")
+ c.resolveBudget = 5 * time.Second
+ node := &databasev1.Node{
+ Metadata: &commonv1.Metadata{Name: "data-0"},
+ Roles: []databasev1.Role{databasev1.Role_ROLE_DATA},
+ }
+ // fetchCurrentNodeFromEndpoint itself retries maxRetries(=3) times per
poll, so
+ // failing the first 3 GetCurrentNode calls forces a second outer poll
iteration.
+ fake := &fakeNodeQueryClient{node: node, failCount: maxRetries}
+ c.clients = []*endpointClient{{nodeQueryClient: fake, addr: "test"}}
+ c.pollCurrentNodeWithRetry(context.Background())
+ role, _ := c.GetNodeInfo()
+ assert.Equal(t, databasev1.Role_name[int32(databasev1.Role_ROLE_DATA)],
role)
+ assert.Greater(t, fake.calls, maxRetries)
+}
+
+func TestCollector_PollCurrentNodeWithRetry_GivesUpAfterBudget(t *testing.T) {
+ log := initTestLogger(t)
+ c := NewCollector(log, []string{"test"}, time.Second, "test-pod")
+ c.resolveBudget = 200 * time.Millisecond
+ fake := &fakeNodeQueryClient{failCount: 1 << 30} // always fails
+ c.clients = []*endpointClient{{nodeQueryClient: fake, addr: "test"}}
+ start := time.Now()
+ c.pollCurrentNodeWithRetry(context.Background())
+ elapsed := time.Since(start)
+ role, _ := c.GetNodeInfo()
+ assert.Equal(t,
databasev1.Role_name[int32(databasev1.Role_ROLE_UNSPECIFIED)], role)
+ assert.Less(t, elapsed, 10*time.Second) // returns promptly, does not
hang
+}
Review Comment:
The budget test currently asserts `elapsed < 10s`, which is too loose to
catch regressions (e.g., if the retry loop accidentally waits far beyond the
configured 200ms budget). Tightening the bound to be relative to
`c.resolveBudget` makes the test actually validate the intended behavior and
prevents future changes from silently reintroducing long startup stalls.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]