Re: [PR] Fix lifecycle migration segment shorter than the configured `SegmentInterval` [skywalking-banyandb]

via GitHub Thu, 07 May 2026 20:16:56 -0700


Copilot commented on code in PR #1120:
URL: 
https://github.com/apache/skywalking-banyandb/pull/1120#discussion_r3206054508



##########
banyand/internal/storage/segment_test.go:
##########
@@ -1178,3 +1180,455 @@ func TestSegment_ConcurrentReopenAndClose_NoPanic(t 
*testing.T) {
                require.Nil(t, s.index, "segment %s should be cleaned up", 
s.suffix)
        }
 }
+
+// newAlignmentTestController is a compact helper for tests covering the
+// epoch-aligned segment.create() behavior introduced by the lifecycle fix.
+// It spins up a real segmentController backed by a temp dir, with the given
+// SegmentInterval, and returns the controller plus a cleanup function.
+func newAlignmentTestController(
+       t *testing.T, interval IntervalRule,
+) (*segmentController[mockTSTable, mockTSTableOpener], string, func()) {
+       t.Helper()
+       tempDir, cleanup := setupTestEnvironment(t)
+
+       ctx := context.Background()
+       l := logger.GetLogger("test-alignment")
+       ctx = context.WithValue(ctx, logger.ContextKey, l)
+       ctx = common.SetPosition(ctx, func(_ common.Position) common.Position {
+               return common.Position{
+                       Database: "test-db",
+                       Stage:    "alignment",
+               }
+       })
+
+       opts := TSDBOpts[mockTSTable, mockTSTableOpener]{
+               TSTableCreator: func(_ fs.FileSystem, _ string, _ 
common.Position, _ *logger.Logger,
+                       _ timestamp.TimeRange, _ mockTSTableOpener, _ any,
+               ) (mockTSTable, error) {
+                       return mockTSTable{ID: common.ShardID(0)}, nil
+               },
+               ShardNum:                       1,
+               SegmentInterval:                interval,
+               TTL:                            IntervalRule{Unit: DAY, Num: 
60},
+               SeriesIndexFlushTimeoutSeconds: 10,
+               SeriesIndexCacheMaxBytes:       1024 * 1024,
+       }
+
+       serviceCache := NewServiceCache().(*serviceCache)
+       sc := newSegmentController[mockTSTable, mockTSTableOpener](
+               ctx,
+               tempDir,
+               l,
+               opts,
+               nil,
+               nil,
+               5*time.Minute,
+               
fs.NewLocalFileSystemWithLoggerAndLimit(logger.GetLogger("storage"), 
opts.MemoryLimit),
+               serviceCache,
+               group,
+       )
+       return sc, tempDir, cleanup
+}
+
+// TestIntervalRule_Standard_AlignsToEpochGrid pins the alignment math itself.
+// Two timestamps in the same N*Unit bucket measured from epoch must round
+// down to the exact same instant; two timestamps in adjacent buckets must
+// round to instants exactly N*Unit apart.
+func TestIntervalRule_Standard_AlignsToEpochGrid(t *testing.T) {
+       //nolint:govet // fieldalignment: test struct optimization not critical
+       cases := []struct {
+               name     string
+               ir       IntervalRule
+               probes   []time.Time
+               expected []time.Time
+       }{
+               {
+                       name: "DAY Num=15 matches calculateTargetSegments grid",
+                       ir:   IntervalRule{Unit: DAY, Num: 15},
+                       probes: []time.Time{
+                               time.Date(2026, 4, 7, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 16, 6, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 19, 23, 59, 59, 0, time.UTC),
+                               time.Date(2026, 4, 22, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 5, 7, 12, 30, 0, 0, time.UTC),
+                       },
+                       expected: []time.Time{
+                               time.Date(2026, 4, 7, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 7, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 7, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 22, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 5, 7, 0, 0, 0, 0, time.UTC),
+                       },
+               },
+               {
+                       name: "DAY Num=5",
+                       ir:   IntervalRule{Unit: DAY, Num: 5},
+                       // 1970-01-01 + N*5 day boundaries near today: 
2026-04-02 (day 20545),
+                       // 2026-04-07 (day 20550), 2026-04-12 (day 20555).
+                       probes: []time.Time{
+                               time.Date(2026, 4, 2, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 3, 6, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 6, 23, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 7, 0, 0, 0, 0, time.UTC),
+                       },
+                       expected: []time.Time{
+                               time.Date(2026, 4, 2, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 2, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 2, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 7, 0, 0, 0, 0, time.UTC),
+                       },
+               },
+               {
+                       name: "HOUR Num=6",
+                       ir:   IntervalRule{Unit: HOUR, Num: 6},
+                       probes: []time.Time{
+                               time.Date(2026, 4, 19, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 19, 5, 59, 59, 0, time.UTC),
+                               time.Date(2026, 4, 19, 6, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 19, 11, 30, 0, 0, time.UTC),
+                               time.Date(2026, 4, 19, 12, 0, 0, 0, time.UTC),
+                       },
+                       expected: []time.Time{
+                               time.Date(2026, 4, 19, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 19, 0, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 19, 6, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 19, 6, 0, 0, 0, time.UTC),
+                               time.Date(2026, 4, 19, 12, 0, 0, 0, time.UTC),
+                       },
+               },
+               {
+                       name:     "DAY Num=1 matches old Unit.Standard 
behavior",
+                       ir:       IntervalRule{Unit: DAY, Num: 1},
+                       probes:   []time.Time{time.Date(2026, 4, 19, 6, 30, 0, 
0, time.UTC)},
+                       expected: []time.Time{time.Date(2026, 4, 19, 0, 0, 0, 
0, time.UTC)},
+               },
+               {
+                       name:     "Num=0 falls back to Unit.Standard",
+                       ir:       IntervalRule{Unit: DAY, Num: 0},
+                       probes:   []time.Time{time.Date(2026, 4, 19, 6, 30, 0, 
0, time.UTC)},
+                       expected: []time.Time{time.Date(2026, 4, 19, 0, 0, 0, 
0, time.UTC)},
+               },
+               {
+                       // Pre-epoch inputs: floor division puts them in the 
bucket below
+                       // rather than collapsing to epoch as truncated 
division would.
+                       name: "DAY Num=15 pre-epoch uses floor division",
+                       ir:   IntervalRule{Unit: DAY, Num: 15},
+                       probes: []time.Time{
+                               time.Date(1969, 12, 25, 0, 0, 0, 0, time.UTC),
+                               time.Date(1969, 12, 31, 23, 59, 59, 0, 
time.UTC),
+                       },
+                       expected: []time.Time{
+                               time.Date(1969, 12, 17, 0, 0, 0, 0, time.UTC),
+                               time.Date(1969, 12, 17, 0, 0, 0, 0, time.UTC),
+                       },
+               },
+       }
+       for _, tc := range cases {
+               t.Run(tc.name, func(t *testing.T) {
+                       require.Equal(t, len(tc.probes), len(tc.expected))
+                       for i, p := range tc.probes {
+                               got := tc.ir.Standard(p)
+                               assert.Equal(t, tc.expected[i], got,
+                                       "Standard(%s) under %d %s expected %s 
got %s",
+                                       p.Format(time.RFC3339), tc.ir.Num, 
tc.ir.Unit.String(),
+                                       tc.expected[i].Format(time.RFC3339), 
got.Format(time.RFC3339))
+                       }
+               })
+       }
+}
+
+// TestIntervalRule_Standard_PreservesLocation verifies that the returned
+// time keeps the input's Location while the bucket instant stays invariant
+// across timezones - the grid is anchored in UTC for cross-node consistency
+// but presented in the caller's local time.
+func TestIntervalRule_Standard_PreservesLocation(t *testing.T) {
+       shanghai := time.FixedZone("CST", 8*3600)
+       losAngeles := time.FixedZone("PST", -8*3600)
+
+       cases := []struct {
+               probe time.Time
+               ir    IntervalRule
+       }{
+               {time.Date(2026, 4, 19, 6, 0, 0, 0, time.UTC), 
IntervalRule{Unit: DAY, Num: 15}},
+               {time.Date(2026, 4, 19, 11, 30, 0, 0, time.UTC), 
IntervalRule{Unit: HOUR, Num: 6}},
+       }
+       for _, c := range cases {
+               want := c.ir.Standard(c.probe)
+               for _, loc := range []*time.Location{time.UTC, shanghai, 
losAngeles} {
+                       got := c.ir.Standard(c.probe.In(loc))
+                       assert.Equal(t, loc, got.Location(),
+                               "Standard must return time in input.Location(); 
ir=%+v loc=%s", c.ir, loc)
+                       assert.True(t, got.Equal(want),
+                               "bucket instant must be timezone-invariant; 
ir=%+v loc=%s utc=%s got=%s",
+                               c.ir, loc, want.Format(time.RFC3339), 
got.Format(time.RFC3339))
+               }
+       }
+}
+
+// TestCreateSegment_OutOfOrderArrival_SameBucket reproduces the production
+// truncation observed on demo-banyandb-data-cold-0 on 2026-04-30 where a part
+// with MinTimestamp ~2026-04-19 created seg-20260419 first and a later-
+// processed part with MinTimestamp ~2026-04-16 produced a 3-day truncated
+// seg-20260416. With Standard() honoring Num, both timestamps resolve to the
+// same epoch-aligned bucket [04/07, 04/22), so the second create returns the
+// same segment instead of creating a truncated neighbor.
+func TestCreateSegment_OutOfOrderArrival_SameBucket(t *testing.T) {
+       sc, _, cleanup := newAlignmentTestController(t, IntervalRule{Unit: DAY, 
Num: 15})
+       defer cleanup()
+
+       laterFirst := time.Date(2026, 4, 19, 6, 0, 0, 0, time.UTC)
+       earlierAfter := time.Date(2026, 4, 16, 6, 0, 0, 0, time.UTC)
+       expectedBucketStart := time.Date(2026, 4, 7, 0, 0, 0, 0, time.UTC)
+       expectedBucketEnd := time.Date(2026, 4, 22, 0, 0, 0, 0, time.UTC)
+
+       seg1, err := sc.create(laterFirst)
+       require.NoError(t, err)
+       require.NotNil(t, seg1)
+       defer seg1.DecRef()
+
+       seg2, err := sc.create(earlierAfter)
+       require.NoError(t, err)
+       require.NotNil(t, seg2)
+
+       assert.Same(t, seg1, seg2,
+               "both timestamps fall in the same 15d bucket; create() must 
reuse the existing segment")
+       assert.Equal(t, expectedBucketStart, seg1.Start,
+               "segment must start at the epoch-aligned bucket boundary, not 
at the first arrival's day")
+       assert.Equal(t, expectedBucketEnd, seg1.End)
+       assert.Equal(t, 15*24*time.Hour, seg1.End.Sub(seg1.Start))
+}
+
+// TestCreateSegment_OutOfOrderArrival_AdjacentBuckets covers the case where
+// the late-arrival timestamp falls in a strictly different epoch bucket from
+// the first-arrival one. The new segment must occupy a clean N*Unit span
+// without being truncated by the existing later segment - the truncation
+// branch in create() must collapse to "stdEnd == next.Start" exactly, leaving
+// span = N*Unit.
+func TestCreateSegment_OutOfOrderArrival_AdjacentBuckets(t *testing.T) {
+       sc, _, cleanup := newAlignmentTestController(t, IntervalRule{Unit: DAY, 
Num: 15})
+       defer cleanup()
+
+       laterFirst := time.Date(2026, 5, 5, 6, 0, 0, 0, time.UTC)    // bucket 
[04/22, 05/07)
+       earlierAfter := time.Date(2026, 4, 15, 6, 0, 0, 0, time.UTC) // bucket 
[04/07, 04/22)
+
+       seg1, err := sc.create(laterFirst)
+       require.NoError(t, err)
+       require.NotNil(t, seg1)
+       defer seg1.DecRef()
+       assert.Equal(t, time.Date(2026, 4, 22, 0, 0, 0, 0, time.UTC), 
seg1.Start)
+       assert.Equal(t, time.Date(2026, 5, 7, 0, 0, 0, 0, time.UTC), seg1.End)
+       assert.Equal(t, 15*24*time.Hour, seg1.End.Sub(seg1.Start))
+
+       seg2, err := sc.create(earlierAfter)
+       require.NoError(t, err)
+       require.NotNil(t, seg2)
+       defer seg2.DecRef()
+       assert.NotSame(t, seg1, seg2,
+               "timestamps in different buckets must not collapse to one 
segment")
+       assert.Equal(t, time.Date(2026, 4, 7, 0, 0, 0, 0, time.UTC), seg2.Start)
+       assert.Equal(t, time.Date(2026, 4, 22, 0, 0, 0, 0, time.UTC), seg2.End,
+               "new earlier segment must abut next.Start with 
stdEnd==next.Start (no internal truncation)")
+       assert.Equal(t, 15*24*time.Hour, seg2.End.Sub(seg2.Start))
+}
+
+// TestCreateSegment_HourInterval_OutOfOrder verifies the same alignment
+// guarantee for HOUR-based segment intervals (Num=6).
+func TestCreateSegment_HourInterval_OutOfOrder(t *testing.T) {
+       sc, _, cleanup := newAlignmentTestController(t, IntervalRule{Unit: 
HOUR, Num: 6})
+       defer cleanup()
+
+       t1 := time.Date(2026, 4, 19, 11, 30, 0, 0, time.UTC) // bucket [06:00, 
12:00)
+       t2 := time.Date(2026, 4, 19, 8, 15, 0, 0, time.UTC)  // same bucket
+       t3 := time.Date(2026, 4, 19, 13, 0, 0, 0, time.UTC)  // bucket [12:00, 
18:00)
+
+       seg1, err := sc.create(t1)
+       require.NoError(t, err)
+       defer seg1.DecRef()
+       assert.Equal(t, time.Date(2026, 4, 19, 6, 0, 0, 0, time.UTC), 
seg1.Start)
+       assert.Equal(t, 6*time.Hour, seg1.End.Sub(seg1.Start))
+
+       seg2, err := sc.create(t2)
+       require.NoError(t, err)
+       assert.Same(t, seg1, seg2, "08:15 must reuse the [06:00, 12:00) 
segment")
+
+       seg3, err := sc.create(t3)
+       require.NoError(t, err)
+       defer seg3.DecRef()
+       assert.NotSame(t, seg1, seg3)
+       assert.Equal(t, time.Date(2026, 4, 19, 12, 0, 0, 0, time.UTC), 
seg3.Start)
+       assert.Equal(t, 6*time.Hour, seg3.End.Sub(seg3.Start))
+}
+
+// TestCreateSegment_ConcurrentCreates_DeterministicAlignment exercises the
+// receiver under contention: many goroutines call create() with random
+// timestamps spread across a window that covers multiple epoch-aligned
+// buckets. Whatever interleaving the scheduler picks, the resulting segment
+// set must (a) cover every probe timestamp, (b) all start on epoch-aligned
+// boundaries, (c) every span be exactly the configured N*Unit, and (d) be
+// non-overlapping. This pins down the property that motivated the fix:
+// alignment is determined by the global grid, not by arrival order.
+func TestCreateSegment_ConcurrentCreates_DeterministicAlignment(t *testing.T) {
+       sc, _, cleanup := newAlignmentTestController(t, IntervalRule{Unit: DAY, 
Num: 15})
+       defer cleanup()
+
+       // Probe range covers 4 epoch-aligned 15d buckets:
+       // [03/08, 03/23), [03/23, 04/07), [04/07, 04/22), [04/22, 05/07).
+       rangeStart := time.Date(2026, 3, 10, 0, 0, 0, 0, time.UTC)
+       rangeEndExclusive := time.Date(2026, 5, 5, 0, 0, 0, 0, time.UTC)
+       rangeNanos := rangeEndExclusive.UnixNano() - rangeStart.UnixNano()
+
+       const goroutines = 32
+       const probesPerGoroutine = 64
+       const knuthMul = uint64(0x9E3779B97F4A7C15)
+       probes := make([]time.Time, 0, goroutines*probesPerGoroutine)
+       for i := 0; i < goroutines*probesPerGoroutine; i++ {
+               offset := int64((uint64(i+1)*knuthMul)>>1) % rangeNanos
+               probes = append(probes, time.Unix(0, 
rangeStart.UnixNano()+offset).UTC())
+       }
+
+       var wg sync.WaitGroup
+       wg.Add(goroutines)
+       for g := 0; g < goroutines; g++ {
+               go func(idx int) {
+                       defer wg.Done()
+                       start := idx * probesPerGoroutine
+                       end := start + probesPerGoroutine
+                       for _, ts := range probes[start:end] {
+                               seg, err := sc.create(ts)
+                               if err != nil {
+                                       t.Errorf("create(%s) returned error: 
%v", ts.Format(time.RFC3339), err)
+                                       return
+                               }
+                               if seg == nil {
+                                       t.Errorf("create(%s) returned nil 
segment", ts.Format(time.RFC3339))
+                                       return
+                               }
+                               if !ts.Before(seg.Start.Add(15*24*time.Hour)) 
|| ts.Before(seg.Start) {
+                                       t.Errorf("probe %s landed in segment 
[%s, %s) which does not contain it",
+                                               ts.Format(time.RFC3339), 
seg.Start.Format(time.RFC3339), seg.End.Format(time.RFC3339))
+                               }
+                               seg.DecRef()

Review Comment:
   The probe containment assertion uses 
ts.Before(seg.Start.Add(15*24*time.Hour)) rather than 
seg.End/seg.Contains(...). If create() ever returns a segment whose span is 
shorter than 15d (e.g., the legacy-neighbor transition case this PR 
introduces), this check can produce false positives/negatives and miss real 
misrouting; assert against seg.End (or seg.Contains) instead of a hard-coded 
duration from seg.Start.



##########
banyand/backup/lifecycle/steps.go:
##########
@@ -130,14 +157,21 @@ func parseGroup(
                        return nil, nil
                }
                nst = ro.Stages[i+1]
-               segmentInterval = st.SegmentInterval
-               l.Info().Msgf("migrating group %s at stage %s to stage %s, 
segment interval: %d(%s), total ttl needs: %d(%s)",
-                       g.Metadata.Name, st.Name, nst.Name, 
segmentInterval.Num, segmentInterval.Unit.String(), ttlTime.Num, 
ttlTime.Unit.String())
+               // Clone before exposing through GroupConfig so callers cannot 
mutate
+               // the shared proto Stages[*] sub-objects.
+               segmentInterval = cloneIntervalRule(st.SegmentInterval)
+               targetSegmentInterval = cloneIntervalRule(nst.SegmentInterval)
+               l.Info().Msgf("migrating group %s at stage %s to stage %s, 
source segment interval: %d(%s), target segment interval: %d(%s), total ttl 
needs: %d(%s)",
+                       g.Metadata.Name, st.Name, nst.Name,
+                       segmentInterval.Num, segmentInterval.Unit.String(),

Review Comment:
   In the matched-stage path, nst := ro.Stages[i+1] is used to populate/log 
targetSegmentInterval, but nst.SegmentInterval is not validated before 
cloneIntervalRule() and targetSegmentInterval.Num/Unit.String() are 
dereferenced. If the next stage is missing segment_interval, this will panic 
instead of returning a validation error; validate nst.SegmentInterval (and 
avoid dereferencing when nil) before logging/returning GroupConfig.



##########
banyand/backup/lifecycle/steps.go:
##########
@@ -113,10 +130,20 @@ func parseGroup(
        if len(ro.Stages) == 0 {
                return nil, fmt.Errorf("no stages in group %s", g.Metadata.Name)
        }
+       if ro.Ttl == nil {
+               return nil, fmt.Errorf("group %s: missing ttl", g.Metadata.Name)
+       }

Review Comment:
   parseGroup clones ro.SegmentInterval into segmentInterval but doesn't 
validate that ro.SegmentInterval is non-nil. segmentInterval is later 
dereferenced in the defaulting log (and returned in GroupConfig), so an invalid 
Group config could still trigger a nil-pointer panic here; consider validating 
ro.SegmentInterval (similar to ro.Ttl) and returning a descriptive error.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Fix lifecycle migration segment shorter than the configured `SegmentInterval` [skywalking-banyandb]

Reply via email to