Re: [PR] Deduplicate spans across replicas using span_id [skywalking-banyandb]

via GitHub Sat, 18 Oct 2025 12:04:08 -0700


Copilot commented on code in PR #794:
URL: 
https://github.com/apache/skywalking-banyandb/pull/794#discussion_r2388601250



##########
banyand/trace/part.go:
##########
@@ -457,14 +457,14 @@ func (mp *memPart) mustInitFromTraces(ts *traces) {
                }
 
                if uncompressedSpansSizeBytes >= maxUncompressedSpanSize || tid 
!= tidPrev {
-                       bsw.MustWriteTrace(tidPrev, ts.spans[indexPrev:i], 
ts.tags[indexPrev:i], ts.timestamps[indexPrev:i])
+                       bsw.MustWriteTrace(tidPrev, ts.spans[indexPrev:i], 
ts.tags[indexPrev:i], ts.timestamps[indexPrev:i], ts.spanIDs[indexPrev:i])
                        tidPrev = tid
                        indexPrev = i
                        uncompressedSpansSizeBytes = 0
                }
                uncompressedSpansSizeBytes += uint64(len(ts.spans[i]))
        }
-       bsw.MustWriteTrace(tidPrev, ts.spans[indexPrev:], ts.tags[indexPrev:], 
ts.timestamps[indexPrev:])
+       bsw.MustWriteTrace(tidPrev, ts.spans[indexPrev:], ts.tags[indexPrev:], 
ts.timestamps[indexPrev:], ts.spanIDs[indexPrev:])

Review Comment:
   The slice operation `ts.spanIDs[indexPrev:i]` and `ts.spanIDs[indexPrev:]` 
could cause a panic if `ts.spanIDs` is shorter than expected. There should be a 
length check to ensure `ts.spanIDs` has at least the same number of elements as 
`ts.spans` before slicing.



##########
banyand/trace/block.go:
##########
@@ -275,7 +283,14 @@ func mustWriteSpansTo(sm *dataBlock, spans [][]byte, 
spanWriter *writer) {
        defer bigValuePool.Release(bb)
 
        sm.offset = spanWriter.bytesWritten
-       for _, span := range spans {
+       for i, span := range spans {
+               var spanID string
+               if i < len(spanIDs) {
+                       spanID = spanIDs[i]

Review Comment:
   The code allows `spanIDs` to be shorter than `spans`, resulting in empty 
span IDs being written for some spans. This inconsistency could cause issues 
during deserialization. Either ensure `spanIDs` has the same length as `spans` 
or handle the mismatch explicitly.



##########
pkg/query/logical/trace/trace_plan_distributed.go:
##########
@@ -170,12 +175,19 @@ func (p *distributedPlan) Execute(ctx context.Context) 
(iter.Iterator[model.Trac
        }
        sortIter := sort.NewItemIter(ct, p.desc)
        var result []*tracev1.InternalTrace
-       seen := make(map[string]bool)
+       seen := make(map[string]*tracev1.InternalTrace)
        for sortIter.Next() {
                trace := sortIter.Val().InternalTrace
-               if !seen[trace.TraceId] {
-                       seen[trace.TraceId] = true
+               if _, ok := seen[trace.TraceId]; !ok {
+                       seen[trace.TraceId] = trace
                        result = append(result, trace)
+               } else {
+                       for _, spanID := range trace.SpanIds {
+                               if 
!slices.Contains(seen[trace.TraceId].SpanIds, spanID) {
+                                       seen[trace.TraceId].SpanIds = 
append(seen[trace.TraceId].SpanIds, spanID)
+                                       seen[trace.TraceId].Spans = 
append(seen[trace.TraceId].Spans, trace.Spans...)
+                               }

Review Comment:
   Line 188 appends all spans from `trace.Spans` for every unique span ID 
found, which will result in duplicate spans being added multiple times. It 
should only append the specific span that corresponds to the current span ID 
being processed.



##########
banyand/trace/write_standalone.go:
##########
@@ -199,12 +204,17 @@ func processTraces(schemaRepo *schemaRepo, tracesInTable 
*tracesInTable, writeEv
                        len(is.indexRuleLocators), 
len(stm.GetSchema().GetTags()))
        }
 
+       return nil
+}
+
+func buildTagsAndMap(stm *trace, tracesInTable *tracesInTable, req 
*tracev1.WriteRequest) ([]*tagValue, map[string]*tagValue) {
        tags := make([]*tagValue, 0, len(stm.schema.Tags))
        tagMap := make(map[string]*tagValue, len(stm.schema.Tags))
        tagSpecs := stm.GetSchema().GetTags()
+
        for i := range tagSpecs {
                tagSpec := tagSpecs[i]
-               if tagSpec.Name == stm.schema.TraceIdTagName {
+               if tagSpec.Name == stm.schema.TraceIdTagName || tagSpec.Name == 
stm.schema.SpanIdTagName {

Review Comment:
   The condition skips both trace ID and span ID tags when building the tag 
map, but span ID is still needed for processing. Only trace ID should be 
skipped here, or there should be separate handling for span ID extraction.



##########
banyand/trace/part.go:
##########
@@ -457,14 +457,14 @@ func (mp *memPart) mustInitFromTraces(ts *traces) {
                }
 
                if uncompressedSpansSizeBytes >= maxUncompressedSpanSize || tid 
!= tidPrev {
-                       bsw.MustWriteTrace(tidPrev, ts.spans[indexPrev:i], 
ts.tags[indexPrev:i], ts.timestamps[indexPrev:i])
+                       bsw.MustWriteTrace(tidPrev, ts.spans[indexPrev:i], 
ts.tags[indexPrev:i], ts.timestamps[indexPrev:i], ts.spanIDs[indexPrev:i])

Review Comment:
   The slice operation `ts.spanIDs[indexPrev:i]` and `ts.spanIDs[indexPrev:]` 
could cause a panic if `ts.spanIDs` is shorter than expected. There should be a 
length check to ensure `ts.spanIDs` has at least the same number of elements as 
`ts.spans` before slicing.



##########
pkg/query/logical/trace/trace_plan_tag_filter.go:
##########
@@ -309,3 +320,60 @@ func (t *traceTagFilterPlan) Children() []logical.Plan {
 func (t *traceTagFilterPlan) Schema() logical.Schema {
        return t.s
 }
+
+type spanIDFilter struct {
+       targetSpanIDs []string
+}
+
+func newSpanIDFilter(spanIDs []string) *spanIDFilter {
+       return &spanIDFilter{
+               targetSpanIDs: spanIDs,
+       }
+}
+
+func (sf *spanIDFilter) matchSpanID(spanID string) bool {
+       for _, targetSpanID := range sf.targetSpanIDs {
+               if spanID == targetSpanID {
+                       return true
+               }
+       }
+       return false

Review Comment:
   The linear search through `targetSpanIDs` could be inefficient for large 
numbers of span IDs. Consider using a map[string]bool for O(1) lookup 
performance instead of O(n) linear search.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Deduplicate spans across replicas using span_id [skywalking-banyandb]

Reply via email to