This is an automated email from the ASF dual-hosted git repository.

wusheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/skywalking.git


The following commit(s) were added to refs/heads/master by this push:
     new 6227db1  Save error trace segment even that segment will abandoned by 
sampling mechanism (#5407)
6227db1 is described below

commit 6227db19cd057512e3a97312cdfd0ebf688b74d5
Author: zifeihan <[email protected]>
AuthorDate: Fri Aug 28 20:37:19 2020 +0800

    Save error trace segment even that segment will abandoned by sampling 
mechanism (#5407)
    
    * Save some error trace segment, event this segment abandoned by server 
side trace sampling mechanism.
    * Support forceSaveErrorSegment config to control force save some error 
segment.
    
    Co-authored-by: echo <[email protected]>
    Co-authored-by: 吴晟 Wu Sheng <[email protected]>
---
 docs/en/setup/backend/trace-sampling.md            |  7 +++++-
 .../analyzer/provider/AnalyzerModuleConfig.java    |  7 ++++++
 .../parser/listener/SegmentAnalysisListener.java   | 26 ++++++++++++----------
 .../src/main/resources/application.yml             |  1 +
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/docs/en/setup/backend/trace-sampling.md 
b/docs/en/setup/backend/trace-sampling.md
index 6a8ace6..d54df83 100644
--- a/docs/en/setup/backend/trace-sampling.md
+++ b/docs/en/setup/backend/trace-sampling.md
@@ -30,4 +30,9 @@ When you set the rate different, let's say
 And we assume the agents reported all trace segments to backend,
 Then the 35% traces in the global will be collected and saved in storage 
consistent/complete, with all spans.
 20% trace segments, which reported to Backend-Instance**B**, will saved in 
storage, maybe miss some trace segments,
-because they are reported to Backend-Instance**A** and ignored.
\ No newline at end of file
+because they are reported to Backend-Instance**A** and ignored.
+
+# Note
+When you open sampling, the actual sample rate could be over sampleRate. 
Because currently, all error segments will be saved, meanwhile, the upstream 
and downstream may not be sampled. This feature is going to make sure you could 
have the error stacks and segments, but don't guarantee you would have the 
whole trace.
+
+Also, the side effect would be, if most of the accesses are fail, the sampling 
rate would be closing to 100%, which could crash the backend or storage 
clusters.
diff --git 
a/oap-server/analyzer/agent-analyzer/src/main/java/org/apache/skywalking/oap/server/analyzer/provider/AnalyzerModuleConfig.java
 
b/oap-server/analyzer/agent-analyzer/src/main/java/org/apache/skywalking/oap/server/analyzer/provider/AnalyzerModuleConfig.java
index 84d3b6d..4a658d0 100644
--- 
a/oap-server/analyzer/agent-analyzer/src/main/java/org/apache/skywalking/oap/server/analyzer/provider/AnalyzerModuleConfig.java
+++ 
b/oap-server/analyzer/agent-analyzer/src/main/java/org/apache/skywalking/oap/server/analyzer/provider/AnalyzerModuleConfig.java
@@ -78,4 +78,11 @@ public class AnalyzerModuleConfig extends ModuleConfig {
 
     @Getter
     private final String configPath = "meter-receive-config";
+
+    /**
+     * Sample the trace segment if the segment has span(s) tagged as error 
status, and ignore the sampleRate configuration.
+     */
+    @Setter
+    @Getter
+    private boolean forceSampleErrorSegment = true;
 }
diff --git 
a/oap-server/analyzer/agent-analyzer/src/main/java/org/apache/skywalking/oap/server/analyzer/provider/trace/parser/listener/SegmentAnalysisListener.java
 
b/oap-server/analyzer/agent-analyzer/src/main/java/org/apache/skywalking/oap/server/analyzer/provider/trace/parser/listener/SegmentAnalysisListener.java
index 9f4afdc..d46b427 100644
--- 
a/oap-server/analyzer/agent-analyzer/src/main/java/org/apache/skywalking/oap/server/analyzer/provider/trace/parser/listener/SegmentAnalysisListener.java
+++ 
b/oap-server/analyzer/agent-analyzer/src/main/java/org/apache/skywalking/oap/server/analyzer/provider/trace/parser/listener/SegmentAnalysisListener.java
@@ -47,6 +47,7 @@ import 
org.apache.skywalking.oap.server.library.util.BooleanUtils;
 public class SegmentAnalysisListener implements FirstAnalysisListener, 
EntryAnalysisListener, SegmentListener {
     private final SourceReceiver sourceReceiver;
     private final TraceSegmentSampler sampler;
+    private final boolean forceSampleErrorSegment;
     private final NamingControl namingControl;
     private final List<String> searchableTagKeys;
 
@@ -125,18 +126,6 @@ public class SegmentAnalysisListener implements 
FirstAnalysisListener, EntryAnal
 
     @Override
     public void parseSegment(SegmentObject segmentObject) {
-        if (sampleStatus.equals(SAMPLE_STATUS.UNKNOWN) || 
sampleStatus.equals(SAMPLE_STATUS.IGNORE)) {
-            if (sampler.shouldSample(segmentObject.getTraceId())) {
-                sampleStatus = SAMPLE_STATUS.SAMPLED;
-            } else {
-                sampleStatus = SAMPLE_STATUS.IGNORE;
-            }
-        }
-
-        if (sampleStatus.equals(SAMPLE_STATUS.IGNORE)) {
-            return;
-        }
-
         segment.setTraceId(segmentObject.getTraceId());
         segmentObject.getSpansList().forEach(span -> {
             if (startTimestamp == 0 || startTimestamp > span.getStartTime()) {
@@ -153,6 +142,16 @@ public class SegmentAnalysisListener implements 
FirstAnalysisListener, EntryAnal
         });
         final long accurateDuration = endTimestamp - startTimestamp;
         duration = accurateDuration > Integer.MAX_VALUE ? Integer.MAX_VALUE : 
(int) accurateDuration;
+
+        if (sampleStatus.equals(SAMPLE_STATUS.UNKNOWN) || 
sampleStatus.equals(SAMPLE_STATUS.IGNORE)) {
+            if (sampler.shouldSample(segmentObject.getTraceId())) {
+                sampleStatus = SAMPLE_STATUS.SAMPLED;
+            } else if (isError && forceSampleErrorSegment) {
+                sampleStatus = SAMPLE_STATUS.SAMPLED;
+            } else {
+                sampleStatus = SAMPLE_STATUS.IGNORE;
+            }
+        }
     }
 
     private void appendSearchableTags(SpanObject span) {
@@ -186,6 +185,7 @@ public class SegmentAnalysisListener implements 
FirstAnalysisListener, EntryAnal
     public static class Factory implements AnalysisListenerFactory {
         private final SourceReceiver sourceReceiver;
         private final TraceSegmentSampler sampler;
+        private final boolean forceSampleErrorSegment;
         private final NamingControl namingControl;
         private final List<String> searchTagKeys;
 
@@ -196,6 +196,7 @@ public class SegmentAnalysisListener implements 
FirstAnalysisListener, EntryAnal
                                                              
.getService(ConfigService.class);
             this.searchTagKeys = 
Arrays.asList(configService.getSearchableTracesTags().split(Const.COMMA));
             this.sampler = new 
TraceSegmentSampler(config.getTraceSampleRateWatcher());
+            this.forceSampleErrorSegment = config.isForceSampleErrorSegment();
             this.namingControl = moduleManager.find(CoreModule.NAME)
                                               .provider()
                                               .getService(NamingControl.class);
@@ -206,6 +207,7 @@ public class SegmentAnalysisListener implements 
FirstAnalysisListener, EntryAnal
             return new SegmentAnalysisListener(
                 sourceReceiver,
                 sampler,
+                forceSampleErrorSegment,
                 namingControl,
                 searchTagKeys
             );
diff --git a/oap-server/server-bootstrap/src/main/resources/application.yml 
b/oap-server/server-bootstrap/src/main/resources/application.yml
index d774945..a74ee81 100755
--- a/oap-server/server-bootstrap/src/main/resources/application.yml
+++ b/oap-server/server-bootstrap/src/main/resources/application.yml
@@ -175,6 +175,7 @@ agent-analyzer:
   default:
     sampleRate: ${SW_TRACE_SAMPLE_RATE:10000} # The sample rate precision is 
1/10000. 10000 means 100% sample in default.
     slowDBAccessThreshold: ${SW_SLOW_DB_THRESHOLD:default:200,mongodb:100} # 
The slow database access thresholds. Unit ms.
+    forceSampleErrorSegment: ${SW_FORCE_SAMPLE_ERROR_SEGMENT:true} # When 
sampling mechanism active, this config can open(true) force save some error 
segment. true is default.
 
 receiver-sharing-server:
   selector: ${SW_RECEIVER_SHARING_SERVER:default}

Reply via email to