This is an automated email from the ASF dual-hosted git repository. hanahmily pushed a commit to branch labeled-alarm in repository https://gitbox.apache.org/repos/asf/skywalking.git
commit 2f4554fa5bc313f00727c0f1b29e85f35b8fc5f7 Author: Gao Hongtao <hanahm...@gmail.com> AuthorDate: Sun Aug 9 09:15:55 2020 +0800 Add labeled metrics to alarm system Signed-off-by: Gao Hongtao <hanahm...@gmail.com> --- docs/en/setup/backend/backend-alarm.md | 16 +++ .../oap/server/core/alarm/provider/AlarmRule.java | 4 + .../core/alarm/provider/MetricsValueType.java | 2 +- .../server/core/alarm/provider/RulesReader.java | 4 + .../server/core/alarm/provider/RunningRule.java | 121 +++++++++++++++------ .../core/alarm/provider/RunningRuleTest.java | 111 +++++++++++++++++++ .../core/analysis/meter/function/AvgFunction.java | 3 +- .../meter/function/AvgLabeledFunction.java | 3 +- .../core/analysis/metrics/LabeledValueHolder.java} | 9 +- 9 files changed, 233 insertions(+), 40 deletions(-) diff --git a/docs/en/setup/backend/backend-alarm.md b/docs/en/setup/backend/backend-alarm.md index 387b193..9224022 100644 --- a/docs/en/setup/backend/backend-alarm.md +++ b/docs/en/setup/backend/backend-alarm.md @@ -24,6 +24,12 @@ Alarm rule is constituted by following keys - **Exclude names**. The following entity names are excluded in this rule. Please follow [Entity name define](#entity-name). - **Include names regex**. Provide a regex to include the entity names. If both setting the include name list and include name regex, both rules will take effect. - **Exclude names regex**. Provide a regex to exclude the exclude names. If both setting the exclude name list and exclude name regex, both rules will take effect. +- **Include labels**. The following labels of the metric are included in this rule. +- **Exclude labels**. The following labels of the metric are excluded in this rule. +- **Include labels regex**. Provide a regex to include labels. If both setting the include label list and include label regex, both rules will take effect. +- **Exclude labels regex**. Provide a regex to exclude labels. If both setting the exclude label list and exclude label regex, both rules will take effect. +*The settings of labels is required by meter-system which intends to store metrics from label-system platform, just like Prometheus, Micrometer, etc. +The function supports the above four settings should implement `LabeledValueHolder`.* - **Threshold**. The target value. For multiple values metrics, such as **percentile**, the threshold is an array. Described like `value1, value2, value3, value4, value5`. Each value could the threshold for each value of the metrics. Set the value to `-` if don't want to trigger alarm by this or some of the values. @@ -75,6 +81,16 @@ rules: count: 3 silence-period: 5 message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000 + meter_service_status_code_rule: + metrics-name: meter_status_code + exclude-labels: + - "200" + op: ">" + threshold: 10 + period: 10 + count: 3 + silence-period: 5 + message: The request number of entity {name} non-200 status is more than expected. ``` ### Default alarm rules diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java index f895972..9667ba7 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java @@ -41,6 +41,10 @@ public class AlarmRule { private String includeNamesRegex; private ArrayList<String> excludeNames; private String excludeNamesRegex; + private ArrayList<String> includeLabels; + private String includeLabelsRegex; + private ArrayList<String> excludeLabels; + private String excludeLabelsRegex; private String threshold; private String op; private int period; diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/MetricsValueType.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/MetricsValueType.java index 040b693..23ad115 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/MetricsValueType.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/MetricsValueType.java @@ -19,5 +19,5 @@ package org.apache.skywalking.oap.server.core.alarm.provider; public enum MetricsValueType { - LONG, INT, DOUBLE, MULTI_INTS + LONG, INT, DOUBLE, LABELED_LONG, MULTI_INTS } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java index 371f6c3..e1c1adf 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java @@ -65,6 +65,10 @@ public class RulesReader { alarmRule.setExcludeNames((ArrayList) settings.getOrDefault("exclude-names", new ArrayList(0))); alarmRule.setIncludeNamesRegex((String) settings.getOrDefault("include-names-regex", "")); alarmRule.setExcludeNamesRegex((String) settings.getOrDefault("exclude-names-regex", "")); + alarmRule.setIncludeLabels((ArrayList) settings.getOrDefault("include-labels", new ArrayList(0))); + alarmRule.setExcludeLabels((ArrayList) settings.getOrDefault("exclude-labels", new ArrayList(0))); + alarmRule.setIncludeLabelsRegex((String) settings.getOrDefault("include-labels-regex", "")); + alarmRule.setExcludeLabelsRegex((String) settings.getOrDefault("exclude-labels-regex", "")); alarmRule.setThreshold(settings.get("threshold").toString()); alarmRule.setOp((String) settings.get("op")); alarmRule.setPeriod((Integer) settings.getOrDefault("period", 1)); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index 00d6146..9b97598 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -20,6 +20,7 @@ package org.apache.skywalking.oap.server.core.alarm.provider; import java.util.ArrayList; import java.util.Arrays; +import java.util.Comparator; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -33,8 +34,10 @@ import lombok.extern.slf4j.Slf4j; import org.apache.skywalking.apm.util.StringUtil; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm; +import org.apache.skywalking.oap.server.core.analysis.metrics.DataTable; import org.apache.skywalking.oap.server.core.analysis.metrics.DoubleValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.IntValueHolder; +import org.apache.skywalking.oap.server.core.analysis.metrics.LabeledValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.LongValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics; import org.apache.skywalking.oap.server.core.analysis.metrics.MultiIntValuesHolder; @@ -64,6 +67,10 @@ public class RunningRule { private final List<String> excludeNames; private final Pattern includeNamesRegex; private final Pattern excludeNamesRegex; + private final List<String> includeLabels; + private final List<String> excludeLabels; + private final Pattern includeLabelsRegex; + private final Pattern excludeLabelsRegex; private final AlarmMessageFormatter formatter; public RunningRule(AlarmRule alarmRule) { @@ -87,6 +94,12 @@ public class RunningRule { Pattern.compile(alarmRule.getIncludeNamesRegex()) : null; this.excludeNamesRegex = StringUtil.isNotEmpty(alarmRule.getExcludeNamesRegex()) ? Pattern.compile(alarmRule.getExcludeNamesRegex()) : null; + this.includeLabels = alarmRule.getIncludeLabels(); + this.excludeLabels = alarmRule.getExcludeLabels(); + this.includeLabelsRegex = StringUtil.isNotEmpty(alarmRule.getIncludeLabelsRegex()) ? + Pattern.compile(alarmRule.getIncludeLabelsRegex()) : null; + this.excludeLabelsRegex = StringUtil.isNotEmpty(alarmRule.getExcludeLabelsRegex()) ? + Pattern.compile(alarmRule.getExcludeLabelsRegex()) : null; this.formatter = new AlarmMessageFormatter(alarmRule.getMessage()); } @@ -107,40 +120,8 @@ public class RunningRule { } final String metaName = meta.getName(); - if (CollectionUtils.isNotEmpty(includeNames)) { - if (!includeNames.contains(metaName)) { - if (log.isTraceEnabled()) { - log.trace("{} isn't in the including list {}", metaName, includeNames); - } - return; - } - } - - if (CollectionUtils.isNotEmpty(excludeNames)) { - if (excludeNames.contains(metaName)) { - if (log.isTraceEnabled()) { - log.trace("{} is in the excluding list {}", metaName, excludeNames); - } - return; - } - } - - if (includeNamesRegex != null) { - if (!includeNamesRegex.matcher(metaName).matches()) { - if (log.isTraceEnabled()) { - log.trace("{} doesn't match the include regex {}", metaName, includeNamesRegex); - } - return; - } - } - - if (excludeNamesRegex != null) { - if (excludeNamesRegex.matcher(metaName).matches()) { - if (log.isTraceEnabled()) { - log.trace("{} matches the exclude regex {}", metaName, excludeNamesRegex); - } - return; - } + if (!validate(metaName, includeNames, excludeNames, includeNamesRegex, excludeNamesRegex)) { + return; } if (valueType == null) { @@ -156,6 +137,18 @@ public class RunningRule { } else if (metrics instanceof MultiIntValuesHolder) { valueType = MetricsValueType.MULTI_INTS; threshold.setType(MetricsValueType.MULTI_INTS); + } else if (metrics instanceof LabeledValueHolder) { + if (((LabeledValueHolder) metrics).getValue().keys().stream() + .noneMatch(label -> validate( + label, + includeLabels, + excludeLabels, + includeLabelsRegex, + excludeLabelsRegex))) { + return; + } + valueType = MetricsValueType.LABELED_LONG; + threshold.setType(MetricsValueType.LONG); } else { log.warn("Unsupported value type {}", valueType); return; @@ -168,6 +161,46 @@ public class RunningRule { } } + private boolean validate(String target, List<String> includeList, List<String> excludeList, + Pattern includeRegex, Pattern excludeRegex) { + if (CollectionUtils.isNotEmpty(includeList)) { + if (!includeList.contains(target)) { + if (log.isTraceEnabled()) { + log.trace("{} isn't in the including list {}", target, includeList); + } + return false; + } + } + + if (CollectionUtils.isNotEmpty(excludeList)) { + if (excludeList.contains(target)) { + if (log.isTraceEnabled()) { + log.trace("{} is in the excluding list {}", target, excludeList); + } + return false; + } + } + + if (includeRegex != null) { + if (!includeRegex.matcher(target).matches()) { + if (log.isTraceEnabled()) { + log.trace("{} doesn't match the include regex {}", target, includeRegex); + } + return false; + } + } + + if (excludeRegex != null) { + if (excludeRegex.matcher(target).matches()) { + if (log.isTraceEnabled()) { + log.trace("{} matches the exclude regex {}", target, excludeRegex); + } + return false; + } + } + return true; + } + /** * Move the buffer window to give time. * @@ -365,6 +398,20 @@ public class RunningRule { } } break; + case LABELED_LONG: + DataTable values = ((LabeledValueHolder) metrics).getValue(); + lexpected = RunningRule.this.threshold.getLongThreshold(); + if (values.keys().stream().anyMatch(label -> + validate( + label, + RunningRule.this.includeLabels, + RunningRule.this.excludeLabels, + RunningRule.this.includeLabelsRegex, + RunningRule.this.excludeLabelsRegex) + && op.test(lexpected, values.get(label)))) { + matchCount++; + } + break; } } @@ -404,6 +451,11 @@ public class RunningRule { int[] iArr = ((MultiIntValuesHolder) m).getValues(); r.add(new TraceLogMetric(m.getTimeBucket(), Arrays.stream(iArr).boxed().toArray(Number[]::new))); break; + case LABELED_LONG: + DataTable dt = ((LabeledValueHolder) m).getValue(); + TraceLogMetric l = new TraceLogMetric(m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()).toArray(new Number[0])); + l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]); + r.add(l); } }); return r; @@ -414,5 +466,6 @@ public class RunningRule { private static class TraceLogMetric { private final long timeBucket; private final Number[] value; + private String[] labels; } } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java index 53e1883..42c5b00 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java @@ -23,11 +23,15 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Objects; +import lombok.Getter; +import lombok.Setter; import org.apache.skywalking.oap.server.core.Const; import org.apache.skywalking.oap.server.core.alarm.AlarmCallback; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm; +import org.apache.skywalking.oap.server.core.analysis.metrics.DataTable; import org.apache.skywalking.oap.server.core.analysis.metrics.IntValueHolder; +import org.apache.skywalking.oap.server.core.analysis.metrics.LabeledValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics; import org.apache.skywalking.oap.server.core.analysis.metrics.MultiIntValuesHolder; import org.apache.skywalking.oap.server.core.remote.grpc.proto.RemoteData; @@ -150,6 +154,22 @@ public class RunningRuleTest { } @Test + public void testLabeledAlarm() { + AlarmRule alarmRule = new AlarmRule(); + alarmRule.setIncludeLabels(Lists.newArrayList("95", "99")); + assertLabeled(alarmRule); + alarmRule = new AlarmRule(); + alarmRule.setIncludeLabelsRegex("9\\d{1}"); + assertLabeled(alarmRule); + alarmRule = new AlarmRule(); + alarmRule.setExcludeLabels(Lists.newArrayList("50", "75")); + assertLabeled(alarmRule); + alarmRule = new AlarmRule(); + alarmRule.setExcludeLabelsRegex("^[5-7][0-9]$"); + assertLabeled(alarmRule); + } + + @Test public void testNoAlarm() { AlarmRule alarmRule = new AlarmRule(); alarmRule.setAlarmRuleName("endpoint_percent_rule"); @@ -386,6 +406,13 @@ public class RunningRuleTest { } + private Metrics getLabeledValueMetrics(long timeBucket, String values) { + MockLabeledValueMetrics mockLabeledValueMetrics = new MockLabeledValueMetrics(); + mockLabeledValueMetrics.setValue(new DataTable(values)); + mockLabeledValueMetrics.setTimeBucket(timeBucket); + return mockLabeledValueMetrics; + } + private class MockMetrics extends Metrics implements IntValueHolder { private int value; @@ -491,4 +518,88 @@ public class RunningRuleTest { return null; } } + + private class MockLabeledValueMetrics extends Metrics implements LabeledValueHolder { + + @Getter + @Setter + private DataTable value; + + @Override + public String id() { + return null; + } + + @Override + public void combine(Metrics metrics) { + + } + + @Override + public void calculate() { + + } + + @Override + public Metrics toHour() { + return null; + } + + @Override + public Metrics toDay() { + return null; + } + + @Override + public int remoteHashCode() { + return 0; + } + + @Override + public void deserialize(RemoteData remoteData) { + + } + + @Override + public RemoteData.Builder serialize() { + return null; + } + } + + private void assertLabeled(AlarmRule alarmRule) { + alarmRule.setAlarmRuleName("endpoint_percent_alarm_rule"); + alarmRule.setMetricsName("endpoint_percent"); + alarmRule.setOp(">"); + alarmRule.setThreshold("10"); + alarmRule.setCount(3); + alarmRule.setPeriod(15); + alarmRule.setMessage("response percentile of endpoint {name} is lower than expected value"); + + RunningRule runningRule = new RunningRule(alarmRule); + LocalDateTime startTime = TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301440"); + + long timeInPeriod1 = 201808301434L; + long timeInPeriod2 = 201808301436L; + long timeInPeriod3 = 201808301438L; + + runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod1, "50,17|99,11")); + runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod2, "75,15|95,12")); + runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20")); + + // check at 201808301440 + List<AlarmMessage> alarmMessages = runningRule.check(); + Assert.assertEquals(0, alarmMessages.size()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); + // check at 201808301441 + alarmMessages = runningRule.check(); + Assert.assertEquals(0, alarmMessages.size()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442")); + // check at 201808301442 + alarmMessages = runningRule.check(); + Assert.assertEquals(1, alarmMessages.size()); + Assert.assertEquals( + "response percentile of endpoint Service_123 is lower than expected value", alarmMessages.get(0) + .getAlarmMessage()); + + } } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/meter/function/AvgFunction.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/meter/function/AvgFunction.java index 4263221..9b151c7 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/meter/function/AvgFunction.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/meter/function/AvgFunction.java @@ -28,6 +28,7 @@ import org.apache.skywalking.oap.server.core.Const; import org.apache.skywalking.oap.server.core.UnexpectedException; import org.apache.skywalking.oap.server.core.analysis.manual.instance.InstanceTraffic; import org.apache.skywalking.oap.server.core.analysis.meter.MeterEntity; +import org.apache.skywalking.oap.server.core.analysis.metrics.LongValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics; import org.apache.skywalking.oap.server.core.analysis.metrics.annotation.ConstOne; import org.apache.skywalking.oap.server.core.analysis.metrics.annotation.Entrance; @@ -39,7 +40,7 @@ import org.apache.skywalking.oap.server.core.storage.annotation.Column; @MeterFunction(functionName = "avg") @ToString -public abstract class AvgFunction extends Metrics implements AcceptableValue<Long> { +public abstract class AvgFunction extends Metrics implements AcceptableValue<Long>, LongValueHolder { protected static final String SUMMATION = "summation"; protected static final String COUNT = "count"; protected static final String VALUE = "value"; diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/meter/function/AvgLabeledFunction.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/meter/function/AvgLabeledFunction.java index 84739e3..c617bf9 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/meter/function/AvgLabeledFunction.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/meter/function/AvgLabeledFunction.java @@ -30,6 +30,7 @@ import org.apache.skywalking.oap.server.core.UnexpectedException; import org.apache.skywalking.oap.server.core.analysis.manual.instance.InstanceTraffic; import org.apache.skywalking.oap.server.core.analysis.meter.MeterEntity; import org.apache.skywalking.oap.server.core.analysis.metrics.DataTable; +import org.apache.skywalking.oap.server.core.analysis.metrics.LabeledValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics; import org.apache.skywalking.oap.server.core.remote.grpc.proto.RemoteData; import org.apache.skywalking.oap.server.core.storage.StorageBuilder; @@ -37,7 +38,7 @@ import org.apache.skywalking.oap.server.core.storage.annotation.Column; @MeterFunction(functionName = "avgLabeled") @ToString -public abstract class AvgLabeledFunction extends Metrics implements AcceptableValue<DataTable> { +public abstract class AvgLabeledFunction extends Metrics implements AcceptableValue<DataTable>, LabeledValueHolder { protected static final String SUMMATION = "summation"; protected static final String COUNT = "count"; protected static final String VALUE = "value"; diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/MetricsValueType.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/metrics/LabeledValueHolder.java similarity index 80% copy from oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/MetricsValueType.java copy to oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/metrics/LabeledValueHolder.java index 040b693..208ae57 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/MetricsValueType.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/analysis/metrics/LabeledValueHolder.java @@ -16,8 +16,11 @@ * */ -package org.apache.skywalking.oap.server.core.alarm.provider; +package org.apache.skywalking.oap.server.core.analysis.metrics; -public enum MetricsValueType { - LONG, INT, DOUBLE, MULTI_INTS +/** + * LabeledValueHolder holds a list of key-value pair. + */ +public interface LabeledValueHolder { + DataTable getValue(); }