This is an automated email from the ASF dual-hosted git repository.
wusheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/skywalking.git
The following commit(s) were added to refs/heads/master by this push:
new 062b31f4ba Fix AlarmRule expression validation: add labeled metrics
mock data for check. (#11437)
062b31f4ba is described below
commit 062b31f4ba5586c33f301dd4052f602894b784c4
Author: Wan Kai <[email protected]>
AuthorDate: Fri Oct 20 10:45:15 2023 +0800
Fix AlarmRule expression validation: add labeled metrics mock data for
check. (#11437)
---
docs/en/changes/changes.md | 1 +
.../oap/server/core/alarm/provider/AlarmRule.java | 18 +-----
.../provider/expr/rt/AlarmMQEVerifyVisitor.java | 66 ++++++++++++++++++++--
.../server/core/alarm/provider/AlarmRuleTest.java | 18 ++++--
test/e2e-v2/cases/alarm/alarm-settings.yml | 11 ++--
.../alarm/expected/silence-after-graphql-warn.yml | 48 ++++++++++++++++
.../alarm/expected/silence-before-graphql-warn.yml | 24 ++++++++
7 files changed, 156 insertions(+), 30 deletions(-)
diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md
index a027a04a80..cebc12bf8c 100644
--- a/docs/en/changes/changes.md
+++ b/docs/en/changes/changes.md
@@ -25,6 +25,7 @@
* Fix `AlarmCore` doAlarm: catch exception for each callback to avoid
interruption.
* Optimize queryBasicTraces in TraceQueryEsDAO.
* Fix `WebhookCallback` send incorrect messages, add catch exception for each
callback HTTP Post.
+* Fix AlarmRule expression validation: add labeled metrics mock data for check.
#### UI
diff --git
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java
index 5094957265..76c6e6cc1a 100644
---
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java
+++
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java
@@ -21,7 +21,6 @@ package org.apache.skywalking.oap.server.core.alarm.provider;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Map;
-import java.util.Optional;
import java.util.Set;
import lombok.AccessLevel;
import lombok.Data;
@@ -39,7 +38,6 @@ import org.apache.skywalking.mqe.rt.grammar.MQEParser;
import org.apache.skywalking.mqe.rt.type.ExpressionResult;
import org.apache.skywalking.mqe.rt.type.ExpressionResultType;
import
org.apache.skywalking.oap.server.core.alarm.provider.expr.rt.AlarmMQEVerifyVisitor;
-import org.apache.skywalking.oap.server.core.storage.annotation.Column;
import
org.apache.skywalking.oap.server.core.storage.annotation.ValueColumnMetadata;
import org.apache.skywalking.oap.server.library.util.StringUtil;
@@ -95,25 +93,11 @@ public class AlarmRule {
private void verifyIncludeMetrics(Set<String> includeMetrics, String
expression) throws IllegalExpressionException {
Set<String> scopeSet = new HashSet<>();
for (String metricName : includeMetrics) {
- Optional<ValueColumnMetadata.ValueColumn> valueColumn =
ValueColumnMetadata.INSTANCE.readValueColumnDefinition(
- metricName);
- if (valueColumn.isEmpty()) {
- throw new IllegalExpressionException("Metric: [" + metricName
+ "] dose not exist.");
- }
scopeSet.add(ValueColumnMetadata.INSTANCE.getScope(metricName).name());
- Column.ValueDataType dataType = valueColumn.get().getDataType();
- switch (dataType) {
- case COMMON_VALUE:
- case LABELED_VALUE:
- break;
- default:
- throw new IllegalExpressionException(
- "Metric dose not supported in alarm, metric: [" +
metricName + "] is not a common or labeled metric.");
- }
}
if (scopeSet.size() != 1) {
throw new IllegalExpressionException(
- "The metrics in expression: " + expression + " must have the
same scope level, but got: " + scopeSet);
+ "The metrics in expression: " + expression + " must have the
same scope level, but got: " + scopeSet + ".");
}
}
}
diff --git
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/expr/rt/AlarmMQEVerifyVisitor.java
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/expr/rt/AlarmMQEVerifyVisitor.java
index 78c93afdd7..c6c556f9c8 100644
---
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/expr/rt/AlarmMQEVerifyVisitor.java
+++
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/expr/rt/AlarmMQEVerifyVisitor.java
@@ -18,7 +18,12 @@
package org.apache.skywalking.oap.server.core.alarm.provider.expr.rt;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
+import java.util.List;
+import java.util.Optional;
import java.util.Set;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
@@ -28,6 +33,13 @@ import org.apache.skywalking.mqe.rt.type.ExpressionResult;
import org.apache.skywalking.mqe.rt.type.ExpressionResultType;
import org.apache.skywalking.mqe.rt.type.MQEValue;
import org.apache.skywalking.mqe.rt.type.MQEValues;
+import org.apache.skywalking.mqe.rt.type.Metadata;
+import org.apache.skywalking.oap.server.core.Const;
+import org.apache.skywalking.oap.server.core.query.type.KeyValue;
+import org.apache.skywalking.oap.server.core.storage.annotation.Column;
+import
org.apache.skywalking.oap.server.core.storage.annotation.ValueColumnMetadata;
+import org.apache.skywalking.oap.server.library.util.CollectionUtils;
+import org.apache.skywalking.oap.server.library.util.StringUtil;
/**
* Used for verify the alarm expression and get the metrics name when read the
alarm rules.
@@ -41,6 +53,14 @@ public class AlarmMQEVerifyVisitor extends MQEVisitorBase {
public ExpressionResult visitMetric(MQEParser.MetricContext ctx) {
ExpressionResult result = new ExpressionResult();
String metricName = ctx.metricName().getText();
+ Optional<ValueColumnMetadata.ValueColumn> valueColumn =
ValueColumnMetadata.INSTANCE.readValueColumnDefinition(
+ metricName);
+ if (valueColumn.isEmpty()) {
+ result.setType(ExpressionResultType.UNKNOWN);
+ result.setError("Metric: [" + metricName + "] dose not exist.");
+ return result;
+ }
+
this.includeMetrics.add(metricName);
if (ctx.parent instanceof MQEParser.TopNOPContext) {
@@ -48,13 +68,49 @@ public class AlarmMQEVerifyVisitor extends MQEVisitorBase {
result.setError("Unsupported operation: [top_n] in alarm
expression.");
return result;
}
+ Column.ValueDataType dataType = valueColumn.get().getDataType();
- MQEValues mqeValues = new MQEValues();
+ MQEValues mockMqeValues = new MQEValues();
MQEValue mqeValue = new MQEValue();
mqeValue.setEmptyValue(true);
- mqeValues.getValues().add(mqeValue);
- result.getResults().add(mqeValues);
- result.setType(ExpressionResultType.TIME_SERIES_VALUES);
- return result;
+ mockMqeValues.getValues().add(mqeValue);
+ if (dataType == Column.ValueDataType.COMMON_VALUE) {
+ result.getResults().add(mockMqeValues);
+ result.setType(ExpressionResultType.TIME_SERIES_VALUES);
+ return result;
+ } else if (dataType == Column.ValueDataType.LABELED_VALUE) {
+ List<String> labelValues = Collections.emptyList();
+ if (ctx.label() != null) {
+ String labelValue = ctx.label().labelValue().getText();
+ String labelValueTrim = labelValue.substring(1,
labelValue.length() - 1);
+ if (StringUtil.isNotBlank(labelValueTrim)) {
+ labelValues =
Arrays.asList(labelValueTrim.split(Const.COMMA));
+ }
+ }
+ ArrayList<MQEValues> mqeValuesList = new ArrayList<>();
+ if (CollectionUtils.isEmpty(labelValues)) {
+ KeyValue label = new KeyValue(GENERAL_LABEL_NAME,
GENERAL_LABEL_NAME);
+ Metadata metadata = new Metadata();
+ metadata.getLabels().add(label);
+ mockMqeValues.setMetric(metadata);
+ mqeValuesList.add(mockMqeValues);
+ } else {
+ for (String value : labelValues) {
+ Metadata metadata = new Metadata();
+ KeyValue label = new KeyValue(GENERAL_LABEL_NAME, value);
+ metadata.getLabels().add(label);
+ mockMqeValues.setMetric(metadata);
+ mqeValuesList.add(mockMqeValues);
+ }
+ }
+ result.setType(ExpressionResultType.TIME_SERIES_VALUES);
+ result.setResults(mqeValuesList);
+ result.setLabeledResult(true);
+ return result;
+ } else {
+ result.setType(ExpressionResultType.UNKNOWN);
+ result.setError("Metric dose not supported in alarm, metric: [" +
metricName + "] is not a common or labeled metric.");
+ return result;
+ }
}
}
diff --git
a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRuleTest.java
b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRuleTest.java
index 098d1d3592..64e353587a 100644
---
a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRuleTest.java
+++
b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRuleTest.java
@@ -41,6 +41,10 @@ public class AlarmRuleTest {
"endpoint_percent", "testColumn",
Column.ValueDataType.COMMON_VALUE, Function.Avg, 0,
Scope.Endpoint.getScopeId()
);
+ ValueColumnMetadata.INSTANCE.putIfAbsent(
+ "meter_status_code", "testColumn",
Column.ValueDataType.LABELED_VALUE, Function.Avg, 0,
+ Scope.Service.getScopeId()
+ );
ValueColumnMetadata.INSTANCE.putIfAbsent(
"record", "testColumn", Column.ValueDataType.SAMPLED_RECORD,
Function.Avg, 0,
Scope.Endpoint.getScopeId()
@@ -58,9 +62,15 @@ public class AlarmRuleTest {
@Test
public void testExpressionVerify() throws IllegalExpressionException {
AlarmRule rule = new AlarmRule();
- //normal
+ //normal common metric
rule.setExpression("sum(service_percent < 85) >= 3");
-
+ //normal labeled metric
+ //4xx + 5xx > 10
+
rule.setExpression("sum(aggregate_labels(meter_status_code{_='4xx,5xx'},sum) >
10) > 3");
+ rule.setExpression("sum(aggregate_labels(meter_status_code,sum) > 10)
> 3");
+ //4xx or 5xx > 10
+ rule.setExpression("sum(meter_status_code{_='4xx,5xx'} > 10) >= 3");
+ rule.setExpression("sum(meter_status_code > 10) >= 3");
//illegal expression
Assertions.assertThrows(IllegalExpressionException.class, () -> {
rule.setExpression("what? sum(service_percent < 85) >= 3");
@@ -68,7 +78,7 @@ public class AlarmRuleTest {
//not exist metric
Assertions.assertEquals(
- "Metric: [service_percent111] dose not exist.",
+ "Expression: sum(service_percent111 < 85) >= 3 error: Metric:
[service_percent111] dose not exist.",
Assertions.assertThrows(IllegalExpressionException.class, () -> {
rule.setExpression("sum(service_percent111 < 85) >= 3");
}).getMessage()
@@ -92,7 +102,7 @@ public class AlarmRuleTest {
//not a common or labeled metric
Assertions.assertEquals(
- "Metric dose not supported in alarm, metric: [record] is not a
common or labeled metric.",
+ "Expression: sum(record < 85) > 1 error: Metric dose not supported
in alarm, metric: [record] is not a common or labeled metric.",
Assertions.assertThrows(IllegalExpressionException.class, () -> {
rule.setExpression("sum(record < 85) > 1");
}).getMessage()
diff --git a/test/e2e-v2/cases/alarm/alarm-settings.yml
b/test/e2e-v2/cases/alarm/alarm-settings.yml
index 4046d44740..40df6f0a75 100755
--- a/test/e2e-v2/cases/alarm/alarm-settings.yml
+++ b/test/e2e-v2/cases/alarm/alarm-settings.yml
@@ -25,12 +25,15 @@ rules:
receivers: lisi
hooks:
- webhook.custom
- # service sla > 1%
- service_sla_rule:
- expression: sum(service_sla > 100) >= 1
+ # service_percentile > 10ms
+ service_percentile_rule:
+ expression: sum(service_percentile{_='0,1,2,3,4'} > 10) >= 3
period: 10
silence-period: 1
- message: Successful rate of service {name} is more than 1% in 1 minutes of
last 10 minutes
+ message: Percentile response time of service {name} alarm in 3 minutes of
last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 >
10, p95 > 10, p99 > 10.
+ tags:
+ level: WARNING
+ receivers: lisi
hooks:
- webhook.none
comp_rule:
diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml
b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml
index 6fd21b956a..641150f9da 100644
--- a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml
+++ b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml
@@ -15,6 +15,30 @@
msgs:
{{- contains .msgs }}
+ - starttime: {{ gt .starttime 0 }}
+ scope: Service
+ id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
+ message: Percentile response time of service e2e-service-provider alarm in
3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 >
10, p90 > 10, p95 > 10, p99 > 10.
+ tags:
+ - key: level
+ value: WARNING
+ - key: receivers
+ value: lisi
+ events:
+ {{- contains .events }}
+ - uuid: {{ notEmpty .uuid }}
+ source:
+ service: e2e-service-provider
+ serviceinstance: ""
+ endpoint: ""
+ name: Alarm
+ type: ""
+ message: {{ notEmpty .message }}
+ parameters: []
+ starttime: {{ gt .starttime 0 }}
+ endtime: {{ gt .endtime 0 }}
+ layer: GENERAL
+ {{- end }}
- starttime: {{ gt .starttime 0 }}
scope: Service
id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
@@ -39,6 +63,30 @@ msgs:
endtime: {{ gt .endtime 0 }}
layer: GENERAL
{{- end }}
+ - starttime: {{ gt .starttime 0 }}
+ scope: Service
+ id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
+ message: Percentile response time of service e2e-service-provider alarm in
3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 >
10, p90 > 10, p95 > 10, p99 > 10.
+ tags:
+ - key: level
+ value: WARNING
+ - key: receivers
+ value: lisi
+ events:
+ {{- contains .events }}
+ - uuid: {{ notEmpty .uuid }}
+ source:
+ service: e2e-service-provider
+ serviceinstance: ""
+ endpoint: ""
+ name: Alarm
+ type: ""
+ message: {{ notEmpty .message }}
+ parameters: []
+ starttime: {{ gt .starttime 0 }}
+ endtime: {{ gt .endtime 0 }}
+ layer: GENERAL
+ {{- end }}
- starttime: {{ gt .starttime 0 }}
scope: Service
id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml
b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml
index 4a6e55462e..e8d5c1ca88 100644
--- a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml
+++ b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml
@@ -15,6 +15,30 @@
msgs:
{{- contains .msgs }}
+ - starttime: {{ gt .starttime 0 }}
+ scope: Service
+ id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
+ message: Percentile response time of service e2e-service-provider alarm in
3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 >
10, p90 > 10, p95 > 10, p99 > 10.
+ tags:
+ - key: level
+ value: WARNING
+ - key: receivers
+ value: lisi
+ events:
+ {{- contains .events }}
+ - uuid: {{ notEmpty .uuid }}
+ source:
+ service: e2e-service-provider
+ serviceinstance: ""
+ endpoint: ""
+ name: Alarm
+ type: ""
+ message: {{ notEmpty .message }}
+ parameters: []
+ starttime: {{ gt .starttime 0 }}
+ endtime: {{ gt .endtime 0 }}
+ layer: GENERAL
+ {{- end }}
- starttime: {{ gt .starttime 0 }}
scope: Service
id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1