[skywalking] branch master updated: Fix the configuration of `Aggregation` and `GC Count` metrics for oap self observability(#8707) (#8719)

wusheng Mon, 21 Mar 2022 04:04:15 -0700

This is an automated email from the ASF dual-hosted git repository.

wusheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/skywalking.git



The following commit(s) were added to refs/heads/master by this push:
     new b9c5283  Fix the configuration of `Aggregation`  and `GC Count` 
metrics for oap self observability(#8707) (#8719)
b9c5283 is described below

commit b9c5283468321f63a2ab364467120039e321b965
Author: Cool-Coding <[email protected]>
AuthorDate: Mon Mar 21 19:03:39 2022 +0800

    Fix the configuration of `Aggregation`  and `GC Count` metrics for oap self 
observability(#8707) (#8719)
---
 CHANGES.md                                         |  1 +
 .../main/resources/fetcher-prom-rules/self.yaml    | 16 +++++++--------
 .../src/main/resources/otel-oc-rules/oap.yaml      | 16 +++++++--------
 .../self-observability.yml                         | 12 +++++------
 .../so11y/expected/metrics-has-value-label.yml     | 23 ++++++++++++++++++++++
 test/e2e-v2/cases/so11y/so11y-cases.yaml           |  4 ++--
 6 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 6227015..cd0ab1b 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -123,6 +123,7 @@ Release Notes.
 * Remove hard requirement of BASE64 encoding for binary field.
 * Add complexity limitation for GraphQL query to avoid malicious query.
 * Add `Column.shardingKeyIdx` for column definition for BanyanDB.
+* Fix the configuration of `Aggregation` and `GC Count` metrics for oap self 
observability
 
 ```
 Sharding key is used to group time series data per metric of one entity in one 
place (same sharding and/or same 
diff --git 
a/oap-server/server-starter/src/main/resources/fetcher-prom-rules/self.yaml 
b/oap-server/server-starter/src/main/resources/fetcher-prom-rules/self.yaml
index 2a2105b..e2b2be7 100644
--- a/oap-server/server-starter/src/main/resources/fetcher-prom-rules/self.yaml
+++ b/oap-server/server-starter/src/main/resources/fetcher-prom-rules/self.yaml
@@ -45,12 +45,13 @@ metricsRules:
     exp: (process_cpu_seconds_total * 100).sum(['service', 
'instance']).rate('PT1M')
   - name: instance_jvm_memory_bytes_used
     exp: jvm_memory_bytes_used.sum(['service', 'instance'])
-  - name: instance_jvm_young_gc_count
-    exp: jvm_gc_collection_seconds_count.tagMatch('gc', 'PS 
Scavenge|Copy|ParNew|G1 Young Generation').sum(['service', 
'instance']).increase('PT1M')
+  - name: instance_jvm_gc_count
+    exp: "jvm_gc_collection_seconds_count.tagMatch('gc', 'PS 
Scavenge|Copy|ParNew|G1 Young Generation|PS 
MarkSweep|MarkSweepCompact|ConcurrentMarkSweep|G1 Old Generation')
+    .sum(['service', 'instance', 'gc']).increase('PT1M')
+    .tag({tags -> if (tags['gc'] == 'PS Scavenge' || tags['gc'] == 'Copy' || 
tags['gc'] == 'ParNew' || tags['gc'] == 'G1 Young Generation') {tags.gc = 
'young_gc_count'} })
+    .tag({tags -> if (tags['gc'] == 'PS MarkSweep' || tags['gc'] == 
'MarkSweepCompact' || tags['gc'] == 'ConcurrentMarkSweep' || tags['gc'] == 'G1 
Old Generation') {tags.gc = 'old_gc_count'} })"
   - name: instance_jvm_young_gc_time
     exp: jvm_gc_collection_seconds_sum.tagMatch('gc', 'PS 
Scavenge|Copy|ParNew|G1 Young Generation').sum(['service', 'instance']) * 1000
-  - name: instance_jvm_old_gc_count
-    exp: jvm_gc_collection_seconds_count.tagMatch('gc', 'PS 
MarkSweep|MarkSweepCompact|ConcurrentMarkSweep|G1 Old 
Generation').sum(['service', 'instance']).increase('PT1M')
   - name: instance_jvm_old_gc_time
     exp: jvm_gc_collection_seconds_sum.tagMatch('gc', 'PS 
MarkSweep|MarkSweepCompact|ConcurrentMarkSweep|G1 Old 
Generation').sum(['service', 'instance']) * 1000
   - name: instance_trace_count
@@ -65,10 +66,9 @@ metricsRules:
     exp: mesh_analysis_latency.sum(['le', 'service', 
'instance']).increase('PT1M').histogram().histogram_percentile([50,70,90,99])
   - name: instance_mesh_analysis_error_count
     exp: mesh_analysis_error_count.sum(['service', 
'instance']).increase('PT1M')
-  - name: instance_metrics_first_aggregation
-    exp: metrics_aggregation.tagEqual('dimensionality', 'minute', 'level', 
'1').sum(['service', 'instance']).increase('PT5M')
-  - name: instance_metrics_second_aggregation
-    exp: metrics_aggregation.tagEqual('dimensionality', 'minute', 'level', 
'2').sum(['service', 'instance']).increase('PT5M')
+  - name: instance_metrics_aggregation
+    exp: "metrics_aggregation.tagEqual('dimensionality', 
'minute').sum(['service', 'instance', 'level']).increase('PT1M')
+    .tag({tags -> if (tags['level'] == '1') {tags.level = 'L1 aggregation'} 
}).tag({tags -> if (tags['level'] == '2') {tags.level = 'L2 aggregation'} })"
   - name: instance_persistence_execute_percentile
     exp: persistence_timer_bulk_execute_latency.sum(['le', 'service', 
'instance']).increase('PT5M').histogram().histogram_percentile([50,70,90,99])
   - name: instance_persistence_prepare_percentile
diff --git 
a/oap-server/server-starter/src/main/resources/otel-oc-rules/oap.yaml 
b/oap-server/server-starter/src/main/resources/otel-oc-rules/oap.yaml
index ce714e3..42355a9 100644
--- a/oap-server/server-starter/src/main/resources/otel-oc-rules/oap.yaml
+++ b/oap-server/server-starter/src/main/resources/otel-oc-rules/oap.yaml
@@ -36,12 +36,13 @@ metricsRules:
     exp: (process_cpu_seconds_total * 100).sum(['service', 
'host_name']).rate('PT1M')
   - name: instance_jvm_memory_bytes_used
     exp: jvm_memory_bytes_used.sum(['service', 'host_name'])
-  - name: instance_jvm_young_gc_count
-    exp: jvm_gc_collection_seconds_count.tagMatch('gc', 'PS 
Scavenge|Copy|ParNew|G1 Young Generation').sum(['service', 
'host_name']).increase('PT1M')
+  - name: instance_jvm_gc_count
+    exp: "jvm_gc_collection_seconds_count.tagMatch('gc', 'PS 
Scavenge|Copy|ParNew|G1 Young Generation|PS 
MarkSweep|MarkSweepCompact|ConcurrentMarkSweep|G1 Old Generation')
+    .sum(['service', 'host_name', 'gc']).increase('PT1M')
+    .tag({tags -> if (tags['gc'] == 'PS Scavenge' || tags['gc'] == 'Copy' || 
tags['gc'] == 'ParNew' || tags['gc'] == 'G1 Young Generation') {tags.gc = 
'young_gc_count'} })
+    .tag({tags -> if (tags['gc'] == 'PS MarkSweep' || tags['gc'] == 
'MarkSweepCompact' || tags['gc'] == 'ConcurrentMarkSweep' || tags['gc'] == 'G1 
Old Generation') {tags.gc = 'old_gc_count'} })"
   - name: instance_jvm_young_gc_time
     exp: jvm_gc_collection_seconds_sum.tagMatch('gc', 'PS 
Scavenge|Copy|ParNew|G1 Young Generation').sum(['service', 'host_name']) * 1000
-  - name: instance_jvm_old_gc_count
-    exp: jvm_gc_collection_seconds_count.tagMatch('gc', 'PS 
MarkSweep|MarkSweepCompact|ConcurrentMarkSweep|G1 Old 
Generation').sum(['service', 'host_name']).increase('PT1M')
   - name: instance_jvm_old_gc_time
     exp: jvm_gc_collection_seconds_sum.tagMatch('gc', 'PS 
MarkSweep|MarkSweepCompact|ConcurrentMarkSweep|G1 Old 
Generation').sum(['service', 'host_name']) * 1000
   - name: instance_trace_count
@@ -56,10 +57,9 @@ metricsRules:
     exp: mesh_analysis_latency.sum(['le', 'service', 
'host_name']).increase('PT1M').histogram().histogram_percentile([50,70,90,99])
   - name: instance_mesh_analysis_error_count
     exp: mesh_analysis_error_count.sum(['service', 
'host_name']).increase('PT1M')
-  - name: instance_metrics_first_aggregation
-    exp: metrics_aggregation.tagEqual('dimensionality', 'minute', 'level', 
'1').sum(['service', 'host_name']).increase('PT1M')
-  - name: instance_metrics_second_aggregation
-    exp: metrics_aggregation.tagEqual('dimensionality', 'minute', 'level', 
'2').sum(['service', 'host_name']).increase('PT1M')
+  - name: instance_metrics_aggregation
+    exp: "metrics_aggregation.tagEqual('dimensionality', 
'minute').sum(['service', 'host_name', 'level']).increase('PT1M')
+    .tag({tags -> if (tags['level'] == '1') {tags.level = 'L1 aggregation'} 
}).tag({tags -> if (tags['level'] == '2') {tags.level = 'L2 aggregation'} })"
   - name: instance_persistence_execute_percentile
     exp: persistence_timer_bulk_execute_latency.sum(['le', 'service', 
'host_name']).increase('PT5M').histogram().histogram_percentile([50,70,90,99])
   - name: instance_persistence_prepare_percentile
diff --git 
a/oap-server/server-starter/src/main/resources/ui-initialized-templates/self-observability.yml
 
b/oap-server/server-starter/src/main/resources/ui-initialized-templates/self-observability.yml
index 452f166..7ba920d 100644
--- 
a/oap-server/server-starter/src/main/resources/ui-initialized-templates/self-observability.yml
+++ 
b/oap-server/server-starter/src/main/resources/ui-initialized-templates/self-observability.yml
@@ -63,9 +63,9 @@ templates:
                   "height": "200",
                   "entityType": "ServiceInstance",
                   "independentSelector": false,
-                  "metricType": "REGULAR_VALUE",
-                  "metricName": 
"meter_oap_instance_jvm_young_gc_count,meter_oap_instance_jvm_old_gc_count",
-                  "queryMetricType": "readMetricsValues",
+                  "metricType": "LABELED_VALUE",
+                  "metricName": "meter_oap_instance_jvm_gc_count",
+                  "queryMetricType": "readLabeledMetricsValues",
                   "chartType": "ChartBar",
                   "unit": "Per Minute"
                 },
@@ -139,9 +139,9 @@ templates:
                   "height": "200",
                   "entityType": "ServiceInstance",
                   "independentSelector": false,
-                  "metricType": "REGULAR_VALUE",
-                  "metricName": 
"meter_oap_instance_metrics_first_aggregation,meter_oap_instance_metrics_second_aggregation",
-                  "queryMetricType": "readMetricsValues",
+                  "metricType": "LABELED_VALUE",
+                  "metricName": "meter_oap_instance_metrics_aggregation",
+                  "queryMetricType": "readLabeledMetricsValues",
                   "chartType": "ChartBar",
                   "unit": "Per Minute"
                 },
diff --git a/test/e2e-v2/cases/so11y/expected/metrics-has-value-label.yml 
b/test/e2e-v2/cases/so11y/expected/metrics-has-value-label.yml
new file mode 100644
index 0000000..a4becdf
--- /dev/null
+++ b/test/e2e-v2/cases/so11y/expected/metrics-has-value-label.yml
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- contains . }}
+- key: {{ notEmpty .key }}
+  value:
+  {{- contains .value }}
+  - key: {{ notEmpty .key }}
+    value: {{ ge .value 1 }}
+  {{- end }}
+{{- end }}
diff --git a/test/e2e-v2/cases/so11y/so11y-cases.yaml 
b/test/e2e-v2/cases/so11y/so11y-cases.yaml
index 3899035..d43d6cb 100644
--- a/test/e2e-v2/cases/so11y/so11y-cases.yaml
+++ b/test/e2e-v2/cases/so11y/so11y-cases.yaml
@@ -30,8 +30,8 @@
       expected: expected/metrics-has-value.yml
     - query: swctl --display yaml 
--base-url=http://${oap_host}:${oap_12800}/graphql metrics linear 
--name=meter_oap_instance_trace_count --instance-name=http://localhost:1234 
--service-name=oap::oap-server |yq e 'to_entries' -
       expected: expected/metrics-has-value.yml
-    - query: swctl --display yaml 
--base-url=http://${oap_host}:${oap_12800}/graphql metrics linear 
--name=meter_oap_instance_metrics_first_aggregation 
--instance-name=http://localhost:1234 --service-name=oap::oap-server |yq e 
'to_entries' -
-      expected: expected/metrics-has-value.yml
+    - query: swctl --display yaml 
--base-url=http://${oap_host}:${oap_12800}/graphql metrics multiple-linear 
--name=meter_oap_instance_metrics_aggregation --labels="L1 aggregation" 
--instance-name=http://localhost:1234 --service-name=oap::oap-server |yq e 
'to_entries | with(.[] ; .value=(.value | to_entries))' -
+      expected: expected/metrics-has-value-label.yml
     - query: swctl --display yaml 
--base-url=http://${oap_host}:${oap_12800}/graphql metrics linear 
--name=meter_oap_instance_persistence_prepare_count 
--instance-name=http://localhost:1234 --service-name=oap::oap-server |yq e 
'to_entries' -
       expected: expected/metrics-has-value.yml
     - query: swctl --display yaml 
--base-url=http://${oap_host}:${oap_12800}/graphql metrics linear 
--name=meter_oap_instance_persistence_execute_count 
--instance-name=http://localhost:1234 --service-name=oap::oap-server |yq e 
'to_entries' -

[skywalking] branch master updated: Fix the configuration of `Aggregation` and `GC Count` metrics for oap self observability(#8707) (#8719)

Reply via email to