This is an automated email from the ASF dual-hosted git repository. wusheng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/skywalking.git
The following commit(s) were added to refs/heads/master by this push: new bea7ed3f4a Fix inaccurate APISIX metrics (#12108) bea7ed3f4a is described below commit bea7ed3f4a3f5bca777bf69c7c8ea8d3d787c35a Author: pg.yang <pg.y...@hotmail.com> AuthorDate: Tue Apr 16 17:39:37 2024 +0800 Fix inaccurate APISIX metrics (#12108) --- docs/en/api/metrics-query-expression.md | 3 --- docs/en/changes/changes.md | 1 + .../src/main/resources/otel-rules/apisix.yaml | 24 ++++++++++---------- .../apisix/apisix-endpoint.json | 6 ++--- .../apisix/apisix-service.json | 26 +++++++++++----------- test/e2e-v2/cases/apisix/apisix-cases.yaml | 4 ++-- 6 files changed, 31 insertions(+), 33 deletions(-) diff --git a/docs/en/api/metrics-query-expression.md b/docs/en/api/metrics-query-expression.md index fa2446e297..dfba7c2e41 100644 --- a/docs/en/api/metrics-query-expression.md +++ b/docs/en/api/metrics-query-expression.md @@ -251,9 +251,6 @@ The order of the new label values should be the same as the order of the label v For example: If we want to query the `service_percentile` metric with the label values `50,75,90,95,99`, and rename the label name to `percentile` and the label values to `P50,P75,P90,P95,P99`, we can use the following expression: -```text -and rename the label values to `P50,P75,P90,P95,P99`, we can use the following expression: - ```text relabel(service_percentile{p='50,75,90,95,99'}, p='50,75,90,95,99', percentile='P50,P75,P90,P95,P99') ``` diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md index c530768508..8789812c14 100644 --- a/docs/en/changes/changes.md +++ b/docs/en/changes/changes.md @@ -105,6 +105,7 @@ - `memory_swap_percentage` -> `memory_virtual_memory_percentage` * Fix/Change UI init setting for Windows Swap -> Virtual Memory * Fix `Memory Swap Usage`/`Virtual Memory Usage` display with UI init.(Linux/Windows) +* Fix inaccurate APISIX metrics #### UI diff --git a/oap-server/server-starter/src/main/resources/otel-rules/apisix.yaml b/oap-server/server-starter/src/main/resources/otel-rules/apisix.yaml index aa3e768b90..4334fb09f3 100644 --- a/oap-server/server-starter/src/main/resources/otel-rules/apisix.yaml +++ b/oap-server/server-starter/src/main/resources/otel-rules/apisix.yaml @@ -36,24 +36,24 @@ metricsRules: # Service # Ignore http_connections metrics with accepted and handled state as the actual type is counter - name: sv_http_connections - exp: apisix_nginx_http_current_connections.tagNotMatch('state','accepted|handled').sum(['state','service_name']).service(['service_name'] , Layer.APISIX) + exp: apisix_nginx_http_current_connections.tagNotMatch('state','accepted|handled').sum(['state','service_name','node']).service(['service_name'] , Layer.APISIX) - name: sv_http_requests - exp: apisix_http_requests_total.sum(['service_instance_id','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX) + exp: apisix_http_requests_total.sum(['service_instance_id','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX) # Not match any route # Refer to https://apisix.apache.org/docs/apisix/plugins/prometheus/ - name: sv_bandwidth_unmatched - exp: apisix_bandwidth.tagEqual('route' , '' , 'node' , '').sum(['type','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX) + exp: apisix_bandwidth.tagEqual('route' , '' , 'node' , '').sum(['type','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX) - name: sv_http_status_unmatched - exp: apisix_http_status.tagEqual('route' , '' , 'node' , '').sum(['code','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX) + exp: apisix_http_status.tagEqual('route' , '' , 'node' , '').sum(['code','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX) - name: sv_http_latency_unmatched - exp: apisix_http_latency.tagEqual('route' , '' , 'node' , '').sum(['type','le','service_name']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX) + exp: apisix_http_latency.tagEqual('route' , '' , 'node' , '').sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX) # Match a route - name: sv_bandwidth_matched - exp: apisix_bandwidth.tagNotEqual('route' , '' , 'node' , '').sum(['type','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX) + exp: apisix_bandwidth.tagNotEqual('route' , '' , 'node' , '').sum(['type','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX) - name: sv_http_status_matched - exp: apisix_http_status.tagNotEqual('route' , '' , 'node' , '').sum(['code','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX) + exp: apisix_http_status.tagNotEqual('route' , '' , 'node' , '').sum(['code','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX) - name: sv_http_latency_matched - exp: apisix_http_latency.tagNotEqual('route' , '' , 'node' , '').sum(['type','le','service_name']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX) + exp: apisix_http_latency.tagNotEqual('route' , '' , 'node' , '').sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX) # Instance - name: instance_shared_dict_capacity_bytes @@ -88,15 +88,15 @@ metricsRules: # Endpoint # Reorganization metrics which has `route` label as endpoint ,that is formatted to `router/{routerId}` - name: endpoint_http_status - exp: apisix_http_status.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['code','service_name','route']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX) + exp: apisix_http_status.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['code','service_name','route','node']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX) - name: endpoint_bandwidth - exp: apisix_bandwidth.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','service_name','route']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX) + exp: apisix_bandwidth.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','service_name','route','node']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX) - name: endpoint_http_latency - exp: apisix_http_latency.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','le','service_name','route']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['route'], Layer.APISIX) + exp: apisix_http_latency.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','le','service_name','route','node']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['route'], Layer.APISIX) # Reorganization metrics which has `node` label as endpoint , that is formatted to `node/{node}` - name: endpoint_http_status exp: apisix_http_status.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']}).sum(['code','service_name','node']).rate('PT1M').endpoint(['service_name'],['node'], Layer.APISIX) - name: endpoint_bandwidth exp: apisix_bandwidth.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']}).sum(['type','service_name','node']).rate('PT1M').endpoint(['service_name'],['node'], Layer.APISIX) - name: endpoint_http_latency - exp: apisix_http_latency.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']})sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['node'], Layer.APISIX) + exp: apisix_http_latency.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']}).sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['node'], Layer.APISIX) diff --git a/oap-server/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-endpoint.json b/oap-server/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-endpoint.json index 4a85bde867..d418fd0994 100644 --- a/oap-server/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-endpoint.json +++ b/oap-server/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-endpoint.json @@ -22,7 +22,7 @@ "showYAxis": true }, "expressions": [ - "meter_apisix_endpoint_http_status" + "aggregate_labels(meter_apisix_endpoint_http_status,sum(code))" ], "associate": [ { @@ -55,7 +55,7 @@ "showYAxis": true }, "expressions": [ - "meter_apisix_endpoint_http_latency" + "aggregate_labels(meter_apisix_endpoint_http_latency,avg(type,p))" ], "associate": [ { @@ -88,7 +88,7 @@ "showYAxis": true }, "expressions": [ - "meter_apisix_endpoint_bandwidth/1024" + "aggregate_labels(meter_apisix_endpoint_bandwidth/1024,sum(type))" ], "associate": [ { diff --git a/oap-server/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-service.json b/oap-server/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-service.json index 3a7f50420f..0637e819f8 100644 --- a/oap-server/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-service.json +++ b/oap-server/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-service.json @@ -22,7 +22,7 @@ "showYAxis": true }, "expressions": [ - "meter_apisix_sv_http_requests" + "aggregate_labels(meter_apisix_sv_http_requests,sum(service_instance_id))" ] }, { @@ -44,7 +44,7 @@ "showYAxis": true }, "expressions": [ - "meter_apisix_sv_http_status_matched" + "aggregate_labels(meter_apisix_sv_http_status_matched,sum(code))" ], "associate": [ { @@ -89,7 +89,7 @@ "showYAxis": true }, "expressions": [ - "meter_apisix_sv_http_latency_matched" + "aggregate_labels(meter_apisix_sv_http_latency_matched,avg(type,p))" ], "associate": [ { @@ -134,7 +134,7 @@ "showYAxis": true }, "expressions": [ - "meter_apisix_sv_bandwidth_matched/1024" + "aggregate_labels(meter_apisix_sv_bandwidth_matched/1024,sum(type))" ], "associate": [ { @@ -168,7 +168,7 @@ "i": "5", "type": "Widget", "expressions": [ - "meter_apisix_sv_http_connections" + "aggregate_labels(meter_apisix_sv_http_connections,sum(state))" ], "graph": { "type": "Line", @@ -224,7 +224,7 @@ "showYAxis": true }, "expressions": [ - "meter_apisix_sv_http_status_unmatched" + "aggregate_labels(meter_apisix_sv_http_status_unmatched,sum(code))" ], "associate": [ { @@ -269,7 +269,7 @@ "showYAxis": true }, "expressions": [ - "meter_apisix_sv_http_latency_unmatched" + "aggregate_labels(meter_apisix_sv_http_latency_unmatched,avg(type,p))" ], "associate": [ { @@ -314,7 +314,7 @@ "showYAxis": true }, "expressions": [ - "meter_apisix_sv_bandwidth_unmatched/1024" + "aggregate_labels(meter_apisix_sv_bandwidth_unmatched/1024,sum(type))" ], "associate": [ { @@ -379,11 +379,11 @@ "isRoot": false, "isDefault": true, "expressions": [ - "avg(meter_apisix_sv_http_status_matched{code='200'})", - "avg(meter_apisix_sv_http_status_matched{code='304'})", - "avg(meter_apisix_sv_http_status_matched{code='404'})", - "avg(meter_apisix_sv_http_status_matched{code='499'})", - "avg(meter_apisix_sv_http_status_matched{code='503'})" + "avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='200'},sum(code)))", + "avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='304'},sum(code)))", + "avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='404'},sum(code)))", + "avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='499'},sum(code)))", + "avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='503'},sum(code)))" ], "expressionsConfig": [ { diff --git a/test/e2e-v2/cases/apisix/apisix-cases.yaml b/test/e2e-v2/cases/apisix/apisix-cases.yaml index 8f95770e13..df3b3d44c9 100644 --- a/test/e2e-v2/cases/apisix/apisix-cases.yaml +++ b/test/e2e-v2/cases/apisix/apisix-cases.yaml @@ -22,7 +22,7 @@ - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql instance ls --service-name=APISIX::showcase-apisix-service expected: expected/instance.yml # service metrics - - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression=meter_apisix_sv_http_connections --service-name=APISIX::showcase-apisix-service + - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression='aggregate_labels(meter_apisix_sv_http_connections,sum(state))' --service-name=APISIX::showcase-apisix-service expected: expected/metrics-has-connection-value-label.yml # instance metrics - query: | @@ -31,5 +31,5 @@ ) expected: expected/metrics-has-status-value-label.yml # endpoint metrics - - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression=meter_apisix_endpoint_http_latency --endpoint-name='route/routes#1' --service-name=APISIX::showcase-apisix-service + - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression='aggregate_labels(meter_apisix_endpoint_http_latency,avg(type,p))' --endpoint-name='route/routes#1' --service-name=APISIX::showcase-apisix-service expected: expected/metrics-has-latency-value-label.yml