This is an automated email from the ASF dual-hosted git repository. zhaoqingran pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hertzbeat.git
The following commit(s) were added to refs/heads/master by this push: new b7398b1bf9 [feature] Support HertzBeat self monitor (#3641) b7398b1bf9 is described below commit b7398b1bf91bf2925dd45e18160c4ced4d74e751 Author: Logic <zqr10...@dromara.org> AuthorDate: Sat Aug 9 17:14:17 2025 +0800 [feature] Support HertzBeat self monitor (#3641) Co-authored-by: Calvin <zhengqi...@apache.org> Co-authored-by: tomsun28 <tomsu...@outlook.com> --- .../hertzbeat-collector-collector/pom.xml | 13 +- .../collector/dispatch/CommonDispatcher.java | 55 +- .../metrics/HertzBeatMetricsCollector.java | 89 ++ .../src/main/resources/application.yml | 18 +- hertzbeat-manager/pom.xml | 4 + .../src/main/resources/application.yml | 12 + .../resources/grafana/hertzbeat-monitor-zh-CN.json | 1083 ++++++++++++++++++++ hertzbeat-otel/pom.xml | 5 +- .../hertzbeat/otel/config/OpenTelemetryConfig.java | 35 +- script/application.yml | 12 + .../hertzbeat-mysql-iotdb/conf/application.yml | 12 + .../hertzbeat-mysql-tdengine/conf/application.yml | 12 + .../conf/application.yml | 12 + .../conf/application.yml | 12 + 14 files changed, 1331 insertions(+), 43 deletions(-) diff --git a/hertzbeat-collector/hertzbeat-collector-collector/pom.xml b/hertzbeat-collector/hertzbeat-collector-collector/pom.xml index 172d3af836..0e7cfd1cbe 100644 --- a/hertzbeat-collector/hertzbeat-collector-collector/pom.xml +++ b/hertzbeat-collector/hertzbeat-collector-collector/pom.xml @@ -70,14 +70,11 @@ <version>${hertzbeat.version}</version> </dependency> - - <!-- spring --> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> - <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-configuration-processor</artifactId> @@ -87,7 +84,15 @@ <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-autoconfigure</artifactId> </dependency> - + <!-- metrics --> + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-actuator</artifactId> + </dependency> + <dependency> + <groupId>io.micrometer</groupId> + <artifactId>micrometer-registry-prometheus</artifactId> + </dependency> </dependencies> <build> diff --git a/hertzbeat-collector/hertzbeat-collector-collector/src/main/java/org/apache/hertzbeat/collector/dispatch/CommonDispatcher.java b/hertzbeat-collector/hertzbeat-collector-collector/src/main/java/org/apache/hertzbeat/collector/dispatch/CommonDispatcher.java index d58d53cdac..6e68756248 100644 --- a/hertzbeat-collector/hertzbeat-collector-collector/src/main/java/org/apache/hertzbeat/collector/dispatch/CommonDispatcher.java +++ b/hertzbeat-collector/hertzbeat-collector-collector/src/main/java/org/apache/hertzbeat/collector/dispatch/CommonDispatcher.java @@ -23,6 +23,7 @@ import lombok.AllArgsConstructor; import lombok.Data; import lombok.extern.slf4j.Slf4j; import org.apache.hertzbeat.collector.dispatch.entrance.internal.CollectJobService; +import org.apache.hertzbeat.collector.metrics.HertzBeatMetricsCollector; import org.apache.hertzbeat.common.timer.Timeout; import org.apache.hertzbeat.collector.timer.TimerDispatch; import org.apache.hertzbeat.collector.timer.WheelTimerTask; @@ -33,6 +34,7 @@ import org.apache.hertzbeat.common.entity.job.Job; import org.apache.hertzbeat.common.entity.job.Metrics; import org.apache.hertzbeat.common.entity.message.CollectRep; import org.apache.hertzbeat.common.queue.CommonDataQueue; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import java.util.HashMap; @@ -90,6 +92,9 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc private final String collectorIdentity; + @Autowired + private HertzBeatMetricsCollector metricsCollector; + public CommonDispatcher(MetricsCollectorQueue jobRequestQueue, TimerDispatch timerDispatch, CommonDataQueue commonDataQueue, @@ -153,12 +158,23 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc for (Map.Entry<String, MetricsTime> entry : metricsTimeoutMonitorMap.entrySet()) { MetricsTime metricsTime = entry.getValue(); if (metricsTime.getStartTime() < deadline) { - // Metrics collection timeout + // Metrics collection timeout + MetricsTime removedMetricsTime = metricsTimeoutMonitorMap.remove(entry.getKey()); + if (removedMetricsTime == null) { + continue; + } WheelTimerTask timerJob = (WheelTimerTask) metricsTime.getTimeout().task(); + Job job = timerJob.getJob(); + // timeout metrics + if (metricsCollector != null) { + long duration = System.currentTimeMillis() - removedMetricsTime.getStartTime(); + metricsCollector.recordCollectMetrics(job, duration, "timeout"); + } + CollectRep.MetricsData metricsData = CollectRep.MetricsData.newBuilder() - .setId(timerJob.getJob().getMonitorId()) - .setTenantId(timerJob.getJob().getTenantId()) - .setApp(timerJob.getJob().getApp()) + .setId(job.getMonitorId()) + .setTenantId(job.getTenantId()) + .setApp(job.getApp()) .setMetrics(metricsTime.getMetrics().getName()) .setPriority(metricsTime.getMetrics().getPriority()) .setTime(System.currentTimeMillis()) @@ -167,7 +183,6 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc if (metricsData.getPriority() == 0) { dispatchCollectData(metricsTime.timeout, metricsTime.getMetrics(), metricsData); } - metricsTimeoutMonitorMap.remove(entry.getKey()); } } } catch (Exception e) { @@ -177,7 +192,7 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc @Override public void dispatchMetricsTask(Timeout timeout) { - // Divide the collection task of a single application into corresponding collection tasks of the metrics according to the metrics under it. + // Divide the collection task of a single application into corresponding collection tasks of the metrics under it. // Put each collect task into the thread pool for scheduling WheelTimerTask timerTask = (WheelTimerTask) timeout.task(); Job job = timerTask.getJob(); @@ -201,16 +216,27 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc public void dispatchCollectData(Timeout timeout, Metrics metrics, CollectRep.MetricsData metricsData) { WheelTimerTask timerJob = (WheelTimerTask) timeout.task(); Job job = timerJob.getJob(); + String monitorKey; + if (metrics.isHasSubTask()) { + monitorKey = job.getId() + "-" + metrics.getName() + "-sub-" + metrics.getSubTaskId(); + } else { + monitorKey = job.getId() + "-" + metrics.getName(); + } + MetricsTime metricsTime = metricsTimeoutMonitorMap.remove(monitorKey); + + // job completed metrics + if (metricsTime != null && metricsCollector != null) { + long duration = System.currentTimeMillis() - metricsTime.getStartTime(); + String status = metricsData.getCode() == CollectRep.Code.SUCCESS ? "success" : "fail"; + metricsCollector.recordCollectMetrics(job, duration, status); + } if (metrics.isHasSubTask()) { - metricsTimeoutMonitorMap.remove(job.getId() + "-" + metrics.getName() + "-sub-" + metrics.getSubTaskId()); boolean isLastTask = metrics.consumeSubTaskResponse(metricsData); if (isLastTask) { metricsData = metrics.getSubTaskDataRef().get().build(); } else { return; } - } else { - metricsTimeoutMonitorMap.remove(job.getId() + "-" + metrics.getName()); } Set<Metrics> metricsSet = job.getNextCollectMetrics(metrics, false); if (job.isCyclic()) { @@ -322,7 +348,13 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc public void dispatchCollectData(Timeout timeout, Metrics metrics, List<CollectRep.MetricsData> metricsDataList) { WheelTimerTask timerJob = (WheelTimerTask) timeout.task(); Job job = timerJob.getJob(); - metricsTimeoutMonitorMap.remove(String.valueOf(job.getId())); + MetricsTime metricsTime = metricsTimeoutMonitorMap.remove(String.valueOf(job.getId())); + if (metricsTime != null && metricsCollector != null) { + long duration = System.currentTimeMillis() - metricsTime.getStartTime(); + // For a list, we consider it a success if at least one item is successful. + boolean isSuccess = metricsDataList.stream().anyMatch(item -> item.getCode() == CollectRep.Code.SUCCESS); + metricsCollector.recordCollectMetrics(job, duration, isSuccess ? "success" : "fail"); + } if (job.isCyclic()) { // The collection and execution of all task of this job are completed. // The periodic task pushes the task to the time wheel again. @@ -340,7 +372,6 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc // and the result listener is notified of the combination of all metrics data timerDispatch.responseSyncJobData(job.getId(), metricsDataList); } - } @@ -354,4 +385,4 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc private Metrics metrics; private Timeout timeout; } -} +} \ No newline at end of file diff --git a/hertzbeat-collector/hertzbeat-collector-collector/src/main/java/org/apache/hertzbeat/collector/metrics/HertzBeatMetricsCollector.java b/hertzbeat-collector/hertzbeat-collector-collector/src/main/java/org/apache/hertzbeat/collector/metrics/HertzBeatMetricsCollector.java new file mode 100644 index 0000000000..66e4b99955 --- /dev/null +++ b/hertzbeat-collector/hertzbeat-collector-collector/src/main/java/org/apache/hertzbeat/collector/metrics/HertzBeatMetricsCollector.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.hertzbeat.collector.metrics; + +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Timer; +import lombok.extern.slf4j.Slf4j; +import org.apache.hertzbeat.common.entity.job.Job; +import org.springframework.stereotype.Service; + +import java.time.Duration; +import java.util.Map; + +/** + * Service for managing and recording Micrometer metrics. + * This service uses Micrometer which integrates natively with Spring Boot Actuator. + */ +@Service +@Slf4j +public class HertzBeatMetricsCollector { + + private final MeterRegistry meterRegistry; + + public HertzBeatMetricsCollector(MeterRegistry meterRegistry) { + this.meterRegistry = meterRegistry; + log.info("MetricsService initialized with MeterRegistry: {}", meterRegistry.getClass().getSimpleName()); + } + + /** + * Records the metrics for a completed collection sub-task. + * + * @param job The parent job containing monitor info. + * @param durationMillis The duration of the collection task in milliseconds. + * @param status The final status of the collection ("success", "fail", "timeout"). + */ + public void recordCollectMetrics(Job job, long durationMillis, String status) { + if (job == null) { + return; + } + + Map<String, String> metadata = job.getMetadata(); + String monitorName = metadata != null ? metadata.get("instancename") : "unknown"; + String monitorTarget = metadata != null ? metadata.get("instancehost") : "unknown"; + + // Record collection count + Counter.builder("hertzbeat.collect.total") + .description("The total number of collection tasks executed") + .tag("status", status) + .tag("monitor_type", job.getApp()) + .tag("monitor_id", String.valueOf(job.getMonitorId())) + .tag("monitor_name", monitorName) + .tag("monitor_target", monitorTarget) + .register(meterRegistry) + .increment(); + + // Record collection duration + Timer.builder("hertzbeat.collect.duration") + .description("The duration of collection task executions") + .tag("status", status) + .tag("monitor_type", job.getApp()) + .tag("monitor_id", String.valueOf(job.getMonitorId())) + .tag("monitor_name", monitorName) + .tag("monitor_target", monitorTarget) + .register(meterRegistry) + .record(Duration.ofMillis(durationMillis)); + + if (log.isDebugEnabled()) { + log.debug("Recorded metrics for monitor [{}] ({}): status={}, duration={}ms", + monitorName, job.getMonitorId(), status, durationMillis); + } + } +} \ No newline at end of file diff --git a/hertzbeat-collector/hertzbeat-collector-collector/src/main/resources/application.yml b/hertzbeat-collector/hertzbeat-collector-collector/src/main/resources/application.yml index 94e69a1e44..c797edccf7 100644 --- a/hertzbeat-collector/hertzbeat-collector-collector/src/main/resources/application.yml +++ b/hertzbeat-collector/hertzbeat-collector-collector/src/main/resources/application.yml @@ -30,7 +30,23 @@ spring: # need to disable spring boot mongodb auto config, or default mongodb connection tried and failed... autoconfigure: exclude: org.springframework.boot.autoconfigure.mongo.MongoAutoConfiguration, org.springframework.boot.autoconfigure.data.mongo.MongoDataAutoConfiguration, org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration, org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration - +management: + endpoints: + web: + exposure: + include: + - 'prometheus' + endpoint: + prometheus: + access: read_only + metrics: + tags: + application: ${spring.application.name} + environment: ${spring.profiles.active} + prometheus: + metrics: + export: + enabled: true --- spring: config: diff --git a/hertzbeat-manager/pom.xml b/hertzbeat-manager/pom.xml index 7351b3fc15..b9065fdc56 100644 --- a/hertzbeat-manager/pom.xml +++ b/hertzbeat-manager/pom.xml @@ -210,6 +210,10 @@ <groupId>org.apache.arrow</groupId> <artifactId>arrow-memory-netty</artifactId> </dependency> + <dependency> + <groupId>io.micrometer</groupId> + <artifactId>micrometer-registry-prometheus</artifactId> + </dependency> </dependencies> <build> diff --git a/hertzbeat-manager/src/main/resources/application.yml b/hertzbeat-manager/src/main/resources/application.yml index 69027e88a6..9965183bd5 100644 --- a/hertzbeat-manager/src/main/resources/application.yml +++ b/hertzbeat-manager/src/main/resources/application.yml @@ -48,7 +48,19 @@ management: include: - 'metrics' - 'health' + - 'prometheus' enabled-by-default: on + endpoint: + prometheus: + access: read_only + metrics: + tags: + application: ${spring.application.name} + environment: ${spring.profiles.active} + prometheus: + metrics: + export: + enabled: true sureness: container: jakarta_servlet diff --git a/hertzbeat-manager/src/main/resources/grafana/hertzbeat-monitor-zh-CN.json b/hertzbeat-manager/src/main/resources/grafana/hertzbeat-monitor-zh-CN.json new file mode 100644 index 0000000000..27e73c233e --- /dev/null +++ b/hertzbeat-manager/src/main/resources/grafana/hertzbeat-monitor-zh-CN.json @@ -0,0 +1,1083 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "12.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 1 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 5 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "失败任务数" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "成功任务数" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#73BF69", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "总监控任务数" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#5794F2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count(group by (monitor_id)(hertzbeat_collect_total))", + "interval": "", + "legendFormat": "总监控任务数", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count(group by (monitor_id)(hertzbeat_collect_total{status=\"success\"}))", + "interval": "", + "legendFormat": "成功任务数", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count(group by (monitor_id)(hertzbeat_collect_total{status=\"fail\"}))", + "interval": "", + "legendFormat": "失败任务数", + "refId": "C" + } + ], + "title": "监控任务状态概览", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#73BF69" + }, + { + "color": "#FADE2A", + "value": 10 + }, + { + "color": "#FF9830", + "value": 25 + }, + { + "color": "#F2495C", + "value": 50 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "displayMode": "lcd", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "12.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "(\n count(group by (monitor_id)(hertzbeat_collect_total{status=\"fail\"}))\n / \n count(group by (monitor_id)(hertzbeat_collect_total))\n) * 100", + "interval": "", + "legendFormat": "任务失败率", + "refId": "A" + } + ], + "title": "任务失败率", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#73BF69" + }, + { + "color": "#FADE2A", + "value": 5 + }, + { + "color": "#FF9830", + "value": 15 + }, + { + "color": "#F2495C", + "value": 30 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 10, + "options": { + "displayMode": "lcd", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "12.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "clamp_max((\n sum(rate(hertzbeat_collect_total{status=\"fail\"}[5m]))\n / \n sum(rate(hertzbeat_collect_total[5m]))\n) * 100, 100)", + "interval": "", + "legendFormat": "采集失败率", + "refId": "A" + } + ], + "title": "实时采集失败率", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "监控名称" + }, + "properties": [ + { + "id": "custom.width", + "value": 200 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "监控类型" + }, + "properties": [ + { + "id": "custom.width", + "value": 150 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "失败次数" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + }, + { + "id": "color", + "value": { + "mode": "fixed", + "seriesBy": "last" + } + }, + { + "id": "max", + "value": 100 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "平均耗时" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + }, + { + "id": "decimals", + "value": 1 + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + }, + { + "id": "max", + "value": 100 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "最近失败时间" + }, + "properties": [ + { + "id": "unit", + "value": "time:YYYY-MM-DD HH:mm:ss" + } + ] + } + ] + }, + "gridPos": { + "h": 15, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 9, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "最近失败时间" + } + ] + }, + "pluginVersion": "12.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum by (monitor_name, monitor_type, monitor_target) (hertzbeat_collect_total{status=\"success\"})", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum by (monitor_name, monitor_type, monitor_target) (hertzbeat_collect_total{status=\"fail\"})", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum by (monitor_name, monitor_type, monitor_target) ((hertzbeat_collect_duration_seconds_sum / hertzbeat_collect_duration_seconds_count) * 1000)", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "max by (monitor_name) (timestamp(hertzbeat_collect_total{status=\"fail\"}))", + "format": "table", + "instant": true, + "refId": "D" + } + ], + "title": "监控任务详情 (按最近失败时间排序)", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "application": true, + "environment": true, + "instance": true, + "job": true, + "monitor_id": true + }, + "indexByName": {}, + "renameByName": { + "Value #A": "成功次数", + "Value #B": "失败次数", + "Value #C": "平均耗时", + "Value #D": "最近失败时间", + "monitor_name": "监控名称", + "monitor_target": "目标地址", + "monitor_type": "监控类型" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*fail.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*success.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#73BF69", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(rate(hertzbeat_collect_total[1m]) * 60) by (monitor_name, status)", + "interval": "", + "legendFormat": "{{monitor_name}} - {{status}}", + "refId": "A" + } + ], + "title": "采集任务执行频率 (每分钟)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "成功采集次数", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "diff" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "hertzbeat_collect_total{status=\"success\"}", + "interval": "", + "legendFormat": "{{monitor_name}} - 成功次数", + "refId": "A" + } + ], + "title": "累积成功次数趋势", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "耗时 (毫秒)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "last", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "(hertzbeat_collect_duration_seconds_sum / hertzbeat_collect_duration_seconds_count) * 1000", + "interval": "", + "legendFormat": "{{monitor_name}} ({{monitor_type}})", + "refId": "A" + } + ], + "title": "平均采集耗时趋势", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "累积失败次数", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "diff" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "hertzbeat_collect_total{status=\"fail\"}", + "interval": "", + "legendFormat": "{{monitor_name}} - 失败次数", + "refId": "A" + } + ], + "title": "累积失败次数趋势", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 41, + "tags": [ + "hertzbeat", + "monitoring", + "prometheus" + ], + "templating": { + "list": [ + { + "current": {}, + "includeAll": false, + "label": "数据源", + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "HertzBeat Monitoring Dashboard", + "uid": "hertzbeat-monitoring-complete-revised", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/hertzbeat-otel/pom.xml b/hertzbeat-otel/pom.xml index 43d1004bf2..0a03179e66 100644 --- a/hertzbeat-otel/pom.xml +++ b/hertzbeat-otel/pom.xml @@ -32,6 +32,10 @@ </properties> <dependencies> + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-web</artifactId> + </dependency> <!-- hertzbeat common --> <dependency> <groupId>org.apache.hertzbeat</groupId> @@ -51,6 +55,5 @@ <groupId>io.opentelemetry.instrumentation</groupId> <artifactId>opentelemetry-spring-boot-starter</artifactId> </dependency> - </dependencies> </project> diff --git a/hertzbeat-otel/src/main/java/org/apache/hertzbeat/otel/config/OpenTelemetryConfig.java b/hertzbeat-otel/src/main/java/org/apache/hertzbeat/otel/config/OpenTelemetryConfig.java index 44774dbd3a..134de32621 100644 --- a/hertzbeat-otel/src/main/java/org/apache/hertzbeat/otel/config/OpenTelemetryConfig.java +++ b/hertzbeat-otel/src/main/java/org/apache/hertzbeat/otel/config/OpenTelemetryConfig.java @@ -6,14 +6,13 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - * */ package org.apache.hertzbeat.otel.config; @@ -44,6 +43,7 @@ import org.springframework.context.annotation.Configuration; /** * OpenTelemetryConfig provides customizations for the auto-configured OpenTelemetry SDK, * specifically for integrating with GrepTimeDB for logs and traces. + * Metrics are handled by Micrometer and exposed via Spring Boot Actuator. */ @Configuration @Slf4j @@ -58,9 +58,6 @@ public class OpenTelemetryConfig { private static final String GREPTIME_TRACE_TABLE_NAME_HEADER = "X-Greptime-Trace-Table-Name"; private static final String GREPTIME_PIPELINE_NAME_HEADER = "X-Greptime-Pipeline-Name"; - /** - * Adds authentication headers if credentials are provided. - */ private void addAuthenticationHeaders(Map<String, String> headers, GreptimeProperties greptimeProps) { if (greptimeProps != null && StringUtils.isNotBlank(greptimeProps.username()) && StringUtils.isNotBlank(greptimeProps.password())) { @@ -73,9 +70,6 @@ public class OpenTelemetryConfig { } } - /** - * Builds HTTP Log headers for OTLP communication with GreptimeDB. - */ private Map<String, String> buildGreptimeOtlpLogHeaders(GreptimeProperties greptimeProps) { Map<String, String> headers = new HashMap<>(); headers.put(GREPTIME_DB_NAME_HEADER, DEFAULT_GREPTIME_DB_NAME); @@ -84,9 +78,6 @@ public class OpenTelemetryConfig { return Collections.unmodifiableMap(headers); } - /** - * Builds HTTP Trace headers for OTLP communication with GreptimeDB. - */ private Map<String, String> buildGreptimeOtlpTraceHeaders(GreptimeProperties greptimeProps) { Map<String, String> headers = new HashMap<>(); headers.put(GREPTIME_DB_NAME_HEADER, DEFAULT_GREPTIME_DB_NAME); @@ -97,33 +88,29 @@ public class OpenTelemetryConfig { return Collections.unmodifiableMap(headers); } - /** - * Provides default OpenTelemetry configuration that always executes. - */ @Bean public AutoConfigurationCustomizerProvider defaultOtelCustomizer() { - log.info("Applying default OpenTelemetry SDK customizations."); + log.info("Applying default OpenTelemetry SDK customizations (logs & traces only)."); return providerCustomizer -> providerCustomizer .addPropertiesCustomizer(sdkConfigProperties -> { Map<String, String> newProperties = new HashMap<>(); + // Disable all built-in exporters - we use Micrometer for metrics newProperties.put("otel.metrics.exporter", "none"); newProperties.put("otel.traces.exporter", "none"); newProperties.put("otel.logs.exporter", "none"); + log.info("OpenTelemetry exporters disabled. Metrics handled by Micrometer."); return newProperties; }) .addResourceCustomizer((resource, configProperties) -> { - log.info("Customizing auto-configured OpenTelemetry Resource with service name: {}.", HERTZBEAT_SERVICE_NAME); + log.info("Customizing OpenTelemetry Resource with service name: {}.", HERTZBEAT_SERVICE_NAME); return resource.merge(Resource.builder().put(SERVICE_NAME, HERTZBEAT_SERVICE_NAME).build()); }); } - /** - * Provides GrepTimeDB-specific OpenTelemetry configuration when enabled. - */ @Bean @ConditionalOnProperty(name = "warehouse.store.greptime.enabled", havingValue = "true") public AutoConfigurationCustomizerProvider greptimeOtelCustomizer(GreptimeProperties greptimeProperties) { - log.info("GreptimeDB is enabled. Applying additional OpenTelemetry SDK customizations for GrepTimeDB."); + log.info("GreptimeDB is enabled. Applying OpenTelemetry customizations for GrepTimeDB logs & traces."); return providerCustomizer -> providerCustomizer .addPropertiesCustomizer(sdkConfigProperties -> { Map<String, String> newProperties = new HashMap<>(); @@ -132,18 +119,16 @@ public class OpenTelemetryConfig { }) .addSpanExporterCustomizer((originalSpanExporter, configProperties) -> { String traceEndpoint = greptimeProperties.httpEndpoint() + "/v1/otlp/v1/traces"; - log.info("Programmatically configuring OtlpHttpSpanExporter for GreptimeDB traces. Endpoint: {}", traceEndpoint); - Map<String, String> traceHeaders = buildGreptimeOtlpTraceHeaders(greptimeProperties); - log.info("Trace Headers for GreptimeDB (programmatic HTTP config): {}", traceHeaders); + log.info("Configuring OtlpHttpSpanExporter for GreptimeDB. Endpoint: {}", traceEndpoint); OtlpHttpSpanExporterBuilder httpExporterBuilder = OtlpHttpSpanExporter.builder() .setEndpoint(traceEndpoint) - .setHeaders(() -> traceHeaders) + .setHeaders(() -> buildGreptimeOtlpTraceHeaders(greptimeProperties)) .setTimeout(10000, TimeUnit.MILLISECONDS); return httpExporterBuilder.build(); }) .addLoggerProviderCustomizer((sdkLoggerProviderBuilder, configProperties) -> { - log.info("Customizing auto-configured SdkLoggerProviderBuilder for GrepTimeDB logs."); + log.info("Customizing SdkLoggerProviderBuilder for GrepTimeDB logs."); OtlpHttpLogRecordExporter logExporter = OtlpHttpLogRecordExporter.builder() .setEndpoint(greptimeProperties.httpEndpoint() + "/v1/otlp/v1/logs") diff --git a/script/application.yml b/script/application.yml index b52a1b4d22..e776cb9830 100644 --- a/script/application.yml +++ b/script/application.yml @@ -48,7 +48,19 @@ management: include: - 'metrics' - 'health' + - 'prometheus' enabled-by-default: on + endpoint: + prometheus: + access: read_only + metrics: + tags: + application: ${spring.application.name} + environment: ${spring.profiles.active} + prometheus: + metrics: + export: + enabled: true sureness: container: jakarta_servlet diff --git a/script/docker-compose/hertzbeat-mysql-iotdb/conf/application.yml b/script/docker-compose/hertzbeat-mysql-iotdb/conf/application.yml index 9b96416453..b981069aa9 100644 --- a/script/docker-compose/hertzbeat-mysql-iotdb/conf/application.yml +++ b/script/docker-compose/hertzbeat-mysql-iotdb/conf/application.yml @@ -49,7 +49,19 @@ management: include: - 'metrics' - 'health' + - 'prometheus' enabled-by-default: on + endpoint: + prometheus: + access: read_only + metrics: + tags: + application: ${spring.application.name} + environment: ${spring.profiles.active} + prometheus: + metrics: + export: + enabled: true sureness: container: jakarta_servlet diff --git a/script/docker-compose/hertzbeat-mysql-tdengine/conf/application.yml b/script/docker-compose/hertzbeat-mysql-tdengine/conf/application.yml index e662444340..1da7920dc4 100644 --- a/script/docker-compose/hertzbeat-mysql-tdengine/conf/application.yml +++ b/script/docker-compose/hertzbeat-mysql-tdengine/conf/application.yml @@ -49,7 +49,19 @@ management: include: - 'metrics' - 'health' + - 'prometheus' enabled-by-default: on + endpoint: + prometheus: + access: read_only + metrics: + tags: + application: ${spring.application.name} + environment: ${spring.profiles.active} + prometheus: + metrics: + export: + enabled: true sureness: container: jakarta_servlet diff --git a/script/docker-compose/hertzbeat-mysql-victoria-metrics/conf/application.yml b/script/docker-compose/hertzbeat-mysql-victoria-metrics/conf/application.yml index d009cd1a70..9a7cba2d7a 100644 --- a/script/docker-compose/hertzbeat-mysql-victoria-metrics/conf/application.yml +++ b/script/docker-compose/hertzbeat-mysql-victoria-metrics/conf/application.yml @@ -49,7 +49,19 @@ management: include: - 'metrics' - 'health' + - 'prometheus' enabled-by-default: on + endpoint: + prometheus: + access: read_only + metrics: + tags: + application: ${spring.application.name} + environment: ${spring.profiles.active} + prometheus: + metrics: + export: + enabled: true sureness: container: jakarta_servlet diff --git a/script/docker-compose/hertzbeat-postgresql-victoria-metrics/conf/application.yml b/script/docker-compose/hertzbeat-postgresql-victoria-metrics/conf/application.yml index 14601a6ad5..54bcaff0eb 100644 --- a/script/docker-compose/hertzbeat-postgresql-victoria-metrics/conf/application.yml +++ b/script/docker-compose/hertzbeat-postgresql-victoria-metrics/conf/application.yml @@ -49,7 +49,19 @@ management: include: - 'metrics' - 'health' + - 'prometheus' enabled-by-default: on + endpoint: + prometheus: + access: read_only + metrics: + tags: + application: ${spring.application.name} + environment: ${spring.profiles.active} + prometheus: + metrics: + export: + enabled: true sureness: container: jakarta_servlet --------------------------------------------------------------------- To unsubscribe, e-mail: notifications-unsubscr...@hertzbeat.apache.org For additional commands, e-mail: notifications-h...@hertzbeat.apache.org