This is an automated email from the ASF dual-hosted git repository.
tanxinyu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git
The following commit(s) were added to refs/heads/master by this push:
new 344962356b [IOTDB-5480] IoTConsensus sync lag may be negative under
single copy (#9031)
344962356b is described below
commit 344962356ba7a72d4291ad7df80f363dd5700b00
Author: Xiangpeng Hu <[email protected]>
AuthorDate: Mon Feb 13 14:47:47 2023 +0800
[IOTDB-5480] IoTConsensus sync lag may be negative under single copy (#9031)
---
.../consensus/iot/IoTConsensusServerImpl.java | 5 +++++
.../consensus/iot/IoTConsensusServerMetrics.java | 23 ++++++++++++++++++++++
docs/UserGuide/Monitor-Alert/Metric-Tool.md | 1 +
docs/zh/UserGuide/Monitor-Alert/Metric-Tool.md | 1 +
4 files changed, 30 insertions(+)
diff --git
a/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
b/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
index e903289177..75a2fc2f98 100644
---
a/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
+++
b/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
@@ -722,6 +722,11 @@ public class IoTConsensusServerImpl {
return searchIndex.get();
}
+ public long getSyncLag() {
+ long safeIndex = getCurrentSafelyDeletedSearchIndex();
+ return getSearchIndex() - safeIndex;
+ }
+
public IoTConsensusConfig getConfig() {
return config;
}
diff --git
a/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerMetrics.java
b/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerMetrics.java
index 82814d071f..ff241bc7e1 100644
---
a/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerMetrics.java
+++
b/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerMetrics.java
@@ -60,6 +60,19 @@ public class IoTConsensusServerMetrics implements IMetricSet
{
impl.getThisNode().getGroupId().toString(),
Tag.TYPE.toString(),
"safeIndex");
+ // TODO: Consider adding topological order to the traversal of metricEntry.
+ MetricService.getInstance()
+ .createAutoGauge(
+ Metric.IOT_CONSENSUS.toString(),
+ MetricLevel.IMPORTANT,
+ impl,
+ IoTConsensusServerImpl::getSyncLag,
+ Tag.NAME.toString(),
+ "ioTConsensusServerImpl",
+ Tag.REGION.toString(),
+ impl.getThisNode().getGroupId().toString(),
+ Tag.TYPE.toString(),
+ "syncLag");
MetricService.getInstance()
.createAutoGauge(
Metric.IOT_CONSENSUS.toString(),
@@ -108,6 +121,16 @@ public class IoTConsensusServerMetrics implements
IMetricSet {
impl.getThisNode().getGroupId().toString(),
Tag.TYPE.toString(),
"safeIndex");
+ MetricService.getInstance()
+ .remove(
+ MetricType.AUTO_GAUGE,
+ Metric.IOT_CONSENSUS.toString(),
+ Tag.NAME.toString(),
+ "ioTConsensusServerImpl",
+ Tag.REGION.toString(),
+ impl.getThisNode().getGroupId().toString(),
+ Tag.TYPE.toString(),
+ "syncLag");
MetricService.getInstance()
.remove(
MetricType.AUTO_GAUGE,
diff --git a/docs/UserGuide/Monitor-Alert/Metric-Tool.md
b/docs/UserGuide/Monitor-Alert/Metric-Tool.md
index dfc68deb52..89e4cf9aaa 100644
--- a/docs/UserGuide/Monitor-Alert/Metric-Tool.md
+++ b/docs/UserGuide/Monitor-Alert/Metric-Tool.md
@@ -171,6 +171,7 @@ carefully evaluated. The current Core-level metrics are as
follows:
| mutli_leader | name="logDispatcher-{{IP}}:{{Port}}", region="{{region}}",
type="cachedRequestInMemoryQueue" | AutoGauge | The size of cache requests of
synchronization thread in replica group |
| mutli_leader | name="IoTConsensusServerImpl", region="{{region}}",
type="searchIndex" | AutoGauge | The write process of
main process in replica group |
| mutli_leader | name="IoTConsensusServerImpl", region="{{region}}",
type="safeIndex" | AutoGauge | The sync index of
replica group |
+| mutli_leader | name="IoTConsensusServerImpl", region="{{region}}",
type="syncLag" | AutoGauge | The sync lag of replica
group |
| mutli_leader | name="IoTConsensusServerImpl", region="{{region}}",
type="LogEntriesFromWAL" | AutoGauge | The number of logEntries
from wal in Batch |
| mutli_leader | name="IoTConsensusServerImpl", region="{{region}}",
type="LogEntriesFromQueue" | AutoGauge | The number of logEntries
from queue in Batch |
| stage | name="iot_consensus", region="{{region}}",
type="getStateMachineLock" | Histogram | The time
consumed to get statemachine lock in main process |
diff --git a/docs/zh/UserGuide/Monitor-Alert/Metric-Tool.md
b/docs/zh/UserGuide/Monitor-Alert/Metric-Tool.md
index 0c131a9af7..f7147fe096 100644
--- a/docs/zh/UserGuide/Monitor-Alert/Metric-Tool.md
+++ b/docs/zh/UserGuide/Monitor-Alert/Metric-Tool.md
@@ -151,6 +151,7 @@ Core 级别的监控指标在系统运行中默认开启,每一个 Core 级别
| iot_consensus | name="logDispatcher-{{IP}}:{{Port}}", region="{{region}}",
type="cachedRequestInMemoryQueue" | AutoGauge | 副本组同步线程缓存队列请求总大小 |
| iot_consensus | name="IoTConsensusServerImpl", region="{{region}}",
type="searchIndex" | AutoGauge | 副本组主流程写入进度 |
| iot_consensus | name="IoTConsensusServerImpl", region="{{region}}",
type="safeIndex" | AutoGauge | 副本组同步进度 |
+| iot_consensus | name="IoTConsensusServerImpl", region="{{region}}",
type="syncLag" | AutoGauge | 副本组写入进度与同步进度差 |
| iot_consensus | name="IoTConsensusServerImpl", region="{{region}}",
type="LogEntriesFromWAL" | AutoGauge | 副本组Batch中来自WAL的日志项数量 |
| iot_consensus | name="IoTConsensusServerImpl", region="{{region}}",
type="LogEntriesFromQueue" | AutoGauge | 副本组Batch中来自队列的日志项数量 |
| stage | name="iot_consensus", region="{{region}}",
type="getStateMachineLock" | Histogram | 主流程获取状态机锁耗时
|