This is an automated email from the ASF dual-hosted git repository. szetszwo pushed a commit to branch branch-2_tmp in repository https://gitbox.apache.org/repos/asf/ratis.git
commit 272eeae2a253fc86d72fccc536aff0fcbbfc82b9 Author: William Song <[email protected]> AuthorDate: Fri Jan 13 19:43:35 2023 +0800 RATIS-1766. Add descriptions to metrics entries (#804) (cherry picked from commit c2181e5fab51254452db094693d5bddbda78d5ba) --- ratis-docs/src/site/markdown/metrics.md | 145 +++++++++++++++++++++ .../server/metrics/SegmentedRaftLogMetrics.java | 4 +- 2 files changed, 147 insertions(+), 2 deletions(-) diff --git a/ratis-docs/src/site/markdown/metrics.md b/ratis-docs/src/site/markdown/metrics.md new file mode 100644 index 000000000..10c78ccbb --- /dev/null +++ b/ratis-docs/src/site/markdown/metrics.md @@ -0,0 +1,145 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + + +# Metrics + +## Ratis Server + +### StateMachine Metrics + +| Application | Component | Name | Type | Description | +|-------------|---------------|---------------------|-------|--------------------------------------------------------------| +| ratis | state_machine | appliedIndex | Gauge | Applied index of state machine | +| ratis | state_machine | applyCompletedIndex | Gauge | Last log index which completely applied to the state machine | +| ratis | state_machine | takeSnapshot | Timer | Time taken for state machine to take a snapshot | + + +### Leader Election Metrics + +| Application | Component | Name | Type | Description | +|-------------|-----------------|-------------------------------|---------|-------------------------------------------------------| +| ratis | leader_election | electionCount | Counter | Number of leader elections of this group | +| ratis | leader_election | timeoutCount | Counter | Number of election timeouts of this peer | +| ratis | leader_election | electionTime | Timer | Time spent on leader election | +| ratis | leader_election | lastLeaderElapsedTime | Gauge | Time elapsed since last hearing from an active leader | +| ratis | leader_election | transferLeadershipCount | Counter | Number of transferLeader requests | +| ratis | leader_election | lastLeaderElectionElapsedTime | Gauge | Time elapsed since last leader election | + +### Log Appender Metrics + +| Application | Component | Name | Type | Description | +|-------------|--------------|-----------------------------------|-------|---------------------------------------------| +| ratis | log_appender | follower_{peer}_next_index | Gauge | Next index of peer | +| ratis | log_appender | follower_{peer}_match_index | Gauge | Match index of peer | +| ratis | log_appender | follower_{peer}_rpc_response_time | Gauge | Time elapsed since peer's last rpc response | + +### Raft Log Metrics + +| Application | Component | Name | Type | Description | +|-------------|------------|---------------------------------|---------|---------------------------------------------------------------------------------------------------------------| +| ratis | log_worker | metadataLogEntryCount | Counter | Number of metadata(term-index) log entries | +| ratis | log_worker | configLogEntryCount | Counter | Number of configuration log entries | +| ratis | log_worker | stateMachineLogEntryCount | Counter | Number of statemachine log entries | +| ratis | log_worker | flushTime | Timer | Time taken to flush log | +| ratis | log_worker | flushCount | Counter | Number of times of log-flush invoked | +| ratis | log_worker | syncTime | Timer | Time taken to log sync (fsync) | +| ratis | log_worker | dataQueueSize | Gauge | Raft log data queue size which at any time gives the number of log related operations in the queue | +| ratis | log_worker | workerQueueSize | Gauge | Raft log worker queue size which at any time gives number of committed entries that are to be synced | +| ratis | log_worker | syncBatchSize | Gauge | Number of raft log entries synced in each flush call | +| ratis | log_worker | cacheMissCount | Counter | Count of RaftLogCache Misses | +| ratis | log_worker | cacheHitCount | Counter | Count of RaftLogCache Hits | +| ratis | log_worker | closedSegmentsNum | Gauge | Number of closed raft log segments | +| ratis | log_worker | closedSegmentsSizeInBytes | Gauge | Size of closed raft log segments in bytes | +| ratis | log_worker | openSegmentSizeInBytes | Gauge | Size of open raft log segment in bytes | +| ratis | log_worker | appendEntryLatency | Timer | Total time taken to append a raft log entry | +| ratis | log_worker | enqueuedTime | Timer | Time spent by a Raft log operation in the queue | +| ratis | log_worker | queueingDelay | Timer | Time taken for a Raft log operation to get into the queue after being requested, waiting queue to be non-full | +| ratis | log_worker | {operation}ExecutionTime | Timer | Time taken for a Raft log operation(open/close/flush/write/purge) to complete execution | +| ratis | log_worker | appendEntryCount | Counter | Number of entries appended to the raft log | +| ratis | log_worker | purgeLog | Timer | Time taken for Raft log purge operation to complete execution | +| ratis | log_worker | numStateMachineDataWriteTimeout | Counter | Number of statemachine dataApi write timeouts | +| ratis | log_worker | numStateMachineDataReadTimeout | Counter | Number of statemachine dataApi read timeouts | +| ratis | log_worker | readEntryLatency | Timer | Time required to read a raft log entry from actual raft log file and create a raft log entry | +| ratis | log_worker | segmentLoadLatency | Timer | Time required to load and process raft log segments during restart | + + +### Raft Server Metrics + +| Application | Component | Name | Type | Description | +|-------------|-----------|----------------------------------|---------|---------------------------------------------------------------------| +| ratis | server | {peer}_lastHeartbeatElapsedTime | Gauge | Time elapsed since last heartbeat rpc response | +| ratis | server | follower_append_entry_latency | Timer | Time taken for followers to append log entries | +| ratis | server | {peer}_peerCommitIndex | Gauge | Commit index of peer | +| ratis | server | clientReadRequest | Timer | Time taken to process read requests from client | +| ratis | server | clientStaleReadRequest | Timer | Time taken to process stale-read requests from client | +| ratis | server | clientWriteRequest | Timer | Time taken to process write requests from client | +| ratis | server | clientWatch{level}Request | Timer | Time taken to process watch(replication_level) requests from client | +| ratis | server | numRequestQueueLimitHits | Counter | Number of (total client requests in queue) limit hits | +| ratis | server | numRequestsByteSizeLimitHits | Counter | Number of (total size of client requests in queue) limit hits | +| ratis | server | numResourceLimitHits | Counter | Sum of numRequestQueueLimitHits and numRequestsByteSizeLimitHits | +| ratis | server | numPendingRequestInQueue | Gauge | Number of pending client requests in queue | +| ratis | server | numPendingRequestMegaByteSize | Gauge | Total size of pending client requests in queue | +| ratis | server | retryCacheEntryCount | Gauge | Number of entries in retry cache | +| ratis | server | retryCacheHitCount | Gauge | Number of retry cache hits | +| ratis | server | retryCacheHitRate | Gauge | Retry cache hit rate | +| ratis | server | retryCacheMissCount | Gauge | Number of retry cache misses | +| ratis | server | retryCacheMissRate | Gauge | Retry cache miss rate | +| ratis | server | numFailedClientStaleReadOnServer | Counter | Number of failed stale-read requests | +| ratis | server | numFailedClientReadOnServer | Counter | Number of failed read requests | +| ratis | server | numFailedClientWriteOnServer | Counter | Number of failed write requests | +| ratis | server | numFailedClientWatchOnServer | Counter | Number of failed watch requests | +| ratis | server | numFailedClientStreamOnServer | Counter | Number of failed stream requests | +| ratis | server | numInstallSnapshot | Counter | Number of install-snapshot requests | + + +## Ratis Netty Metrics + +| Application | Component | Name | Type | Description | +|-------------|---------------|-------------------------------|---------|-------------------------------------------| +| ratis_netty | stream_server | {request}_latency | timer | Time taken to process data stream request | +| ratis_netty | stream_server | {request}_success_reply_count | Counter | Number of success replies of request | +| ratis_netty | stream_server | {request}_fail_reply_count | Counter | Number of fail replies of request | +| ratis_netty | stream_server | num_requests_{request} | Counter | Number of total data stream requests | + +## Ratis gRPC Metrics + +### Message Metrics + +| Application | Component | Name | Type | Description | +|-------------|------------------------|----------------------------|---------|--------------------------------------------------| +| ratis | client_message_metrics | {method}_started_total | Counter | total messages started of {method} | +| ratis | client_message_metrics | {method}_completed_total | Counter | total messages completed of {method} | +| ratis | client_message_metrics | {method}_received_executed | Counter | total messages received and executed of {method} | +| ratis | server_message_metrics | {method}_started_total | Counter | total messages started of {method} | +| ratis | server_message_metrics | {method}_completed_total | Counter | total messages completed of {method} | +| ratis | server_message_metrics | {method}_received_executed | Counter | total messages received and executed of {method} | + +### gRPC Log Appender Metrics + + +| Application | Component | Name | Type | Description | +|-------------|--------------|---------------------------------------|---------|---------------------------------------------| +| ratis_grpc | log_appender | {appendEntries}_latency | Timer | Latency of method (appendEntries/heartbeat) | +| ratis_grpc | log_appender | {follower}_success_reply_count | Counter | Number of success replies | +| ratis_grpc | log_appender | {follower}_not_leader_reply_count | Counter | Number of NotLeader replies | +| ratis_grpc | log_appender | {follower}_inconsistency_reply_count | Counter | Number of Inconsistency replies | +| ratis_grpc | log_appender | {follower}_append_entry_timeout_count | Counter | Number of appendEntries timeouts | +| ratis_grpc | log_appender | {follower}_pending_log_requests_count | Counter | Number of pending requests | +| ratis_grpc | log_appender | num_retries | Counter | Number of request retries | +| ratis_grpc | log_appender | num_requests | Counter | Number of requests in total | +| ratis_grpc | log_appender | num_install_snapshot | Counter | Number of install snapshot requests | diff --git a/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java b/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java index f359e1a0b..810fcb003 100644 --- a/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java +++ b/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java @@ -67,9 +67,9 @@ public class SegmentedRaftLogMetrics extends RaftLogMetricsBase { /** Number of entries appended to the raft log */ public static final String RAFT_LOG_APPEND_ENTRY_COUNT = "appendEntryCount"; public static final String RAFT_LOG_PURGE_METRIC = "purgeLog"; - /** Time taken for a Raft log operation to complete write state machine data. */ + /** Number of statemachine dataApi write timeouts */ public static final String RAFT_LOG_STATEMACHINE_DATA_WRITE_TIMEOUT_COUNT = "numStateMachineDataWriteTimeout"; - /** Time taken for a Raft log operation to complete read state machine data. */ + /** Number of statemachine dataApi read timeouts */ public static final String RAFT_LOG_STATEMACHINE_DATA_READ_TIMEOUT_COUNT = "numStateMachineDataReadTimeout"; //////////////////////////////
