This is an automated email from the ASF dual-hosted git repository.

rpuch pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/ignite-3.git


The following commit(s) were added to refs/heads/main by this push:
     new be5df8031e6 IGNITE-27356 Add checkpoint metrics (#7250)
be5df8031e6 is described below

commit be5df8031e6b60ea976e724d4d6062df6e763dae
Author: Viacheslav Blinov <[email protected]>
AuthorDate: Tue Dec 30 17:26:38 2025 +0300

    IGNITE-27356 Add checkpoint metrics (#7250)
    
    Co-authored-by: Roman Puchkovskiy <[email protected]>
---
 .../persistence/checkpoint/CheckpointManager.java  |   5 +-
 .../checkpoint/CheckpointReadWriteLock.java        |  63 ++++++++---
 .../checkpoint/CheckpointReadWriteLockMetrics.java | 120 +++++++++++++++++++++
 .../checkpoint/CheckpointReadWriteLockTest.java    |   6 +-
 .../checkpoint/CheckpointTimeoutLockTest.java      |  72 ++++++++++++-
 .../checkpoint/CheckpointTestUtils.java            |   8 +-
 6 files changed, 254 insertions(+), 20 deletions(-)

diff --git 
a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointManager.java
 
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointManager.java
index 526490a4cd9..b6e23dbb363 100644
--- 
a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointManager.java
+++ 
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointManager.java
@@ -124,9 +124,12 @@ public class CheckpointManager {
                 ? new 
ReentrantReadWriteLockWithTracking(Loggers.forClass(CheckpointReadWriteLock.class),
 logReadLockThresholdTimeout)
                 : new ReentrantReadWriteLockWithTracking();
 
+        var readWriteLockMetrics = new 
CheckpointReadWriteLockMetrics(checkpointMetricSource);
+
         CheckpointReadWriteLock checkpointReadWriteLock = new 
CheckpointReadWriteLock(
                 reentrantReadWriteLockWithTracking,
-                commonExecutorService
+                commonExecutorService,
+                readWriteLockMetrics
         );
 
         checkpointWorkflow = new CheckpointWorkflow(
diff --git 
a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLock.java
 
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLock.java
index eab912f45f2..290374878a2 100644
--- 
a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLock.java
+++ 
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLock.java
@@ -17,8 +17,6 @@
 
 package org.apache.ignite.internal.pagememory.persistence.checkpoint;
 
-import static 
org.apache.ignite.internal.util.FastTimestamps.coarseCurrentTimeMillis;
-
 import java.util.concurrent.Executor;
 import java.util.concurrent.TimeUnit;
 import org.apache.ignite.internal.lang.IgniteInternalException;
@@ -43,11 +41,15 @@ public class CheckpointReadWriteLock {
 
     private final IgniteThrottledLogger log;
 
+    private final ThreadLocal<Long> checkpointReadLockAcquiredTime = new 
ThreadLocal<>();
+
     private final ThreadLocal<Integer> checkpointReadLockHoldCount = 
ThreadLocal.withInitial(() -> 0);
 
     /** Checkpoint lock. */
     private final ReentrantReadWriteLockWithTracking checkpointLock;
 
+    private final CheckpointReadWriteLockMetrics metrics;
+
     /** Current write lock holder thread. */
     private volatile @Nullable Thread currentWriteLockHolder;
 
@@ -56,10 +58,16 @@ public class CheckpointReadWriteLock {
      *
      * @param checkpointLock Checkpoint lock.
      * @param throttledLogExecutor Executor for the throttled logger.
+     * @param metrics Read/write lock metrics.
      */
-    public CheckpointReadWriteLock(ReentrantReadWriteLockWithTracking 
checkpointLock, Executor throttledLogExecutor) {
+    public CheckpointReadWriteLock(
+            ReentrantReadWriteLockWithTracking checkpointLock,
+            Executor throttledLogExecutor,
+            CheckpointReadWriteLockMetrics metrics
+    ) {
         this.checkpointLock = checkpointLock;
         this.log = 
Loggers.toThrottledLogger(Loggers.forClass(CheckpointReadWriteLock.class), 
throttledLogExecutor);
+        this.metrics = metrics;
     }
 
     /**
@@ -72,11 +80,12 @@ public class CheckpointReadWriteLock {
             return;
         }
 
-        long start = coarseCurrentTimeMillis();
+        long startNanos = System.nanoTime();
 
+        metrics.incrementReadLockWaitingThreads();
         checkpointLock.readLock().lock();
 
-        onReadLock(start, true);
+        onReadLock(startNanos, true);
     }
 
     /**
@@ -91,11 +100,12 @@ public class CheckpointReadWriteLock {
             return true;
         }
 
-        long start = coarseCurrentTimeMillis();
+        long startNanos = System.nanoTime();
 
+        metrics.incrementReadLockWaitingThreads();
         boolean res = checkpointLock.readLock().tryLock(timeout, unit);
 
-        onReadLock(start, res);
+        onReadLock(startNanos, res);
 
         return res;
     }
@@ -110,11 +120,12 @@ public class CheckpointReadWriteLock {
             return true;
         }
 
-        long start = coarseCurrentTimeMillis();
+        long startNanos = System.nanoTime();
 
+        metrics.incrementReadLockWaitingThreads();
         boolean res = checkpointLock.readLock().tryLock();
 
-        onReadLock(start, res);
+        onReadLock(startNanos, res);
 
         return res;
     }
@@ -138,7 +149,7 @@ public class CheckpointReadWriteLock {
 
         checkpointLock.readLock().unlock();
 
-        checkpointReadLockHoldCount.set(checkpointReadLockHoldCount.get() - 1);
+        onReadUnlock();
     }
 
     /**
@@ -181,15 +192,37 @@ public class CheckpointReadWriteLock {
         return checkpointLock.hasQueuedWriters();
     }
 
-    private void onReadLock(long start, boolean taken) {
-        long elapsed = coarseCurrentTimeMillis() - start;
+    private void onReadLock(long startNanos, boolean taken) {
+        metrics.decrementReadLockWaitingThreads();
+
+        long currentNanos = System.nanoTime();
+        long elapsedNanos = currentNanos - startNanos;
 
         if (taken) {
-            checkpointReadLockHoldCount.set(checkpointReadLockHoldCount.get() 
+ 1);
+            int newLockCount = checkpointReadLockHoldCount.get() + 1;
+            checkpointReadLockHoldCount.set(newLockCount);
+
+            // We only record acquisition time on first lock acquisition (not 
on reentry).
+            if (newLockCount == 1) {
+                checkpointReadLockAcquiredTime.set(currentNanos);
+            }
+            metrics.recordReadLockAcquisitionTime(elapsedNanos);
         }
 
-        if (elapsed > LONG_LOCK_THRESHOLD_MILLIS) {
-            log.warn(LONG_LOCK_THROTTLE_KEY, "Checkpoint read lock took {} ms 
to acquire.", elapsed);
+        long elapsedMillis = TimeUnit.NANOSECONDS.toMillis(elapsedNanos);
+        if (elapsedMillis > LONG_LOCK_THRESHOLD_MILLIS) {
+            log.warn(LONG_LOCK_THROTTLE_KEY, "Checkpoint read lock took {} ms 
to acquire.", elapsedMillis);
+        }
+    }
+
+    private void onReadUnlock() {
+        int newLockCount = checkpointReadLockHoldCount.get() - 1;
+        checkpointReadLockHoldCount.set(newLockCount);
+        if (newLockCount == 0) {
+            // Fully unlocked - record hold duration.
+            Long acquiredTimeNanos = checkpointReadLockAcquiredTime.get();
+            long holdDurationNanos = System.nanoTime() - acquiredTimeNanos;
+            metrics.recordReadLockHoldDuration(holdDurationNanos);
         }
     }
 }
diff --git 
a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockMetrics.java
 
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockMetrics.java
new file mode 100644
index 00000000000..e1d63df4017
--- /dev/null
+++ 
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockMetrics.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.internal.pagememory.persistence.checkpoint;
+
+import org.apache.ignite.internal.metrics.DistributionMetric;
+import org.apache.ignite.internal.metrics.LongAdderMetric;
+
+/**
+ * Metrics for checkpoint read/write lock operations.
+ *
+ * <p>This metric source tracks performance and contention characteristics of 
checkpoint read lock
+ * acquired by normal operations during database operation.
+ */
+public class CheckpointReadWriteLockMetrics {
+    private static final long[] LOCK_ACQUISITION_BOUNDS_NANOS = {
+            1_000,           // 1µs   - uncontended, fast path
+            10_000,          // 10µs  - minor contention
+            100_000,         // 100µs - moderate contention
+            1_000_000,       // 1ms   - high contention
+            10_000_000,      // 10ms  - checkpoint in progress?
+            100_000_000,     // 100ms - severe contention, reported as warning 
in logs
+            1_000_000_000    // 1s    - pathological case, shall be treated as 
an emergency error
+    };
+
+    private static final long[] LOCK_HOLD_BOUNDS_NANOS = {
+            1_000,           // 1µs    - very fast operation (single field 
update)
+            10_000,          // 10µs   - fast single-page operation
+            100_000,         // 100µs  - multi-page operation
+            1_000_000,       // 1ms    - complex operation
+            10_000_000,      // 10ms   - batch operation
+            100_000_000,     // 100ms  - large batch or slow I/O
+            1_000_000_000    // 1s     - pathologically long operation
+    };
+
+    private final DistributionMetric readLockAcquisitionTime = new 
DistributionMetric(
+            "ReadLockAcquisitionTime",
+            "Time from requesting checkpoint read lock until acquisition in 
nanoseconds.",
+            LOCK_ACQUISITION_BOUNDS_NANOS
+    );
+
+    private final DistributionMetric readLockHoldTime = new DistributionMetric(
+            "ReadLockHoldTime",
+            "Duration between checkpoint read lock acquisition and release in 
nanoseconds.",
+            LOCK_HOLD_BOUNDS_NANOS
+    );
+
+    private final LongAdderMetric readLockWaitingThreads = new LongAdderMetric(
+            "ReadLockWaitingThreads",
+            "Current number of threads waiting for checkpoint read lock."
+    );
+
+    /**
+     * Constructor.
+     *
+     * @param metricSource Metric source to register metrics with.
+     */
+    public CheckpointReadWriteLockMetrics(CheckpointMetricSource metricSource) 
{
+        metricSource.addMetric(readLockAcquisitionTime);
+        metricSource.addMetric(readLockHoldTime);
+        metricSource.addMetric(readLockWaitingThreads);
+    }
+
+    /**
+     * Records the duration of a lock acquisition in nanoseconds.
+     */
+    public void recordReadLockAcquisitionTime(long acquisitionDurationNanos) {
+        readLockAcquisitionTime.add(acquisitionDurationNanos);
+    }
+
+    /**
+     * Records the duration of a lock hold in nanoseconds.
+     */
+    public void recordReadLockHoldDuration(long lockHoldDurationNanos) {
+        readLockHoldTime.add(lockHoldDurationNanos);
+    }
+
+    /**
+     * Increments the count of threads waiting for the read lock.
+     */
+    public void incrementReadLockWaitingThreads() {
+        readLockWaitingThreads.increment();
+    }
+
+    /**
+     * Decrements the count of threads waiting for the read lock.
+     */
+    public void decrementReadLockWaitingThreads() {
+        readLockWaitingThreads.decrement();
+    }
+
+    /** Returns the read lock acquisition time metric. */
+    DistributionMetric readLockAcquisitionTime() {
+        return readLockAcquisitionTime;
+    }
+
+    /** Returns the read lock hold time metric. */
+    DistributionMetric readLockHoldTime() {
+        return readLockHoldTime;
+    }
+
+    /** Returns the read lock waiting threads metric. */
+    LongAdderMetric readLockWaitingThreads() {
+        return readLockWaitingThreads;
+    }
+}
diff --git 
a/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockTest.java
 
b/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockTest.java
index 43b264897e2..a7846208469 100644
--- 
a/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockTest.java
+++ 
b/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockTest.java
@@ -41,6 +41,10 @@ public class CheckpointReadWriteLockTest {
     @InjectExecutorService
     private ExecutorService executorService;
 
+    private final CheckpointReadWriteLockMetrics metrics = new 
CheckpointReadWriteLockMetrics(
+            new CheckpointMetricSource("test")
+    );
+
     @Test
     void testReadLock() throws Exception {
         CheckpointReadWriteLock lock0 = newReadWriteLock();
@@ -167,7 +171,7 @@ public class CheckpointReadWriteLockTest {
     }
 
     private CheckpointReadWriteLock newReadWriteLock() {
-        return new CheckpointReadWriteLock(new 
ReentrantReadWriteLockWithTracking(), executorService);
+        return new CheckpointReadWriteLock(new 
ReentrantReadWriteLockWithTracking(), executorService, metrics);
     }
 
     @Test
diff --git 
a/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTimeoutLockTest.java
 
b/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTimeoutLockTest.java
index 7aa7021e44e..bca2ab77fa8 100644
--- 
a/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTimeoutLockTest.java
+++ 
b/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTimeoutLockTest.java
@@ -28,6 +28,7 @@ import static 
org.apache.ignite.internal.testframework.IgniteTestUtils.runAsync;
 import static 
org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willSucceedIn;
 import static org.apache.ignite.internal.util.IgniteUtils.closeAll;
 import static 
org.apache.ignite.lang.ErrorGroups.CriticalWorkers.SYSTEM_CRITICAL_OPERATION_TIMEOUT_ERR;
+import static org.awaitility.Awaitility.await;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.instanceOf;
@@ -41,6 +42,7 @@ import static org.mockito.Mockito.doAnswer;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 
+import java.util.Arrays;
 import java.util.concurrent.CancellationException;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.CountDownLatch;
@@ -51,6 +53,7 @@ import java.util.concurrent.atomic.AtomicReference;
 import org.apache.ignite.internal.failure.FailureManager;
 import org.apache.ignite.internal.lang.IgniteInternalException;
 import org.apache.ignite.internal.lang.NodeStoppingException;
+import org.apache.ignite.internal.metrics.DistributionMetric;
 import org.apache.ignite.internal.pagememory.persistence.CheckpointUrgency;
 import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest;
 import org.apache.ignite.internal.testframework.ExecutorServiceExtension;
@@ -71,6 +74,10 @@ public class CheckpointTimeoutLockTest extends 
BaseIgniteAbstractTest {
     @InjectExecutorService
     private ExecutorService executorService;
 
+    private final CheckpointReadWriteLockMetrics dummyMetrics = new 
CheckpointReadWriteLockMetrics(
+            new CheckpointMetricSource("test")
+    );
+
     @AfterEach
     void tearDown() {
         if (timeoutLock != null) {
@@ -385,7 +392,11 @@ public class CheckpointTimeoutLockTest extends 
BaseIgniteAbstractTest {
     }
 
     private CheckpointReadWriteLock newReadWriteLock() {
-        return new CheckpointReadWriteLock(new 
ReentrantReadWriteLockWithTracking(log, 5_000), executorService);
+        return newReadWriteLock(dummyMetrics);
+    }
+
+    private CheckpointReadWriteLock 
newReadWriteLock(CheckpointReadWriteLockMetrics metrics) {
+        return new CheckpointReadWriteLock(new 
ReentrantReadWriteLockWithTracking(log, 5_000), executorService, metrics);
     }
 
     private CheckpointProgress newCheckpointProgress(CompletableFuture<?> 
future) {
@@ -407,4 +418,63 @@ public class CheckpointTimeoutLockTest extends 
BaseIgniteAbstractTest {
 
         return checkpointer;
     }
+
+    @Test
+    void testCheckpointReadLockMetrics() {
+        CheckpointMetricSource metricSource = new 
CheckpointMetricSource("test");
+        CheckpointReadWriteLockMetrics metrics = new 
CheckpointReadWriteLockMetrics(metricSource);
+        CheckpointReadWriteLock readWriteLock = newReadWriteLock(metrics);
+
+        timeoutLock = new CheckpointTimeoutLock(
+                readWriteLock,
+                10_000,
+                () -> NOT_REQUIRED,
+                mock(Checkpointer.class),
+                mock(FailureManager.class)
+        );
+
+        timeoutLock.start();
+
+        try {
+            // Verify metrics start at zero
+            
assertDistributionMetricRecordsCount(metrics.readLockAcquisitionTime(), 0L);
+
+            // Acquire and immediately release the lock
+            timeoutLock.checkpointReadLock();
+            timeoutLock.checkpointReadUnlock();
+
+            // Verify acquisition was recorded
+            
assertDistributionMetricRecordsCount(metrics.readLockAcquisitionTime(), 1L);
+
+            // Verify hold time distribution was recorded
+            assertDistributionMetricRecordsCount(metrics.readLockHoldTime(), 
1L);
+
+            readWriteLock.writeLock();
+            runAsync(() -> {
+                timeoutLock.checkpointReadLock();
+                timeoutLock.checkpointReadUnlock();
+            });
+            await().untilAsserted(() -> 
assertThat(metrics.readLockWaitingThreads().value(), is(1L)));
+            readWriteLock.writeUnlock();
+            await().untilAsserted(() -> 
assertThat(metrics.readLockWaitingThreads().value(), is(0L)));
+        } finally {
+            timeoutLock.stop();
+        }
+    }
+
+    /**
+     * Verifies that the specified distribution metric has recorded the 
expected total number of measurements.
+     *
+     * <p>
+     * Rather than checking individual histogram buckets, this method 
aggregates all recorded measurements across every bucket
+     * and confirms that the expected interaction was captured in at least one 
of them.
+     */
+    private static void 
assertDistributionMetricRecordsCount(DistributionMetric metric, long 
expectedMeasuresCount) {
+        long totalMeasuresCount = Arrays.stream(metric.value()).sum();
+        assertThat(
+                "Unexpected total measures count in distribution metric " + 
metric.name(),
+                totalMeasuresCount,
+                is(expectedMeasuresCount)
+        );
+    }
 }
diff --git 
a/modules/page-memory/src/testFixtures/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTestUtils.java
 
b/modules/page-memory/src/testFixtures/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTestUtils.java
index 8d2267ebd13..ee95c6a6b1c 100644
--- 
a/modules/page-memory/src/testFixtures/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTestUtils.java
+++ 
b/modules/page-memory/src/testFixtures/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTestUtils.java
@@ -42,14 +42,18 @@ import 
org.apache.ignite.internal.pagememory.persistence.store.FilePageStoreMana
  * Useful class for testing a checkpoint.
  */
 public class CheckpointTestUtils {
+    private static final CheckpointReadWriteLockMetrics metrics = new 
CheckpointReadWriteLockMetrics(
+            new CheckpointMetricSource("test")
+    );
+
     /**
      * Returns new instance of {@link CheckpointReadWriteLock}.
      *
-     * @param log             Logger.
+     * @param log Logger.
      * @param executorService Executor service.
      */
     static CheckpointReadWriteLock newReadWriteLock(IgniteLogger log, 
ExecutorService executorService) {
-        return new CheckpointReadWriteLock(new 
ReentrantReadWriteLockWithTracking(log, 5_000), executorService);
+        return new CheckpointReadWriteLock(new 
ReentrantReadWriteLockWithTracking(log, 5_000), executorService, metrics);
     }
 
     /**

Reply via email to