jojochuang commented on code in PR #9406:
URL: https://github.com/apache/ozone/pull/9406#discussion_r2600495538


##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OMSnapshotDirectoryMetrics.java:
##########
@@ -0,0 +1,360 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.om.snapshot;
+
+import static org.apache.hadoop.ozone.OzoneConsts.ROCKSDB_SST_SUFFIX;
+import static 
org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL;
+import static 
org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL_DEFAULT;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.io.File;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Timer;
+import java.util.TimerTask;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicReference;
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.hdds.annotation.InterfaceAudience;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.utils.db.DBStore;
+import org.apache.hadoop.hdds.utils.db.RDBStore;
+import org.apache.hadoop.metrics2.MetricsCollector;
+import org.apache.hadoop.metrics2.MetricsInfo;
+import org.apache.hadoop.metrics2.MetricsSource;
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.annotation.Metric;
+import org.apache.hadoop.metrics2.annotation.Metrics;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.lib.MetricsRegistry;
+import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
+import org.apache.hadoop.ozone.OzoneConsts;
+import org.apache.hadoop.ozone.om.OMMetadataManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Metrics for tracking db.snapshots directory space usage and SST file counts.
+ * Provides both aggregate metrics and per-checkpoint-directory metrics.
+ * Metrics are updated asynchronously to avoid blocking operations.
+ */
[email protected]
+@Metrics(about = "OM Snapshot Directory Metrics", context = OzoneConsts.OZONE)
+public final class OMSnapshotDirectoryMetrics implements MetricsSource {
+  private static final Logger LOG =
+      LoggerFactory.getLogger(OMSnapshotDirectoryMetrics.class);
+  private static final String SOURCE_NAME =
+      OMSnapshotDirectoryMetrics.class.getSimpleName();
+
+  // Aggregate metrics
+  private @Metric MutableGaugeLong dbSnapshotsDirSize;
+  private @Metric MutableGaugeLong totalSstFilesCount;
+  private @Metric MutableGaugeLong numSnapshots;
+
+  private final AtomicLong lastUpdateTime = new AtomicLong(0);
+  private final AtomicReference<CompletableFuture<Void>> 
currentUpdateFutureRef =
+      new AtomicReference<>();
+  private final OMMetadataManager metadataManager;
+  private final MetricsRegistry registry = new MetricsRegistry(SOURCE_NAME);
+
+  // Per-checkpoint-directory metrics storage
+  private volatile Map<String, CheckpointMetrics> checkpointMetricsMap = new 
HashMap<>();
+
+  private Timer updateTimer;
+
+  /**
+   * Starts the periodic metrics update task.
+   *
+   * @param conf OzoneConfiguration for reading update interval
+   */
+  public void start(OzoneConfiguration conf) {
+    long updateInterval = 
conf.getTimeDuration(OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL,
+        OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL_DEFAULT,
+        TimeUnit.MILLISECONDS);
+
+    updateTimer = new Timer("OMSnapshotDirectoryMetricsUpdate", true);
+    updateTimer.schedule(new TimerTask() {
+      @Override
+      public void run() {
+        updateMetricsAsync();
+      }
+    }, 0, updateInterval);
+  }
+
+  /**
+   * Stops the periodic metrics update task.
+   */
+  public void stop() {
+    if (updateTimer != null) {
+      updateTimer.cancel();
+      updateTimer = null;
+    }
+  }
+
+  public void unRegister() {
+    stop();
+    MetricsSystem ms = DefaultMetricsSystem.instance();
+    ms.unregisterSource(SOURCE_NAME);
+  }
+
+  /**
+   * Internal class to store per-checkpoint metrics.
+   */
+  private static class CheckpointMetrics {
+    private final long size;
+    private final int sstFileCount;
+
+    CheckpointMetrics(long size, int sstFileCount) {
+      this.size = size;
+      this.sstFileCount = sstFileCount;
+    }
+
+    public long getSize() {
+      return size;
+    }
+
+    public int getSstFileCount() {
+      return sstFileCount;
+    }
+  }
+
+  private OMSnapshotDirectoryMetrics(OMMetadataManager metadataManager) {
+    this.metadataManager = metadataManager;
+  }
+
+  public static OMSnapshotDirectoryMetrics create(String parent,
+      OMMetadataManager metadataManager) {
+    MetricsSystem ms = DefaultMetricsSystem.instance();
+    return ms.register(SOURCE_NAME,
+        parent,
+        new OMSnapshotDirectoryMetrics(metadataManager));
+  }
+
+  /**
+   * Updates all metrics (aggregate and per-checkpoint) asynchronously
+   * in a background thread.
+   */
+  public void updateMetricsAsync() {
+    CompletableFuture<Void> currentUpdateFuture = currentUpdateFutureRef.get();
+    if (currentUpdateFuture != null && !currentUpdateFuture.isDone()) {
+      return;
+    }
+
+    CompletableFuture<Void> newFuture = CompletableFuture.runAsync(() -> {
+      try {
+        updateMetrics();
+        lastUpdateTime.set(System.currentTimeMillis());
+      } catch (Exception e) {
+        LOG.warn("Failed to update snapshot directory metrics", e);
+      } finally {
+        currentUpdateFutureRef.set(null);
+      }
+    });
+
+    // Atomically set the future only if the current value matches what we 
checked
+    // This prevents race conditions where multiple threads try to set a new 
future
+    CompletableFuture<Void> expected = currentUpdateFutureRef.get();
+    if (expected == null || expected.isDone()) {
+      // Only set if still null or done (double-check after creating future)
+      if (!currentUpdateFutureRef.compareAndSet(expected, newFuture)) {
+        // Another thread set a future, cancel this one
+        newFuture.cancel(false);
+      }
+    } else {
+      // Another thread started an update, cancel this one
+      newFuture.cancel(false);
+    }
+  }
+
+  /**
+   * Updates all metrics synchronously - both aggregate and 
per-checkpoint-directory.
+   */
+  @VisibleForTesting
+  void updateMetrics() throws IOException {
+    DBStore store = metadataManager.getStore();
+    if (!(store instanceof RDBStore)) {
+      LOG.debug("Store is not RDBStore, skipping snapshot directory metrics 
update");
+      resetMetrics();
+      return;
+    }
+
+    RDBStore rdbStore = (RDBStore) store;
+    String snapshotsParentDir = rdbStore.getSnapshotsParentDir();
+
+    if (snapshotsParentDir == null) {
+      resetMetrics();
+      return;
+    }
+
+    File snapshotsDir = new File(snapshotsParentDir);
+    if (!snapshotsDir.exists() || !snapshotsDir.isDirectory()) {
+      resetMetrics();
+      return;
+    }
+
+    try {
+      // Calculate aggregate metrics
+      long totalSize = FileUtils.sizeOfDirectory(snapshotsDir);

Review Comment:
   does it take into account of hard links by any chance?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to