sadanand48 commented on code in PR #9406:
URL: https://github.com/apache/ozone/pull/9406#discussion_r2628543291


##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OMSnapshotDirectoryMetrics.java:
##########
@@ -0,0 +1,399 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.om.snapshot;
+
+import static org.apache.hadoop.ozone.OzoneConsts.ROCKSDB_SST_SUFFIX;
+import static 
org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL;
+import static 
org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL_DEFAULT;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import org.apache.hadoop.hdds.annotation.InterfaceAudience;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.utils.IOUtils;
+import org.apache.hadoop.hdds.utils.db.DBStore;
+import org.apache.hadoop.hdds.utils.db.RDBStore;
+import org.apache.hadoop.metrics2.MetricsCollector;
+import org.apache.hadoop.metrics2.MetricsInfo;
+import org.apache.hadoop.metrics2.MetricsSource;
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.annotation.Metric;
+import org.apache.hadoop.metrics2.annotation.Metrics;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.lib.MetricsRegistry;
+import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
+import org.apache.hadoop.ozone.OzoneConsts;
+import org.apache.hadoop.ozone.om.OMMetadataManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Metrics for tracking db.snapshots directory space usage and SST file counts.
+ * Provides both aggregate metrics and per-checkpoint-directory metrics.
+ * Metrics are updated asynchronously to avoid blocking operations.
+ */
[email protected]
+@Metrics(about = "OM Snapshot Directory Metrics", context = OzoneConsts.OZONE)
+public final class OMSnapshotDirectoryMetrics implements MetricsSource {
+  private static final Logger LOG =
+      LoggerFactory.getLogger(OMSnapshotDirectoryMetrics.class);
+  private static final String SOURCE_NAME =
+      OMSnapshotDirectoryMetrics.class.getSimpleName();
+
+  // Aggregate metrics
+  private @Metric MutableGaugeLong dbSnapshotsDirSize;
+  private @Metric MutableGaugeLong totalSstFilesCount;
+  private @Metric MutableGaugeLong numSnapshots;
+
+  private final AtomicLong lastUpdateTime = new AtomicLong(0);
+  private final OMMetadataManager metadataManager;
+  private final MetricsRegistry registry = new MetricsRegistry(SOURCE_NAME);
+
+  // Per-checkpoint-directory metrics storage
+  private volatile Map<String, CheckpointMetrics> checkpointMetricsMap = new 
HashMap<>();
+  private ScheduledExecutorService updateExecutor;
+  private ScheduledFuture<?> updateTask;
+
+  /**
+   * Starts the periodic metrics update task.
+   *
+   * @param conf OzoneConfiguration for reading update interval
+   */
+  public void start(OzoneConfiguration conf) {
+    long updateInterval = 
conf.getTimeDuration(OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL,
+        OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL_DEFAULT,
+        TimeUnit.MILLISECONDS);
+
+    updateExecutor = Executors.newSingleThreadScheduledExecutor(r -> {
+      Thread t = new Thread(r, "OMSnapshotDirectoryMetricsUpdate");
+      t.setDaemon(true);
+      return t;
+    });
+
+    // Schedule periodic updates
+    updateTask = updateExecutor.scheduleWithFixedDelay(() -> {
+      try {
+        updateMetrics();
+        lastUpdateTime.set(System.currentTimeMillis());
+      } catch (Exception e) {
+        LOG.warn("Failed to update snapshot directory metrics", e);
+      }
+    }, 0, updateInterval, TimeUnit.MILLISECONDS);
+  }
+
+  /**
+   * Stops the periodic metrics update task.
+   */
+  public void stop() {
+    if (updateTask != null) {
+      updateTask.cancel(false); // Don't interrupt if running
+      updateTask = null;
+    }
+
+    if (updateExecutor != null) {
+      updateExecutor.shutdown();
+      try {
+        // Wait for any running updateMetrics() to complete (with timeout)
+        if (!updateExecutor.awaitTermination(30, TimeUnit.SECONDS)) {
+          LOG.warn("Metrics update executor did not terminate in time, forcing 
shutdown");
+          updateExecutor.shutdownNow();
+          // Wait a bit more for cancellation to take effect
+          if (!updateExecutor.awaitTermination(5, TimeUnit.SECONDS)) {
+            LOG.error("Metrics update executor did not terminate after force 
shutdown");
+          }
+        }
+      } catch (InterruptedException e) {
+        Thread.currentThread().interrupt();
+        updateExecutor.shutdownNow();
+      }
+      updateExecutor = null;
+    }
+  }
+
+  public void unRegister() {
+    stop();
+    MetricsSystem ms = DefaultMetricsSystem.instance();
+    ms.unregisterSource(SOURCE_NAME);
+  }
+
+  /**
+   * Internal class to store per-checkpoint metrics.
+   */
+  private static class CheckpointMetrics {
+    private final long size;
+    private final int sstFileCount;
+
+    CheckpointMetrics(long size, int sstFileCount) {
+      this.size = size;
+      this.sstFileCount = sstFileCount;
+    }
+
+    public long getSize() {
+      return size;
+    }
+
+    public int getSstFileCount() {
+      return sstFileCount;
+    }
+  }
+
+  private OMSnapshotDirectoryMetrics(OMMetadataManager metadataManager) {
+    this.metadataManager = metadataManager;
+  }
+
+  public static OMSnapshotDirectoryMetrics create(String parent,
+      OMMetadataManager metadataManager) {
+    MetricsSystem ms = DefaultMetricsSystem.instance();
+    return ms.register(SOURCE_NAME,
+        parent,
+        new OMSnapshotDirectoryMetrics(metadataManager));
+  }
+
+  /**
+   * Updates all metrics synchronously - both aggregate and 
per-checkpoint-directory.
+   */
+  @VisibleForTesting
+  void updateMetrics() throws IOException {
+    DBStore store = metadataManager.getStore();
+    if (!(store instanceof RDBStore)) {
+      LOG.debug("Store is not RDBStore, skipping snapshot directory metrics 
update");
+      resetMetrics();
+      return;
+    }
+
+    RDBStore rdbStore = (RDBStore) store;
+    String snapshotsParentDir = rdbStore.getSnapshotsParentDir();
+
+    if (snapshotsParentDir == null) {
+      resetMetrics();
+      return;
+    }
+
+    File snapshotsDir = new File(snapshotsParentDir);
+    if (!snapshotsDir.exists() || !snapshotsDir.isDirectory()) {
+      resetMetrics();
+      return;
+    }
+
+    try {
+      // Check for interruption before expensive operations
+      if (Thread.currentThread().isInterrupted()) {
+        LOG.info("Metrics update interrupted, skipping");
+        return;
+      }
+
+      // Calculate aggregate metrics
+      long totalSize = calculateDirSizeAccountingForHardlinks(snapshotsDir);
+      dbSnapshotsDirSize.set(totalSize);
+
+      // Calculate per-checkpoint-directory metrics and aggregate totals
+      File[] checkpointDirs = snapshotsDir.listFiles(File::isDirectory);
+      int totalSstCount = 0;
+      int snapshotCount = 0;
+      Map<String, CheckpointMetrics> newCheckpointMetricsMap = new HashMap<>();
+
+      if (checkpointDirs != null) {
+        snapshotCount = checkpointDirs.length;
+
+        for (File checkpointDir : checkpointDirs) {
+          String checkpointDirName = checkpointDir.getName();
+          long checkpointSize = 0;
+          int sstFileCount = 0;
+
+          try {
+            checkpointSize = 
calculateDirSizeAccountingForHardlinks(checkpointDir);
+            File[] sstFiles = checkpointDir.listFiles((dir, name) ->
+                name.toLowerCase().endsWith(ROCKSDB_SST_SUFFIX));
+            if (sstFiles != null) {
+              sstFileCount = sstFiles.length;
+            }
+          } catch (Exception e) {
+            LOG.debug("Error calculating metrics for checkpoint directory {}",
+                checkpointDirName, e);
+            // Continue with other directories even if one fails
+            continue;
+          }
+
+          totalSstCount += sstFileCount;
+          newCheckpointMetricsMap.put(checkpointDirName,
+              new CheckpointMetrics(checkpointSize, sstFileCount));
+        }
+      }
+
+      // Update aggregate metrics
+      totalSstFilesCount.set(totalSstCount);
+      numSnapshots.set(snapshotCount);
+
+      // Atomically update per-checkpoint metrics map
+      checkpointMetricsMap = 
Collections.unmodifiableMap(newCheckpointMetricsMap);
+
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Updated snapshot directory metrics: size={}, sstFiles={}, 
snapshots={}",
+            totalSize, totalSstCount, snapshotCount);
+      }
+
+    } catch (Exception e) {
+      LOG.warn("Error calculating snapshot directory metrics", e);
+      resetMetrics();
+    }
+  }
+
+  /**
+   * Calculates directory size accounting for hardlinks.
+   * (only counts each inode once).
+   * Uses Files.getAttribute to get the inode number and tracks
+   * visited inodes.
+   *
+   * @param directory the directory to calculate size for
+   * @return total size in bytes, counting each inode only once
+   */
+  private long calculateDirSizeAccountingForHardlinks(File directory)
+      throws IOException {
+    Set<Object> visitedInodes = new HashSet<>();

Review Comment:
   the directory here is the db.snapshots/checkpointState dir that has all 
checkpoint dirs so we should be good



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to