sadanand48 commented on code in PR #9406: URL: https://github.com/apache/ozone/pull/9406#discussion_r2600964759
########## hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OMSnapshotDirectoryMetrics.java: ########## @@ -0,0 +1,360 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.om.snapshot; + +import static org.apache.hadoop.ozone.OzoneConsts.ROCKSDB_SST_SUFFIX; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL_DEFAULT; + +import com.google.common.annotations.VisibleForTesting; +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Timer; +import java.util.TimerTask; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.hdds.annotation.InterfaceAudience; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.utils.db.DBStore; +import org.apache.hadoop.hdds.utils.db.RDBStore; +import org.apache.hadoop.metrics2.MetricsCollector; +import org.apache.hadoop.metrics2.MetricsInfo; +import org.apache.hadoop.metrics2.MetricsSource; +import org.apache.hadoop.metrics2.MetricsSystem; +import org.apache.hadoop.metrics2.annotation.Metric; +import org.apache.hadoop.metrics2.annotation.Metrics; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.MetricsRegistry; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; +import org.apache.hadoop.ozone.OzoneConsts; +import org.apache.hadoop.ozone.om.OMMetadataManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Metrics for tracking db.snapshots directory space usage and SST file counts. + * Provides both aggregate metrics and per-checkpoint-directory metrics. + * Metrics are updated asynchronously to avoid blocking operations. + */ [email protected] +@Metrics(about = "OM Snapshot Directory Metrics", context = OzoneConsts.OZONE) +public final class OMSnapshotDirectoryMetrics implements MetricsSource { + private static final Logger LOG = + LoggerFactory.getLogger(OMSnapshotDirectoryMetrics.class); + private static final String SOURCE_NAME = + OMSnapshotDirectoryMetrics.class.getSimpleName(); + + // Aggregate metrics + private @Metric MutableGaugeLong dbSnapshotsDirSize; + private @Metric MutableGaugeLong totalSstFilesCount; + private @Metric MutableGaugeLong numSnapshots; + + private final AtomicLong lastUpdateTime = new AtomicLong(0); + private final AtomicReference<CompletableFuture<Void>> currentUpdateFutureRef = + new AtomicReference<>(); + private final OMMetadataManager metadataManager; + private final MetricsRegistry registry = new MetricsRegistry(SOURCE_NAME); + + // Per-checkpoint-directory metrics storage + private volatile Map<String, CheckpointMetrics> checkpointMetricsMap = new HashMap<>(); + + private Timer updateTimer; + + /** + * Starts the periodic metrics update task. + * + * @param conf OzoneConfiguration for reading update interval + */ + public void start(OzoneConfiguration conf) { + long updateInterval = conf.getTimeDuration(OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL, + OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL_DEFAULT, + TimeUnit.MILLISECONDS); + + updateTimer = new Timer("OMSnapshotDirectoryMetricsUpdate", true); + updateTimer.schedule(new TimerTask() { + @Override + public void run() { + updateMetricsAsync(); + } + }, 0, updateInterval); + } + + /** + * Stops the periodic metrics update task. + */ + public void stop() { + if (updateTimer != null) { + updateTimer.cancel(); + updateTimer = null; + } + } + + public void unRegister() { + stop(); + MetricsSystem ms = DefaultMetricsSystem.instance(); + ms.unregisterSource(SOURCE_NAME); + } + + /** + * Internal class to store per-checkpoint metrics. + */ + private static class CheckpointMetrics { + private final long size; + private final int sstFileCount; + + CheckpointMetrics(long size, int sstFileCount) { + this.size = size; + this.sstFileCount = sstFileCount; + } + + public long getSize() { + return size; + } + + public int getSstFileCount() { + return sstFileCount; + } + } + + private OMSnapshotDirectoryMetrics(OMMetadataManager metadataManager) { + this.metadataManager = metadataManager; + } + + public static OMSnapshotDirectoryMetrics create(String parent, + OMMetadataManager metadataManager) { + MetricsSystem ms = DefaultMetricsSystem.instance(); + return ms.register(SOURCE_NAME, + parent, + new OMSnapshotDirectoryMetrics(metadataManager)); + } + + /** + * Updates all metrics (aggregate and per-checkpoint) asynchronously + * in a background thread. + */ + public void updateMetricsAsync() { + CompletableFuture<Void> currentUpdateFuture = currentUpdateFutureRef.get(); + if (currentUpdateFuture != null && !currentUpdateFuture.isDone()) { + return; + } + + CompletableFuture<Void> newFuture = CompletableFuture.runAsync(() -> { + try { + updateMetrics(); + lastUpdateTime.set(System.currentTimeMillis()); + } catch (Exception e) { + LOG.warn("Failed to update snapshot directory metrics", e); + } finally { + currentUpdateFutureRef.set(null); + } + }); + + // Atomically set the future only if the current value matches what we checked + // This prevents race conditions where multiple threads try to set a new future + CompletableFuture<Void> expected = currentUpdateFutureRef.get(); + if (expected == null || expected.isDone()) { + // Only set if still null or done (double-check after creating future) + if (!currentUpdateFutureRef.compareAndSet(expected, newFuture)) { + // Another thread set a future, cancel this one + newFuture.cancel(false); + } + } else { + // Another thread started an update, cancel this one + newFuture.cancel(false); + } + } + + /** + * Updates all metrics synchronously - both aggregate and per-checkpoint-directory. + */ + @VisibleForTesting + void updateMetrics() throws IOException { + DBStore store = metadataManager.getStore(); + if (!(store instanceof RDBStore)) { + LOG.debug("Store is not RDBStore, skipping snapshot directory metrics update"); + resetMetrics(); + return; + } + + RDBStore rdbStore = (RDBStore) store; + String snapshotsParentDir = rdbStore.getSnapshotsParentDir(); + + if (snapshotsParentDir == null) { + resetMetrics(); + return; + } + + File snapshotsDir = new File(snapshotsParentDir); + if (!snapshotsDir.exists() || !snapshotsDir.isDirectory()) { + resetMetrics(); + return; + } + + try { + // Calculate aggregate metrics + long totalSize = FileUtils.sizeOfDirectory(snapshotsDir); Review Comment: good point, It doesn't and calculates it twice, changed it to only account it once. ########## hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OMSnapshotDirectoryMetrics.java: ########## @@ -0,0 +1,360 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.om.snapshot; + +import static org.apache.hadoop.ozone.OzoneConsts.ROCKSDB_SST_SUFFIX; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL_DEFAULT; + +import com.google.common.annotations.VisibleForTesting; +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Timer; +import java.util.TimerTask; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.hdds.annotation.InterfaceAudience; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.utils.db.DBStore; +import org.apache.hadoop.hdds.utils.db.RDBStore; +import org.apache.hadoop.metrics2.MetricsCollector; +import org.apache.hadoop.metrics2.MetricsInfo; +import org.apache.hadoop.metrics2.MetricsSource; +import org.apache.hadoop.metrics2.MetricsSystem; +import org.apache.hadoop.metrics2.annotation.Metric; +import org.apache.hadoop.metrics2.annotation.Metrics; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.MetricsRegistry; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; +import org.apache.hadoop.ozone.OzoneConsts; +import org.apache.hadoop.ozone.om.OMMetadataManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Metrics for tracking db.snapshots directory space usage and SST file counts. + * Provides both aggregate metrics and per-checkpoint-directory metrics. + * Metrics are updated asynchronously to avoid blocking operations. + */ [email protected] +@Metrics(about = "OM Snapshot Directory Metrics", context = OzoneConsts.OZONE) +public final class OMSnapshotDirectoryMetrics implements MetricsSource { + private static final Logger LOG = + LoggerFactory.getLogger(OMSnapshotDirectoryMetrics.class); + private static final String SOURCE_NAME = + OMSnapshotDirectoryMetrics.class.getSimpleName(); + + // Aggregate metrics + private @Metric MutableGaugeLong dbSnapshotsDirSize; + private @Metric MutableGaugeLong totalSstFilesCount; + private @Metric MutableGaugeLong numSnapshots; + + private final AtomicLong lastUpdateTime = new AtomicLong(0); + private final AtomicReference<CompletableFuture<Void>> currentUpdateFutureRef = + new AtomicReference<>(); + private final OMMetadataManager metadataManager; + private final MetricsRegistry registry = new MetricsRegistry(SOURCE_NAME); + + // Per-checkpoint-directory metrics storage + private volatile Map<String, CheckpointMetrics> checkpointMetricsMap = new HashMap<>(); + + private Timer updateTimer; + + /** + * Starts the periodic metrics update task. + * + * @param conf OzoneConfiguration for reading update interval + */ + public void start(OzoneConfiguration conf) { + long updateInterval = conf.getTimeDuration(OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL, + OZONE_OM_SNAPSHOT_DIRECTORY_METRICS_UPDATE_INTERVAL_DEFAULT, + TimeUnit.MILLISECONDS); + + updateTimer = new Timer("OMSnapshotDirectoryMetricsUpdate", true); + updateTimer.schedule(new TimerTask() { + @Override + public void run() { + updateMetricsAsync(); + } + }, 0, updateInterval); + } + + /** + * Stops the periodic metrics update task. + */ + public void stop() { + if (updateTimer != null) { + updateTimer.cancel(); + updateTimer = null; + } + } + + public void unRegister() { + stop(); + MetricsSystem ms = DefaultMetricsSystem.instance(); + ms.unregisterSource(SOURCE_NAME); + } + + /** + * Internal class to store per-checkpoint metrics. + */ + private static class CheckpointMetrics { + private final long size; + private final int sstFileCount; + + CheckpointMetrics(long size, int sstFileCount) { + this.size = size; + this.sstFileCount = sstFileCount; + } + + public long getSize() { + return size; + } + + public int getSstFileCount() { + return sstFileCount; + } + } + + private OMSnapshotDirectoryMetrics(OMMetadataManager metadataManager) { + this.metadataManager = metadataManager; + } + + public static OMSnapshotDirectoryMetrics create(String parent, + OMMetadataManager metadataManager) { + MetricsSystem ms = DefaultMetricsSystem.instance(); + return ms.register(SOURCE_NAME, + parent, + new OMSnapshotDirectoryMetrics(metadataManager)); + } + + /** + * Updates all metrics (aggregate and per-checkpoint) asynchronously + * in a background thread. + */ + public void updateMetricsAsync() { Review Comment: done. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
