Re: [PR] HDDS-13891. SCM-based health monitoring and batch processing in Recon [ozone]

via GitHub Thu, 19 Feb 2026 08:43:02 -0800


devmadhuu commented on code in PR #9258:
URL: https://github.com/apache/ozone/pull/9258#discussion_r2828917821



##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/NullContainerReplicaPendingOps.java:
##########
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.recon.fsck;
+
+import java.time.Clock;
+import java.util.Collections;
+import java.util.List;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
+import 
org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
+import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
+
+/**
+ * Null implementation of ContainerReplicaPendingOps for Recon's
+ * local ReplicationManager.
+ *
+ * <p>This stub always returns empty pending operations because Recon does not
+ * send replication commands to datanodes. It only uses ReplicationManager's
+ * health check logic to determine container health states.</p>
+ *
+ * <p> Since SCM's health determination logic (Phase 1) explicitly ignores 
pending operations by calling
+ * isSufficientlyReplicated(false). Pending operations only affect command
+ * deduplication (Phase 2), which Recon doesn't need since it doesn't enqueue
+ * commands.</p>
+ */
+public class NullContainerReplicaPendingOps extends ContainerReplicaPendingOps 
{
+
+  public NullContainerReplicaPendingOps(Clock clock,
+      ReplicationManager.ReplicationManagerConfiguration rmConf) {
+    super(clock, rmConf);
+  }
+
+  /**
+   * Always returns an empty list since Recon does not track pending 
operations.
+   * This is correct because health state determination does not depend on
+   * pending operations (see RatisReplicationCheckHandler.java:212).
+   *
+   * @param id The ContainerID to check for pending operations
+   * @return Empty list - Recon has no pending operations
+   */
+  @Override
+  public List<ContainerReplicaOp> getPendingOps(ContainerID id) {
+    return Collections.emptyList();
+  }
+
+  /**
+   * No-op since Recon doesn't add pending operations.
+   */
+  public void scheduleAddReplica(ContainerID containerID, DatanodeDetails 
target,
+      SCMCommand<?> command, int replicaIndex, long containerSize) {
+    // No-op - Recon doesn't send commands
+  }
+
+  /**
+   * No-op since Recon doesn't add pending operations.
+   */
+  public void scheduleDeleteReplica(ContainerID containerID, DatanodeDetails 
target,
+      SCMCommand<?> command, int replicaIndex) {
+    // No-op - Recon doesn't send commands
+  }
+
+  /**
+   * No-op since Recon doesn't complete operations.
+   * @return false - operation not tracked
+   */
+  @Override
+  public boolean completeAddReplica(ContainerID containerID, DatanodeDetails 
target,
+      int replicaIndex) {
+    // No-op - Recon doesn't track command completion
+    return false;
+  }
+
+  /**
+   * No-op since Recon doesn't complete operations.
+   * @return false - operation not tracked
+   */
+  @Override
+  public boolean completeDeleteReplica(ContainerID containerID, 
DatanodeDetails target,
+      int replicaIndex) {
+    // No-op - Recon doesn't track command completion
+    return false;
+  }
+
+  /**
+   * Always returns 0 since Recon has no pending operations.
+   */
+  @Override
+  public long getPendingOpCount(ContainerReplicaOp.PendingOpType opType) {
+    return 0L;
+  }
+
+  /**
+   * Always returns 0 since Recon has no pending operations.
+   */
+  public long getTotalScheduledBytes(DatanodeDetails datanode) {

Review Comment:
   removed.



##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ReconReplicationManagerReport.java:
##########
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.recon.fsck;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
+
+/**
+ * Extended ReplicationManagerReport that captures ALL container health states,
+ * not just the first 100 samples per state.
+ *
+ * <p>SCM's standard ReplicationManagerReport uses sampling (SAMPLE_LIMIT = 
100)
+ * to limit memory usage. This is appropriate for SCM which only needs samples
+ * for debugging/UI display.</p>
+ *
+ * <p>Recon, however, needs to track per-container health states for ALL 
containers
+ * to populate its UNHEALTHY_CONTAINERS_V2 table. This extended report removes
+ * the sampling limitation while maintaining backward compatibility by still
+ * calling the parent's incrementAndSample() method.</p>
+ *
+ * <p><b>REPLICA_MISMATCH Handling:</b> Since SCM's HealthState enum doesn't 
include
+ * REPLICA_MISMATCH (it's a Recon-specific check for data checksum mismatches),
+ * we track it separately in replicaMismatchContainers.</p>
+ *
+ * <p><b>Memory Impact:</b> For a cluster with 100K containers and 5% 
unhealthy rate,
+ * this adds approximately 620KB of memory during report generation (5K 
containers
+ * × 124 bytes per container). Even in worst case (100% unhealthy), memory 
usage
+ * is only ~14MB, which is negligible for Recon.</p>
+ */
+public class ReconReplicationManagerReport extends ReplicationManagerReport {
+
+  // Captures ALL containers per health state (no SAMPLE_LIMIT restriction)
+  private final Map<HealthState, List<ContainerID>> allContainersByState =
+      new HashMap<>();
+
+  // Captures containers with REPLICA_MISMATCH (Recon-specific, not in SCM's 
HealthState)
+  private final List<ContainerID> replicaMismatchContainers = new 
ArrayList<>();
+
+  /**
+   * Override to capture ALL containers, not just first 100 samples.
+   * Still calls parent method to maintain aggregate counts and samples
+   * for backward compatibility.
+   *
+   * @param stat The health state to increment
+   * @param container The container ID to record
+   */
+  @Override
+  public void incrementAndSample(HealthState stat, ContainerID container) {
+    // Call parent to maintain aggregate counts and samples (limited to 100)
+    super.incrementAndSample(stat, container);
+
+    // Capture ALL containers for Recon (no SAMPLE_LIMIT restriction)
+    allContainersByState
+        .computeIfAbsent(stat, k -> new ArrayList<>())
+        .add(container);
+  }
+
+  /**
+   * Get ALL containers with the specified health state.
+   * Unlike getSample() which returns max 100 containers, this returns
+   * all containers that were recorded for the given state.
+   *
+   * @param stat The health state to query
+   * @return List of all container IDs with the specified health state,
+   *         or empty list if none
+   */
+  public List<ContainerID> getAllContainers(HealthState stat) {
+    return allContainersByState.getOrDefault(stat, Collections.emptyList());
+  }
+
+  /**
+   * Get all captured containers grouped by health state.
+   * This provides a complete view of all unhealthy containers in the cluster.
+   *
+   * @return Immutable map of HealthState to list of container IDs
+   */
+  public Map<HealthState, List<ContainerID>> getAllContainersByState() {
+    return Collections.unmodifiableMap(allContainersByState);
+  }
+
+  /**
+   * Get count of ALL captured containers for a health state.
+   * This may differ from getStat() if containers were added after the
+   * initial increment.
+   *
+   * @param stat The health state to query
+   * @return Number of containers captured for this state
+   */
+  public int getAllContainersCount(HealthState stat) {
+    return allContainersByState.getOrDefault(stat, 
Collections.emptyList()).size();
+  }
+
+  /**
+   * Clear all captured containers. Useful for resetting the report
+   * for a new processing cycle.
+   */
+  public void clearAllContainers() {

Review Comment:
   removed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] HDDS-13891. SCM-based health monitoring and batch processing in Recon [ozone]

Reply via email to