priyeshkaratha commented on code in PR #9258:
URL: https://github.com/apache/ozone/pull/9258#discussion_r2601074063


##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ContainerHealthTaskV2.java:
##########
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.recon.fsck;
+
+import javax.inject.Inject;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.scm.PlacementPolicy;
+import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import 
org.apache.hadoop.ozone.recon.persistence.ContainerHealthSchemaManagerV2;
+import org.apache.hadoop.ozone.recon.scm.ReconScmTask;
+import org.apache.hadoop.ozone.recon.scm.ReconStorageContainerManagerFacade;
+import org.apache.hadoop.ozone.recon.spi.ReconContainerMetadataManager;
+import org.apache.hadoop.ozone.recon.tasks.ReconTaskConfig;
+import 
org.apache.hadoop.ozone.recon.tasks.updater.ReconTaskStatusUpdaterManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * V2 implementation of Container Health Task using Local ReplicationManager.
+ *
+ * <p><b>Solution:</b></p>
+ * <ul>
+ *   <li>Uses Recon's local ReplicationManager (not RPC to SCM)</li>
+ *   <li>Calls processAll() once to check all containers in batch</li>
+ *   <li>ReplicationManager uses stub PendingOps 
(NullContainerReplicaPendingOps)</li>
+ *   <li>No false positives despite stub - health determination ignores 
pending ops</li>
+ *   <li>All database operations handled inside ReconReplicationManager</li>
+ * </ul>
+ *
+ * <p><b>Benefits over RPC call to SCM 3:</b></p>
+ * <ul>
+ *   <li>Zero RPC overhead (no per-container calls to SCM)</li>
+ *   <li>Zero SCM load</li>
+ *   <li>Simpler code - single method call</li>
+ *   <li>Perfect accuracy (proven via code analysis)</li>
+ *   <li>Captures ALL container health states (no 100-sample limit)</li>
+ * </ul>
+ *
+ * @see ReconReplicationManager
+ * @see NullContainerReplicaPendingOps
+ */
+public class ContainerHealthTaskV2 extends ReconScmTask {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ContainerHealthTaskV2.class);
+
+  private final ReconStorageContainerManagerFacade reconScm;
+  private final long interval;
+
+  @Inject
+  @SuppressWarnings("checkstyle:ParameterNumber")

Review Comment:
   Can we use Builder class instead of suppressing checkstyle warnings.



##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/NullContainerReplicaPendingOps.java:
##########
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.recon.fsck;
+
+import java.time.Clock;
+import java.util.Collections;
+import java.util.List;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
+import 
org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
+import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
+
+/**
+ * Null implementation of ContainerReplicaPendingOps for Recon's
+ * local ReplicationManager.
+ *
+ * <p>This stub always returns empty pending operations because Recon does not
+ * send replication commands to datanodes. It only uses ReplicationManager's
+ * health check logic to determine container health states.</p>
+ *
+ * <p> Since SCM's health determination logic (Phase 1) explicitly ignores 
pending operations by calling
+ * isSufficientlyReplicated(false). Pending operations only affect command
+ * deduplication (Phase 2), which Recon doesn't need since it doesn't enqueue
+ * commands.</p>
+ */
+public class NullContainerReplicaPendingOps extends ContainerReplicaPendingOps 
{
+
+  public NullContainerReplicaPendingOps(Clock clock,
+      ReplicationManager.ReplicationManagerConfiguration rmConf) {
+    super(clock, rmConf);
+  }
+
+  /**
+   * Always returns an empty list since Recon does not track pending 
operations.
+   * This is correct because health state determination does not depend on
+   * pending operations (see RatisReplicationCheckHandler.java:212).
+   *
+   * @param id The ContainerID to check for pending operations
+   * @return Empty list - Recon has no pending operations
+   */
+  @Override
+  public List<ContainerReplicaOp> getPendingOps(ContainerID id) {
+    return Collections.emptyList();
+  }
+
+  /**
+   * No-op since Recon doesn't add pending operations.
+   */
+  public void scheduleAddReplica(ContainerID containerID, DatanodeDetails 
target,
+      SCMCommand<?> command, int replicaIndex, long containerSize) {
+    // No-op - Recon doesn't send commands
+  }
+
+  /**
+   * No-op since Recon doesn't add pending operations.
+   */
+  public void scheduleDeleteReplica(ContainerID containerID, DatanodeDetails 
target,
+      SCMCommand<?> command, int replicaIndex) {
+    // No-op - Recon doesn't send commands
+  }

Review Comment:
   These methods never used.



##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/ContainerEndpoint.java:
##########
@@ -451,6 +482,81 @@ public Response getUnhealthyContainers(
     return Response.ok(response).build();
   }
 
+  /**
+   * V2 implementation - reads from UNHEALTHY_CONTAINERS_V2 table.
+   */
+  private Response getUnhealthyContainersV2(
+      String state,
+      int limit,
+      long maxContainerId,
+      long minContainerId) {
+    List<UnhealthyContainerMetadata> unhealthyMeta = new ArrayList<>();
+    List<UnhealthyContainersSummary> summary = new ArrayList<>();
+
+    try {
+      ContainerSchemaDefinitionV2.UnHealthyContainerStates v2State = null;
+
+      if (state != null) {
+        // Convert V1 state string to V2 enum
+        v2State = 
ContainerSchemaDefinitionV2.UnHealthyContainerStates.valueOf(state);
+      }
+
+      // Get summary from V2 table and convert to V1 format
+      List<ContainerHealthSchemaManagerV2.UnhealthyContainersSummaryV2> 
v2Summary =
+          containerHealthSchemaManagerV2.getUnhealthyContainersSummary();
+      for (ContainerHealthSchemaManagerV2.UnhealthyContainersSummaryV2 s : 
v2Summary) {
+        summary.add(new UnhealthyContainersSummary(s.getContainerState(), 
s.getCount()));
+      }
+
+      // Get containers from V2 table
+      List<ContainerHealthSchemaManagerV2.UnhealthyContainerRecordV2> 
v2Containers =
+          containerHealthSchemaManagerV2.getUnhealthyContainers(v2State, 
minContainerId, maxContainerId, limit);
+
+      // Convert V2 records to response format
+      for (ContainerHealthSchemaManagerV2.UnhealthyContainerRecordV2 c : 
v2Containers) {
+        long containerID = c.getContainerId();
+        ContainerInfo containerInfo =
+            containerManager.getContainer(ContainerID.valueOf(containerID));
+        long keyCount = containerInfo.getNumberOfKeys();
+        UUID pipelineID = containerInfo.getPipelineID().getId();
+        List<ContainerHistory> datanodes =
+            containerManager.getLatestContainerHistory(containerID,
+                containerInfo.getReplicationConfig().getRequiredNodes());
+
+        // Create UnhealthyContainers POJO from V2 record for response
+        UnhealthyContainers v1Container = new UnhealthyContainers();

Review Comment:
   Can we create a builder class for UnhealthyContainers?



##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/persistence/ContainerHealthSchemaManager.java:
##########
@@ -172,6 +172,47 @@ public void 
insertUnhealthyContainerRecords(List<UnhealthyContainers> recs) {
     }
   }
 
+  /**
+   * Delete a specific unhealthy container state record.
+   *
+   * @param containerId Container ID
+   * @param state Container state to delete
+   */
+  public void deleteUnhealthyContainer(long containerId, Object state) {

Review Comment:
   I can see this is never called. Also can see methods 
deleteAllStatesForContainer, batchDeleteReplicaMismatchForContainers and 
clearAllUnhealthyContainerRecords also never used.



##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ReconReplicationManagerReport.java:
##########
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.recon.fsck;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
+
+/**
+ * Extended ReplicationManagerReport that captures ALL container health states,
+ * not just the first 100 samples per state.
+ *
+ * <p>SCM's standard ReplicationManagerReport uses sampling (SAMPLE_LIMIT = 
100)
+ * to limit memory usage. This is appropriate for SCM which only needs samples
+ * for debugging/UI display.</p>
+ *
+ * <p>Recon, however, needs to track per-container health states for ALL 
containers
+ * to populate its UNHEALTHY_CONTAINERS_V2 table. This extended report removes
+ * the sampling limitation while maintaining backward compatibility by still
+ * calling the parent's incrementAndSample() method.</p>
+ *
+ * <p><b>REPLICA_MISMATCH Handling:</b> Since SCM's HealthState enum doesn't 
include
+ * REPLICA_MISMATCH (it's a Recon-specific check for data checksum mismatches),
+ * we track it separately in replicaMismatchContainers.</p>
+ *
+ * <p><b>Memory Impact:</b> For a cluster with 100K containers and 5% 
unhealthy rate,
+ * this adds approximately 620KB of memory during report generation (5K 
containers
+ * × 124 bytes per container). Even in worst case (100% unhealthy), memory 
usage
+ * is only ~14MB, which is negligible for Recon.</p>
+ */
+public class ReconReplicationManagerReport extends ReplicationManagerReport {
+
+  // Captures ALL containers per health state (no SAMPLE_LIMIT restriction)
+  private final Map<HealthState, List<ContainerID>> allContainersByState =
+      new HashMap<>();
+
+  // Captures containers with REPLICA_MISMATCH (Recon-specific, not in SCM's 
HealthState)
+  private final List<ContainerID> replicaMismatchContainers = new 
ArrayList<>();
+
+  /**
+   * Override to capture ALL containers, not just first 100 samples.
+   * Still calls parent method to maintain aggregate counts and samples
+   * for backward compatibility.
+   *
+   * @param stat The health state to increment
+   * @param container The container ID to record
+   */
+  @Override
+  public void incrementAndSample(HealthState stat, ContainerID container) {
+    // Call parent to maintain aggregate counts and samples (limited to 100)
+    super.incrementAndSample(stat, container);
+
+    // Capture ALL containers for Recon (no SAMPLE_LIMIT restriction)
+    allContainersByState
+        .computeIfAbsent(stat, k -> new ArrayList<>())
+        .add(container);
+  }
+
+  /**
+   * Get ALL containers with the specified health state.
+   * Unlike getSample() which returns max 100 containers, this returns
+   * all containers that were recorded for the given state.
+   *
+   * @param stat The health state to query
+   * @return List of all container IDs with the specified health state,
+   *         or empty list if none
+   */
+  public List<ContainerID> getAllContainers(HealthState stat) {
+    return allContainersByState.getOrDefault(stat, Collections.emptyList());
+  }
+
+  /**
+   * Get all captured containers grouped by health state.
+   * This provides a complete view of all unhealthy containers in the cluster.
+   *
+   * @return Immutable map of HealthState to list of container IDs
+   */
+  public Map<HealthState, List<ContainerID>> getAllContainersByState() {
+    return Collections.unmodifiableMap(allContainersByState);
+  }
+
+  /**
+   * Get count of ALL captured containers for a health state.
+   * This may differ from getStat() if containers were added after the
+   * initial increment.
+   *
+   * @param stat The health state to query
+   * @return Number of containers captured for this state
+   */
+  public int getAllContainersCount(HealthState stat) {
+    return allContainersByState.getOrDefault(stat, 
Collections.emptyList()).size();
+  }
+
+  /**
+   * Clear all captured containers. Useful for resetting the report
+   * for a new processing cycle.
+   */
+  public void clearAllContainers() {

Review Comment:
   This method never called.



##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/NullContainerReplicaPendingOps.java:
##########
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.recon.fsck;
+
+import java.time.Clock;
+import java.util.Collections;
+import java.util.List;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
+import 
org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
+import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
+
+/**
+ * Null implementation of ContainerReplicaPendingOps for Recon's
+ * local ReplicationManager.
+ *
+ * <p>This stub always returns empty pending operations because Recon does not
+ * send replication commands to datanodes. It only uses ReplicationManager's
+ * health check logic to determine container health states.</p>
+ *
+ * <p> Since SCM's health determination logic (Phase 1) explicitly ignores 
pending operations by calling
+ * isSufficientlyReplicated(false). Pending operations only affect command
+ * deduplication (Phase 2), which Recon doesn't need since it doesn't enqueue
+ * commands.</p>
+ */
+public class NullContainerReplicaPendingOps extends ContainerReplicaPendingOps 
{
+
+  public NullContainerReplicaPendingOps(Clock clock,
+      ReplicationManager.ReplicationManagerConfiguration rmConf) {
+    super(clock, rmConf);
+  }
+
+  /**
+   * Always returns an empty list since Recon does not track pending 
operations.
+   * This is correct because health state determination does not depend on
+   * pending operations (see RatisReplicationCheckHandler.java:212).
+   *
+   * @param id The ContainerID to check for pending operations
+   * @return Empty list - Recon has no pending operations
+   */
+  @Override
+  public List<ContainerReplicaOp> getPendingOps(ContainerID id) {
+    return Collections.emptyList();
+  }
+
+  /**
+   * No-op since Recon doesn't add pending operations.
+   */
+  public void scheduleAddReplica(ContainerID containerID, DatanodeDetails 
target,
+      SCMCommand<?> command, int replicaIndex, long containerSize) {
+    // No-op - Recon doesn't send commands
+  }
+
+  /**
+   * No-op since Recon doesn't add pending operations.
+   */
+  public void scheduleDeleteReplica(ContainerID containerID, DatanodeDetails 
target,
+      SCMCommand<?> command, int replicaIndex) {
+    // No-op - Recon doesn't send commands
+  }
+
+  /**
+   * No-op since Recon doesn't complete operations.
+   * @return false - operation not tracked
+   */
+  @Override
+  public boolean completeAddReplica(ContainerID containerID, DatanodeDetails 
target,
+      int replicaIndex) {
+    // No-op - Recon doesn't track command completion
+    return false;
+  }
+
+  /**
+   * No-op since Recon doesn't complete operations.
+   * @return false - operation not tracked
+   */
+  @Override
+  public boolean completeDeleteReplica(ContainerID containerID, 
DatanodeDetails target,
+      int replicaIndex) {
+    // No-op - Recon doesn't track command completion
+    return false;
+  }
+
+  /**
+   * Always returns 0 since Recon has no pending operations.
+   */
+  @Override
+  public long getPendingOpCount(ContainerReplicaOp.PendingOpType opType) {
+    return 0L;
+  }
+
+  /**
+   * Always returns 0 since Recon has no pending operations.
+   */
+  public long getTotalScheduledBytes(DatanodeDetails datanode) {

Review Comment:
   This method is never used



##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ReconReplicationManager.java:
##########
@@ -0,0 +1,402 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.recon.fsck;
+
+import java.io.IOException;
+import java.time.Clock;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.scm.PlacementPolicy;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
+import org.apache.hadoop.hdds.scm.container.ContainerReplica;
+import 
org.apache.hadoop.hdds.scm.container.ReplicationManagerReport.HealthState;
+import org.apache.hadoop.hdds.scm.container.replication.NullReplicationQueue;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationQueue;
+import org.apache.hadoop.hdds.scm.ha.SCMContext;
+import org.apache.hadoop.hdds.scm.node.NodeManager;
+import org.apache.hadoop.hdds.server.events.EventPublisher;
+import 
org.apache.hadoop.ozone.recon.persistence.ContainerHealthSchemaManagerV2;
+import 
org.apache.hadoop.ozone.recon.persistence.ContainerHealthSchemaManagerV2.UnhealthyContainerRecordV2;
+import 
org.apache.ozone.recon.schema.ContainerSchemaDefinitionV2.UnHealthyContainerStates;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Recon-specific extension of SCM's ReplicationManager.
+ *
+ * <p><b>Key Differences from SCM:</b></p>
+ * <ol>
+ *   <li>Uses NullContainerReplicaPendingOps stub (no pending operations 
tracking)</li>
+ *   <li>Overrides processAll() to capture ALL container health states (no 
100-sample limit)</li>
+ *   <li>Stores results in Recon's UNHEALTHY_CONTAINERS_V2 table</li>
+ *   <li>Does not issue replication commands (read-only monitoring)</li>
+ * </ol>
+ *
+ * <p><b>Why This Works Without PendingOps:</b></p>
+ * <p>SCM's health check logic uses a two-phase approach:
+ * <ul>
+ *   <li><b>Phase 1 (Health Determination):</b> Calls 
isSufficientlyReplicated(false)
+ *       which ignores pending operations. This phase determines the health 
state.</li>
+ *   <li><b>Phase 2 (Command Deduplication):</b> Calls 
isSufficientlyReplicated(true)
+ *       which considers pending operations. This phase decides whether to 
enqueue
+ *       new commands.</li>
+ * </ul>
+ * Since Recon only needs Phase 1 (health determination) and doesn't issue 
commands,
+ * the stub PendingOps does not cause false positives.</p>
+ *
+ * @see NullContainerReplicaPendingOps
+ * @see ReconReplicationManagerReport
+ */
+public class ReconReplicationManager extends ReplicationManager {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ReconReplicationManager.class);
+
+  private final ContainerHealthSchemaManagerV2 healthSchemaManager;
+  private final ContainerManager containerManager;
+
+  @SuppressWarnings("checkstyle:ParameterNumber")

Review Comment:
   can we avoid parameter number suppressing using Builder?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to