This is an automated email from the ASF dual-hosted git repository.

adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new d92f0ed87e HDDS-12653. Add option in `ozone debug log container list` 
to filter by Health State (#8415)
d92f0ed87e is described below

commit d92f0ed87e7396665972628463cce3b55daa0626
Author: sreejasahithi <[email protected]>
AuthorDate: Mon May 12 19:09:13 2025 +0530

    HDDS-12653. Add option in `ozone debug log container list` to filter by 
Health State (#8415)
---
 .../debug/logs/container/ContainerInfoCommand.java |   3 -
 .../logs/container/ContainerLogController.java     |   8 +-
 .../container/DuplicateOpenContainersCommand.java  |   3 -
 .../ozone/debug/logs/container/ListContainers.java |  44 ++++--
 .../container/utils/ContainerDatanodeDatabase.java | 162 ++++++++++++++++++++-
 .../debug/logs/container/utils/SQLDBConstants.java |  64 ++++++++
 6 files changed, 263 insertions(+), 21 deletions(-)

diff --git 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerInfoCommand.java
 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerInfoCommand.java
index 7c5f8b32dc..116cbe3f34 100644
--- 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerInfoCommand.java
+++ 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerInfoCommand.java
@@ -49,9 +49,6 @@ public Void call() throws Exception {
     }
     
     Path dbPath = parent.resolveDbPath();
-    if (dbPath == null) {
-      return null;
-    }
 
     ContainerDatanodeDatabase cdd = new 
ContainerDatanodeDatabase(dbPath.toString());
  
diff --git 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerLogController.java
 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerLogController.java
index dd43119ea6..1a6cdafea6 100644
--- 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerLogController.java
+++ 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerLogController.java
@@ -63,17 +63,17 @@ public Path resolveDbPath() {
       if (Files.exists(resolvedPath) && Files.isRegularFile(resolvedPath)) {
         out().println("Using default database file found in current directory: 
" + resolvedPath);
       } else {
-        err().println("No database path provided and default file '" + 
SQLDBConstants.DEFAULT_DB_FILENAME + "' not " +
+        throw new IllegalArgumentException("No database path provided and 
default file '" + 
+            SQLDBConstants.DEFAULT_DB_FILENAME + "' not " +
             "found in current directory. Please provide a valid database 
path");
-        return null;
       }
     } else {
       resolvedPath = Paths.get(dbPath);
       Path parentDir = resolvedPath.getParent();
 
       if (parentDir != null && !Files.exists(parentDir)) {
-        err().println("The parent directory of the provided database path does 
not exist: " + parentDir);
-        return null;
+        throw new IllegalArgumentException("The parent directory of the 
provided database " +
+            "path does not exist: " + parentDir);
       }
     }
 
diff --git 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/DuplicateOpenContainersCommand.java
 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/DuplicateOpenContainersCommand.java
index 820d5b5d2e..a0fb8371ec 100644
--- 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/DuplicateOpenContainersCommand.java
+++ 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/DuplicateOpenContainersCommand.java
@@ -39,9 +39,6 @@ public class DuplicateOpenContainersCommand implements 
Callable<Void> {
   @Override
   public Void call() throws Exception {
     Path dbPath = parent.resolveDbPath();
-    if (dbPath == null) {
-      return null;
-    }
 
     ContainerDatanodeDatabase cdd = new 
ContainerDatanodeDatabase(dbPath.toString());
     cdd.findDuplicateOpenContainer();
diff --git 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ListContainers.java
 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ListContainers.java
index cd0c09f2ba..2c9de2d643 100644
--- 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ListContainers.java
+++ 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ListContainers.java
@@ -19,7 +19,9 @@
 
 import java.nio.file.Path;
 import java.util.concurrent.Callable;
+import org.apache.hadoop.hdds.cli.AbstractSubcommand;
 import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
 import 
org.apache.hadoop.ozone.debug.logs.container.utils.ContainerDatanodeDatabase;
 import org.apache.hadoop.ozone.shell.ListLimitOptions;
 import picocli.CommandLine;
@@ -33,30 +35,52 @@
     name = "list",
     description = "Finds containers from the database based on the option 
provided."
 )
-public class ListContainers implements Callable<Void> {
+public class ListContainers extends AbstractSubcommand implements 
Callable<Void> {
   
-  @CommandLine.Option(names = {"--state"},
-      description = "Life cycle state of the container.",
-      required = true)
-  private HddsProtos.LifeCycleState state;
+  @CommandLine.ArgGroup(multiplicity = "1")
+  private ExclusiveOptions exclusiveOptions;
 
   @CommandLine.Mixin
   private ListLimitOptions listOptions;
-
+  
   @CommandLine.ParentCommand
   private ContainerLogController parent;
 
+  private static final class ExclusiveOptions {
+    @CommandLine.Option(names = {"--lifecycle"},
+        description = "Life cycle state of the container.")
+    private HddsProtos.LifeCycleState lifecycleState;
+
+    @CommandLine.Option(names = {"--health"},
+        description = "Health state of the container.")
+    private ReplicationManagerReport.HealthState healthState;
+  }
+
   @Override
   public Void call() throws Exception {
     
     Path dbPath = parent.resolveDbPath();
-    if (dbPath == null) {
-      return null;
-    }
 
     ContainerDatanodeDatabase cdd = new 
ContainerDatanodeDatabase(dbPath.toString());
 
-    cdd.listContainersByState(state.name(), listOptions.getLimit());
+    if (exclusiveOptions.lifecycleState != null) {
+      cdd.listContainersByState(exclusiveOptions.lifecycleState.name(), 
listOptions.getLimit());
+    } else if (exclusiveOptions.healthState != null) {
+      switch (exclusiveOptions.healthState) {
+      case UNDER_REPLICATED:
+      case OVER_REPLICATED:
+        cdd.listReplicatedContainers(exclusiveOptions.healthState.name(), 
listOptions.getLimit());
+        break;
+      case UNHEALTHY:
+        cdd.listUnhealthyContainers(listOptions.getLimit());
+        break;
+      case QUASI_CLOSED_STUCK:
+        cdd.listQuasiClosedStuckContainers(listOptions.getLimit());
+        break;
+      default:
+        err().println("Unsupported health state: " + 
exclusiveOptions.healthState);
+      }
+    }
     
     return null;
   }
diff --git 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/ContainerDatanodeDatabase.java
 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/ContainerDatanodeDatabase.java
index 1d44605249..398fb98b28 100644
--- 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/ContainerDatanodeDatabase.java
+++ 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/ContainerDatanodeDatabase.java
@@ -49,15 +49,18 @@ public class ContainerDatanodeDatabase {
   private static final int DEFAULT_REPLICATION_FACTOR;
 
   private final PrintWriter out;
+  private final PrintWriter err;
 
   public ContainerDatanodeDatabase(String dbPath) {
     this.databasePath = dbPath;
     this.out = new PrintWriter(new OutputStreamWriter(System.out, 
StandardCharsets.UTF_8), true);
+    this.err = new PrintWriter(new OutputStreamWriter(System.err, 
StandardCharsets.UTF_8), true);
   }
 
-  public ContainerDatanodeDatabase(String dbPath, PrintWriter out) {
+  public ContainerDatanodeDatabase(String dbPath, PrintWriter out, PrintWriter 
err) {
     this.databasePath = dbPath;
     this.out = out;
+    this.err = err;
   }
   
   static {
@@ -122,6 +125,7 @@ public void createIndexes() throws SQLException {
       createIdxDclContainerStateTime(stmt);
       createContainerLogIndex(stmt);
       createIdxContainerlogContainerId(stmt);
+      createIndexForQuasiClosedQuery(stmt);
     } catch (SQLException e) {
       throw new SQLException("Error while creating index: " + e.getMessage());
     } catch (Exception e) {
@@ -144,6 +148,11 @@ private void createIdxContainerlogContainerId(Statement 
stmt) throws SQLExceptio
     stmt.execute(createIndexSQL);
   }
 
+  private void createIndexForQuasiClosedQuery(Statement stmt) throws 
SQLException {
+    String createIndexSQL = 
SQLDBConstants.CREATE_DCL_STATE_CONTAINER_DATANODE_TIME_INDEX;
+    stmt.execute(createIndexSQL);
+  }
+
   /**
    * Inserts a list of container log entries into the 
DatanodeContainerLogTable.
    *
@@ -609,5 +618,156 @@ private List<DatanodeContainerInfo> 
getContainerLogDataForOpenContainers(Long co
 
     return logEntries;
   }
+
+  /**
+   * Lists containers that are over- or under-replicated also provides count 
of replicas.
+   */
+  
+  public void listReplicatedContainers(String overOrUnder, Integer limit) 
throws SQLException {
+    String operator;
+    if ("OVER_REPLICATED".equalsIgnoreCase(overOrUnder)) {
+      operator = ">";
+    } else if ("UNDER_REPLICATED".equalsIgnoreCase(overOrUnder)) {
+      operator = "<";
+    } else {
+      err.println("Invalid type. Use OVER_REPLICATED or UNDER_REPLICATED.");
+      return;
+    }
+    
+    String rawQuery = SQLDBConstants.SELECT_REPLICATED_CONTAINERS;
+
+    if (!rawQuery.contains("{operator}")) {
+      err.println("Query not defined correctly.");
+      return;
+    }
+
+    String finalQuery = rawQuery.replace("{operator}", operator);
+
+    boolean limitProvided = limit != Integer.MAX_VALUE;
+    if (limitProvided) {
+      finalQuery += " LIMIT ?";
+    }
+
+    try (Connection connection = getConnection();
+         PreparedStatement pstmt = connection.prepareStatement(finalQuery)) {
+
+      pstmt.setInt(1, DEFAULT_REPLICATION_FACTOR);
+      
+      if (limitProvided) {
+        pstmt.setInt(2, limit + 1);
+      }
+
+      try (ResultSet rs = pstmt.executeQuery()) {
+        int count = 0;
+
+        while (rs.next()) {
+          if (limitProvided && count >= limit) {
+            err.println("Note: There might be more containers. Use --all 
option to list all entries.");
+            break;
+          }
+
+          out.printf("Container ID = %s - Count = %d%n", 
rs.getLong("container_id"), 
+                  rs.getInt("replica_count"));
+          count++;
+        }
+
+        out.println("Number of containers listed: " + count);
+
+      }
+
+    } catch (SQLException e) {
+      throw new SQLException("Error while retrieving containers." + 
e.getMessage(), e);
+    } catch (Exception e) {
+      throw new RuntimeException("Unexpected error: "  + e);
+    }
+  }
+
+  /**
+   * Lists containers that are UNHEALTHY also provides count of replicas which 
are in UNHEALTHY state.
+   */
+
+  public void listUnhealthyContainers(Integer limit) throws SQLException {
+    
+    String query = SQLDBConstants.SELECT_UNHEALTHY_CONTAINERS;
+
+    boolean limitProvided = limit != Integer.MAX_VALUE;
+    if (limitProvided) {
+      query += " LIMIT ?";
+    }
+
+    try (Connection connection = getConnection();
+         PreparedStatement stmt = connection.prepareStatement(query)) {
+
+      if (limitProvided) {
+        stmt.setInt(1, limit + 1);
+      }
+
+      try (ResultSet rs = stmt.executeQuery()) {
+        int count = 0;
+
+        while (rs.next()) {
+          if (limitProvided && count >= limit) {
+            err.println("Note: There might be more containers. Use --all 
option to list all entries.");
+            break;
+          }
+
+          out.printf("Container ID = %s - Count = %d%n", 
rs.getString("container_id"), 
+                  rs.getInt("unhealthy_replica_count"));
+          count++;
+        }
+
+        out.println("Number of containers listed: " + count);
+      }
+
+    } catch (SQLException e) {
+      throw new SQLException("Error while retrieving containers." + 
e.getMessage(), e);
+    } catch (Exception e) {
+      throw new RuntimeException("Unexpected error: "  + e);
+    }
+  }
+
+  /**
+   * Lists containers that are QUASI_CLOSED stuck also provides count of 
replicas which are in QUASI_CLOSED state.
+   */
+
+  public void listQuasiClosedStuckContainers(Integer limit) throws 
SQLException {
+  
+    String query =  SQLDBConstants.SELECT_QUASI_CLOSED_STUCK_CONTAINERS;
+
+    boolean limitProvided = limit != Integer.MAX_VALUE;
+    if (limitProvided) {
+      query += " LIMIT ?";
+    }
+
+    try (Connection connection = getConnection();
+         PreparedStatement statement = connection.prepareStatement(query)) {
+
+      if (limitProvided) {
+        statement.setInt(1, limit + 1);
+      }
+
+      try (ResultSet resultSet = statement.executeQuery()) {
+        int count = 0;
+
+        while (resultSet.next()) {
+          if (limitProvided && count >= limit) {
+            err.println("Note: There might be more containers. Use --all 
option to list all entries.");
+            break;
+          }
+
+          out.printf("Container ID = %s - Count = %d%n", 
resultSet.getString("container_id"),
+                  resultSet.getInt("quasi_closed_replica_count"));
+          count++;
+        }
+
+        out.println("Number of containers listed: " + count);
+      }
+
+    } catch (SQLException e) {
+      throw new SQLException("Error while retrieving containers." + 
e.getMessage(), e);
+    } catch (Exception e) {
+      throw new RuntimeException("Unexpected error: "  + e);
+    }
+  }
 }
 
diff --git 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/SQLDBConstants.java
 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/SQLDBConstants.java
index 74822d35e3..773e821298 100644
--- 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/SQLDBConstants.java
+++ 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/SQLDBConstants.java
@@ -17,6 +17,9 @@
 
 package org.apache.hadoop.ozone.debug.logs.container.utils;
 
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
+
 /**
  * Constants used for ContainerDatanodeDatabase.
  */
@@ -29,6 +32,11 @@ public final class SQLDBConstants {
   public static final int BATCH_SIZE = 2500;
   public static final String DATANODE_CONTAINER_LOG_TABLE_NAME = 
"DatanodeContainerLogTable";
   public static final String CONTAINER_LOG_TABLE_NAME = "ContainerLogTable";
+  public static final String CLOSED_STATE = 
HddsProtos.LifeCycleState.CLOSED.name();
+  public static final String DELETED_STATE = 
HddsProtos.LifeCycleState.DELETED.name();
+  public static final String UNHEALTHY_STATE = 
ReplicationManagerReport.HealthState.UNHEALTHY.name();
+  public static final String QUASI_CLOSED_STATE = 
HddsProtos.LifeCycleState.QUASI_CLOSED.name();
+
   public static final String CREATE_DATANODE_CONTAINER_LOG_TABLE = 
       "CREATE TABLE IF NOT EXISTS DatanodeContainerLogTable (datanode_id TEXT 
NOT NULL, " +
           "container_id INTEGER NOT NULL, timestamp TEXT NOT NULL, 
container_state TEXT, bcsid INTEGER, " +
@@ -75,6 +83,62 @@ public final class SQLDBConstants {
   public static final String SELECT_CONTAINER_DETAILS_OPEN_STATE = "SELECT 
d.timestamp, d.container_id, " +
       "d.datanode_id, d.container_state FROM DatanodeContainerLogTable d " +
       "WHERE d.container_id = ? AND d.container_state = 'OPEN' ORDER BY 
d.timestamp ASC;";
+  public static final String CREATE_DCL_STATE_CONTAINER_DATANODE_TIME_INDEX =
+      "CREATE INDEX IF NOT EXISTS idx_dcl_state_container_datanode_time " +
+          "ON DatanodeContainerLogTable(container_state, container_id, 
datanode_id, timestamp DESC);";
+  public static final String SELECT_REPLICATED_CONTAINERS =
+          "SELECT container_id, COUNT(DISTINCT datanode_id) AS 
replica_count\n" +
+                  "FROM ContainerLogTable\n" +
+                  "WHERE latest_state != '" + DELETED_STATE + "'\n" +
+                  " GROUP BY container_id\n" +
+                  "HAVING COUNT(DISTINCT datanode_id) {operator} ?";
+  public static final String SELECT_UNHEALTHY_CONTAINERS =
+      "SELECT u.container_id, COUNT(*) AS unhealthy_replica_count\n" +
+          "FROM (\n" +
+          "    SELECT container_id, datanode_id, MAX(timestamp) AS 
latest_unhealthy_timestamp\n" +
+          "    FROM DatanodeContainerLogTable\n" +
+          "    WHERE container_state = '" + UNHEALTHY_STATE + "'\n" +
+          "    GROUP BY container_id, datanode_id\n" +
+          ") AS u\n" +
+          "LEFT JOIN (\n" +
+          "    SELECT container_id, datanode_id, MAX(timestamp) AS 
latest_closed_timestamp\n" +
+          "    FROM DatanodeContainerLogTable\n" +
+          "    WHERE container_state IN ('" + CLOSED_STATE + "', '" + 
DELETED_STATE + "')\n" +
+          "    GROUP BY container_id, datanode_id\n" +
+          ") AS c\n" +
+          "ON u.container_id = c.container_id AND u.datanode_id = 
c.datanode_id\n" +
+          "WHERE c.latest_closed_timestamp IS NULL \n" +
+          "   OR u.latest_unhealthy_timestamp > c.latest_closed_timestamp\n" +
+              "GROUP BY u.container_id\n" +
+              "ORDER BY u.container_id";
+  public static final String SELECT_QUASI_CLOSED_STUCK_CONTAINERS =
+      "WITH quasi_closed_replicas AS ( " +
+          "    SELECT container_id, datanode_id, MAX(timestamp) AS 
latest_quasi_closed_timestamp\n" +
+          "    FROM DatanodeContainerLogTable " +
+          "    WHERE container_state = '" + QUASI_CLOSED_STATE + "'\n" +
+          "    GROUP BY container_id, datanode_id" +
+          "), " +
+          "container_with_enough_quasi_closed AS (\n" +
+          "    SELECT container_id\n" +
+          "    FROM quasi_closed_replicas\n" +
+          "    GROUP BY container_id\n" +
+          "    HAVING COUNT(DISTINCT datanode_id) >= 3\n" +
+          "),\n" +
+          "closed_or_deleted AS (\n" +
+          "    SELECT container_id, datanode_id, MAX(timestamp) AS 
latest_closed_timestamp\n" +
+          "    FROM DatanodeContainerLogTable\n" +
+          "    WHERE container_state IN ('" + CLOSED_STATE + "', '" + 
DELETED_STATE + "')\n" +
+          "    GROUP BY container_id, datanode_id\n" +
+          ")\n" +
+          "SELECT q.container_id, COUNT(*) AS quasi_closed_replica_count\n" +
+          "FROM quasi_closed_replicas q\n" +
+          "JOIN container_with_enough_quasi_closed qc ON q.container_id = 
qc.container_id\n" +
+          "LEFT JOIN closed_or_deleted c \n" +
+          "    ON q.container_id = c.container_id AND q.datanode_id = 
c.datanode_id\n" +
+          "WHERE c.latest_closed_timestamp IS NULL\n" +
+          "   OR q.latest_quasi_closed_timestamp > 
c.latest_closed_timestamp\n" +
+              "GROUP BY q.container_id\n" +
+              "ORDER BY q.container_id";
   
   private SQLDBConstants() {
     //Never constructed


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to