This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new d92f0ed87e HDDS-12653. Add option in `ozone debug log container list`
to filter by Health State (#8415)
d92f0ed87e is described below
commit d92f0ed87e7396665972628463cce3b55daa0626
Author: sreejasahithi <[email protected]>
AuthorDate: Mon May 12 19:09:13 2025 +0530
HDDS-12653. Add option in `ozone debug log container list` to filter by
Health State (#8415)
---
.../debug/logs/container/ContainerInfoCommand.java | 3 -
.../logs/container/ContainerLogController.java | 8 +-
.../container/DuplicateOpenContainersCommand.java | 3 -
.../ozone/debug/logs/container/ListContainers.java | 44 ++++--
.../container/utils/ContainerDatanodeDatabase.java | 162 ++++++++++++++++++++-
.../debug/logs/container/utils/SQLDBConstants.java | 64 ++++++++
6 files changed, 263 insertions(+), 21 deletions(-)
diff --git
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerInfoCommand.java
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerInfoCommand.java
index 7c5f8b32dc..116cbe3f34 100644
---
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerInfoCommand.java
+++
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerInfoCommand.java
@@ -49,9 +49,6 @@ public Void call() throws Exception {
}
Path dbPath = parent.resolveDbPath();
- if (dbPath == null) {
- return null;
- }
ContainerDatanodeDatabase cdd = new
ContainerDatanodeDatabase(dbPath.toString());
diff --git
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerLogController.java
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerLogController.java
index dd43119ea6..1a6cdafea6 100644
---
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerLogController.java
+++
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ContainerLogController.java
@@ -63,17 +63,17 @@ public Path resolveDbPath() {
if (Files.exists(resolvedPath) && Files.isRegularFile(resolvedPath)) {
out().println("Using default database file found in current directory:
" + resolvedPath);
} else {
- err().println("No database path provided and default file '" +
SQLDBConstants.DEFAULT_DB_FILENAME + "' not " +
+ throw new IllegalArgumentException("No database path provided and
default file '" +
+ SQLDBConstants.DEFAULT_DB_FILENAME + "' not " +
"found in current directory. Please provide a valid database
path");
- return null;
}
} else {
resolvedPath = Paths.get(dbPath);
Path parentDir = resolvedPath.getParent();
if (parentDir != null && !Files.exists(parentDir)) {
- err().println("The parent directory of the provided database path does
not exist: " + parentDir);
- return null;
+ throw new IllegalArgumentException("The parent directory of the
provided database " +
+ "path does not exist: " + parentDir);
}
}
diff --git
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/DuplicateOpenContainersCommand.java
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/DuplicateOpenContainersCommand.java
index 820d5b5d2e..a0fb8371ec 100644
---
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/DuplicateOpenContainersCommand.java
+++
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/DuplicateOpenContainersCommand.java
@@ -39,9 +39,6 @@ public class DuplicateOpenContainersCommand implements
Callable<Void> {
@Override
public Void call() throws Exception {
Path dbPath = parent.resolveDbPath();
- if (dbPath == null) {
- return null;
- }
ContainerDatanodeDatabase cdd = new
ContainerDatanodeDatabase(dbPath.toString());
cdd.findDuplicateOpenContainer();
diff --git
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ListContainers.java
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ListContainers.java
index cd0c09f2ba..2c9de2d643 100644
---
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ListContainers.java
+++
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/ListContainers.java
@@ -19,7 +19,9 @@
import java.nio.file.Path;
import java.util.concurrent.Callable;
+import org.apache.hadoop.hdds.cli.AbstractSubcommand;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
import
org.apache.hadoop.ozone.debug.logs.container.utils.ContainerDatanodeDatabase;
import org.apache.hadoop.ozone.shell.ListLimitOptions;
import picocli.CommandLine;
@@ -33,30 +35,52 @@
name = "list",
description = "Finds containers from the database based on the option
provided."
)
-public class ListContainers implements Callable<Void> {
+public class ListContainers extends AbstractSubcommand implements
Callable<Void> {
- @CommandLine.Option(names = {"--state"},
- description = "Life cycle state of the container.",
- required = true)
- private HddsProtos.LifeCycleState state;
+ @CommandLine.ArgGroup(multiplicity = "1")
+ private ExclusiveOptions exclusiveOptions;
@CommandLine.Mixin
private ListLimitOptions listOptions;
-
+
@CommandLine.ParentCommand
private ContainerLogController parent;
+ private static final class ExclusiveOptions {
+ @CommandLine.Option(names = {"--lifecycle"},
+ description = "Life cycle state of the container.")
+ private HddsProtos.LifeCycleState lifecycleState;
+
+ @CommandLine.Option(names = {"--health"},
+ description = "Health state of the container.")
+ private ReplicationManagerReport.HealthState healthState;
+ }
+
@Override
public Void call() throws Exception {
Path dbPath = parent.resolveDbPath();
- if (dbPath == null) {
- return null;
- }
ContainerDatanodeDatabase cdd = new
ContainerDatanodeDatabase(dbPath.toString());
- cdd.listContainersByState(state.name(), listOptions.getLimit());
+ if (exclusiveOptions.lifecycleState != null) {
+ cdd.listContainersByState(exclusiveOptions.lifecycleState.name(),
listOptions.getLimit());
+ } else if (exclusiveOptions.healthState != null) {
+ switch (exclusiveOptions.healthState) {
+ case UNDER_REPLICATED:
+ case OVER_REPLICATED:
+ cdd.listReplicatedContainers(exclusiveOptions.healthState.name(),
listOptions.getLimit());
+ break;
+ case UNHEALTHY:
+ cdd.listUnhealthyContainers(listOptions.getLimit());
+ break;
+ case QUASI_CLOSED_STUCK:
+ cdd.listQuasiClosedStuckContainers(listOptions.getLimit());
+ break;
+ default:
+ err().println("Unsupported health state: " +
exclusiveOptions.healthState);
+ }
+ }
return null;
}
diff --git
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/ContainerDatanodeDatabase.java
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/ContainerDatanodeDatabase.java
index 1d44605249..398fb98b28 100644
---
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/ContainerDatanodeDatabase.java
+++
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/ContainerDatanodeDatabase.java
@@ -49,15 +49,18 @@ public class ContainerDatanodeDatabase {
private static final int DEFAULT_REPLICATION_FACTOR;
private final PrintWriter out;
+ private final PrintWriter err;
public ContainerDatanodeDatabase(String dbPath) {
this.databasePath = dbPath;
this.out = new PrintWriter(new OutputStreamWriter(System.out,
StandardCharsets.UTF_8), true);
+ this.err = new PrintWriter(new OutputStreamWriter(System.err,
StandardCharsets.UTF_8), true);
}
- public ContainerDatanodeDatabase(String dbPath, PrintWriter out) {
+ public ContainerDatanodeDatabase(String dbPath, PrintWriter out, PrintWriter
err) {
this.databasePath = dbPath;
this.out = out;
+ this.err = err;
}
static {
@@ -122,6 +125,7 @@ public void createIndexes() throws SQLException {
createIdxDclContainerStateTime(stmt);
createContainerLogIndex(stmt);
createIdxContainerlogContainerId(stmt);
+ createIndexForQuasiClosedQuery(stmt);
} catch (SQLException e) {
throw new SQLException("Error while creating index: " + e.getMessage());
} catch (Exception e) {
@@ -144,6 +148,11 @@ private void createIdxContainerlogContainerId(Statement
stmt) throws SQLExceptio
stmt.execute(createIndexSQL);
}
+ private void createIndexForQuasiClosedQuery(Statement stmt) throws
SQLException {
+ String createIndexSQL =
SQLDBConstants.CREATE_DCL_STATE_CONTAINER_DATANODE_TIME_INDEX;
+ stmt.execute(createIndexSQL);
+ }
+
/**
* Inserts a list of container log entries into the
DatanodeContainerLogTable.
*
@@ -609,5 +618,156 @@ private List<DatanodeContainerInfo>
getContainerLogDataForOpenContainers(Long co
return logEntries;
}
+
+ /**
+ * Lists containers that are over- or under-replicated also provides count
of replicas.
+ */
+
+ public void listReplicatedContainers(String overOrUnder, Integer limit)
throws SQLException {
+ String operator;
+ if ("OVER_REPLICATED".equalsIgnoreCase(overOrUnder)) {
+ operator = ">";
+ } else if ("UNDER_REPLICATED".equalsIgnoreCase(overOrUnder)) {
+ operator = "<";
+ } else {
+ err.println("Invalid type. Use OVER_REPLICATED or UNDER_REPLICATED.");
+ return;
+ }
+
+ String rawQuery = SQLDBConstants.SELECT_REPLICATED_CONTAINERS;
+
+ if (!rawQuery.contains("{operator}")) {
+ err.println("Query not defined correctly.");
+ return;
+ }
+
+ String finalQuery = rawQuery.replace("{operator}", operator);
+
+ boolean limitProvided = limit != Integer.MAX_VALUE;
+ if (limitProvided) {
+ finalQuery += " LIMIT ?";
+ }
+
+ try (Connection connection = getConnection();
+ PreparedStatement pstmt = connection.prepareStatement(finalQuery)) {
+
+ pstmt.setInt(1, DEFAULT_REPLICATION_FACTOR);
+
+ if (limitProvided) {
+ pstmt.setInt(2, limit + 1);
+ }
+
+ try (ResultSet rs = pstmt.executeQuery()) {
+ int count = 0;
+
+ while (rs.next()) {
+ if (limitProvided && count >= limit) {
+ err.println("Note: There might be more containers. Use --all
option to list all entries.");
+ break;
+ }
+
+ out.printf("Container ID = %s - Count = %d%n",
rs.getLong("container_id"),
+ rs.getInt("replica_count"));
+ count++;
+ }
+
+ out.println("Number of containers listed: " + count);
+
+ }
+
+ } catch (SQLException e) {
+ throw new SQLException("Error while retrieving containers." +
e.getMessage(), e);
+ } catch (Exception e) {
+ throw new RuntimeException("Unexpected error: " + e);
+ }
+ }
+
+ /**
+ * Lists containers that are UNHEALTHY also provides count of replicas which
are in UNHEALTHY state.
+ */
+
+ public void listUnhealthyContainers(Integer limit) throws SQLException {
+
+ String query = SQLDBConstants.SELECT_UNHEALTHY_CONTAINERS;
+
+ boolean limitProvided = limit != Integer.MAX_VALUE;
+ if (limitProvided) {
+ query += " LIMIT ?";
+ }
+
+ try (Connection connection = getConnection();
+ PreparedStatement stmt = connection.prepareStatement(query)) {
+
+ if (limitProvided) {
+ stmt.setInt(1, limit + 1);
+ }
+
+ try (ResultSet rs = stmt.executeQuery()) {
+ int count = 0;
+
+ while (rs.next()) {
+ if (limitProvided && count >= limit) {
+ err.println("Note: There might be more containers. Use --all
option to list all entries.");
+ break;
+ }
+
+ out.printf("Container ID = %s - Count = %d%n",
rs.getString("container_id"),
+ rs.getInt("unhealthy_replica_count"));
+ count++;
+ }
+
+ out.println("Number of containers listed: " + count);
+ }
+
+ } catch (SQLException e) {
+ throw new SQLException("Error while retrieving containers." +
e.getMessage(), e);
+ } catch (Exception e) {
+ throw new RuntimeException("Unexpected error: " + e);
+ }
+ }
+
+ /**
+ * Lists containers that are QUASI_CLOSED stuck also provides count of
replicas which are in QUASI_CLOSED state.
+ */
+
+ public void listQuasiClosedStuckContainers(Integer limit) throws
SQLException {
+
+ String query = SQLDBConstants.SELECT_QUASI_CLOSED_STUCK_CONTAINERS;
+
+ boolean limitProvided = limit != Integer.MAX_VALUE;
+ if (limitProvided) {
+ query += " LIMIT ?";
+ }
+
+ try (Connection connection = getConnection();
+ PreparedStatement statement = connection.prepareStatement(query)) {
+
+ if (limitProvided) {
+ statement.setInt(1, limit + 1);
+ }
+
+ try (ResultSet resultSet = statement.executeQuery()) {
+ int count = 0;
+
+ while (resultSet.next()) {
+ if (limitProvided && count >= limit) {
+ err.println("Note: There might be more containers. Use --all
option to list all entries.");
+ break;
+ }
+
+ out.printf("Container ID = %s - Count = %d%n",
resultSet.getString("container_id"),
+ resultSet.getInt("quasi_closed_replica_count"));
+ count++;
+ }
+
+ out.println("Number of containers listed: " + count);
+ }
+
+ } catch (SQLException e) {
+ throw new SQLException("Error while retrieving containers." +
e.getMessage(), e);
+ } catch (Exception e) {
+ throw new RuntimeException("Unexpected error: " + e);
+ }
+ }
}
diff --git
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/SQLDBConstants.java
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/SQLDBConstants.java
index 74822d35e3..773e821298 100644
---
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/SQLDBConstants.java
+++
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/logs/container/utils/SQLDBConstants.java
@@ -17,6 +17,9 @@
package org.apache.hadoop.ozone.debug.logs.container.utils;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
+
/**
* Constants used for ContainerDatanodeDatabase.
*/
@@ -29,6 +32,11 @@ public final class SQLDBConstants {
public static final int BATCH_SIZE = 2500;
public static final String DATANODE_CONTAINER_LOG_TABLE_NAME =
"DatanodeContainerLogTable";
public static final String CONTAINER_LOG_TABLE_NAME = "ContainerLogTable";
+ public static final String CLOSED_STATE =
HddsProtos.LifeCycleState.CLOSED.name();
+ public static final String DELETED_STATE =
HddsProtos.LifeCycleState.DELETED.name();
+ public static final String UNHEALTHY_STATE =
ReplicationManagerReport.HealthState.UNHEALTHY.name();
+ public static final String QUASI_CLOSED_STATE =
HddsProtos.LifeCycleState.QUASI_CLOSED.name();
+
public static final String CREATE_DATANODE_CONTAINER_LOG_TABLE =
"CREATE TABLE IF NOT EXISTS DatanodeContainerLogTable (datanode_id TEXT
NOT NULL, " +
"container_id INTEGER NOT NULL, timestamp TEXT NOT NULL,
container_state TEXT, bcsid INTEGER, " +
@@ -75,6 +83,62 @@ public final class SQLDBConstants {
public static final String SELECT_CONTAINER_DETAILS_OPEN_STATE = "SELECT
d.timestamp, d.container_id, " +
"d.datanode_id, d.container_state FROM DatanodeContainerLogTable d " +
"WHERE d.container_id = ? AND d.container_state = 'OPEN' ORDER BY
d.timestamp ASC;";
+ public static final String CREATE_DCL_STATE_CONTAINER_DATANODE_TIME_INDEX =
+ "CREATE INDEX IF NOT EXISTS idx_dcl_state_container_datanode_time " +
+ "ON DatanodeContainerLogTable(container_state, container_id,
datanode_id, timestamp DESC);";
+ public static final String SELECT_REPLICATED_CONTAINERS =
+ "SELECT container_id, COUNT(DISTINCT datanode_id) AS
replica_count\n" +
+ "FROM ContainerLogTable\n" +
+ "WHERE latest_state != '" + DELETED_STATE + "'\n" +
+ " GROUP BY container_id\n" +
+ "HAVING COUNT(DISTINCT datanode_id) {operator} ?";
+ public static final String SELECT_UNHEALTHY_CONTAINERS =
+ "SELECT u.container_id, COUNT(*) AS unhealthy_replica_count\n" +
+ "FROM (\n" +
+ " SELECT container_id, datanode_id, MAX(timestamp) AS
latest_unhealthy_timestamp\n" +
+ " FROM DatanodeContainerLogTable\n" +
+ " WHERE container_state = '" + UNHEALTHY_STATE + "'\n" +
+ " GROUP BY container_id, datanode_id\n" +
+ ") AS u\n" +
+ "LEFT JOIN (\n" +
+ " SELECT container_id, datanode_id, MAX(timestamp) AS
latest_closed_timestamp\n" +
+ " FROM DatanodeContainerLogTable\n" +
+ " WHERE container_state IN ('" + CLOSED_STATE + "', '" +
DELETED_STATE + "')\n" +
+ " GROUP BY container_id, datanode_id\n" +
+ ") AS c\n" +
+ "ON u.container_id = c.container_id AND u.datanode_id =
c.datanode_id\n" +
+ "WHERE c.latest_closed_timestamp IS NULL \n" +
+ " OR u.latest_unhealthy_timestamp > c.latest_closed_timestamp\n" +
+ "GROUP BY u.container_id\n" +
+ "ORDER BY u.container_id";
+ public static final String SELECT_QUASI_CLOSED_STUCK_CONTAINERS =
+ "WITH quasi_closed_replicas AS ( " +
+ " SELECT container_id, datanode_id, MAX(timestamp) AS
latest_quasi_closed_timestamp\n" +
+ " FROM DatanodeContainerLogTable " +
+ " WHERE container_state = '" + QUASI_CLOSED_STATE + "'\n" +
+ " GROUP BY container_id, datanode_id" +
+ "), " +
+ "container_with_enough_quasi_closed AS (\n" +
+ " SELECT container_id\n" +
+ " FROM quasi_closed_replicas\n" +
+ " GROUP BY container_id\n" +
+ " HAVING COUNT(DISTINCT datanode_id) >= 3\n" +
+ "),\n" +
+ "closed_or_deleted AS (\n" +
+ " SELECT container_id, datanode_id, MAX(timestamp) AS
latest_closed_timestamp\n" +
+ " FROM DatanodeContainerLogTable\n" +
+ " WHERE container_state IN ('" + CLOSED_STATE + "', '" +
DELETED_STATE + "')\n" +
+ " GROUP BY container_id, datanode_id\n" +
+ ")\n" +
+ "SELECT q.container_id, COUNT(*) AS quasi_closed_replica_count\n" +
+ "FROM quasi_closed_replicas q\n" +
+ "JOIN container_with_enough_quasi_closed qc ON q.container_id =
qc.container_id\n" +
+ "LEFT JOIN closed_or_deleted c \n" +
+ " ON q.container_id = c.container_id AND q.datanode_id =
c.datanode_id\n" +
+ "WHERE c.latest_closed_timestamp IS NULL\n" +
+ " OR q.latest_quasi_closed_timestamp >
c.latest_closed_timestamp\n" +
+ "GROUP BY q.container_id\n" +
+ "ORDER BY q.container_id";
private SQLDBConstants() {
//Never constructed
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]