This is an automated email from the ASF dual-hosted git repository.
krathbun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/main by this push:
new be9b1fde06 Completed functionality for `admin check` command (#5348)
be9b1fde06 is described below
commit be9b1fde0652c0523004a04d6aa032746b48bea6
Author: Kevin Rathbun <[email protected]>
AuthorDate: Wed Feb 4 14:12:39 2026 -0500
Completed functionality for `admin check` command (#5348)
* New checks for `admin check` command
- Implemented SYSTEM_CONFIG check:
- Checks ZooKeeper locks for Accumulo server processes
- Checks ZooKeeper table nodes
- Checks that the WAL metadata in ZooKeeper is valid
- Added new SERVER_CONFIG check:
- Checks that all configured properties are valid
- Checks that required properties are present in the config
- Added new tests in `AdminCheckIT` for SYSTEM_CONFIG and SERVER_CONFIG.
Added failing test cases for all checks.
- Deleted CheckServerConfig (run via `accumulo check-server-config`) as the
new `accumulo admin check run SERVER_CONFIG` will inherently do the same checks.
---------
Co-authored-by: Keith Turner <[email protected]>
---
.../server/conf/CheckAccumuloProperties.java | 3 +-
.../accumulo/server/conf/CheckServerConfig.java | 51 ---
.../accumulo/server/log/WalStateManager.java | 2 +-
.../org/apache/accumulo/server/util/Admin.java | 3 +
.../util/checkCommand/MetadataCheckRunner.java | 15 +-
.../checkCommand/MetadataTableCheckRunner.java | 9 -
.../util/checkCommand/RootMetadataCheckRunner.java | 20 +-
.../util/checkCommand/RootTableCheckRunner.java | 17 -
.../util/checkCommand/ServerConfigCheckRunner.java | 86 ++++++
.../util/checkCommand/SystemConfigCheckRunner.java | 198 ++++++++++++
.../util/checkCommand/TableLocksCheckRunner.java | 6 +-
...nCheckIT_SimpleSuite.java => AdminCheckIT.java} | 343 +++++++++++++++++----
.../apache/accumulo/test/start/KeywordStartIT.java | 3 -
13 files changed, 588 insertions(+), 168 deletions(-)
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java
b/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java
index fe257087c9..9f4769f594 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java
@@ -24,6 +24,7 @@ import java.nio.file.Path;
import org.apache.accumulo.core.conf.SiteConfiguration;
import org.apache.accumulo.server.ServerDirs;
import org.apache.accumulo.server.fs.VolumeManagerImpl;
+import org.apache.accumulo.server.util.Admin;
import org.apache.accumulo.start.spi.KeywordExecutable;
import org.apache.hadoop.conf.Configuration;
@@ -45,7 +46,7 @@ public class CheckAccumuloProperties implements
KeywordExecutable {
return "Checks the provided Accumulo configuration file for errors. "
+ "This only checks the contents of the file and not any running
Accumulo system, "
+ "so it can be used prior to init, but only performs a subset of the
checks done by "
- + (new CheckServerConfig().keyword());
+ + "'admin check run " + Admin.CheckCommand.Check.SERVER_CONFIG + "'";
}
@SuppressFBWarnings(value = "PATH_TRAVERSAL_IN", justification =
"intentional user-provided path")
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/conf/CheckServerConfig.java
b/server/base/src/main/java/org/apache/accumulo/server/conf/CheckServerConfig.java
deleted file mode 100644
index b62c7fc17a..0000000000
---
a/server/base/src/main/java/org/apache/accumulo/server/conf/CheckServerConfig.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.accumulo.server.conf;
-
-import org.apache.accumulo.core.conf.SiteConfiguration;
-import org.apache.accumulo.server.ServerContext;
-import org.apache.accumulo.start.spi.KeywordExecutable;
-
-import com.google.auto.service.AutoService;
-
-@AutoService(KeywordExecutable.class)
-public class CheckServerConfig implements KeywordExecutable {
-
- public static void main(String[] args) {
- try (var context = new ServerContext(SiteConfiguration.auto())) {
- context.getConfiguration();
- }
- }
-
- @Override
- public String keyword() {
- return "check-server-config";
- }
-
- @Override
- public String description() {
- return "Checks server config";
- }
-
- @Override
- public void execute(String[] args) {
- main(args);
- }
-
-}
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/log/WalStateManager.java
b/server/base/src/main/java/org/apache/accumulo/server/log/WalStateManager.java
index 21cfef697a..519e08f4e5 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/log/WalStateManager.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/log/WalStateManager.java
@@ -144,7 +144,7 @@ public class WalStateManager {
updateState(tsi, path, WalState.UNREFERENCED);
}
- private static Pair<WalState,Path> parse(byte[] data) {
+ public static Pair<WalState,Path> parse(byte[] data) {
String[] parts = new String(data, UTF_8).split(",");
return new Pair<>(WalState.valueOf(parts[0]), new Path(parts[1]));
}
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/util/Admin.java
b/server/base/src/main/java/org/apache/accumulo/server/util/Admin.java
index 640c60d617..8e1dfaab24 100644
--- a/server/base/src/main/java/org/apache/accumulo/server/util/Admin.java
+++ b/server/base/src/main/java/org/apache/accumulo/server/util/Admin.java
@@ -101,6 +101,7 @@ import
org.apache.accumulo.server.util.checkCommand.CheckRunner;
import org.apache.accumulo.server.util.checkCommand.MetadataTableCheckRunner;
import org.apache.accumulo.server.util.checkCommand.RootMetadataCheckRunner;
import org.apache.accumulo.server.util.checkCommand.RootTableCheckRunner;
+import org.apache.accumulo.server.util.checkCommand.ServerConfigCheckRunner;
import org.apache.accumulo.server.util.checkCommand.SystemConfigCheckRunner;
import org.apache.accumulo.server.util.checkCommand.SystemFilesCheckRunner;
import org.apache.accumulo.server.util.checkCommand.TableLocksCheckRunner;
@@ -191,6 +192,8 @@ public class Admin implements KeywordExecutable {
// Caution should be taken when changing or adding any new checks: order
is important
SYSTEM_CONFIG(SystemConfigCheckRunner::new, "Validate the system config
stored in ZooKeeper",
Collections.emptyList()),
+ SERVER_CONFIG(ServerConfigCheckRunner::new, "Validate the server
configuration",
+ Collections.singletonList(SYSTEM_CONFIG)),
TABLE_LOCKS(TableLocksCheckRunner::new,
"Ensures that table and namespace locks are valid and are associated
with a FATE op",
Collections.singletonList(SYSTEM_CONFIG)),
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/MetadataCheckRunner.java
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/MetadataCheckRunner.java
index dd1b8d525d..6d0b29bf11 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/MetadataCheckRunner.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/MetadataCheckRunner.java
@@ -27,7 +27,6 @@ import java.util.SortedMap;
import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.client.Scanner;
-import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.TableId;
@@ -43,7 +42,6 @@ import
org.apache.accumulo.server.constraints.MetadataConstraints;
import org.apache.accumulo.server.constraints.SystemEnvironment;
import org.apache.accumulo.server.util.Admin;
import org.apache.hadoop.io.Text;
-import org.apache.zookeeper.KeeperException;
public interface MetadataCheckRunner extends CheckRunner {
@@ -51,9 +49,15 @@ public interface MetadataCheckRunner extends CheckRunner {
TableId tableId();
- Set<ColumnFQ> requiredColFQs();
+ default Set<ColumnFQ> requiredColFQs() {
+ return
Set.of(MetadataSchema.TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN,
+ MetadataSchema.TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN,
+ MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN);
+ }
- Set<Text> requiredColFams();
+ default Set<Text> requiredColFams() {
+ return
Set.of(MetadataSchema.TabletsSection.CurrentLocationColumnFamily.NAME);
+ }
default String scanning() {
return String.format("%s (%s) table", tableName(), tableId());
@@ -64,8 +68,7 @@ public interface MetadataCheckRunner extends CheckRunner {
* that are expected. For the root metadata, ensures that the expected
"columns" exist in ZK.
*/
default Admin.CheckCommand.CheckStatus checkRequiredColumns(ServerContext
context,
- Admin.CheckCommand.CheckStatus status)
- throws TableNotFoundException, InterruptedException, KeeperException {
+ Admin.CheckCommand.CheckStatus status) throws Exception {
Set<ColumnFQ> requiredColFQs;
Set<Text> requiredColFams;
boolean missingReqCol = false;
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/MetadataTableCheckRunner.java
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/MetadataTableCheckRunner.java
index c33b9ae7df..7ca81bd254 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/MetadataTableCheckRunner.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/MetadataTableCheckRunner.java
@@ -24,9 +24,7 @@ import java.util.Set;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.data.TableId;
import org.apache.accumulo.core.metadata.SystemTables;
-import org.apache.accumulo.core.metadata.schema.MetadataSchema;
import org.apache.accumulo.core.security.Authorizations;
-import org.apache.accumulo.core.util.ColumnFQ;
import org.apache.accumulo.server.ServerContext;
import org.apache.accumulo.server.cli.ServerUtilOpts;
import org.apache.accumulo.server.util.Admin;
@@ -47,13 +45,6 @@ public class MetadataTableCheckRunner implements
MetadataCheckRunner {
return SystemTables.METADATA.tableId();
}
- @Override
- public Set<ColumnFQ> requiredColFQs() {
- return
Set.of(MetadataSchema.TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN,
- MetadataSchema.TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN,
- MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN);
- }
-
@Override
public Set<Text> requiredColFams() {
return Set.of();
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/RootMetadataCheckRunner.java
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/RootMetadataCheckRunner.java
index ecd62f0e4c..d5d65d00e2 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/RootMetadataCheckRunner.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/RootMetadataCheckRunner.java
@@ -23,11 +23,9 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.util.HashSet;
import java.util.Set;
-import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.data.TableId;
import org.apache.accumulo.core.metadata.RootTable;
import org.apache.accumulo.core.metadata.SystemTables;
-import org.apache.accumulo.core.metadata.schema.MetadataSchema;
import org.apache.accumulo.core.metadata.schema.RootTabletMetadata;
import org.apache.accumulo.core.util.ColumnFQ;
import org.apache.accumulo.server.ServerContext;
@@ -35,7 +33,6 @@ import org.apache.accumulo.server.cli.ServerUtilOpts;
import org.apache.accumulo.server.util.Admin;
import org.apache.accumulo.server.util.FindOfflineTablets;
import org.apache.hadoop.io.Text;
-import org.apache.zookeeper.KeeperException;
public class RootMetadataCheckRunner implements MetadataCheckRunner {
private static final Admin.CheckCommand.Check check =
Admin.CheckCommand.Check.ROOT_METADATA;
@@ -50,19 +47,6 @@ public class RootMetadataCheckRunner implements
MetadataCheckRunner {
throw new UnsupportedOperationException();
}
- @Override
- public Set<ColumnFQ> requiredColFQs() {
- return
Set.of(MetadataSchema.TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN,
- MetadataSchema.TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN,
- MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN,
- MetadataSchema.TabletsSection.ServerColumnFamily.LOCK_COLUMN);
- }
-
- @Override
- public Set<Text> requiredColFams() {
- return
Set.of(MetadataSchema.TabletsSection.CurrentLocationColumnFamily.NAME);
- }
-
@Override
public String scanning() {
return "root tablet metadata in ZooKeeper";
@@ -70,7 +54,7 @@ public class RootMetadataCheckRunner implements
MetadataCheckRunner {
@Override
public Admin.CheckCommand.CheckStatus runCheck(ServerContext context,
ServerUtilOpts opts,
- boolean fixFiles) throws TableNotFoundException, InterruptedException,
KeeperException {
+ boolean fixFiles) throws Exception {
Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK;
printRunning();
@@ -97,7 +81,7 @@ public class RootMetadataCheckRunner implements
MetadataCheckRunner {
@Override
public Admin.CheckCommand.CheckStatus checkRequiredColumns(ServerContext
context,
- Admin.CheckCommand.CheckStatus status) throws InterruptedException,
KeeperException {
+ Admin.CheckCommand.CheckStatus status) throws Exception {
final String json =
new
String(context.getZooSession().asReader().getData(RootTable.ZROOT_TABLET),
UTF_8);
final var rtm = new RootTabletMetadata(json);
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/RootTableCheckRunner.java
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/RootTableCheckRunner.java
index fb7355e541..6a834016ba 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/RootTableCheckRunner.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/RootTableCheckRunner.java
@@ -19,20 +19,16 @@
package org.apache.accumulo.server.util.checkCommand;
import java.util.AbstractMap;
-import java.util.Set;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.data.TableId;
import org.apache.accumulo.core.metadata.SystemTables;
-import org.apache.accumulo.core.metadata.schema.MetadataSchema;
import org.apache.accumulo.core.security.Authorizations;
-import org.apache.accumulo.core.util.ColumnFQ;
import org.apache.accumulo.server.ServerContext;
import org.apache.accumulo.server.cli.ServerUtilOpts;
import org.apache.accumulo.server.util.Admin;
import org.apache.accumulo.server.util.CheckForMetadataProblems;
import org.apache.accumulo.server.util.FindOfflineTablets;
-import org.apache.hadoop.io.Text;
public class RootTableCheckRunner implements MetadataCheckRunner {
private static final Admin.CheckCommand.Check check =
Admin.CheckCommand.Check.ROOT_TABLE;
@@ -47,19 +43,6 @@ public class RootTableCheckRunner implements
MetadataCheckRunner {
return SystemTables.ROOT.tableId();
}
- @Override
- public Set<ColumnFQ> requiredColFQs() {
- return
Set.of(MetadataSchema.TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN,
- MetadataSchema.TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN,
- MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN,
- MetadataSchema.TabletsSection.ServerColumnFamily.LOCK_COLUMN);
- }
-
- @Override
- public Set<Text> requiredColFams() {
- return
Set.of(MetadataSchema.TabletsSection.CurrentLocationColumnFamily.NAME);
- }
-
@Override
public Admin.CheckCommand.CheckStatus runCheck(ServerContext context,
ServerUtilOpts opts,
boolean fixFiles) throws Exception {
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/ServerConfigCheckRunner.java
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/ServerConfigCheckRunner.java
new file mode 100644
index 0000000000..02003c8eb4
--- /dev/null
+++
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/ServerConfigCheckRunner.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.server.util.checkCommand;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.accumulo.core.conf.Property;
+import org.apache.accumulo.server.ServerContext;
+import org.apache.accumulo.server.cli.ServerUtilOpts;
+import org.apache.accumulo.server.util.Admin;
+
+public class ServerConfigCheckRunner implements CheckRunner {
+ private static final Admin.CheckCommand.Check check =
Admin.CheckCommand.Check.SERVER_CONFIG;
+
+ @Override
+ public Admin.CheckCommand.CheckStatus runCheck(ServerContext context,
ServerUtilOpts opts,
+ boolean fixFiles) throws Exception {
+ Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK;
+ printRunning();
+
+ log.trace("********** Checking server configuration **********");
+
+ log.trace("Checking that all configured properties are valid (valid key
and value)");
+ final Map<String,String> definedProps = new HashMap<>();
+ final var config = context.getConfiguration();
+ config.getProperties(definedProps, s -> true);
+ for (var entry : definedProps.entrySet()) {
+ var key = entry.getKey();
+ var val = entry.getValue();
+ if (!Property.isValidProperty(key, val)) {
+ log.warn("Invalid property (key={} val={}) found in the config", key,
val);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ }
+ }
+
+ log.trace("Checking that all required config properties are present");
+ // there are many properties that should be set (default value or user
set), identifying them
+ // all and checking them here is unrealistic. Some property that is not
set but is expected
+ // will likely result in some sort of failure eventually anyway. We will
just check a few
+ // obvious required properties here.
+ Set<Property> requiredProps = Set.of(Property.INSTANCE_ZK_HOST,
Property.INSTANCE_ZK_TIMEOUT,
+ Property.INSTANCE_SECRET, Property.INSTANCE_VOLUMES,
Property.GENERAL_THREADPOOL_SIZE,
+ Property.GENERAL_DELEGATION_TOKEN_LIFETIME,
+ Property.GENERAL_DELEGATION_TOKEN_UPDATE_INTERVAL,
Property.GENERAL_IDLE_PROCESS_INTERVAL,
+ Property.GENERAL_LOW_MEM_DETECTOR_INTERVAL,
Property.GENERAL_LOW_MEM_DETECTOR_THRESHOLD,
+ Property.GENERAL_SERVER_LOCK_VERIFICATION_INTERVAL,
Property.MANAGER_CLIENTPORT,
+ Property.TSERV_CLIENTPORT, Property.GC_CYCLE_START,
Property.GC_CYCLE_DELAY,
+ Property.GC_PORT, Property.MONITOR_PORT, Property.TABLE_MAJC_RATIO,
+ Property.TABLE_SPLIT_THRESHOLD);
+ for (var reqProp : requiredProps) {
+ var confPropVal = config.get(reqProp);
+ // already checked that all set properties are valid, just check that it
is set then we know
+ // it's valid
+ if (confPropVal == null || confPropVal.isEmpty()) {
+ log.warn("Required property {} is not set!", reqProp);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ }
+ }
+
+ printCompleted(status);
+ return status;
+ }
+
+ @Override
+ public Admin.CheckCommand.Check getCheck() {
+ return check;
+ }
+}
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java
index 9e8467eb7b..798ecd9955 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java
@@ -18,9 +18,25 @@
*/
package org.apache.accumulo.server.util.checkCommand;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.accumulo.core.Constants;
+import org.apache.accumulo.core.client.admin.servers.ServerId;
+import org.apache.accumulo.core.fate.zookeeper.ZooReaderWriter;
+import org.apache.accumulo.core.metadata.SystemTables;
+import org.apache.accumulo.core.metadata.TServerInstance;
+import org.apache.accumulo.core.metadata.schema.TabletMetadata;
+import org.apache.accumulo.core.util.Pair;
import org.apache.accumulo.server.ServerContext;
import org.apache.accumulo.server.cli.ServerUtilOpts;
+import org.apache.accumulo.server.log.WalStateManager;
import org.apache.accumulo.server.util.Admin;
+import org.apache.hadoop.fs.Path;
+
+import com.google.common.collect.Sets;
public class SystemConfigCheckRunner implements CheckRunner {
private static final Admin.CheckCommand.Check check =
Admin.CheckCommand.Check.SYSTEM_CONFIG;
@@ -30,10 +46,192 @@ public class SystemConfigCheckRunner implements
CheckRunner {
boolean fixFiles) throws Exception {
Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK;
printRunning();
+
+ log.trace("********** Checking validity of some ZooKeeper nodes
**********");
+ status = checkZkNodes(context, status);
+
printCompleted(status);
return status;
}
+ private static Admin.CheckCommand.CheckStatus checkZkNodes(ServerContext
context,
+ Admin.CheckCommand.CheckStatus status) throws Exception {
+ status = checkZKLocks(context, status);
+ status = checkZKTableNodes(context, status);
+ status = checkZKWALsMetadata(context, status);
+
+ return status;
+ }
+
+ private static Admin.CheckCommand.CheckStatus checkZKLocks(ServerContext
context,
+ Admin.CheckCommand.CheckStatus status) throws Exception {
+ final ServerId.Type[] serverTypes = ServerId.Type.values();
+
+ log.trace("Checking ZooKeeper locks for Accumulo server processes...");
+
+ // check that essential server processes have a ZK lock failing otherwise
+ // check that nonessential server processes have a ZK lock only if they
are running. If they are
+ // not running, alerts the user that the process is not running which may
or may not be expected
+ for (ServerId.Type serverType : serverTypes) {
+ log.trace("Looking for {} lock(s)...", serverType);
+ var servers = context.instanceOperations().getServers(serverType);
+
+ switch (serverType) {
+ case MANAGER:
+ // essential process
+ case GARBAGE_COLLECTOR:
+ // essential process
+ if (servers.size() != 1) {
+ log.warn("Expected 1 server to be found for {} but found {}",
serverType,
+ servers.size());
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ } else {
+ // no exception and 1 server found
+ log.trace("Verified ZooKeeper lock for {}", servers);
+ }
+ break;
+ case MONITOR:
+ // nonessential process
+ if (servers.isEmpty()) {
+ log.debug("No {} appears to be running. This may or may not be
expected", serverType);
+ } else if (servers.size() > 1) {
+ log.warn("More than 1 {} was found running. This is not expected",
serverType);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ } else {
+ // no exception and 1 server found
+ log.trace("Verified ZooKeeper lock for {}", servers);
+ }
+ break;
+ case TABLET_SERVER:
+ // essential process(es)
+ case COMPACTOR:
+ // essential process(es)
+ if (servers.isEmpty()) {
+ log.warn("No {} appear to be running. This is not expected.",
serverType);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ } else {
+ // no exception and >= 1 server found
+ log.trace("Verified ZooKeeper lock(s) for {} servers",
servers.size());
+ }
+ break;
+ case SCAN_SERVER:
+ // nonessential process(es)
+ if (servers.isEmpty()) {
+ log.debug("No {} appear to be running. This may or may not be
expected.", serverType);
+ } else {
+ // no exception and >= 1 server found
+ log.trace("Verified ZooKeeper lock(s) for {} servers",
servers.size());
+ }
+ break;
+ default:
+ throw new IllegalStateException("Unhandled case: " + serverType);
+ }
+ }
+
+ return status;
+ }
+
+ private static Admin.CheckCommand.CheckStatus
checkZKTableNodes(ServerContext context,
+ Admin.CheckCommand.CheckStatus status) throws Exception {
+ log.trace("Checking ZooKeeper table nodes...");
+
+ final var zrw = context.getZooSession().asReaderWriter();
+ final var tableNameToId = context.tableOperations().tableIdMap();
+ final Map<String,String> systemTableNameToId = new HashMap<>();
+ for (var accumuloTable : SystemTables.values()) {
+ systemTableNameToId.put(accumuloTable.tableName(),
accumuloTable.tableId().canonical());
+ }
+
+ // ensure all system tables exist
+ if (!tableNameToId.values().containsAll(systemTableNameToId.values())) {
+ log.warn(
+ "Missing essential Accumulo table. One or more of {} are missing
from the tables found {}",
+ systemTableNameToId, tableNameToId);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ }
+ for (var nameToId : tableNameToId.entrySet()) {
+ var tablePath = Constants.ZTABLES + "/" + nameToId.getValue();
+ // expect the table path to exist and some data to exist
+ if (!zrw.exists(tablePath) || zrw.getChildren(tablePath).isEmpty()) {
+ log.warn("Failed to find table ({}) info at expected path {}",
nameToId, tablePath);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ }
+ }
+
+ return status;
+ }
+
+ private static Admin.CheckCommand.CheckStatus
checkZKWALsMetadata(ServerContext context,
+ Admin.CheckCommand.CheckStatus status) throws Exception {
+ final var zrw = context.getZooSession().asReaderWriter();
+
+ log.trace("Checking that WAL metadata in ZooKeeper is valid...");
+
+ var walsBefore = gatherWalsFromZK(context, zrw);
+
+ // gather any wals present in ZooKeeper but missing in DFS
+ Map<TServerInstance,Set<Pair<WalStateManager.WalState,Path>>> missingWals
= new HashMap<>();
+ for (var instanceAndWals : walsBefore.entrySet()) {
+ for (var wal : instanceAndWals.getValue()) {
+ if (!context.getVolumeManager().exists(wal.getSecond())) {
+ missingWals.computeIfAbsent(instanceAndWals.getKey(), k -> new
HashSet<>()).add(wal);
+ }
+ }
+ }
+
+ var walsAfter = gatherWalsFromZK(context, zrw);
+
+ for (var instanceAndMissingWals : missingWals.entrySet()) {
+ // if the TServer is alive before AND after the DFS check AND any
missing WAL is still in
+ // use after the DFS check
+ var actualMissing = Sets.intersection(instanceAndMissingWals.getValue(),
+ walsAfter.getOrDefault(instanceAndMissingWals.getKey(), Set.of()));
+ if (!actualMissing.isEmpty()) {
+ log.warn("WAL metadata for tserver {} references a WAL that does not
exist : {}",
+ instanceAndMissingWals.getKey(), actualMissing);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ }
+ }
+
+ return status;
+ }
+
+ private static Map<TServerInstance,Set<Pair<WalStateManager.WalState,Path>>>
+ gatherWalsFromZK(ServerContext context, ZooReaderWriter zrw) throws
Exception {
+ final var rootWalsDir = WalStateManager.ZWALS;
+ Map<TServerInstance,Set<Pair<WalStateManager.WalState,Path>>> wals = new
HashMap<>();
+ var tserverInstances = TabletMetadata.getLiveTServers(context);
+ for (var tsi : tserverInstances) {
+ wals.put(tsi, new HashSet<>());
+ // each child node of the root wals dir is a TServerInstance
+ final var tserverPath = rootWalsDir + "/" + tsi.toString();
+
+ // each child node of the tserver should be WAL metadata
+ final var walsPaths = zrw.getChildren(tserverPath);
+ if (walsPaths.isEmpty()) {
+ log.warn("No WAL metadata found for tserver {}. If it is expected that
mutations have "
+ + "occurred on the tserver, this is a problem. Otherwise, this is
normal", tsi);
+ }
+
+ for (var walPath : walsPaths) {
+ // should be able to parse the WAL metadata
+ final var fullWalPath = tserverPath + "/" + walPath;
+ log.trace("Attempting to parse WAL metadata at {}", fullWalPath);
+ var data = zrw.getData(fullWalPath);
+ if (data == null) {
+ continue;
+ }
+ var parseRes = WalStateManager.parse(data);
+ log.trace("Successfully parsed WAL metadata at {} result {}",
fullWalPath, parseRes);
+ if (parseRes.getFirst() == WalStateManager.WalState.OPEN
+ || parseRes.getFirst() == WalStateManager.WalState.CLOSED) {
+ wals.get(tsi).add(parseRes);
+ }
+ }
+ }
+ return wals;
+ }
+
@Override
public Admin.CheckCommand.Check getCheck() {
return check;
diff --git
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/TableLocksCheckRunner.java
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/TableLocksCheckRunner.java
index a771f595a9..beb0915330 100644
---
a/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/TableLocksCheckRunner.java
+++
b/server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/TableLocksCheckRunner.java
@@ -21,8 +21,6 @@ package org.apache.accumulo.server.util.checkCommand;
import java.util.List;
import java.util.Map;
-import org.apache.accumulo.core.client.AccumuloException;
-import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.fate.AdminUtil;
import org.apache.accumulo.core.fate.FateId;
import org.apache.accumulo.core.fate.FateInstanceType;
@@ -32,7 +30,6 @@ import org.apache.accumulo.core.metadata.SystemTables;
import org.apache.accumulo.server.ServerContext;
import org.apache.accumulo.server.cli.ServerUtilOpts;
import org.apache.accumulo.server.util.Admin;
-import org.apache.zookeeper.KeeperException;
public class TableLocksCheckRunner implements CheckRunner {
private static final Admin.CheckCommand.Check check =
Admin.CheckCommand.Check.TABLE_LOCKS;
@@ -56,8 +53,7 @@ public class TableLocksCheckRunner implements CheckRunner {
}
private static Admin.CheckCommand.CheckStatus checkTableLocks(ServerContext
context,
- Admin.CheckCommand.CheckStatus status)
- throws InterruptedException, KeeperException, AccumuloException,
AccumuloSecurityException {
+ Admin.CheckCommand.CheckStatus status) throws Exception {
final AdminUtil<Admin> admin = new AdminUtil<>();
final var zTableLocksPath =
context.getServerPaths().createTableLocksPath();
final var zk = context.getZooSession();
diff --git
a/test/src/main/java/org/apache/accumulo/test/AdminCheckIT_SimpleSuite.java
b/test/src/main/java/org/apache/accumulo/test/AdminCheckIT.java
similarity index 61%
rename from
test/src/main/java/org/apache/accumulo/test/AdminCheckIT_SimpleSuite.java
rename to test/src/main/java/org/apache/accumulo/test/AdminCheckIT.java
index a47210e79d..a51c8a264e 100644
--- a/test/src/main/java/org/apache/accumulo/test/AdminCheckIT_SimpleSuite.java
+++ b/test/src/main/java/org/apache/accumulo/test/AdminCheckIT.java
@@ -18,9 +18,11 @@
*/
package org.apache.accumulo.test;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayOutputStream;
@@ -36,39 +38,39 @@ import java.util.TreeMap;
import java.util.function.Supplier;
import java.util.regex.Pattern;
+import org.apache.accumulo.core.Constants;
import org.apache.accumulo.core.client.Accumulo;
import org.apache.accumulo.core.client.AccumuloClient;
import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.client.admin.CompactionConfig;
-import org.apache.accumulo.harness.SharedMiniClusterBase;
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.accumulo.core.fate.zookeeper.ZooUtil;
+import org.apache.accumulo.core.lock.ServiceLockPaths;
+import org.apache.accumulo.core.metadata.RootTable;
+import org.apache.accumulo.core.metadata.StoredTabletFile;
+import org.apache.accumulo.core.metadata.SystemTables;
+import org.apache.accumulo.core.metadata.TServerInstance;
+import org.apache.accumulo.core.metadata.schema.MetadataSchema;
+import org.apache.accumulo.core.metadata.schema.RootTabletMetadata;
+import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.server.ServerContext;
import org.apache.accumulo.server.cli.ServerUtilOpts;
+import org.apache.accumulo.server.log.WalStateManager;
import org.apache.accumulo.server.util.Admin;
import org.apache.accumulo.server.util.checkCommand.CheckRunner;
+import org.apache.accumulo.test.functional.ConfigurableMacBase;
import org.apache.accumulo.test.functional.ReadWriteIT;
import org.apache.accumulo.test.functional.SlowIterator;
+import org.apache.hadoop.fs.Path;
import org.easymock.EasyMock;
import org.easymock.IAnswer;
-import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import com.beust.jcommander.JCommander;
import com.google.common.collect.Sets;
-public class AdminCheckIT_SimpleSuite extends SharedMiniClusterBase {
-
- @BeforeAll
- public static void setup() throws Exception {
- SharedMiniClusterBase.startMiniCluster();
- }
-
- @AfterAll
- public static void teardown() {
- SharedMiniClusterBase.stopMiniCluster();
- }
-
+public class AdminCheckIT extends ConfigurableMacBase {
private static final PrintStream ORIGINAL_OUT = System.out;
@AfterEach
@@ -194,6 +196,7 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
String expRunAllRunOrder =
"Running dummy check SYSTEM_CONFIG\nDummy check SYSTEM_CONFIG
completed with status OK\n"
+ + "Running dummy check SERVER_CONFIG\nDummy check SERVER_CONFIG
completed with status OK\n"
+ "Running dummy check TABLE_LOCKS\nDummy check TABLE_LOCKS
completed with status OK\n"
+ "Running dummy check ROOT_METADATA\nDummy check ROOT_METADATA
completed with status OK\n"
+ "Running dummy check ROOT_TABLE\nDummy check ROOT_TABLE
completed with status OK\n"
@@ -206,11 +209,13 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
+ "Running dummy check USER_FILES\nDummy check USER_FILES
completed with status OK\n";
// The dashes at the beginning and end of the string marks the begging and
end of the
// printed table allowing us to ensure the table only includes what is
expected
- String expRunAllStatusInfo =
"-SYSTEM_CONFIG|OKTABLE_LOCKS|OKROOT_METADATA|OKROOT_TABLE|OK"
- + "METADATA_TABLE|OKSYSTEM_FILES|OKUSER_FILES|OK-";
- String expRunSubStatusInfo =
"-SYSTEM_CONFIG|FILTERED_OUTTABLE_LOCKS|FILTERED_OUT"
- + "ROOT_METADATA|FILTERED_OUTROOT_TABLE|OKMETADATA_TABLE|FILTERED_OUT"
- + "SYSTEM_FILES|OKUSER_FILES|OK-";
+ String expRunAllStatusInfo =
+
"-SYSTEM_CONFIG|OKSERVER_CONFIG|OKTABLE_LOCKS|OKROOT_METADATA|OKROOT_TABLE|OK"
+ + "METADATA_TABLE|OKSYSTEM_FILES|OKUSER_FILES|OK-";
+ String expRunSubStatusInfo =
+
"-SYSTEM_CONFIG|FILTERED_OUTSERVER_CONFIG|FILTERED_OUTTABLE_LOCKS|FILTERED_OUT"
+ +
"ROOT_METADATA|FILTERED_OUTROOT_TABLE|OKMETADATA_TABLE|FILTERED_OUT"
+ + "SYSTEM_FILES|OKUSER_FILES|OK-";
assertTrue(out1.contains(expRunAllRunOrder));
assertTrue(out2.contains(expRunAllRunOrder));
@@ -239,10 +244,10 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
public void testAdminCheckRunWithCheckFailures() {
// tests running checks with some failing
- boolean[] rootTableFails = new boolean[] {true, true, true, false, true,
true, true};
- boolean[] systemConfigFails = new boolean[] {false, true, true, true,
true, true, true};
+ boolean[] rootTableFails = new boolean[] {true, true, true, true, false,
true, true, true};
+ boolean[] systemConfigFails = new boolean[] {false, true, true, true,
true, true, true, true};
boolean[] userFilesAndMetadataTableFails =
- new boolean[] {true, true, true, true, false, true, false};
+ new boolean[] {true, true, true, true, true, false, true, false};
// run all checks with ROOT_TABLE failing: only SYSTEM_CONFIG and
ROOT_METADATA should pass
// the rest should be filtered out as skipped due to dependency failure
@@ -264,6 +269,7 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
String expRunOrder1 =
"Running dummy check SYSTEM_CONFIG\nDummy check SYSTEM_CONFIG
completed with status OK\n"
+ + "Running dummy check SERVER_CONFIG\nDummy check SERVER_CONFIG
completed with status OK\n"
+ "Running dummy check TABLE_LOCKS\nDummy check TABLE_LOCKS
completed with status OK\n"
+ "Running dummy check ROOT_METADATA\nDummy check ROOT_METADATA
completed with status OK\n"
+ "Running dummy check ROOT_TABLE\nDummy check ROOT_TABLE
completed with status FAILED";
@@ -280,8 +286,8 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
assertTrue(out4.contains(expRunOrder3And4));
assertNoOtherChecksRan(out1, true, Admin.CheckCommand.Check.SYSTEM_CONFIG,
- Admin.CheckCommand.Check.TABLE_LOCKS,
Admin.CheckCommand.Check.ROOT_TABLE,
- Admin.CheckCommand.Check.ROOT_METADATA);
+ Admin.CheckCommand.Check.SERVER_CONFIG,
Admin.CheckCommand.Check.TABLE_LOCKS,
+ Admin.CheckCommand.Check.ROOT_TABLE,
Admin.CheckCommand.Check.ROOT_METADATA);
assertNoOtherChecksRan(out2, true, Admin.CheckCommand.Check.SYSTEM_CONFIG);
assertNoOtherChecksRan(out3, true, Admin.CheckCommand.Check.SYSTEM_CONFIG,
Admin.CheckCommand.Check.ROOT_TABLE,
Admin.CheckCommand.Check.USER_FILES);
@@ -293,16 +299,17 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
out3 = out3.replaceAll("\\s+", "");
out4 = out4.replaceAll("\\s+", "");
- String expStatusInfo1 =
"-SYSTEM_CONFIG|OKTABLE_LOCKS|OKROOT_METADATA|OKROOT_TABLE|FAILED"
- +
"METADATA_TABLE|SKIPPED_DEPENDENCY_FAILEDSYSTEM_FILES|SKIPPED_DEPENDENCY_FAILED"
- + "USER_FILES|SKIPPED_DEPENDENCY_FAILED-";
- String expStatusInfo2 =
"-SYSTEM_CONFIG|FAILEDTABLE_LOCKS|SKIPPED_DEPENDENCY_FAILED"
- +
"ROOT_METADATA|SKIPPED_DEPENDENCY_FAILEDROOT_TABLE|SKIPPED_DEPENDENCY_FAILED"
- +
"METADATA_TABLE|SKIPPED_DEPENDENCY_FAILEDSYSTEM_FILES|SKIPPED_DEPENDENCY_FAILED"
- + "USER_FILES|SKIPPED_DEPENDENCY_FAILED-";
- String expStatusInfo3And4 = "-SYSTEM_CONFIG|OKTABLE_LOCKS|FILTERED_OUT"
- + "ROOT_METADATA|FILTERED_OUTROOT_TABLE|OKMETADATA_TABLE|FILTERED_OUT"
- + "SYSTEM_FILES|FILTERED_OUTUSER_FILES|FAILED";
+ String expStatusInfo1 =
"-SYSTEM_CONFIG|OKSERVER_CONFIG|OKTABLE_LOCKS|OKROOT_METADATA|OK"
+ + "ROOT_TABLE|FAILEDMETADATA_TABLE|SKIPPED_DEPENDENCY_FAILED"
+ +
"SYSTEM_FILES|SKIPPED_DEPENDENCY_FAILEDUSER_FILES|SKIPPED_DEPENDENCY_FAILED-";
+ String expStatusInfo2 =
"-SYSTEM_CONFIG|FAILEDSERVER_CONFIG|SKIPPED_DEPENDENCY_FAILED"
+ +
"TABLE_LOCKS|SKIPPED_DEPENDENCY_FAILEDROOT_METADATA|SKIPPED_DEPENDENCY_FAILED"
+ +
"ROOT_TABLE|SKIPPED_DEPENDENCY_FAILEDMETADATA_TABLE|SKIPPED_DEPENDENCY_FAILED"
+ +
"SYSTEM_FILES|SKIPPED_DEPENDENCY_FAILEDUSER_FILES|SKIPPED_DEPENDENCY_FAILED-";
+ String expStatusInfo3And4 =
+ "-SYSTEM_CONFIG|OKSERVER_CONFIG|FILTERED_OUTTABLE_LOCKS|FILTERED_OUT"
+ +
"ROOT_METADATA|FILTERED_OUTROOT_TABLE|OKMETADATA_TABLE|FILTERED_OUT"
+ + "SYSTEM_FILES|FILTERED_OUTUSER_FILES|FAILED";
assertTrue(out1.contains(expStatusInfo1));
assertTrue(out2.contains(expStatusInfo2));
@@ -316,12 +323,11 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
*/
@Test
- public void testPassingTableLocksCheck() throws Exception {
- // Tests the TABLE_LOCKS check in the case where all checks pass
+ public void testTableLocksCheck() throws Exception {
String table = getUniqueNames(1)[0];
Admin.CheckCommand.Check tableLocksCheck =
Admin.CheckCommand.Check.TABLE_LOCKS;
- try (AccumuloClient client =
Accumulo.newClient().from(getClientProps()).build()) {
+ try (AccumuloClient client =
Accumulo.newClient().from(getClientProperties()).build()) {
client.tableOperations().create(table);
ReadWriteIT.ingest(client, 10, 10, 10, 0, table);
@@ -336,21 +342,43 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
slowCompaction.setIterators(List.of(is));
client.tableOperations().compact(table, slowCompaction);
+ // test passing case
var p = getCluster().exec(Admin.class, "check", "run",
tableLocksCheck.name());
assertEquals(0, p.getProcess().waitFor());
String out = p.readStdOut();
assertTrue(out.contains("locks are valid"));
assertTrue(out.contains("Check TABLE_LOCKS completed with status OK"));
assertNoOtherChecksRan(out, false, tableLocksCheck);
+
+ // test a failing case
+ // write an invalid table lock
+ final var context = getCluster().getServerContext();
+ final var zrw = context.getZooSession().asReaderWriter();
+ final var path = new
ServiceLockPaths(context.getZooCache()).createTableLocksPath();
+ zrw.putPersistentData(path.toString() + "/foo", new byte[0],
ZooUtil.NodeExistsPolicy.FAIL);
+ p = getCluster().exec(Admin.class, "check", "run",
tableLocksCheck.name());
+ assertEquals(5, p.getProcess().waitFor());
+ out = p.readStdOut();
+ assertTrue(
+ out.contains("Some table and namespace locks are INVALID (the
table/namespace DNE)"));
+ assertTrue(out.contains("Check TABLE_LOCKS completed with status
FAILED"));
+ assertNoOtherChecksRan(out, false, tableLocksCheck);
}
}
@Test
- public void testPassingMetadataTableCheck() throws Exception {
- // Tests the METADATA_TABLE check in the case where all checks pass
+ public void testMetadataTableCheck() throws Exception {
Admin.CheckCommand.Check metaTableCheck =
Admin.CheckCommand.Check.METADATA_TABLE;
+ String table = getUniqueNames(1)[0];
+
+ try (AccumuloClient client =
Accumulo.newClient().from(getClientProperties()).build()) {
+ client.tableOperations().create(table);
+
+ ReadWriteIT.ingest(client, 10, 10, 10, 0, table);
+ client.tableOperations().flush(table, null, null, true);
+ }
- // no extra setup needed, just check the metadata table
+ // test passing case
var p = getCluster().exec(Admin.class, "check", "run",
metaTableCheck.name());
assertEquals(0, p.getProcess().waitFor());
String out = p.readStdOut();
@@ -360,13 +388,31 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
assertTrue(out.contains("Looking for invalid columns"));
assertTrue(out.contains("Check METADATA_TABLE completed with status OK"));
assertNoOtherChecksRan(out, false, metaTableCheck);
+
+ // test a failing case
+ // delete a required column for the metadata of the table we created
+ final var context = getCluster().getServerContext();
+ final String tableId = context.tableOperations().tableIdMap().get(table);
+ final String tablet = tableId + "<";
+ try (var writer =
context.createBatchWriter(SystemTables.METADATA.tableName())) {
+ var mut = new Mutation(tablet);
+
mut.putDelete(MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN.getColumnFamily(),
+
MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN.getColumnQualifier());
+ writer.addMutation(mut);
+ }
+ p = getCluster().exec(Admin.class, "check", "run", metaTableCheck.name());
+ assertEquals(5, p.getProcess().waitFor());
+ out = p.readStdOut();
+ assertTrue(out.contains("Tablet " + tablet + " is missing required
columns"));
+ assertTrue(out.contains("Check METADATA_TABLE completed with status
FAILED"));
+ assertNoOtherChecksRan(out, false, metaTableCheck);
}
@Test
- public void testPassingRootTableCheck() throws Exception {
- // Tests the ROOT_TABLE check in the case where all checks pass
+ public void testRootTableCheck() throws Exception {
Admin.CheckCommand.Check rootTableCheck =
Admin.CheckCommand.Check.ROOT_TABLE;
+ // test passing case
// no extra setup needed, just check the root table
var p = getCluster().exec(Admin.class, "check", "run",
rootTableCheck.name());
assertEquals(0, p.getProcess().waitFor());
@@ -377,13 +423,31 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
assertTrue(out.contains("Looking for invalid columns"));
assertTrue(out.contains("Check ROOT_TABLE completed with status OK"));
assertNoOtherChecksRan(out, false, rootTableCheck);
+
+ // test a failing case
+ // delete a required column for the metadata of the metadata table
+ final var context = getCluster().getServerContext();
+ final String tableId = SystemTables.METADATA.tableId().canonical();
+ final String tablet = tableId + "<";
+ try (var writer =
context.createBatchWriter(SystemTables.ROOT.tableName())) {
+ var mut = new Mutation(tablet);
+
mut.putDelete(MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN.getColumnFamily(),
+
MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN.getColumnQualifier());
+ writer.addMutation(mut);
+ }
+ p = getCluster().exec(Admin.class, "check", "run", rootTableCheck.name());
+ assertEquals(5, p.getProcess().waitFor());
+ out = p.readStdOut();
+ assertTrue(out.contains("Tablet " + tablet + " is missing required
columns"));
+ assertTrue(out.contains("Check ROOT_TABLE completed with status FAILED"));
+ assertNoOtherChecksRan(out, false, rootTableCheck);
}
@Test
- public void testPassingRootMetadataCheck() throws Exception {
- // Tests the ROOT_TABLE check in the case where all checks pass
+ public void testRootMetadataCheck() throws Exception {
Admin.CheckCommand.Check rootMetaCheck =
Admin.CheckCommand.Check.ROOT_METADATA;
+ // test passing case
// no extra setup needed, just check the root table metadata
var p = getCluster().exec(Admin.class, "check", "run",
rootMetaCheck.name());
assertEquals(0, p.getProcess().waitFor());
@@ -393,27 +457,67 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
assertTrue(out.contains("Looking for invalid columns"));
assertTrue(out.contains("Check ROOT_METADATA completed with status OK"));
assertNoOtherChecksRan(out, false, rootMetaCheck);
+
+ // test a failing case
+ // delete a required column for the metadata of the root tablet
+ final var context = getCluster().getServerContext();
+ final var zrw = context.getZooSession().asReaderWriter();
+ var json = new String(zrw.getData(RootTable.ZROOT_TABLET), UTF_8);
+ var rtm = new RootTabletMetadata(json);
+ var tablet = rtm.toKeyValues().firstKey().getRow();
+ var mut = new Mutation(tablet);
+
mut.putDelete(MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN.getColumnFamily(),
+
MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN.getColumnQualifier());
+ rtm.update(mut);
+ zrw.putPersistentData(RootTable.ZROOT_TABLET, rtm.toJson().getBytes(UTF_8),
+ ZooUtil.NodeExistsPolicy.OVERWRITE);
+
+ p = getCluster().exec(Admin.class, "check", "run", rootMetaCheck.name());
+ assertEquals(5, p.getProcess().waitFor());
+ out = p.readStdOut();
+ assertTrue(out.contains("Tablet " + tablet + " is missing required
columns"));
+ assertTrue(out.contains("Check ROOT_METADATA completed with status
FAILED"));
+ assertNoOtherChecksRan(out, false, rootMetaCheck);
}
@Test
- public void testPassingSystemFilesCheck() throws Exception {
- // Tests the SYSTEM_FILES check in the case where it should pass
+ public void testSystemFilesCheck() throws Exception {
Admin.CheckCommand.Check sysFilesCheck =
Admin.CheckCommand.Check.SYSTEM_FILES;
+ // test passing case
// no extra setup needed, just run the check
var p = getCluster().exec(Admin.class, "check", "run",
sysFilesCheck.name());
assertEquals(0, p.getProcess().waitFor());
String out = p.readStdOut();
assertTrue(Pattern.compile("missing files: 0, total files:
[1-9]+").matcher(out).find());
+ assertTrue(out.contains("Check SYSTEM_FILES completed with status OK"));
+ assertNoOtherChecksRan(out, false, sysFilesCheck);
+
+ // test a failing case
+ // read the root table to find where the metadata table rfile is located
in HDFS then delete it
+ Path path;
+ ServerContext context = getCluster().getServerContext();
+ try (var scanner = context.createScanner(SystemTables.ROOT.tableName(),
Authorizations.EMPTY)) {
+
scanner.fetchColumnFamily(MetadataSchema.TabletsSection.DataFileColumnFamily.NAME);
+ var pathJsonData =
scanner.iterator().next().getKey().getColumnQualifier().toString();
+ path = new Path(StoredTabletFile.of(pathJsonData).getMetadataPath());
+ getCluster().getServerContext().getVolumeManager().delete(path);
+ }
+ p = getCluster().exec(Admin.class, "check", "run", sysFilesCheck.name());
+ assertEquals(5, p.getProcess().waitFor());
+ out = p.readStdOut();
+ assertTrue(out.contains("File " + path + " is missing"));
+ assertTrue(Pattern.compile("missing files: 1, total files:
[1-9]+").matcher(out).find());
+ assertTrue(out.contains("Check SYSTEM_FILES completed with status
FAILED"));
assertNoOtherChecksRan(out, false, sysFilesCheck);
}
@Test
- public void testPassingUserFilesCheck() throws Exception {
- // Tests the USER_FILES check in the case where it should pass
+ public void testUserFilesCheck() throws Exception {
Admin.CheckCommand.Check userFilesCheck =
Admin.CheckCommand.Check.USER_FILES;
- try (AccumuloClient client =
Accumulo.newClient().from(getClientProps()).build()) {
+ try (AccumuloClient client =
Accumulo.newClient().from(getClientProperties()).build()) {
+ // test passing case
// create a table, insert some data, and flush so there's a file to check
String table = getUniqueNames(1)[0];
client.tableOperations().create(table);
@@ -424,11 +528,123 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
assertEquals(0, p.getProcess().waitFor());
String out = p.readStdOut();
assertTrue(Pattern.compile("missing files: 0, total files:
[1-9]+").matcher(out).find());
+ assertTrue(out.contains("Check USER_FILES completed with status OK"));
+ assertNoOtherChecksRan(out, false, userFilesCheck);
+
+ // test a failing case
+ // read the metadata for the table to find where the rfile is located in
HDFS then delete it
+ Path path;
+ try (var scanner =
+ client.createScanner(SystemTables.METADATA.tableName(),
Authorizations.EMPTY)) {
+
scanner.fetchColumnFamily(MetadataSchema.TabletsSection.DataFileColumnFamily.NAME);
+ var pathJsonData =
scanner.iterator().next().getKey().getColumnQualifier().toString();
+ path = new Path(StoredTabletFile.of(pathJsonData).getMetadataPath());
+ getCluster().getServerContext().getVolumeManager().delete(path);
+ }
+ p = getCluster().exec(Admin.class, "check", "run",
userFilesCheck.name());
+ assertEquals(5, p.getProcess().waitFor());
+ out = p.readStdOut();
+ assertTrue(out.contains("File " + path + " is missing"));
+ assertTrue(Pattern.compile("missing files: 1, total files:
[1-9]+").matcher(out).find());
+ assertTrue(out.contains("Check USER_FILES completed with status
FAILED"));
assertNoOtherChecksRan(out, false, userFilesCheck);
}
}
- // TODO 4892 need failing tests...
+ @Test
+ public void testSystemConfigCheck() throws Exception {
+ Admin.CheckCommand.Check sysConfCheck =
Admin.CheckCommand.Check.SYSTEM_CONFIG;
+
+ // test passing case
+ var p = getCluster().exec(Admin.class, "check", "run",
sysConfCheck.name());
+ assertEquals(0, p.getProcess().waitFor());
+ String out = p.readStdOut();
+ assertTrue(out.contains("Checking ZooKeeper locks for Accumulo server
processes"));
+ assertTrue(out.contains("Checking ZooKeeper table nodes"));
+ assertTrue(out.contains("Checking that WAL metadata in ZooKeeper is
valid"));
+ assertTrue(out.contains("Check SYSTEM_CONFIG completed with status OK"));
+ assertNoOtherChecksRan(out, false, sysConfCheck);
+
+ // test a failing case
+ // delete the ZK data for the metadata table
+ var context = getCluster().getServerContext();
+ var zrw = context.getZooSession().asReaderWriter();
+ zrw.recursiveDelete(Constants.ZTABLES + "/" +
SystemTables.METADATA.tableId(),
+ ZooUtil.NodeMissingPolicy.FAIL);
+
+ p = getCluster().exec(Admin.class, "check", "run", sysConfCheck.name());
+ assertEquals(5, p.getProcess().waitFor());
+ out = p.readStdOut();
+ assertTrue(out.contains("Failed to find table ("
+ + (Map.entry(SystemTables.METADATA.tableName(),
SystemTables.METADATA.tableId())) + ")"));
+ assertTrue(out.contains("Check SYSTEM_CONFIG completed with status
FAILED"));
+ assertNoOtherChecksRan(out, false, sysConfCheck);
+ }
+
+ @Test
+ public void testSystemConfigCheck2() throws Exception {
+ // test a failing case
+ // delete a WAL in HDFS that is referenced in ZK
+
+ Admin.CheckCommand.Check sysConfCheck =
Admin.CheckCommand.Check.SYSTEM_CONFIG;
+ var context = getCluster().getServerContext();
+ var zrw = context.getZooSession().asReaderWriter();
+ var rootWalsDir = WalStateManager.ZWALS;
+
+ // Need to ensure some form of mutation happens to a table so at least one
WAL exists for us
+ // to delete. This call creates some data in the metadata table
+
getCluster().getServerContext().tableOperations().create(getUniqueNames(1)[0]);
+
+ // need to find a TServer with a WAL, so we can delete the WAL from DFS
+ String fullWalPathZk = null;
+ TServerInstance tServerInstance = null;
+ var tserversIter = zrw.getChildren(rootWalsDir).stream().iterator();
+ outer: while (tserversIter.hasNext()) {
+ var tserverInstanceStr = tserversIter.next();
+ var walPaths = zrw.getChildren(rootWalsDir + "/" + tserverInstanceStr);
+ if (!walPaths.isEmpty()) {
+ var walPathsIter = walPaths.iterator();
+ while (walPathsIter.hasNext()) {
+ var walPath = walPathsIter.next();
+ if (zrw.getData(rootWalsDir + "/" + tserverInstanceStr + "/" +
walPath) != null) {
+ fullWalPathZk = rootWalsDir + "/" + tserverInstanceStr + "/" +
walPath;
+ tServerInstance = new TServerInstance(tserverInstanceStr);
+ break outer;
+ }
+ }
+ }
+ }
+ assertNotNull(fullWalPathZk, "Could not find a WAL in ZK");
+ var wal = WalStateManager.parse(zrw.getData(fullWalPathZk));
+
+ // delete from HDFS
+ context.getVolumeManager().delete(wal.getSecond());
+
+ var p = getCluster().exec(Admin.class, "check", "run",
sysConfCheck.name());
+ assertEquals(5, p.getProcess().waitFor());
+ var out = p.readStdOut();
+ assertTrue(out.contains(
+ "WAL metadata for tserver " + tServerInstance + " references a WAL
that does not exist"));
+ assertTrue(out.contains("Check SYSTEM_CONFIG completed with status
FAILED"));
+ assertNoOtherChecksRan(out, false, sysConfCheck);
+ }
+
+ @Test
+ public void testServerConfigCheck() throws Exception {
+ Admin.CheckCommand.Check servConfCheck =
Admin.CheckCommand.Check.SERVER_CONFIG;
+
+ // test passing case
+ var p = getCluster().exec(Admin.class, "check", "run",
servConfCheck.name());
+ assertEquals(0, p.getProcess().waitFor());
+ String out = p.readStdOut();
+ assertTrue(out.contains("Checking server configuration"));
+ assertTrue(out.contains("Checking that all configured properties are
valid"));
+ assertTrue(out.contains("Checking that all required config properties are
present"));
+ assertTrue(out.contains("Check SERVER_CONFIG completed with status OK"));
+ assertNoOtherChecksRan(out, false, servConfCheck);
+
+ // no simple way to test for a failure case
+ }
private String executeCheckCommand(String[] checkCmdArgs, boolean[]
checksPass) {
String output;
@@ -511,6 +727,17 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
}
}
+ static class DummyServerConfigCheckRunner extends DummyCheckRunner {
+ public DummyServerConfigCheckRunner(boolean passes) {
+ super(passes);
+ }
+
+ @Override
+ public Admin.CheckCommand.Check getCheck() {
+ return Admin.CheckCommand.Check.SERVER_CONFIG;
+ }
+ }
+
static class DummyTableLocksCheckRunner extends DummyCheckRunner {
public DummyTableLocksCheckRunner(boolean passes) {
super(passes);
@@ -584,15 +811,17 @@ public class AdminCheckIT_SimpleSuite extends
SharedMiniClusterBase {
this.checkRunners = new TreeMap<>();
this.checkRunners.put(Check.SYSTEM_CONFIG,
() -> new DummySystemConfigCheckRunner(checksPass[0]));
- this.checkRunners.put(Check.TABLE_LOCKS, () -> new
DummyTableLocksCheckRunner(checksPass[1]));
+ this.checkRunners.put(Check.SERVER_CONFIG,
+ () -> new DummyServerConfigCheckRunner(checksPass[1]));
+ this.checkRunners.put(Check.TABLE_LOCKS, () -> new
DummyTableLocksCheckRunner(checksPass[2]));
this.checkRunners.put(Check.ROOT_METADATA,
- () -> new DummyRootMetadataCheckRunner(checksPass[2]));
- this.checkRunners.put(Check.ROOT_TABLE, () -> new
DummyRootTableCheckRunner(checksPass[3]));
+ () -> new DummyRootMetadataCheckRunner(checksPass[3]));
+ this.checkRunners.put(Check.ROOT_TABLE, () -> new
DummyRootTableCheckRunner(checksPass[4]));
this.checkRunners.put(Check.METADATA_TABLE,
- () -> new DummyMetadataTableCheckRunner(checksPass[4]));
+ () -> new DummyMetadataTableCheckRunner(checksPass[5]));
this.checkRunners.put(Check.SYSTEM_FILES,
- () -> new DummySystemFilesCheckRunner(checksPass[5]));
- this.checkRunners.put(Check.USER_FILES, () -> new
DummyUserFilesCheckRunner(checksPass[6]));
+ () -> new DummySystemFilesCheckRunner(checksPass[6]));
+ this.checkRunners.put(Check.USER_FILES, () -> new
DummyUserFilesCheckRunner(checksPass[7]));
}
@Override
diff --git
a/test/src/main/java/org/apache/accumulo/test/start/KeywordStartIT.java
b/test/src/main/java/org/apache/accumulo/test/start/KeywordStartIT.java
index a917ea9ee3..f3798a2a90 100644
--- a/test/src/main/java/org/apache/accumulo/test/start/KeywordStartIT.java
+++ b/test/src/main/java/org/apache/accumulo/test/start/KeywordStartIT.java
@@ -54,7 +54,6 @@ import org.apache.accumulo.monitor.Monitor;
import org.apache.accumulo.monitor.MonitorExecutable;
import org.apache.accumulo.server.conf.CheckAccumuloProperties;
import org.apache.accumulo.server.conf.CheckCompactionConfig;
-import org.apache.accumulo.server.conf.CheckServerConfig;
import org.apache.accumulo.server.conf.util.ZooInfoViewer;
import org.apache.accumulo.server.conf.util.ZooPropEditor;
import org.apache.accumulo.server.init.Initialize;
@@ -130,7 +129,6 @@ public class KeywordStartIT {
TreeMap<String,Class<? extends KeywordExecutable>> expectSet = new
TreeMap<>();
expectSet.put("admin", Admin.class);
expectSet.put("check-compaction-config", CheckCompactionConfig.class);
- expectSet.put("check-server-config", CheckServerConfig.class);
expectSet.put("check-accumulo-properties", CheckAccumuloProperties.class);
expectSet.put("compactor", CompactorExecutable.class);
expectSet.put("create-empty", CreateEmpty.class);
@@ -200,7 +198,6 @@ public class KeywordStartIT {
HashSet<Class<?>> expectSet = new HashSet<>();
expectSet.add(Admin.class);
expectSet.add(CheckCompactionConfig.class);
- expectSet.add(CheckServerConfig.class);
expectSet.add(CreateEmpty.class);
expectSet.add(CreateToken.class);
expectSet.add(DumpZookeeper.class);