kevinrr888 commented on code in PR #5348: URL: https://github.com/apache/accumulo/pull/5348#discussion_r1965734455
########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/ServerConfigCheckRunner.java: ########## @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.server.util.checkCommand; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.apache.accumulo.core.conf.Property; +import org.apache.accumulo.server.ServerContext; +import org.apache.accumulo.server.cli.ServerUtilOpts; +import org.apache.accumulo.server.util.Admin; + +public class ServerConfigCheckRunner implements CheckRunner { + private static final Admin.CheckCommand.Check check = Admin.CheckCommand.Check.SERVER_CONFIG; + + @Override + public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, ServerUtilOpts opts, + boolean fixFiles) throws Exception { + Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK; + printRunning(); + + log.trace("********** Checking server configuration **********"); + + log.trace("Checking that all configured properties are valid (valid key and value)"); + final Map<String,String> definedProps = new HashMap<>(); + final var config = context.getConfiguration(); + config.getProperties(definedProps, s -> true); + for (var entry : definedProps.entrySet()) { + var key = entry.getKey(); + var val = entry.getValue(); + if (!Property.isValidProperty(key, val)) { + log.warn("Invalid property (key={} val={}) found in the config", key, val); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + } + + log.trace("Checking that all required config properties are present"); + // there are many properties that should be set (default value or user set), identifying them + // all and checking them here is unrealistic. Some property that is not set but is expected + // will likely result in some sort of failure eventually anyway. We will just check a few + // obvious required properties here. + Set<Property> requiredProps = Set.of(Property.INSTANCE_ZK_HOST, Property.INSTANCE_ZK_TIMEOUT, + Property.INSTANCE_SECRET, Property.INSTANCE_VOLUMES, Property.GENERAL_THREADPOOL_SIZE, + Property.GENERAL_DELEGATION_TOKEN_LIFETIME, + Property.GENERAL_DELEGATION_TOKEN_UPDATE_INTERVAL, Property.GENERAL_IDLE_PROCESS_INTERVAL, + Property.GENERAL_LOW_MEM_DETECTOR_INTERVAL, Property.GENERAL_LOW_MEM_DETECTOR_THRESHOLD, + Property.GENERAL_PROCESS_BIND_ADDRESS, Property.GENERAL_SERVER_LOCK_VERIFICATION_INTERVAL, + Property.MANAGER_CLIENTPORT, Property.TSERV_CLIENTPORT, Property.GC_CYCLE_START, + Property.GC_CYCLE_DELAY, Property.GC_PORT, Property.MONITOR_PORT, Property.TABLE_MAJC_RATIO, + Property.TABLE_SPLIT_THRESHOLD); Review Comment: any other important ones I left out? Any I shouldn't have included? ########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java: ########## @@ -18,22 +18,230 @@ */ package org.apache.accumulo.server.util.checkCommand; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.accumulo.core.Constants; +import org.apache.accumulo.core.lock.ServiceLock; +import org.apache.accumulo.core.metadata.AccumuloTable; +import org.apache.accumulo.core.metadata.TServerInstance; +import org.apache.accumulo.core.metadata.schema.TabletMetadata; +import org.apache.accumulo.core.zookeeper.ZooSession; import org.apache.accumulo.server.ServerContext; import org.apache.accumulo.server.cli.ServerUtilOpts; +import org.apache.accumulo.server.log.WalStateManager; import org.apache.accumulo.server.util.Admin; public class SystemConfigCheckRunner implements CheckRunner { private static final Admin.CheckCommand.Check check = Admin.CheckCommand.Check.SYSTEM_CONFIG; + public enum ServerProcess { + MANAGER, GC, TSERVER, COMPACTION_COORDINATOR, COMPACTOR, MONITOR, SCAN_SERVER + } + @Override public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, ServerUtilOpts opts, boolean fixFiles) throws Exception { Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK; printRunning(); + + log.trace("********** Checking validity of some ZooKeeper nodes **********"); + status = checkZkNodes(context, status); + printCompleted(status); return status; } + private static Admin.CheckCommand.CheckStatus checkZkNodes(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + status = checkZKLocks(context, status); + status = checkZKTableNodes(context, status); + status = checkZKWALsMetadata(context, status); + + return status; + } + + private static Admin.CheckCommand.CheckStatus checkZKLocks(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + final ServerProcess[] serverProcesses = ServerProcess.values(); + final String zkRoot = context.getZooKeeperRoot(); + final var zs = context.getZooSession(); + final var zrw = zs.asReaderWriter(); + final var compactors = context.instanceOperations().getCompactors(); + final var sservers = context.instanceOperations().getScanServers(); + + log.trace("Checking ZooKeeper locks for Accumulo server processes..."); + + // check that essential server processes have a ZK lock failing otherwise + // check that nonessential server processes have a ZK lock only if they are running. If they are + // not running, alerts the user that the process is not running which may or may not be expected + for (ServerProcess proc : serverProcesses) { + log.trace("Looking for {} lock(s)...", proc); + switch (proc) { + case MANAGER: + // essential process + status = checkLock(zkRoot + Constants.ZMANAGER_LOCK, proc, true, zs, status); + break; + case GC: + // essential process + status = checkLock(zkRoot + Constants.ZGC_LOCK, proc, true, zs, status); + break; + case TSERVER: + // essential process(es) + final var tservers = TabletMetadata.getLiveTServers(context); + if (tservers.isEmpty()) { + log.warn("Did not find any running tablet servers!"); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + break; + case COMPACTION_COORDINATOR: + // nonessential process + status = checkLock(zkRoot + Constants.ZCOORDINATOR_LOCK, proc, false, zs, status); + break; + case COMPACTOR: + // nonessential process(es) + if (compactors.isEmpty()) { + log.debug("No compactors appear to be running... This may or may not be expected"); + } + for (String compactor : compactors) { + // for each running compactor, ensure a zk lock exists for it + boolean checkedLock = false; + String compactorQueuesPath = zkRoot + Constants.ZCOMPACTORS; + var compactorQueues = zrw.getChildren(compactorQueuesPath); + // find the queue the compactor is in + for (var queue : compactorQueues) { + String compactorQueuePath = compactorQueuesPath + "/" + queue; + String lockPath = compactorQueuePath + "/" + compactor; + if (zrw.exists(lockPath)) { + status = checkLock(lockPath, proc, true, zs, status); + checkedLock = true; + break; + } + } + if (!checkedLock) { + log.warn("Did not find a ZooKeeper lock for the compactor {}!", compactor); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + } + break; + case MONITOR: + // nonessential process + status = checkLock(zkRoot + Constants.ZMONITOR_LOCK, proc, false, zs, status); + break; + case SCAN_SERVER: + // nonessential process(es) + if (sservers.isEmpty()) { + log.debug("No scan servers appear to be running... This may or may not be expected"); + } + for (String sserver : sservers) { + status = + checkLock(zkRoot + Constants.ZSSERVERS + "/" + sserver, proc, true, zs, status); + } + break; + default: + throw new IllegalStateException("Unhandled case: " + proc); + } + } Review Comment: I imagine some of the non essential processes here in 3.x are essential in 4.x. I believe COMPACTION_COORDINATOR, COMPACTOR, SCAN_SERVER are all essential in 4.x? Is this correct? Will need to change this when merging into 4.x if so. ########## server/base/src/main/java/org/apache/accumulo/server/conf/CheckServerConfig.java: ########## @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.accumulo.server.conf; - -import org.apache.accumulo.core.conf.SiteConfiguration; -import org.apache.accumulo.server.ServerContext; -import org.apache.accumulo.start.spi.KeywordExecutable; - -import com.google.auto.service.AutoService; - -@AutoService(KeywordExecutable.class) -public class CheckServerConfig implements KeywordExecutable { Review Comment: I deleted CheckServerConfig since now this is checked through this SERVER_CONFIG check: the ServerContext object is constructed the same (see ServerUtilOpts) and we end up calling context.getConfiguration() in the SERVER_CONFIG check, so the same checks are done with a few additions. Maybe we don't want to completely get rid of this check yet. Maybe it would be better to have the check just output that the check has moved under `accumulo admin check`? ########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java: ########## @@ -18,22 +18,230 @@ */ package org.apache.accumulo.server.util.checkCommand; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.accumulo.core.Constants; +import org.apache.accumulo.core.lock.ServiceLock; +import org.apache.accumulo.core.metadata.AccumuloTable; +import org.apache.accumulo.core.metadata.TServerInstance; +import org.apache.accumulo.core.metadata.schema.TabletMetadata; +import org.apache.accumulo.core.zookeeper.ZooSession; import org.apache.accumulo.server.ServerContext; import org.apache.accumulo.server.cli.ServerUtilOpts; +import org.apache.accumulo.server.log.WalStateManager; import org.apache.accumulo.server.util.Admin; public class SystemConfigCheckRunner implements CheckRunner { private static final Admin.CheckCommand.Check check = Admin.CheckCommand.Check.SYSTEM_CONFIG; + public enum ServerProcess { + MANAGER, GC, TSERVER, COMPACTION_COORDINATOR, COMPACTOR, MONITOR, SCAN_SERVER + } Review Comment: Is there already an existing enum? Considered ServiceLockData.ThriftService but that isn't quite what I'm looking for (missing a type for ScanServer and Monitor for example). ServerType in accumulo-minicluster is what I'm looking for but wouldn't make sense to use a minicluster class here. Is there an enum containing all the types of servers an instance may be running? ########## test/src/main/java/org/apache/accumulo/test/AdminCheckIT.java: ########## @@ -34,19 +35,29 @@ import java.util.Set; import java.util.TreeMap; import java.util.function.Supplier; -import java.util.regex.Pattern; Review Comment: General comment about `AdminCheckIT`: considered changing from ConfigurableMacBase to SharedMiniCluserBase for this test from suggestion of @dlmarion. Prior to these new changes, SharedMiniClusterBase was sufficient and much faster, but some of these tests now do damage to the MAC to test for failure cases of the checks (e.g., deleting essential files, messing with metadata) ########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java: ########## @@ -18,22 +18,230 @@ */ package org.apache.accumulo.server.util.checkCommand; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.accumulo.core.Constants; +import org.apache.accumulo.core.lock.ServiceLock; +import org.apache.accumulo.core.metadata.AccumuloTable; +import org.apache.accumulo.core.metadata.TServerInstance; +import org.apache.accumulo.core.metadata.schema.TabletMetadata; +import org.apache.accumulo.core.zookeeper.ZooSession; import org.apache.accumulo.server.ServerContext; import org.apache.accumulo.server.cli.ServerUtilOpts; +import org.apache.accumulo.server.log.WalStateManager; import org.apache.accumulo.server.util.Admin; public class SystemConfigCheckRunner implements CheckRunner { private static final Admin.CheckCommand.Check check = Admin.CheckCommand.Check.SYSTEM_CONFIG; + public enum ServerProcess { + MANAGER, GC, TSERVER, COMPACTION_COORDINATOR, COMPACTOR, MONITOR, SCAN_SERVER + } + @Override public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, ServerUtilOpts opts, boolean fixFiles) throws Exception { Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK; printRunning(); + + log.trace("********** Checking validity of some ZooKeeper nodes **********"); + status = checkZkNodes(context, status); + printCompleted(status); return status; } + private static Admin.CheckCommand.CheckStatus checkZkNodes(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + status = checkZKLocks(context, status); + status = checkZKTableNodes(context, status); + status = checkZKWALsMetadata(context, status); + + return status; + } + + private static Admin.CheckCommand.CheckStatus checkZKLocks(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + final ServerProcess[] serverProcesses = ServerProcess.values(); + final String zkRoot = context.getZooKeeperRoot(); + final var zs = context.getZooSession(); + final var zrw = zs.asReaderWriter(); + final var compactors = context.instanceOperations().getCompactors(); + final var sservers = context.instanceOperations().getScanServers(); Review Comment: Looking back at this code again: this gets the compactors and sservers from scanning ZK then validating the lock node... I hadn't considered how this was gathering the data. Thinking most or all of the code for the cases for COMPACTOR and SCAN_SERVER below can be removed. Thinking the new cases would look something like: ``` case COMPACTOR: var compactors = context.instanceOperations().getCompactors(); if (compactors.isEmpty()) { // warn user (debug) that no compactors were found to be running which may or may not be expected. } break; ``` ########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/RootTableCheckRunner.java: ########## @@ -51,8 +51,7 @@ public TableId tableId() { public Set<ColumnFQ> requiredColFQs() { return Set.of(MetadataSchema.TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN, MetadataSchema.TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN, - MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN, - MetadataSchema.TabletsSection.ServerColumnFamily.LOCK_COLUMN); + MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN); } Review Comment: Do the required col FQs and col fams in RootTableCheckRunner, RootMetadataCheckRunner, and MetadataTableCheckRunner look correct now? If these look good, since these are all equivalent now, can push these up to no longer override and return the same thing... ########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java: ########## @@ -18,22 +18,230 @@ */ package org.apache.accumulo.server.util.checkCommand; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.accumulo.core.Constants; +import org.apache.accumulo.core.lock.ServiceLock; +import org.apache.accumulo.core.metadata.AccumuloTable; +import org.apache.accumulo.core.metadata.TServerInstance; +import org.apache.accumulo.core.metadata.schema.TabletMetadata; +import org.apache.accumulo.core.zookeeper.ZooSession; import org.apache.accumulo.server.ServerContext; import org.apache.accumulo.server.cli.ServerUtilOpts; +import org.apache.accumulo.server.log.WalStateManager; import org.apache.accumulo.server.util.Admin; public class SystemConfigCheckRunner implements CheckRunner { private static final Admin.CheckCommand.Check check = Admin.CheckCommand.Check.SYSTEM_CONFIG; + public enum ServerProcess { + MANAGER, GC, TSERVER, COMPACTION_COORDINATOR, COMPACTOR, MONITOR, SCAN_SERVER + } + @Override public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, ServerUtilOpts opts, boolean fixFiles) throws Exception { Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK; printRunning(); + + log.trace("********** Checking validity of some ZooKeeper nodes **********"); + status = checkZkNodes(context, status); + printCompleted(status); return status; } + private static Admin.CheckCommand.CheckStatus checkZkNodes(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + status = checkZKLocks(context, status); + status = checkZKTableNodes(context, status); + status = checkZKWALsMetadata(context, status); + + return status; + } + + private static Admin.CheckCommand.CheckStatus checkZKLocks(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + final ServerProcess[] serverProcesses = ServerProcess.values(); + final String zkRoot = context.getZooKeeperRoot(); + final var zs = context.getZooSession(); + final var zrw = zs.asReaderWriter(); + final var compactors = context.instanceOperations().getCompactors(); + final var sservers = context.instanceOperations().getScanServers(); + + log.trace("Checking ZooKeeper locks for Accumulo server processes..."); + + // check that essential server processes have a ZK lock failing otherwise + // check that nonessential server processes have a ZK lock only if they are running. If they are + // not running, alerts the user that the process is not running which may or may not be expected + for (ServerProcess proc : serverProcesses) { + log.trace("Looking for {} lock(s)...", proc); + switch (proc) { + case MANAGER: + // essential process + status = checkLock(zkRoot + Constants.ZMANAGER_LOCK, proc, true, zs, status); + break; + case GC: + // essential process + status = checkLock(zkRoot + Constants.ZGC_LOCK, proc, true, zs, status); + break; + case TSERVER: + // essential process(es) + final var tservers = TabletMetadata.getLiveTServers(context); + if (tservers.isEmpty()) { + log.warn("Did not find any running tablet servers!"); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + break; + case COMPACTION_COORDINATOR: + // nonessential process + status = checkLock(zkRoot + Constants.ZCOORDINATOR_LOCK, proc, false, zs, status); + break; + case COMPACTOR: + // nonessential process(es) + if (compactors.isEmpty()) { + log.debug("No compactors appear to be running... This may or may not be expected"); + } + for (String compactor : compactors) { + // for each running compactor, ensure a zk lock exists for it + boolean checkedLock = false; + String compactorQueuesPath = zkRoot + Constants.ZCOMPACTORS; + var compactorQueues = zrw.getChildren(compactorQueuesPath); + // find the queue the compactor is in + for (var queue : compactorQueues) { + String compactorQueuePath = compactorQueuesPath + "/" + queue; + String lockPath = compactorQueuePath + "/" + compactor; + if (zrw.exists(lockPath)) { + status = checkLock(lockPath, proc, true, zs, status); + checkedLock = true; + break; + } + } + if (!checkedLock) { + log.warn("Did not find a ZooKeeper lock for the compactor {}!", compactor); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + } + break; + case MONITOR: + // nonessential process + status = checkLock(zkRoot + Constants.ZMONITOR_LOCK, proc, false, zs, status); + break; + case SCAN_SERVER: + // nonessential process(es) + if (sservers.isEmpty()) { + log.debug("No scan servers appear to be running... This may or may not be expected"); + } + for (String sserver : sservers) { + status = + checkLock(zkRoot + Constants.ZSSERVERS + "/" + sserver, proc, true, zs, status); + } + break; + default: + throw new IllegalStateException("Unhandled case: " + proc); + } + } + + return status; + } + + private static Admin.CheckCommand.CheckStatus checkLock(String path, ServerProcess proc, + boolean requiredProc, ZooSession zs, Admin.CheckCommand.CheckStatus status) throws Exception { + log.trace("Checking ZooKeeper lock at path {}", path); + + ServiceLock.ServiceLockPath slp = ServiceLock.path(path); + var opData = ServiceLock.getLockData(zs, slp); + if (requiredProc && opData.isEmpty()) { + log.warn("No ZooKeeper lock found for {} at {}! The process may not be running.", proc, path); + status = Admin.CheckCommand.CheckStatus.FAILED; + } else if (!requiredProc && opData.isEmpty()) { + log.debug("No ZooKeeper lock found for {} at {}. The process may not be running. " + + "This may or may not be expected.", proc, path); + } + return status; + } + + private static Admin.CheckCommand.CheckStatus checkZKTableNodes(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + log.trace("Checking ZooKeeper table nodes..."); + + final var zrw = context.getZooSession().asReaderWriter(); + final var zkRoot = context.getZooKeeperRoot(); + final var tableNameToId = context.tableOperations().tableIdMap(); + final Map<String,String> systemTableNameToId = new HashMap<>(); + for (var accumuloTable : AccumuloTable.values()) { + systemTableNameToId.put(accumuloTable.tableName(), accumuloTable.tableId().canonical()); + } + + // ensure all system tables exist + if (!tableNameToId.values().containsAll(systemTableNameToId.values())) { + log.warn( + "Missing essential Accumulo table. One or more of {} are missing from the tables found {}", + systemTableNameToId, tableNameToId); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + for (var nameToId : tableNameToId.entrySet()) { + var tablePath = zkRoot + Constants.ZTABLES + "/" + nameToId.getValue(); + // expect the table path to exist and some data to exist + if (!zrw.exists(tablePath) || zrw.getChildren(tablePath).isEmpty()) { + log.warn("Failed to find table ({}) info at expected path {}", nameToId, tablePath); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + } + + return status; + } + + private static Admin.CheckCommand.CheckStatus checkZKWALsMetadata(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + final String zkRoot = context.getZooKeeperRoot(); + final var zs = context.getZooSession(); + final var zrw = zs.asReaderWriter(); + final var rootWalsDir = zkRoot + WalStateManager.ZWALS; + final Set<TServerInstance> tserverInstances = TabletMetadata.getLiveTServers(context); + final Set<TServerInstance> seenTServerInstancesAtWals = new HashSet<>(); + + log.trace("Checking that WAL metadata in ZooKeeper is valid..."); + + // each child node of the root wals dir should be a TServerInstance.toString() + var tserverInstancesAtWals = zrw.getChildren(rootWalsDir); + for (var tserverInstanceAtWals : tserverInstancesAtWals) { + final TServerInstance tsi = new TServerInstance(tserverInstanceAtWals); + seenTServerInstancesAtWals.add(tsi); + final var tserverPath = rootWalsDir + "/" + tserverInstanceAtWals; + // each child node of the tserver should be WAL metadata + final var wals = zrw.getChildren(tserverPath); + if (wals.isEmpty()) { + log.debug("No WAL metadata found for tserver {}", tsi); + } + for (var wal : wals) { + // should be able to parse the WAL metadata + final var fullWalPath = tserverPath + "/" + wal; + log.trace("Attempting to parse WAL metadata at {}", fullWalPath); + var parseRes = WalStateManager.parse(zrw.getData(fullWalPath)); + log.trace("Successfully parsed WAL metadata at {} result {}", fullWalPath, parseRes); + log.trace("Checking if the WAL path {} found in the metadata exists in HDFS...", + parseRes.getSecond()); + if (!context.getVolumeManager().exists(parseRes.getSecond())) { + log.warn("WAL metadata for tserver {} references a WAL that does not exist", + tserverInstanceAtWals); + status = Admin.CheckCommand.CheckStatus.FAILED; + } Review Comment: Let me know if this part might is a bit much to be checking here ########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java: ########## @@ -18,22 +18,230 @@ */ package org.apache.accumulo.server.util.checkCommand; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.accumulo.core.Constants; +import org.apache.accumulo.core.lock.ServiceLock; +import org.apache.accumulo.core.metadata.AccumuloTable; +import org.apache.accumulo.core.metadata.TServerInstance; +import org.apache.accumulo.core.metadata.schema.TabletMetadata; +import org.apache.accumulo.core.zookeeper.ZooSession; import org.apache.accumulo.server.ServerContext; import org.apache.accumulo.server.cli.ServerUtilOpts; +import org.apache.accumulo.server.log.WalStateManager; import org.apache.accumulo.server.util.Admin; public class SystemConfigCheckRunner implements CheckRunner { private static final Admin.CheckCommand.Check check = Admin.CheckCommand.Check.SYSTEM_CONFIG; + public enum ServerProcess { + MANAGER, GC, TSERVER, COMPACTION_COORDINATOR, COMPACTOR, MONITOR, SCAN_SERVER + } + @Override public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, ServerUtilOpts opts, boolean fixFiles) throws Exception { Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK; printRunning(); + + log.trace("********** Checking validity of some ZooKeeper nodes **********"); + status = checkZkNodes(context, status); + printCompleted(status); return status; } + private static Admin.CheckCommand.CheckStatus checkZkNodes(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + status = checkZKLocks(context, status); + status = checkZKTableNodes(context, status); + status = checkZKWALsMetadata(context, status); + + return status; + } + + private static Admin.CheckCommand.CheckStatus checkZKLocks(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + final ServerProcess[] serverProcesses = ServerProcess.values(); + final String zkRoot = context.getZooKeeperRoot(); + final var zs = context.getZooSession(); + final var zrw = zs.asReaderWriter(); + final var compactors = context.instanceOperations().getCompactors(); + final var sservers = context.instanceOperations().getScanServers(); + + log.trace("Checking ZooKeeper locks for Accumulo server processes..."); + + // check that essential server processes have a ZK lock failing otherwise + // check that nonessential server processes have a ZK lock only if they are running. If they are + // not running, alerts the user that the process is not running which may or may not be expected + for (ServerProcess proc : serverProcesses) { + log.trace("Looking for {} lock(s)...", proc); + switch (proc) { + case MANAGER: + // essential process + status = checkLock(zkRoot + Constants.ZMANAGER_LOCK, proc, true, zs, status); + break; + case GC: + // essential process + status = checkLock(zkRoot + Constants.ZGC_LOCK, proc, true, zs, status); + break; + case TSERVER: + // essential process(es) + final var tservers = TabletMetadata.getLiveTServers(context); + if (tservers.isEmpty()) { + log.warn("Did not find any running tablet servers!"); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + break; + case COMPACTION_COORDINATOR: + // nonessential process + status = checkLock(zkRoot + Constants.ZCOORDINATOR_LOCK, proc, false, zs, status); + break; + case COMPACTOR: + // nonessential process(es) + if (compactors.isEmpty()) { + log.debug("No compactors appear to be running... This may or may not be expected"); + } + for (String compactor : compactors) { + // for each running compactor, ensure a zk lock exists for it + boolean checkedLock = false; + String compactorQueuesPath = zkRoot + Constants.ZCOMPACTORS; + var compactorQueues = zrw.getChildren(compactorQueuesPath); + // find the queue the compactor is in + for (var queue : compactorQueues) { + String compactorQueuePath = compactorQueuesPath + "/" + queue; + String lockPath = compactorQueuePath + "/" + compactor; + if (zrw.exists(lockPath)) { + status = checkLock(lockPath, proc, true, zs, status); + checkedLock = true; + break; + } + } + if (!checkedLock) { + log.warn("Did not find a ZooKeeper lock for the compactor {}!", compactor); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + } + break; + case MONITOR: + // nonessential process + status = checkLock(zkRoot + Constants.ZMONITOR_LOCK, proc, false, zs, status); + break; + case SCAN_SERVER: + // nonessential process(es) + if (sservers.isEmpty()) { + log.debug("No scan servers appear to be running... This may or may not be expected"); + } + for (String sserver : sservers) { + status = + checkLock(zkRoot + Constants.ZSSERVERS + "/" + sserver, proc, true, zs, status); + } + break; + default: + throw new IllegalStateException("Unhandled case: " + proc); + } + } + + return status; + } + + private static Admin.CheckCommand.CheckStatus checkLock(String path, ServerProcess proc, + boolean requiredProc, ZooSession zs, Admin.CheckCommand.CheckStatus status) throws Exception { + log.trace("Checking ZooKeeper lock at path {}", path); + + ServiceLock.ServiceLockPath slp = ServiceLock.path(path); + var opData = ServiceLock.getLockData(zs, slp); + if (requiredProc && opData.isEmpty()) { + log.warn("No ZooKeeper lock found for {} at {}! The process may not be running.", proc, path); + status = Admin.CheckCommand.CheckStatus.FAILED; + } else if (!requiredProc && opData.isEmpty()) { + log.debug("No ZooKeeper lock found for {} at {}. The process may not be running. " + + "This may or may not be expected.", proc, path); + } + return status; + } + + private static Admin.CheckCommand.CheckStatus checkZKTableNodes(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + log.trace("Checking ZooKeeper table nodes..."); + + final var zrw = context.getZooSession().asReaderWriter(); + final var zkRoot = context.getZooKeeperRoot(); + final var tableNameToId = context.tableOperations().tableIdMap(); + final Map<String,String> systemTableNameToId = new HashMap<>(); + for (var accumuloTable : AccumuloTable.values()) { + systemTableNameToId.put(accumuloTable.tableName(), accumuloTable.tableId().canonical()); + } + + // ensure all system tables exist + if (!tableNameToId.values().containsAll(systemTableNameToId.values())) { + log.warn( + "Missing essential Accumulo table. One or more of {} are missing from the tables found {}", + systemTableNameToId, tableNameToId); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + for (var nameToId : tableNameToId.entrySet()) { + var tablePath = zkRoot + Constants.ZTABLES + "/" + nameToId.getValue(); + // expect the table path to exist and some data to exist + if (!zrw.exists(tablePath) || zrw.getChildren(tablePath).isEmpty()) { + log.warn("Failed to find table ({}) info at expected path {}", nameToId, tablePath); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + } + + return status; + } + + private static Admin.CheckCommand.CheckStatus checkZKWALsMetadata(ServerContext context, + Admin.CheckCommand.CheckStatus status) throws Exception { + final String zkRoot = context.getZooKeeperRoot(); + final var zs = context.getZooSession(); + final var zrw = zs.asReaderWriter(); + final var rootWalsDir = zkRoot + WalStateManager.ZWALS; + final Set<TServerInstance> tserverInstances = TabletMetadata.getLiveTServers(context); + final Set<TServerInstance> seenTServerInstancesAtWals = new HashSet<>(); + + log.trace("Checking that WAL metadata in ZooKeeper is valid..."); + + // each child node of the root wals dir should be a TServerInstance.toString() + var tserverInstancesAtWals = zrw.getChildren(rootWalsDir); + for (var tserverInstanceAtWals : tserverInstancesAtWals) { + final TServerInstance tsi = new TServerInstance(tserverInstanceAtWals); + seenTServerInstancesAtWals.add(tsi); + final var tserverPath = rootWalsDir + "/" + tserverInstanceAtWals; + // each child node of the tserver should be WAL metadata + final var wals = zrw.getChildren(tserverPath); + if (wals.isEmpty()) { + log.debug("No WAL metadata found for tserver {}", tsi); + } + for (var wal : wals) { + // should be able to parse the WAL metadata + final var fullWalPath = tserverPath + "/" + wal; + log.trace("Attempting to parse WAL metadata at {}", fullWalPath); + var parseRes = WalStateManager.parse(zrw.getData(fullWalPath)); + log.trace("Successfully parsed WAL metadata at {} result {}", fullWalPath, parseRes); + log.trace("Checking if the WAL path {} found in the metadata exists in HDFS...", + parseRes.getSecond()); + if (!context.getVolumeManager().exists(parseRes.getSecond())) { + log.warn("WAL metadata for tserver {} references a WAL that does not exist", + tserverInstanceAtWals); + status = Admin.CheckCommand.CheckStatus.FAILED; + } + } + } + if (!tserverInstances.equals(seenTServerInstancesAtWals)) { + log.warn( + "Expected WAL metadata in ZooKeeper for all tservers. tservers={} tservers seen storing WAL metadata={}", + tserverInstances, seenTServerInstancesAtWals); + status = Admin.CheckCommand.CheckStatus.FAILED; + } Review Comment: Should we expect this? If no writes have occurred yet for the TServer, this check would fail, but I imagine in any normal scenario this check is fine. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
