keith-turner commented on code in PR #4957: URL: https://github.com/apache/accumulo/pull/4957#discussion_r1815332185
########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/UserFilesCheckRunner.java: ########## @@ -20,18 +20,33 @@ import static org.apache.accumulo.server.util.Admin.CheckCommand.Check; +import org.apache.accumulo.core.metadata.AccumuloTable; +import org.apache.accumulo.server.ServerContext; +import org.apache.accumulo.server.cli.ServerUtilOpts; import org.apache.accumulo.server.util.Admin; +import org.apache.accumulo.server.util.RemoveEntriesForMissingFiles; public class UserFilesCheckRunner implements CheckRunner { private static final Check check = Check.USER_FILES; @Override - public Admin.CheckCommand.CheckStatus runCheck() { + public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, ServerUtilOpts opts, + boolean fixFiles) throws Exception { Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK; + printRunning(); - System.out.println("Running check " + check); - // work - System.out.println("Check " + check + " completed with status " + status); + System.out.println("\n********** Looking for missing user files **********\n"); Review Comment: These prints produce a lot of noise when running the command. Could make these log at trace/debug level. This would make the output of the command very concise when there are no problems. For testing purpose could change the log4j2 testing config to set the check logger to trace/debug. When a problem is found could log the details of that at warn. I rant this locally and manually corrupted the metadata table and saw the following output. ``` ********** Looking for missing columns ********** Scanning the accumulo.metadata (!0) table for missing required columns... Tablet +scanref;m is missing required columns: col FQs: [srv:dir, ~tab:~pr], col fams: [] in the accumulo.metadata (!0) table Tablet +scanref< is missing required columns: col FQs: [srv:dir], col fams: [] in the accumulo.metadata (!0) table ********** Looking for invalid columns ********** ``` So maybe in that output everything could be logged at trace/debug and the two lines about problems fouind at WARN level. If this seems like a good change, it could be done in a follow on PR. ########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/MetadataCheckRunner.java: ########## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.server.util.checkCommand; + +import java.io.IOException; +import java.util.AbstractMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import java.util.SortedMap; + +import org.apache.accumulo.core.client.IteratorSetting; +import org.apache.accumulo.core.client.Scanner; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Mutation; +import org.apache.accumulo.core.data.TableId; +import org.apache.accumulo.core.data.TabletId; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.user.WholeRowIterator; +import org.apache.accumulo.core.metadata.schema.MetadataSchema; +import org.apache.accumulo.core.security.AuthorizationContainer; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.core.util.ColumnFQ; +import org.apache.accumulo.server.ServerContext; +import org.apache.accumulo.server.constraints.MetadataConstraints; +import org.apache.accumulo.server.constraints.SystemEnvironment; +import org.apache.accumulo.server.util.Admin; +import org.apache.hadoop.io.Text; +import org.apache.zookeeper.KeeperException; + +public interface MetadataCheckRunner extends CheckRunner { + + String tableName(); + + TableId tableId(); + + Set<ColumnFQ> requiredColFQs(); + + Set<Text> requiredColFams(); + + default String scanning() { + return String.format("%s (%s) table", tableName(), tableId()); + } + + /** + * Ensures that the {@link #tableName()} table (either metadata or root table) has all columns + * that are expected. For the root metadata, ensures that the expected "columns" exist in ZK. + */ + default Admin.CheckCommand.CheckStatus checkRequiredColumns(ServerContext context, + Admin.CheckCommand.CheckStatus status) + throws TableNotFoundException, InterruptedException, KeeperException { + Set<ColumnFQ> requiredColFQs; + Set<Text> requiredColFams; + boolean missingReqCol = false; + + System.out.printf("Scanning the %s for missing required columns...\n", scanning()); + try (Scanner scanner = context.createScanner(tableName(), Authorizations.EMPTY)) { + var is = new IteratorSetting(100, "tablets", WholeRowIterator.class); + scanner.addScanIterator(is); Review Comment: could also configure the scanner to only fetch the required columns. This will bring back less data, like it will not bring back tablets files. ########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/MetadataCheckRunner.java: ########## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.server.util.checkCommand; + +import java.io.IOException; +import java.util.AbstractMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import java.util.SortedMap; + +import org.apache.accumulo.core.client.IteratorSetting; +import org.apache.accumulo.core.client.Scanner; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Mutation; +import org.apache.accumulo.core.data.TableId; +import org.apache.accumulo.core.data.TabletId; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.user.WholeRowIterator; +import org.apache.accumulo.core.metadata.schema.MetadataSchema; +import org.apache.accumulo.core.security.AuthorizationContainer; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.core.util.ColumnFQ; +import org.apache.accumulo.server.ServerContext; +import org.apache.accumulo.server.constraints.MetadataConstraints; +import org.apache.accumulo.server.constraints.SystemEnvironment; +import org.apache.accumulo.server.util.Admin; +import org.apache.hadoop.io.Text; +import org.apache.zookeeper.KeeperException; + +public interface MetadataCheckRunner extends CheckRunner { + + String tableName(); + + TableId tableId(); + + Set<ColumnFQ> requiredColFQs(); + + Set<Text> requiredColFams(); + + default String scanning() { + return String.format("%s (%s) table", tableName(), tableId()); + } + + /** + * Ensures that the {@link #tableName()} table (either metadata or root table) has all columns + * that are expected. For the root metadata, ensures that the expected "columns" exist in ZK. + */ + default Admin.CheckCommand.CheckStatus checkRequiredColumns(ServerContext context, + Admin.CheckCommand.CheckStatus status) + throws TableNotFoundException, InterruptedException, KeeperException { + Set<ColumnFQ> requiredColFQs; + Set<Text> requiredColFams; + boolean missingReqCol = false; + + System.out.printf("Scanning the %s for missing required columns...\n", scanning()); + try (Scanner scanner = context.createScanner(tableName(), Authorizations.EMPTY)) { + var is = new IteratorSetting(100, "tablets", WholeRowIterator.class); + scanner.addScanIterator(is); + scanner.setRange(MetadataSchema.TabletsSection.getRange()); + for (var entry : scanner) { + requiredColFQs = new HashSet<>(requiredColFQs()); + requiredColFams = new HashSet<>(requiredColFams()); + SortedMap<Key,Value> rowMap; + try { + rowMap = WholeRowIterator.decodeRow(entry.getKey(), entry.getValue()); + } catch (IOException e) { + throw new RuntimeException(e); + } + for (var e : rowMap.entrySet()) { + var key = e.getKey(); + boolean removed = + requiredColFQs.remove(new ColumnFQ(key.getColumnFamily(), key.getColumnQualifier())); + if (!removed) { + requiredColFams.remove(key.getColumnFamily()); + } + } + if (!requiredColFQs.isEmpty() || !requiredColFams.isEmpty()) { + System.out.printf( + "Tablet %s is missing required columns: col FQs: %s, col fams: %s in the %s\n", + entry.getKey().getRow(), requiredColFQs, requiredColFams, scanning()); + status = Admin.CheckCommand.CheckStatus.FAILED; + missingReqCol = true; + } + } + } + + if (!missingReqCol) { + System.out.printf("...The %s contains all required columns for all tablets\n", scanning()); + } + return status; + } + + /** + * Ensures each column in the root or metadata table (or in ZK for the root metadata) is valid - + * no unexpected columns, and for the columns that are expected, ensures the values are valid + */ + default Admin.CheckCommand.CheckStatus checkColumns(ServerContext context, + Iterator<AbstractMap.SimpleImmutableEntry<Key,Value>> iter, + Admin.CheckCommand.CheckStatus status) + throws TableNotFoundException, InterruptedException, KeeperException { + boolean invalidCol = false; + + System.out.printf("Scanning the %s for invalid columns...\n", scanning()); + while (iter.hasNext()) { + var entry = iter.next(); + Key key = entry.getKey(); + // create a mutation that's equivalent to the existing data to check validity + Mutation m = new Mutation(key.getRow()); + m.at().family(key.getColumnFamily()).qualifier(key.getColumnQualifier()) + .visibility(key.getColumnVisibility()).timestamp(key.getTimestamp()) + .put(entry.getValue()); + MetadataConstraints mc = new MetadataConstraints(); Review Comment: Should be able to create this outside of the loop. ########## test/src/main/java/org/apache/accumulo/test/AdminCheckIT.java: ########## @@ -264,6 +272,113 @@ public void testAdminCheckRunWithCheckFailures() { assertTrue(out4.contains(expStatusInfo3And4)); } + /* + * The following tests test the expected functionality of the admin check command (e.g., are the + * checks correctly identifying problems) + */ + + @Test + public void testSystemConfigCheck() throws Exception { + String table = getUniqueNames(1)[0]; + + try (AccumuloClient client = Accumulo.newClient().from(getClientProperties()).build()) { + client.tableOperations().create(table); + + ReadWriteIT.ingest(client, 10, 10, 10, 0, table); + client.tableOperations().flush(table, null, null, true); + + // the slow compaction is to ensure we hold a table lock when the check runs, so we have + // locks to check + IteratorSetting is = new IteratorSetting(1, SlowIterator.class); + is.addOption("sleepTime", "10000"); + CompactionConfig slowCompaction = new CompactionConfig(); + slowCompaction.setWait(false); + slowCompaction.setIterators(List.of(is)); + client.tableOperations().compact(table, slowCompaction); + + var p = getCluster().exec(Admin.class, "check", "run", "system_config"); + assertEquals(0, p.getProcess().waitFor()); + String out = p.readStdOut(); + assertTrue(out.contains("locks are valid")); + assertTrue(out.contains("Check SYSTEM_CONFIG completed with status OK")); + } + } + + @Test + public void testPassingMetadataTableCheck() throws Exception { + // Tests the METADATA_TABLE check in the case where all checks pass + + // no extra setup needed, just check the metadata table + var p = getCluster().exec(Admin.class, "check", "run", "metadata_table"); + assertEquals(0, p.getProcess().waitFor()); + String out = p.readStdOut(); + assertTrue(out.contains("Looking for offline tablets")); + assertTrue(out.contains("Checking some references")); + assertTrue(out.contains("Looking for missing columns")); + assertTrue(out.contains("Looking for invalid columns")); + assertTrue(out.contains("Check METADATA_TABLE completed with status OK")); Review Comment: Could look for other checks running in this test and others that specify a check. ```suggestion assertTrue(out.contains("Check METADATA_TABLE completed with status OK")); // Should not see other checks running assertFalse(out.contains("SYSTEM_CONFIG")); ``` ########## server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java: ########## @@ -18,18 +18,91 @@ */ package org.apache.accumulo.server.util.checkCommand; +import java.util.List; +import java.util.Map; + +import org.apache.accumulo.core.Constants; +import org.apache.accumulo.core.client.AccumuloException; +import org.apache.accumulo.core.client.AccumuloSecurityException; +import org.apache.accumulo.core.fate.AdminUtil; +import org.apache.accumulo.core.fate.ZooStore; +import org.apache.accumulo.core.fate.zookeeper.ZooReaderWriter; +import org.apache.accumulo.core.lock.ServiceLock; +import org.apache.accumulo.server.ServerContext; +import org.apache.accumulo.server.cli.ServerUtilOpts; import org.apache.accumulo.server.util.Admin; +import org.apache.zookeeper.KeeperException; public class SystemConfigCheckRunner implements CheckRunner { private static final Admin.CheckCommand.Check check = Admin.CheckCommand.Check.SYSTEM_CONFIG; @Override - public Admin.CheckCommand.CheckStatus runCheck() { + public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, ServerUtilOpts opts, + boolean fixFiles) throws Exception { Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK; + printRunning(); + + System.out.println("\n********** Checking some references **********\n"); + status = checkTableLocks(context, status); Review Comment: This check seems misplaced, maybe there should be a new Check.TABLE_LOCKS for this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@accumulo.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org