Github user keith-turner commented on a diff in the pull request: https://github.com/apache/accumulo/pull/107#discussion_r65650715 --- Diff: server/gc/src/main/java/org/apache/accumulo/gc/GarbageCollectWriteAheadLogs.java --- @@ -170,76 +184,203 @@ boolean holdsLock(HostAndPort addr) { } } - private int removeFiles(Map<String,Path> nameToFileMap, Map<String,ArrayList<Path>> serverToFileMap, Map<String,Path> sortedWALogs, final GCStatus status) { - AccumuloConfiguration conf = ServerConfiguration.getSystemConfiguration(instance); + private AccumuloConfiguration getConfig() { + return ServerConfiguration.getSystemConfiguration(instance); + } + + /** + * Top level method for removing WAL files. + * <p> + * Loops over all the gathered WAL and sortedWAL entries and calls the appropriate methods for removal + * + * @param nameToFileMap + * Map of filename to Path + * @param serverToFileMap + * Map of HostAndPort string to a list of Paths + * @param sortedWALogs + * Map of sorted WAL names to Path + * @param status + * GCStatus object for tracking what is done + * @return 0 always + */ + @VisibleForTesting + int removeFiles(Map<String,Path> nameToFileMap, Map<String,ArrayList<Path>> serverToFileMap, Map<String,Path> sortedWALogs, final GCStatus status) { + // TODO: remove nameToFileMap from method signature, not used here I don't think + AccumuloConfiguration conf = getConfig(); for (Entry<String,ArrayList<Path>> entry : serverToFileMap.entrySet()) { if (entry.getKey().isEmpty()) { - // old-style log entry, just remove it - for (Path path : entry.getValue()) { - log.debug("Removing old-style WAL " + path); - try { - if (!useTrash || !fs.moveToTrash(path)) - fs.deleteRecursively(path); - status.currentLog.deleted++; - } catch (FileNotFoundException ex) { - // ignored - } catch (IOException ex) { - log.error("Unable to delete wal " + path + ": " + ex); - } - } + removeOldStyleWAL(entry, status); } else { - HostAndPort address = AddressUtil.parseAddress(entry.getKey(), false); - if (!holdsLock(address)) { - for (Path path : entry.getValue()) { - log.debug("Removing WAL for offline server " + path); - try { - if (!useTrash || !fs.moveToTrash(path)) - fs.deleteRecursively(path); - status.currentLog.deleted++; - } catch (FileNotFoundException ex) { - // ignored - } catch (IOException ex) { - log.error("Unable to delete wal " + path + ": " + ex); - } - } - continue; - } else { - Client tserver = null; - try { - tserver = ThriftUtil.getClient(new TabletClientService.Client.Factory(), address, conf); - tserver.removeLogs(Tracer.traceInfo(), SystemCredentials.get().toThrift(instance), paths2strings(entry.getValue())); - log.debug("deleted " + entry.getValue() + " from " + entry.getKey()); - status.currentLog.deleted += entry.getValue().size(); - } catch (TException e) { - log.warn("Error talking to " + address + ": " + e); - } finally { - if (tserver != null) - ThriftUtil.returnClient(tserver); - } - } + removeWALFile(entry, conf, status); } } - for (Path swalog : sortedWALogs.values()) { - log.debug("Removing sorted WAL " + swalog); + removeSortedWAL(swalog); + } + return 0; + } + + /** + * Removes sortedWALs. + * <p> + * Sorted WALs are WALs that are in the recovery directory and have already been used. + * + * @param swalog + * Path to the WAL + */ + @VisibleForTesting + void removeSortedWAL(Path swalog) { + log.debug("Removing sorted WAL " + swalog); + try { + if (!useTrash || !fs.moveToTrash(swalog)) { + fs.deleteRecursively(swalog); + } + } catch (FileNotFoundException ex) { + // ignored + } catch (IOException ioe) { try { - if (!useTrash || !fs.moveToTrash(swalog)) { - fs.deleteRecursively(swalog); + if (fs.exists(swalog)) { + log.error("Unable to delete sorted walog " + swalog + ": " + ioe); } - } catch (FileNotFoundException ex) { - // ignored - } catch (IOException ioe) { + } catch (IOException ex) { + log.error("Unable to check for the existence of " + swalog, ex); + } + } + } + + /** + * A wrapper method to check if the tserver using the WAL is still alive + * <p> + * Delegates to the deletion to #removeWALfromDownTserver if the ZK lock is gone or #askTserverToRemoveWAL if the server is known to still be alive + * + * @param entry + * WAL information gathered + * @param conf + * AccumuloConfiguration object + * @param status + * GCStatus object + */ + @VisibleForTesting + void removeWALFile(Entry<String,ArrayList<Path>> entry, AccumuloConfiguration conf, final GCStatus status) { + HostAndPort address = AddressUtil.parseAddress(entry.getKey(), false); + if (!holdsLock(address)) { + removeWALfromDownTserver(address, conf, entry, status); + } else { + askTserverToRemoveWAL(address, conf, entry, status); + } + } + + /** + * Asks a currently running tserver to remove it's WALs. + * <p> + * A tserver has more information about whether a WAL is still being used for current mutations. It is safer to ask the tserver to remove the file instead of + * just relying on information in the metadata table. + * + * @param address + * HostAndPort of the tserver + * @param conf + * AccumuloConfiguration entry + * @param entry + * WAL information gathered + * @param status + * GCStatus object + */ + @VisibleForTesting + void askTserverToRemoveWAL(HostAndPort address, AccumuloConfiguration conf, Entry<String,ArrayList<Path>> entry, final GCStatus status) { + firstSeenDead.remove(address); + Client tserver = null; + try { + tserver = ThriftUtil.getClient(new TabletClientService.Client.Factory(), address, conf); + tserver.removeLogs(Tracer.traceInfo(), SystemCredentials.get().toThrift(instance), paths2strings(entry.getValue())); + log.debug("asking tserver to delete " + entry.getValue() + " from " + entry.getKey()); --- End diff -- I think it should say `asked` instead of `asking` because the log message is after the thrift call.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---