Repository: hbase Updated Branches: refs/heads/master f8b12805b -> 3ac324942
HBASE-21155 Save on a few log strings and some churn in wal splitter by skipping out early if no logs in dir Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/3ac32494 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/3ac32494 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/3ac32494 Branch: refs/heads/master Commit: 3ac32494231ca70aa5e4997aede4497f224ce5ec Parents: f8b1280 Author: Michael Stack <[email protected]> Authored: Wed Sep 5 15:26:16 2018 -0700 Committer: Michael Stack <[email protected]> Committed: Thu Sep 6 16:52:47 2018 -0700 ---------------------------------------------------------------------- .../store/wal/ProcedureWALFormat.java | 1 + .../procedure2/store/wal/WALProcedureStore.java | 7 +++ .../org/apache/hadoop/hbase/master/HMaster.java | 7 ++- .../hadoop/hbase/master/MasterWalManager.java | 5 ++ .../hbase/master/RegionServerTracker.java | 6 ++- .../hadoop/hbase/master/ServerManager.java | 6 +-- .../hadoop/hbase/master/SplitLogManager.java | 54 +++++++++++--------- 7 files changed, 54 insertions(+), 32 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/3ac32494/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/wal/ProcedureWALFormat.java ---------------------------------------------------------------------- diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/wal/ProcedureWALFormat.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/wal/ProcedureWALFormat.java index ac3a529..da8af84 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/wal/ProcedureWALFormat.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/wal/ProcedureWALFormat.java @@ -83,6 +83,7 @@ public final class ProcedureWALFormat { // Ignore the last log which is current active log. while (logs.hasNext()) { ProcedureWALFile log = logs.next(); + LOG.debug("Loading WAL id={}", log.getLogId()); log.open(); try { reader.read(log); http://git-wip-us.apache.org/repos/asf/hbase/blob/3ac32494/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/wal/WALProcedureStore.java ---------------------------------------------------------------------- diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/wal/WALProcedureStore.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/wal/WALProcedureStore.java index 24004f6..b99b1c1 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/wal/WALProcedureStore.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/wal/WALProcedureStore.java @@ -424,6 +424,8 @@ public class WALProcedureStore extends ProcedureStoreBase { it.next(); // Skip the current log ProcedureWALFormat.load(it, storeTracker, new ProcedureWALFormat.Loader() { + long count = 0; + @Override public void setMaxProcId(long maxProcId) { loader.setMaxProcId(maxProcId); @@ -431,6 +433,11 @@ public class WALProcedureStore extends ProcedureStoreBase { @Override public void load(ProcedureIterator procIter) throws IOException { + if ((++count % 1000) == 0) { + // Log every 1000 procedures otherwise it looks like Master is dead if loads of WALs + // and procedures to load. + LOG.debug("Loaded {} procedures", this.count); + } loader.load(procIter); } http://git-wip-us.apache.org/repos/asf/hbase/blob/3ac32494/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 92afa9c..f51496a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -585,7 +585,7 @@ public class HMaster extends HRegionServer implements MasterServices { abort(error, t); } } - })); + }), getName() + ":becomeActiveMaster"); } // Fall in here even if we have been aborted. Need to run the shutdown services and // the super run call will do this for us. @@ -924,7 +924,7 @@ public class HMaster extends HRegionServer implements MasterServices { HBaseFsck.createLockRetryCounterFactory(this.conf).create()); } - status.setStatus("Initialze ServerManager and schedule SCP for crash servers"); + status.setStatus("Initialize ServerManager and schedule SCP for crash servers"); this.serverManager = createServerManager(this); createProcedureExecutor(); @SuppressWarnings("rawtypes") @@ -946,6 +946,9 @@ public class HMaster extends HRegionServer implements MasterServices { .collect(Collectors.toList()); this.assignmentManager.setupRIT(ritList); + // Start RegionServerTracker with listing of servers found with exiting SCPs -- these should + // be registered in the deadServers set -- and with the list of servernames out on the + // filesystem that COULD BE 'alive' (we'll schedule SCPs for each and let SCP figure it out). this.regionServerTracker = new RegionServerTracker(zooKeeper, this, this.serverManager); this.regionServerTracker.start( procsByType.getOrDefault(ServerCrashProcedure.class, Collections.emptyList()).stream() http://git-wip-us.apache.org/repos/asf/hbase/blob/3ac32494/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java index 2dc8918..f24805f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java @@ -143,6 +143,11 @@ public class MasterWalManager { return this.fsOk; } + /** + * @return Return listing of ServerNames found in the filesystem under the WAL directory + * that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these are already + * being split -- they cannot be 'alive'. + */ public Set<ServerName> getLiveServersFromWALDir() throws IOException { Path walDirPath = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME); FileStatus[] walDirForLiveServers = FSUtils.listStatus(fs, walDirPath, http://git-wip-us.apache.org/repos/asf/hbase/blob/3ac32494/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java index db46ef0..f472e72 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java @@ -111,13 +111,15 @@ public class RegionServerTracker extends ZKListener { * In this method, we will also construct the region server sets in {@link ServerManager}. If a * region server is dead between the crash of the previous master instance and the start of the * current master instance, we will schedule a SCP for it. This is done in - * {@link ServerManager#findOutDeadServersAndProcess(Set, Set)}, we call it here under the lock + * {@link ServerManager#findDeadServersAndProcess(Set, Set)}, we call it here under the lock * protection to prevent concurrency issues with server expiration operation. * @param deadServersFromPE the region servers which already have SCP associated. * @param liveServersFromWALDir the live region servers from wal directory. */ public void start(Set<ServerName> deadServersFromPE, Set<ServerName> liveServersFromWALDir) throws KeeperException, IOException { + LOG.info("Starting RegionServerTracker; {} have existing ServerCrashProcedures, {} " + + "possibly 'live' servers.", deadServersFromPE.size(), liveServersFromWALDir.size()); watcher.registerListener(this); synchronized (this) { List<String> servers = @@ -132,7 +134,7 @@ public class RegionServerTracker extends ZKListener { info.getVersionInfo().getVersion()) : ServerMetricsBuilder.of(serverName); serverManager.checkAndRecordNewServer(serverName, serverMetrics); } - serverManager.findOutDeadServersAndProcess(deadServersFromPE, liveServersFromWALDir); + serverManager.findDeadServersAndProcess(deadServersFromPE, liveServersFromWALDir); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/3ac32494/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 2cb4d5e..68b8e79 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -350,10 +350,10 @@ public class ServerManager { * <p/> * Must be called inside the initialization method of {@code RegionServerTracker} to avoid * concurrency issue. - * @param deadServersFromPE the region servers which already have SCP associated. + * @param deadServersFromPE the region servers which already have a SCP associated. * @param liveServersFromWALDir the live region servers from wal directory. */ - void findOutDeadServersAndProcess(Set<ServerName> deadServersFromPE, + void findDeadServersAndProcess(Set<ServerName> deadServersFromPE, Set<ServerName> liveServersFromWALDir) { deadServersFromPE.forEach(deadservers::add); liveServersFromWALDir.stream().filter(sn -> !onlineServers.containsKey(sn)) @@ -621,7 +621,7 @@ public class ServerManager { public void moveFromOnlineToDeadServers(final ServerName sn) { synchronized (onlineServers) { if (!this.onlineServers.containsKey(sn)) { - LOG.warn("Expiration of " + sn + " but server not online"); + LOG.trace("Expiration of {} but server not online", sn); } // Remove the server from the known servers lists and update load info BUT // add to deadservers first; do this so it'll show in dead servers list if http://git-wip-us.apache.org/repos/asf/hbase/blob/3ac32494/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java index 2e2f8bf..1e4e2ce 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java @@ -174,7 +174,7 @@ public class SplitLogManager { } FileStatus[] logfiles = FSUtils.listStatus(fs, logDir, filter); if (logfiles == null || logfiles.length == 0) { - LOG.info(logDir + " is empty dir, no logs to split"); + LOG.info(logDir + " dir is empty, no logs to split."); } else { Collections.addAll(fileStatus, logfiles); } @@ -235,29 +235,33 @@ public class SplitLogManager { PathFilter filter) throws IOException { MonitoredTask status = TaskMonitor.get().createStatus("Doing distributed log split in " + logDirs + " for serverName=" + serverNames); - FileStatus[] logfiles = getFileList(logDirs, filter); - status.setStatus("Checking directory contents..."); - SplitLogCounters.tot_mgr_log_split_batch_start.increment(); - LOG.info("Started splitting " + logfiles.length + " logs in " + logDirs + - " for " + serverNames); - long t = EnvironmentEdgeManager.currentTime(); long totalSize = 0; - TaskBatch batch = new TaskBatch(); - for (FileStatus lf : logfiles) { - // TODO If the log file is still being written to - which is most likely - // the case for the last log file - then its length will show up here - // as zero. The size of such a file can only be retrieved after - // recover-lease is done. totalSize will be under in most cases and the - // metrics that it drives will also be under-reported. - totalSize += lf.getLen(); - String pathToLog = FSUtils.removeWALRootPath(lf.getPath(), conf); - if (!enqueueSplitTask(pathToLog, batch)) { - throw new IOException("duplicate log split scheduled for " + lf.getPath()); + TaskBatch batch = null; + long startTime = 0; + FileStatus[] logfiles = getFileList(logDirs, filter); + if (logfiles.length != 0) { + status.setStatus("Checking directory contents..."); + SplitLogCounters.tot_mgr_log_split_batch_start.increment(); + LOG.info("Started splitting " + logfiles.length + " logs in " + logDirs + + " for " + serverNames); + startTime = EnvironmentEdgeManager.currentTime(); + batch = new TaskBatch(); + for (FileStatus lf : logfiles) { + // TODO If the log file is still being written to - which is most likely + // the case for the last log file - then its length will show up here + // as zero. The size of such a file can only be retrieved after + // recover-lease is done. totalSize will be under in most cases and the + // metrics that it drives will also be under-reported. + totalSize += lf.getLen(); + String pathToLog = FSUtils.removeWALRootPath(lf.getPath(), conf); + if (!enqueueSplitTask(pathToLog, batch)) { + throw new IOException("duplicate log split scheduled for " + lf.getPath()); + } } + waitForSplittingCompletion(batch, status); } - waitForSplittingCompletion(batch, status); - if (batch.done != batch.installed) { + if (batch != null && batch.done != batch.installed) { batch.isDead = true; SplitLogCounters.tot_mgr_log_split_batch_err.increment(); LOG.warn("error while splitting logs in " + logDirs + " installed = " + batch.installed @@ -285,10 +289,10 @@ public class SplitLogManager { } SplitLogCounters.tot_mgr_log_split_batch_success.increment(); } - String msg = - "finished splitting (more than or equal to) " + totalSize + " bytes in " + batch.installed - + " log files in " + logDirs + " in " - + (EnvironmentEdgeManager.currentTime() - t) + "ms"; + String msg = "Finished splitting (more than or equal to) " + totalSize + + " bytes in " + ((batch == null)? 0: batch.installed) + + " log files in " + logDirs + " in " + + ((startTime == -1)? startTime: (EnvironmentEdgeManager.currentTime() - startTime)) + "ms"; status.markComplete(msg); LOG.info(msg); return totalSize; @@ -448,7 +452,7 @@ public class SplitLogManager { } deadWorkers.add(workerName); } - LOG.info("dead splitlog worker " + workerName); + LOG.info("Dead splitlog worker {}", workerName); } void handleDeadWorkers(Set<ServerName> serverNames) {
