This is an automated email from the ASF dual-hosted git repository. kturner pushed a commit to branch 1.9 in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/1.9 by this push: new 2c2840b Fix race condition when getting WALs for dead tserver (#866) 2c2840b is described below commit 2c2840b1873282354fffad2d76774fd31cc2fd41 Author: Keith Turner <ktur...@apache.org> AuthorDate: Wed Jan 2 14:16:33 2019 -0500 Fix race condition when getting WALs for dead tserver (#866) When a tablet server dies the master gets its WALs from ZK. In ZK there is a list of WALs per tserver. Each WAL in ZK has state that is either OPEN, CLOSED, or UNREFERENCED. The master needs a list of OPEN and CLOSED logs for dead tservers. While the master is trying to obtain this list its possible that the Accumulo GC may delete an UNREFERENCED WAL. If this happened then the code before this commit would return an empty list of WALs. This could result in OPEN and CLOSED logs being ignored for recovery which could result in data loss. This patch fixes the race condition. This bug was observed while looking into an exception I noticed while writing test for #860. In the IT I was frequently calling another WalStateManager function and noticed NoNodeExceptions. As a result I examined how the entire class handled NoNode race conditions and found this bug. I noticed some other possible NoNode race condition, but do not think this these would occur with the current way the code is called. --- .../accumulo/server/log/WalStateManager.java | 24 +++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/server/base/src/main/java/org/apache/accumulo/server/log/WalStateManager.java b/server/base/src/main/java/org/apache/accumulo/server/log/WalStateManager.java index 22e9ee1..12f3133 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/log/WalStateManager.java +++ b/server/base/src/main/java/org/apache/accumulo/server/log/WalStateManager.java @@ -163,9 +163,22 @@ public class WalStateManager { String zpath = root() + "/" + tsi.toString(); zoo.sync(zpath); for (String child : zoo.getChildren(zpath)) { - Pair<WalState,Path> parts = parse(zoo.getData(zpath + "/" + child, null)); - if (parts.getFirst() != WalState.UNREFERENCED) { - result.add(parts.getSecond()); + byte[] zdata = null; + try { + // This function is called by the Master. Its possible that Accumulo GC deletes an + // unreferenced WAL in ZK after the call to getChildren above. Catch this exception inside + // the loop so that not all children are ignored. + zdata = zoo.getData(zpath + "/" + child, null); + } catch (KeeperException.NoNodeException e) { + log.debug("WAL state removed {} {} during getWalsInUse. Likely a race condition between " + + "master and GC.", tsi, child); + } + + if (zdata != null) { + Pair<WalState,Path> parts = parse(zdata); + if (parts.getFirst() != WalState.UNREFERENCED) { + result.add(parts.getSecond()); + } } } } catch (KeeperException.NoNodeException e) { @@ -187,6 +200,9 @@ public class WalStateManager { if (logs == null) { result.put(inst, logs = new ArrayList<>()); } + + // This function is called by the Accumulo GC which deletes WAL markers. Therefore we do not + // expect the following call to fail because the WAL info in ZK was deleted. for (String idString : zoo.getChildren(path + "/" + child)) { logs.add(UUID.fromString(idString)); } @@ -212,6 +228,8 @@ public class WalStateManager { Map<Path,WalState> result = new HashMap<>(); for (Entry<TServerInstance,List<UUID>> entry : getAllMarkers().entrySet()) { for (UUID id : entry.getValue()) { + // This function is called by the Accumulo GC which deletes WAL markers. Therefore we do not + // expect the following call to fail because the WAL info in ZK was deleted. Pair<WalState,Path> state = state(entry.getKey(), id); result.put(state.getSecond(), state.getFirst()); }