Repository: hbase Updated Branches: refs/heads/branch-2.0 d49fca134 -> 5e06b80e7
HBASE-20978 [amv2] Worker terminating UNNATURALLY during MoveRegionProcedure Signed-off-by: Michael Stack <[email protected]> Signed-off-by: Duo Zhang <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/5e06b80e Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/5e06b80e Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/5e06b80e Branch: refs/heads/branch-2.0 Commit: 5e06b80e75fb38d1d05a9c30af1fc44deec542d1 Parents: d49fca1 Author: Allan Yang <[email protected]> Authored: Tue Aug 14 12:09:42 2018 -0700 Committer: Michael Stack <[email protected]> Committed: Tue Aug 14 16:29:24 2018 -0700 ---------------------------------------------------------------------- .../hbase/procedure2/ProcedureExecutor.java | 65 ++++++++++++++++++-- .../procedure2/ProcedureTestingUtility.java | 7 +++ .../hbase/procedure2/TestChildProcedures.java | 27 ++++++++ .../src/test/resources/hbase-site.xml | 4 ++ 4 files changed, 99 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java ---------------------------------------------------------------------- diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java index 3c2998c..1e0ee79 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java @@ -83,12 +83,34 @@ public class ProcedureExecutor<TEnvironment> { "hbase.procedure.worker.keep.alive.time.msec"; private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = TimeUnit.MINUTES.toMillis(1); + /** + * {@link #testing} is non-null when ProcedureExecutor is being tested. Tests will try to + * break PE having it fail at various junctures. When non-null, testing is set to an instance of + * the below internal {@link Testing} class with flags set for the particular test. + */ Testing testing = null; + + /** + * Class with parameters describing how to fail/die when in testing-context. + */ public static class Testing { protected boolean killIfSuspended = false; + + /** + * Kill the PE BEFORE we store state to the WAL. Good for figuring out if a Procedure is + * persisting all the state it needs to recover after a crash. + */ protected boolean killBeforeStoreUpdate = false; protected boolean toggleKillBeforeStoreUpdate = false; + /** + * Set when we want to fail AFTER state has been stored into the WAL. Rarely used. HBASE-20978 + * is about a case where memory-state was being set after store to WAL where a crash could + * cause us to get stuck. This flag allows killing at what was a vulnerable time. + */ + protected boolean killAfterStoreUpdate = false; + protected boolean toggleKillAfterStoreUpdate = false; + protected boolean shouldKillBeforeStoreUpdate() { final boolean kill = this.killBeforeStoreUpdate; if (this.toggleKillBeforeStoreUpdate) { @@ -101,6 +123,19 @@ public class ProcedureExecutor<TEnvironment> { protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) { return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate(); } + + protected boolean shouldKillAfterStoreUpdate() { + final boolean kill = this.killAfterStoreUpdate; + if (this.toggleKillAfterStoreUpdate) { + this.killAfterStoreUpdate = !kill; + LOG.warn("Toggle KILL after store update to: " + this.killAfterStoreUpdate); + } + return kill; + } + + protected boolean shouldKillAfterStoreUpdate(final boolean isSuspended) { + return (isSuspended && !killIfSuspended) ? false : shouldKillAfterStoreUpdate(); + } } public interface ProcedureExecutorListener { @@ -503,6 +538,17 @@ public class ProcedureExecutor<TEnvironment> { break; case WAITING: if (!proc.hasChildren()) { + // Normally, WAITING procedures should be waken by its children. + // But, there is a case that, all the children are successful and before + // they can wake up their parent procedure, the master was killed. + // So, during recovering the procedures from ProcedureWal, its children + // are not loaded because of their SUCCESS state. + // So we need to continue to run this WAITING procedure. But before + // executing, we need to set its state to RUNNABLE, otherwise, a exception + // will throw: + // Preconditions.checkArgument(procedure.getState() == ProcedureState.RUNNABLE, + // "NOT RUNNABLE! " + procedure.toString()); + proc.setState(ProcedureState.RUNNABLE); runnableList.add(proc); } break; @@ -1562,10 +1608,7 @@ public class ProcedureExecutor<TEnvironment> { // allows to kill the executor before something is stored to the wal. // useful to test the procedure recovery. if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) { - String msg = "TESTING: Kill before store update: " + procedure; - LOG.debug(msg); - stop(); - throw new RuntimeException(msg); + kill("TESTING: Kill BEFORE store update: " + procedure); } // TODO: The code here doesn't check if store is running before persisting to the store as @@ -1591,6 +1634,14 @@ public class ProcedureExecutor<TEnvironment> { assert (reExecute && subprocs == null) || !reExecute; } while (reExecute); + + // Allows to kill the executor after something is stored to the WAL but before the below + // state settings are done -- in particular the one on the end where we make parent + // RUNNABLE again when its children are done; see countDownChildren. + if (testing != null && testing.shouldKillAfterStoreUpdate(suspended)) { + kill("TESTING: Kill AFTER store update: " + procedure); + } + // Submit the new subprocedures if (subprocs != null && !procedure.isFailed()) { submitChildrenProcedures(subprocs); @@ -1608,6 +1659,12 @@ public class ProcedureExecutor<TEnvironment> { } } + private void kill(String msg) { + LOG.debug(msg); + stop(); + throw new RuntimeException(msg); + } + private Procedure<TEnvironment>[] initializeChildren(RootProcedureState<TEnvironment> procStack, Procedure<TEnvironment> procedure, Procedure<TEnvironment>[] subprocs) { assert subprocs != null : "expected subprocedures"; http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java ---------------------------------------------------------------------- diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java index e8d72f9..138215b 100644 --- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java +++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java @@ -167,6 +167,13 @@ public class ProcedureTestingUtility { assertSingleExecutorForKillTests(procExecutor); } + public static <TEnv> void toggleKillAfterStoreUpdate(ProcedureExecutor<TEnv> procExecutor) { + createExecutorTesting(procExecutor); + procExecutor.testing.killAfterStoreUpdate = !procExecutor.testing.killAfterStoreUpdate; + LOG.warn("Set Kill after store update to: " + procExecutor.testing.killAfterStoreUpdate); + assertSingleExecutorForKillTests(procExecutor); + } + public static <TEnv> void setKillAndToggleBeforeStoreUpdate(ProcedureExecutor<TEnv> procExecutor, boolean value) { ProcedureTestingUtility.setKillBeforeStoreUpdate(procExecutor, value); http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java ---------------------------------------------------------------------- diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java index cce4caf..4f3c443 100644 --- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java +++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java @@ -109,6 +109,29 @@ public class TestChildProcedures { ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId); } + + /** + * Test the state setting that happens after store to WAL; in particular the bit where we + * set the parent runnable again after its children have all completed successfully. + * See HBASE-20978. + */ + @Test + public void testChildLoadWithRestartAfterChildSuccess() throws Exception { + procEnv.toggleKillAfterStoreUpdate = true; + + TestRootProcedure proc = new TestRootProcedure(); + long procId = ProcedureTestingUtility.submitAndWait(procExecutor, proc); + int restartCount = 0; + while (!procExecutor.isFinished(procId)) { + ProcedureTestingUtility.restart(procExecutor); + ProcedureTestingUtility.waitProcedure(procExecutor, proc); + restartCount++; + } + assertEquals(4, restartCount); + assertTrue("expected completed proc", procExecutor.isFinished(procId)); + ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId); + } + @Test public void testChildRollbackLoad() throws Exception { procEnv.toggleKillBeforeStoreUpdate = false; @@ -154,6 +177,9 @@ public class TestChildProcedures { if (env.toggleKillBeforeStoreUpdate) { ProcedureTestingUtility.toggleKillBeforeStoreUpdate(procExecutor); } + if (env.toggleKillAfterStoreUpdate) { + ProcedureTestingUtility.toggleKillAfterStoreUpdate(procExecutor); + } return new Procedure[] { new TestChildProcedure(), new TestChildProcedure() }; } @@ -193,6 +219,7 @@ public class TestChildProcedures { private static class TestProcEnv { public boolean toggleKillBeforeStoreUpdate = false; + public boolean toggleKillAfterStoreUpdate = false; public boolean triggerRollbackOnChild = false; } } http://git-wip-us.apache.org/repos/asf/hbase/blob/5e06b80e/hbase-procedure/src/test/resources/hbase-site.xml ---------------------------------------------------------------------- diff --git a/hbase-procedure/src/test/resources/hbase-site.xml b/hbase-procedure/src/test/resources/hbase-site.xml index 114ee8a..3709a71 100644 --- a/hbase-procedure/src/test/resources/hbase-site.xml +++ b/hbase-procedure/src/test/resources/hbase-site.xml @@ -22,6 +22,10 @@ --> <configuration> <property> + <name>hbase.defaults.for.version.skip</name> + <value>true</value> + </property> + <property> <name>hbase.procedure.store.wal.use.hsync</name> <value>false</value> </property>
