SLIDER-467 cleanly shut down AMs should say "succeeded" in app reports. Easier said than done
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/155262bf Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/155262bf Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/155262bf Branch: refs/heads/develop Commit: 155262bf7555139cc54d553e6ef34e612d075a6f Parents: 4a53f55 Author: Steve Loughran <[email protected]> Authored: Sat Oct 4 14:02:48 2014 -0700 Committer: Steve Loughran <[email protected]> Committed: Sat Oct 4 14:02:48 2014 -0700 ---------------------------------------------------------------------- .../providers/AbstractProviderService.java | 21 ++++++++++- .../providers/agent/AgentProviderService.java | 22 ------------ .../slideram/SliderAMProviderService.java | 8 ----- .../apache/slider/server/appmaster/AMUtils.java | 2 +- .../server/appmaster/SliderAppMaster.java | 38 +++++++++----------- .../appmaster/actions/ActionStopSlider.java | 20 +++++++++++ .../workflow/WorkflowSequenceService.java | 8 ++++- .../standalone/TestStandaloneAMDestroy.groovy | 3 +- .../standalone/TestStandaloneAMRestart.groovy | 36 +++++++++++++------ .../slider/test/YarnMiniClusterTestBase.groovy | 13 ++++--- .../providers/hbase/HBaseProviderService.java | 23 ------------ 11 files changed, 99 insertions(+), 95 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/providers/AbstractProviderService.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/providers/AbstractProviderService.java b/slider-core/src/main/java/org/apache/slider/providers/AbstractProviderService.java index baddb56..b3cf527 100644 --- a/slider-core/src/main/java/org/apache/slider/providers/AbstractProviderService.java +++ b/slider-core/src/main/java/org/apache/slider/providers/AbstractProviderService.java @@ -77,6 +77,7 @@ public abstract class AbstractProviderService public AbstractProviderService(String name) { super(name); + setStopIfNoChildServicesAtStartup(false); } @Override @@ -184,7 +185,25 @@ public abstract class AbstractProviderService } return false; } - + + /** + * override point to allow a process to start executing in this container + * @param instanceDefinition cluster description + * @param confDir configuration directory + * @param env environment + * @param execInProgress the callback for the exec events + * @return false + * @throws IOException + * @throws SliderException + */ + @Override + public boolean exec(AggregateConf instanceDefinition, + File confDir, + Map<String, String> env, + ProviderCompleted execInProgress) throws IOException, SliderException { + return false; + } + @SuppressWarnings("ThrowableResultOfMethodCallIgnored") @Override // ExitCodeProvider public int getExitCode() { http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java b/slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java index d7943b2..67a268e 100644 --- a/slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java +++ b/slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java @@ -395,28 +395,6 @@ public class AgentProviderService extends AbstractProviderService implements } } - /** - * Run this service - * - * @param instanceDefinition component description - * @param confDir local dir with the config - * @param env environment variables above those generated by - * @param execInProgress callback for the event notification - * - * @throws IOException IO problems - * @throws SliderException anything internal - */ - @Override - public boolean exec(AggregateConf instanceDefinition, - File confDir, - Map<String, String> env, - ProviderCompleted execInProgress) throws - IOException, - SliderException { - - return false; - } - @Override public boolean isSupportedRole(String role) { return true; http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMProviderService.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMProviderService.java b/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMProviderService.java index 863ea7e..2b2d1c7 100644 --- a/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMProviderService.java +++ b/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMProviderService.java @@ -91,14 +91,6 @@ public class SliderAMProviderService extends AbstractProviderService implements } @Override - public boolean exec(AggregateConf instanceDefinition, - File confDir, - Map<String, String> env, - ProviderCompleted execInProgress) throws IOException, SliderException { - return false; - } - - @Override public List<ProviderRole> getRoles() { return new ArrayList<ProviderRole>(0); } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/server/appmaster/AMUtils.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/AMUtils.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/AMUtils.java index 39f511a..32684c6 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/AMUtils.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/AMUtils.java @@ -33,7 +33,7 @@ public class AMUtils { return LauncherExitCodes.EXIT_SUCCESS; //remap from a planned shutdown to a failure case LauncherExitCodes.EXIT_CLIENT_INITIATED_SHUTDOWN: - return SliderExitCodes.EXIT_PROCESS_FAILED; + return SliderExitCodes.EXIT_SUCCESS; default: return exitCode; } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java index aac8106..7fbea86 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java @@ -20,6 +20,7 @@ package org.apache.slider.server.appmaster; import com.codahale.metrics.MetricRegistry; import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; import com.google.protobuf.BlockingService; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; @@ -846,15 +847,14 @@ public class SliderAppMaster extends AbstractSliderLaunchedService // launch the real provider; this is expected to trigger a callback that // starts the node review process launchProviderService(instanceDefinition, confDir); - + //now block waiting to be told to exit the process waitForAMCompletionSignal(); - //shutdown time - } finally { - finish(); + } catch(Exception e) { + stopAction = new ActionStopSlider(e); } - - return amExitCode; + //shutdown time + return finish(); } private void startAgentWebApp(MapOperations appInformation, @@ -1064,14 +1064,6 @@ public class SliderAppMaster extends AbstractSliderLaunchedService } finally { AMExecutionStateLock.unlock(); } - //add a sleep here for about a second. Why? it - //stops RPC calls breaking so dramatically when the cluster - //is torn down mid-RPC - try { - Thread.sleep(TERMINATION_SIGNAL_PROPAGATION_DELAY); - } catch (InterruptedException ignored) { - //ignored - } } /** @@ -1083,6 +1075,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService // this is a queued action: schedule it through the queues schedule(stopActionRequest); } + /** * Signal that the AM is complete * @@ -1105,8 +1098,10 @@ public class SliderAppMaster extends AbstractSliderLaunchedService /** * trigger the YARN cluster termination process + * @return the exit code */ - private synchronized void finish() { + private synchronized int finish() { + Preconditions.checkNotNull(stopAction, "null stop action"); FinalApplicationStatus appStatus; log.info("Triggering shutdown of the AM: {}", stopAction); @@ -1145,6 +1140,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService } catch (YarnException e) { log.info("Failed to unregister application: " + e, e); } + return exitCode; } /** @@ -1377,7 +1373,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService public void onShutdownRequest() { LOG_YARN.info("Shutdown Request received"); signalAMComplete(new ActionStopSlider("stop", - EXIT_CLIENT_INITIATED_SHUTDOWN, + EXIT_SUCCESS, FinalApplicationStatus.SUCCEEDED, "Shutdown requested from RM")); } @@ -1627,7 +1623,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService return Messages.AMSuicideResponseProto.getDefaultInstance(); } - /* =================================================================== */ +/* =================================================================== */ /* END */ /* =================================================================== */ @@ -1664,7 +1660,6 @@ public class SliderAppMaster extends AbstractSliderLaunchedService } } - /* =================================================================== */ /* EventCallback from the child or ourselves directly */ /* =================================================================== */ @@ -1680,9 +1675,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService //this may happen in a separate thread, so the ability to act is limited log.error("Failed to flex cluster nodes: {}", e, e); //declare a failure - queue(new ActionStopSlider("stop", - EXIT_DEPLOYMENT_FAILED, FinalApplicationStatus.FAILED, - "Failed to create application:" + e.toString())); + queue(new ActionStopSlider(e)); } } @@ -1728,7 +1721,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService if (shouldTriggerFailure) { String reason = - "Spawned master exited with raw " + exitCode + " mapped to " + + "Spawned process failed with raw " + exitCode + " mapped to " + mappedProcessExitCode; ActionStopSlider stop = new ActionStopSlider("stop", mappedProcessExitCode, @@ -1916,6 +1909,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService InternalKeys.DEFAULT_CHAOS_MONKEY_ENABLED); if (!enabled) { log.info("Chaos monkey disabled"); + return false; } long monkeyInterval = internals.getTimeRange( http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java index 39ff761..d2f23a2 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java @@ -20,6 +20,8 @@ package org.apache.slider.server.appmaster.actions; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.slider.core.exceptions.TriggerClusterTeardownException; +import org.apache.slider.core.main.ExitCodeProvider; +import org.apache.slider.core.main.LauncherExitCodes; import org.apache.slider.server.appmaster.SliderAppMaster; import org.apache.slider.server.appmaster.state.AppState; @@ -91,6 +93,24 @@ public class ActionStopSlider extends AsyncAction { ex.getMessage()); } + /** + * Build from an exception. + * <p> + * If the exception implements + * {@link ExitCodeProvider} then the exit code is extracted from that + * @param ex exception. + */ + public ActionStopSlider(Exception ex) { + super("stop"); + if (ex instanceof ExitCodeProvider) { + setExitCode(((ExitCodeProvider)ex).getExitCode()); + } else { + setExitCode(LauncherExitCodes.EXIT_EXCEPTION_THROWN); + } + setFinalApplicationStatus(FinalApplicationStatus.FAILED); + setMessage(ex.getMessage()); + } + @Override public void execute(SliderAppMaster appMaster, QueueAccess queueService, http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/server/services/workflow/WorkflowSequenceService.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/services/workflow/WorkflowSequenceService.java b/slider-core/src/main/java/org/apache/slider/server/services/workflow/WorkflowSequenceService.java index ca07f99..e584e63 100644 --- a/slider-core/src/main/java/org/apache/slider/server/services/workflow/WorkflowSequenceService.java +++ b/slider-core/src/main/java/org/apache/slider/server/services/workflow/WorkflowSequenceService.java @@ -79,6 +79,8 @@ public class WorkflowSequenceService extends AbstractService implements null if one did not finish yet */ private volatile Service previousService; + + private boolean stopIfNoChildServicesAtStartup = true; /** * Construct an instance @@ -133,13 +135,17 @@ public class WorkflowSequenceService extends AbstractService implements return previousService; } + protected void setStopIfNoChildServicesAtStartup(boolean stopIfNoChildServicesAtStartup) { + this.stopIfNoChildServicesAtStartup = stopIfNoChildServicesAtStartup; + } + /** * When started * @throws Exception */ @Override protected void serviceStart() throws Exception { - if (!startNextService()) { + if (!startNextService() && stopIfNoChildServicesAtStartup) { //nothing to start -so stop stop(); } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMDestroy.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMDestroy.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMDestroy.groovy index fa48b70..463c4c0 100644 --- a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMDestroy.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMDestroy.groovy @@ -94,7 +94,6 @@ class TestStandaloneAMDestroy extends AgentMiniClusterTestBase { describe "END EXPECTED WARNINGS" - describe "destroying $clustername" //now: destroy it @@ -153,6 +152,8 @@ class TestStandaloneAMDestroy extends AgentMiniClusterTestBase { //and try to destroy a completely different cluster just for the fun of it assert 0 == sliderClient.actionDestroy("no-cluster-of-this-name") + + maybeStopCluster(cluster2, "", "Teardown at end of test case", false); } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy index 1073309..947529c 100644 --- a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy @@ -47,11 +47,13 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase { // patch the configuration for AM restart YarnConfiguration conf = getRestartableConfiguration(5) + int restartLimit = 3; String clustername = createMiniCluster("", conf, 1, true) ServiceLauncher<SliderClient> launcher = createStandaloneAMWithArgs(clustername, [ - Arguments.ARG_OPTION, SliderXmlConfKeys.KEY_AM_RESTART_LIMIT, "4" + Arguments.ARG_OPTION, SliderXmlConfKeys.KEY_AM_RESTART_LIMIT, + "$restartLimit".toString() ], true, false) @@ -68,21 +70,17 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase { diagnosticArgs.yarn = true sliderClient.actionDiagnostic(diagnosticArgs) - ActionAMSuicideArgs args = new ActionAMSuicideArgs() - args.message = "test AM iteration" - args.waittime = 100 - args.exitcode = 1 - sliderClient.actionAmSuicide(clustername, args) - waitWhileClusterLive(sliderClient); - //give yarn some time to notice - sleep(20000) - waitUntilClusterLive(sliderClient, 20000) + int iteration = 1; + killAM(iteration, sliderClient, clustername) + killAM(iteration++, sliderClient, clustername) // app should be running here assert 0 == sliderClient.actionExists(clustername, true) + // kill again & expect it to be considered a failure - sliderClient.actionAmSuicide(clustername, args) + killAM(iteration++, sliderClient, clustername) + report = sliderClient.applicationReport assert report.finalApplicationStatus == FinalApplicationStatus.FAILED @@ -95,6 +93,22 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase { assert 0 == clusterActionFreeze(sliderClient, clustername, "force", true) } + public ActionAMSuicideArgs killAM( + int iteration, + SliderClient sliderClient, + String clustername) { + ActionAMSuicideArgs args = new ActionAMSuicideArgs() + args.waittime = 100 + args.exitcode = 1 + args.message = "kill AM iteration #$iteration" + sliderClient.actionAmSuicide(clustername, args) + waitWhileClusterLive(sliderClient); + //give yarn some time to notice + sleep(20000) + waitUntilClusterLive(sliderClient, 20000) + return args + } + /** * Get a restartable configuration * @param restarts http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/test/groovy/org/apache/slider/test/YarnMiniClusterTestBase.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/test/YarnMiniClusterTestBase.groovy b/slider-core/src/test/groovy/org/apache/slider/test/YarnMiniClusterTestBase.groovy index aa82bdb..9595a32 100644 --- a/slider-core/src/test/groovy/org/apache/slider/test/YarnMiniClusterTestBase.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/test/YarnMiniClusterTestBase.groovy @@ -193,8 +193,10 @@ public abstract class YarnMiniClusterTestBase extends ServiceLauncherBaseTest { clustersToTeardown << client; } protected void addToTeardown(ServiceLauncher<SliderClient> launcher) { - SliderClient sliderClient = launcher.service - if (sliderClient) addToTeardown(sliderClient) + SliderClient sliderClient = launcher?.service + if (sliderClient) { + addToTeardown(sliderClient) + } } @@ -208,7 +210,7 @@ public abstract class YarnMiniClusterTestBase extends ServiceLauncherBaseTest { public void stopRunningClusters() { clustersToTeardown.each { SliderClient client -> try { - maybeStopCluster(client, "", "Teardown at end of test case"); + maybeStopCluster(client, "", "Teardown at end of test case", true); } catch (Exception e) { log.warn("While stopping cluster " + e, e); } @@ -748,14 +750,15 @@ public abstract class YarnMiniClusterTestBase extends ServiceLauncherBaseTest { public int maybeStopCluster( SliderClient sliderClient, String clustername, - String message) { + String message, + boolean force = false) { if (sliderClient != null) { if (!clustername) { clustername = sliderClient.deployedClusterName; } //only stop a cluster that exists if (clustername) { - return clusterActionFreeze(sliderClient, clustername, message); + return clusterActionFreeze(sliderClient, clustername, message, force); } } return 0; http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-providers/hbase/slider-hbase-provider/src/main/java/org/apache/slider/providers/hbase/HBaseProviderService.java ---------------------------------------------------------------------- diff --git a/slider-providers/hbase/slider-hbase-provider/src/main/java/org/apache/slider/providers/hbase/HBaseProviderService.java b/slider-providers/hbase/slider-hbase-provider/src/main/java/org/apache/slider/providers/hbase/HBaseProviderService.java index dc11050..f75a6c7 100644 --- a/slider-providers/hbase/slider-hbase-provider/src/main/java/org/apache/slider/providers/hbase/HBaseProviderService.java +++ b/slider-providers/hbase/slider-hbase-provider/src/main/java/org/apache/slider/providers/hbase/HBaseProviderService.java @@ -281,29 +281,6 @@ public class HBaseProviderService extends AbstractProviderService } /** - * Run this service - * - * - * @param instanceDefinition component description - * @param confDir local dir with the config - * @param env environment variables above those generated by - * @param execInProgress callback for the event notification - * @throws IOException IO problems - * @throws SliderException anything internal - */ - @Override - public boolean exec(AggregateConf instanceDefinition, - File confDir, - Map<String, String> env, - ProviderCompleted execInProgress) throws - IOException, - SliderException { - - return false; - } - - - /** * This is a validation of the application configuration on the AM. * Here is where things like the existence of keytabs and other * not-seen-client-side properties can be tested, before
