SLIDER-467 final status to be "succeeded"...reworking of how exit codes are propagated with tests fixed where the changes showed they were incomplete/broken
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/a3944b1c Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/a3944b1c Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/a3944b1c Branch: refs/heads/feature/SLIDER-149_Support_a_YARN_service_registry Commit: a3944b1cd44024acd4578a7e49f8abf0e8497f92 Parents: 155262b Author: Steve Loughran <[email protected]> Authored: Sat Oct 4 19:48:56 2014 -0700 Committer: Steve Loughran <[email protected]> Committed: Sat Oct 4 19:48:56 2014 -0700 ---------------------------------------------------------------------- .../slider/core/exceptions/SliderException.java | 6 +-- .../apache/slider/providers/ProviderUtils.java | 6 ++- .../server/appmaster/SliderAppMaster.java | 40 ++++++++++++++------ .../agent/actions/TestActionStatus.groovy | 9 ++--- .../standalone/TestStandaloneAMRestart.groovy | 38 ++++++++++++++----- 5 files changed, 67 insertions(+), 32 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java b/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java index 18e3157..7f3134a 100644 --- a/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java +++ b/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java @@ -41,9 +41,9 @@ public class SliderException extends ServiceLaunchException implements /** * Format the exception as you create it - * @param code - * @param message - * @param args + * @param code exit code + * @param message exception message -sprintf formatted + * @param args arguments for the formatting */ public SliderException(int code, String message, Object... args) { super(code, String.format(message, args)); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java b/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java index 464aba5..8e77a9c 100644 --- a/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java +++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java @@ -204,11 +204,13 @@ public class ProviderUtils implements RoleKeys { /** * Validate the node count and heap size values of a node class - * + * <p> + * If max <= 0: min <= count + * If max > 0: min <= count <= max * @param name node class name * @param count requested node count * @param min requested heap size - * @param max + * @param max maximum value. * @throws BadCommandArgumentsException if the values are out of range */ public void validateNodeCount(String name, http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java index 7fbea86..5676f3f 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java @@ -51,6 +51,7 @@ import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync; import org.apache.hadoop.yarn.client.api.async.NMClientAsync; import org.apache.hadoop.yarn.client.api.async.impl.NMClientAsyncImpl; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.ipc.YarnRPC; import org.apache.hadoop.yarn.security.AMRMTokenIdentifier; @@ -86,6 +87,7 @@ import org.apache.slider.core.exceptions.SliderException; import org.apache.slider.core.exceptions.SliderInternalStateException; import org.apache.slider.core.exceptions.TriggerClusterTeardownException; import org.apache.slider.core.main.ExitCodeProvider; +import org.apache.slider.core.main.LauncherExitCodes; import org.apache.slider.core.main.RunService; import org.apache.slider.core.main.ServiceLauncher; import org.apache.slider.core.persist.ConfTreeSerDeser; @@ -335,7 +337,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService * The stop request received...the exit details are extracted * from this */ - private ActionStopSlider stopAction; + private volatile ActionStopSlider stopAction; @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private RoleLaunchService launchService; @@ -357,6 +359,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService private String agentOpsUrl; private String agentStatusUrl; private FsDelegationTokenManager fsDelegationTokenManager; + private RegisterApplicationMasterResponse amRegistrationData; /** * Service Constructor @@ -690,12 +693,12 @@ public class SliderAppMaster extends AbstractSliderLaunchedService // address = SliderUtils.getRmSchedulerAddress(asyncRMClient.getConfig()); log.info("Connecting to RM at {},address tracking URL={}", appMasterRpcPort, appMasterTrackingUrl); - RegisterApplicationMasterResponse response = asyncRMClient + amRegistrationData = asyncRMClient .registerApplicationMaster(appMasterHostname, appMasterRpcPort, appMasterTrackingUrl); Resource maxResources = - response.getMaximumResourceCapability(); + amRegistrationData.getMaximumResourceCapability(); containerMaxMemory = maxResources.getMemory(); containerMaxCores = maxResources.getVirtualCores(); appState.setContainerLimits(maxResources.getMemory(), @@ -707,8 +710,8 @@ public class SliderAppMaster extends AbstractSliderLaunchedService boolean securityEnabled = UserGroupInformation.isSecurityEnabled(); if (securityEnabled) { secretManager.setMasterKey( - response.getClientToAMTokenMasterKey().array()); - applicationACLs = response.getApplicationACLs(); + amRegistrationData.getClientToAMTokenMasterKey().array()); + applicationACLs = amRegistrationData.getApplicationACLs(); //tell the server what the ACLs are rpcService.getServer().refreshServiceAcl(serviceConf, @@ -717,7 +720,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService // extract container list - liveContainers = response.getContainersFromPreviousAttempts(); + liveContainers = amRegistrationData.getContainersFromPreviousAttempts(); //now validate the installation Configuration providerConf = @@ -851,7 +854,8 @@ public class SliderAppMaster extends AbstractSliderLaunchedService //now block waiting to be told to exit the process waitForAMCompletionSignal(); } catch(Exception e) { - stopAction = new ActionStopSlider(e); + log.error("Exception : {}", e, e); + onAMStop(new ActionStopSlider(e)); } //shutdown time return finish(); @@ -1117,6 +1121,13 @@ public class SliderAppMaster extends AbstractSliderLaunchedService log.debug("Stopped forked process: exit code={}", exitCode); } + // make sure the AM is actually registered. If not, there's no point + // trying to unregister it + if (amRegistrationData == null) { + log.info("Application attempt not yet registered; skipping unregistration"); + return exitCode; + } + //stop any launches in progress launchService.stop(); @@ -1137,6 +1148,9 @@ public class SliderAppMaster extends AbstractSliderLaunchedService */ } catch (IOException e) { log.info("Failed to unregister application: " + e, e); + } catch (InvalidApplicationMasterRequestException e) { + log.info("Application not found in YARN application list;" + + " it may have been terminated/YARN shutdown in progress: " + e, e); } catch (YarnException e) { log.info("Failed to unregister application: " + e, e); } @@ -1451,10 +1465,13 @@ public class SliderAppMaster extends AbstractSliderLaunchedService YarnException { onRpcCall("stopCluster()"); String message = request.getMessage(); + if (message == null) { + message = "application frozen by client"; + } ActionStopSlider stopSlider = new ActionStopSlider(message, 1000, TimeUnit.MILLISECONDS, - 0, + LauncherExitCodes.EXIT_SUCCESS, FinalApplicationStatus.SUCCEEDED, message); log.info("SliderAppMasterApi.stopCluster: {}", stopSlider); @@ -1471,7 +1488,8 @@ public class SliderAppMaster extends AbstractSliderLaunchedService ConfTreeSerDeser confTreeSerDeser = new ConfTreeSerDeser(); ConfTree updatedResources = confTreeSerDeser.fromJson(payload); flexCluster(updatedResources); - return Messages.FlexClusterResponseProto.newBuilder().setResponse(true).build(); + return Messages.FlexClusterResponseProto.newBuilder().setResponse( + true).build(); } @Override //SliderClusterProtocol @@ -1672,9 +1690,9 @@ public class SliderAppMaster extends AbstractSliderLaunchedService try { flexCluster(getInstanceDefinition().getResources()); } catch (Exception e) { - //this may happen in a separate thread, so the ability to act is limited + // cluster flex failure: log log.error("Failed to flex cluster nodes: {}", e, e); - //declare a failure + // then what? exit queue(new ActionStopSlider(e)); } } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy index 9fcdb17..fea07af 100644 --- a/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy @@ -44,11 +44,6 @@ import org.junit.Test @Slf4j class TestActionStatus extends AgentMiniClusterTestBase { - @Before - public void setup() { - super.setup() - createMiniCluster("", configuration, 1, false) - } /** * This is a test suite to run the tests against a single cluster instance @@ -58,6 +53,8 @@ class TestActionStatus extends AgentMiniClusterTestBase { @Test public void testSuite() throws Throwable { + super.setup() + createMiniCluster("testactionstatus", configuration, 1, true) testStatusLiveCluster() testStatusMissingCluster() } @@ -116,7 +113,7 @@ class TestActionStatus extends AgentMiniClusterTestBase { assert statusLauncher.serviceExitCode == 0 //status to a file - File tfile = new File("target/" + clustername + "/status.json") + File tfile = new File("target/$clustername-status.json") statusArgs.output = tfile.absolutePath sliderClient.actionStatus(clustername, statusArgs) def text = tfile.text http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy index 947529c..d8f7141 100644 --- a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy @@ -52,8 +52,8 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase { ServiceLauncher<SliderClient> launcher = createStandaloneAMWithArgs(clustername, [ - Arguments.ARG_OPTION, SliderXmlConfKeys.KEY_AM_RESTART_LIMIT, - "$restartLimit".toString() + Arguments.ARG_DEFINE, + SliderXmlConfKeys.KEY_AM_RESTART_LIMIT + "=" + restartLimit ], true, false) @@ -71,15 +71,16 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase { sliderClient.actionDiagnostic(diagnosticArgs) int iteration = 1; - killAM(iteration, sliderClient, clustername) + killAMAndWaitForRestart(sliderClient, iteration, clustername) - killAM(iteration++, sliderClient, clustername) + killAMAndWaitForRestart(sliderClient, iteration++, clustername) // app should be running here assert 0 == sliderClient.actionExists(clustername, true) // kill again & expect it to be considered a failure - killAM(iteration++, sliderClient, clustername) + killAmAndWaitForDeath(sliderClient, iteration++, clustername) + sleep(20000) report = sliderClient.applicationReport assert report.finalApplicationStatus == FinalApplicationStatus.FAILED @@ -93,9 +94,29 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase { assert 0 == clusterActionFreeze(sliderClient, clustername, "force", true) } - public ActionAMSuicideArgs killAM( - int iteration, + /** + * Kill an AM. take an iteration count for the message sent to the + * AM (hence its logs) + * @param iteration + * @param sliderClient + * @param clustername + * @return + */ + public ActionAMSuicideArgs killAMAndWaitForRestart( + SliderClient sliderClient, int iteration, String clustername) { + ActionAMSuicideArgs args = killAmAndWaitForDeath( + sliderClient, + iteration, + clustername) + //give yarn some time to notice + sleep(20000) + waitUntilClusterLive(sliderClient, 20000) + return args + } + + public ActionAMSuicideArgs killAmAndWaitForDeath( SliderClient sliderClient, + int iteration, String clustername) { ActionAMSuicideArgs args = new ActionAMSuicideArgs() args.waittime = 100 @@ -103,9 +124,6 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase { args.message = "kill AM iteration #$iteration" sliderClient.actionAmSuicide(clustername, args) waitWhileClusterLive(sliderClient); - //give yarn some time to notice - sleep(20000) - waitUntilClusterLive(sliderClient, 20000) return args }
