Repository: incubator-slider Updated Branches: refs/heads/develop bf1b41d70 -> 134ef53f9
SLIDER-1209 Provide information on whether a slider app was killed / stopped via a request (exitReason) Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/134ef53f Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/134ef53f Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/134ef53f Branch: refs/heads/develop Commit: 134ef53f9a0330e0c007ebfc620f1ea40017e54e Parents: bf1b41d Author: Gour Saha <gourks...@apache.org> Authored: Sat Mar 4 21:10:06 2017 -0800 Committer: Gour Saha <gourks...@apache.org> Committed: Sat Mar 4 21:10:06 2017 -0800 ---------------------------------------------------------------------- .../org/apache/slider/api/SliderExitReason.java | 27 +++++++++++++ .../api/types/ApplicationDiagnostics.java | 10 +++++ .../server/appmaster/SliderAppMaster.java | 40 +++++++++++++------- .../appmaster/actions/ActionStopSlider.java | 10 +++++ .../server/appmaster/rpc/SliderIPCService.java | 2 + .../application/actions/RestActionStop.java | 2 + .../lifecycle/AgentClusterLifecycleIT.groovy | 9 +++++ 7 files changed, 87 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/134ef53f/slider-core/src/main/java/org/apache/slider/api/SliderExitReason.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/SliderExitReason.java b/slider-core/src/main/java/org/apache/slider/api/SliderExitReason.java new file mode 100644 index 0000000..de698c9 --- /dev/null +++ b/slider-core/src/main/java/org/apache/slider/api/SliderExitReason.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.slider.api; + +/** + * A high level reason for an application failure. For most of the cases it is + * difficult to decipher if the Slider app failed due to an application error. + * This gap can be bridged a little better when we get to SLIDER-1208. + * + */ +public enum SliderExitReason { + STOP_COMMAND_ISSUED, SLIDER_AM_ERROR, SLIDER_AGENT_ERROR, CHAOS_MONKEY, YARN_ERROR, APP_ERROR; +} http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/134ef53f/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java b/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java index c28c11b..f609017 100644 --- a/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java +++ b/slider-core/src/main/java/org/apache/slider/api/types/ApplicationDiagnostics.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; +import org.apache.slider.api.SliderExitReason; import org.codehaus.jackson.JsonGenerationException; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.annotate.JsonIgnore; @@ -46,6 +47,7 @@ public class ApplicationDiagnostics { private Map<String, ContainerInformation> containersMap = new HashMap<>(); private FinalApplicationStatus finalStatus; private String finalMessage; + private SliderExitReason exitReason; private Set<ContainerInformation> containers = new HashSet<>(); private Set<String> recentFailedContainers = new HashSet<>(); @@ -98,6 +100,14 @@ public class ApplicationDiagnostics { this.finalMessage = finalMessage; } + public SliderExitReason getExitReason() { + return exitReason; + } + + public void setExitReason(SliderExitReason exitReason) { + this.exitReason = exitReason; + } + @Override public String toString() { try { http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/134ef53f/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java index f076e87..c33f7ac 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java @@ -90,6 +90,7 @@ import org.apache.slider.api.ClusterDescription; import org.apache.slider.api.InternalKeys; import org.apache.slider.api.ResourceKeys; import org.apache.slider.api.RoleKeys; +import org.apache.slider.api.SliderExitReason; import org.apache.slider.api.StatusKeys; import org.apache.slider.api.proto.SliderClusterAPI; import org.apache.slider.api.types.ApplicationDiagnostics; @@ -1017,7 +1018,9 @@ public class SliderAppMaster extends AbstractSliderLaunchedService log.error("Exception : {}", e, e); // call the AM stop command as if it had been queued (but without // going via the queue, which may not have started - onAMStop(new ActionStopSlider(e)); + ActionStopSlider stopSlider = new ActionStopSlider(e); + stopSlider.setExitReason(SliderExitReason.SLIDER_AM_ERROR); + onAMStop(stopSlider); } //shutdown time return finish(); @@ -1617,6 +1620,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService ApplicationDiagnostics appDiagnostics = getApplicationDiagnostics(); appDiagnostics.setFinalStatus(appStatus); appDiagnostics.setFinalMessage(finalMessage); + appDiagnostics.setExitReason(stopAction.getExitReason()); String appMessage = appDiagnostics.toString(); try { log.info("Unregistering AM status={} message={}", appStatus, appMessage); @@ -1961,7 +1965,9 @@ public class SliderAppMaster extends AbstractSliderLaunchedService } catch (TriggerClusterTeardownException e) { //App state has decided that it is time to exit log.error("Cluster teardown triggered {}", e, e); - queue(new ActionStopSlider(e)); + ActionStopSlider stopSlider = new ActionStopSlider(e); + stopSlider.setExitReason(SliderExitReason.SLIDER_AM_ERROR); + queue(stopSlider); } } @@ -2028,10 +2034,10 @@ public class SliderAppMaster extends AbstractSliderLaunchedService @Override //AMRMClientAsync public void onShutdownRequest() { LOG_YARN.info("Shutdown Request received"); - signalAMComplete(new ActionStopSlider("stop", - EXIT_SUCCESS, - FinalApplicationStatus.SUCCEEDED, - "Shutdown requested from RM")); + ActionStopSlider stopSlider = new ActionStopSlider("stop", EXIT_SUCCESS, + FinalApplicationStatus.SUCCEEDED, "Shutdown requested from RM"); + stopSlider.setExitReason(SliderExitReason.YARN_ERROR); + signalAMComplete(stopSlider); } /** @@ -2069,9 +2075,11 @@ public class SliderAppMaster extends AbstractSliderLaunchedService if (e instanceof InvalidResourceRequestException) { // stop the cluster LOG_YARN.error("AMRMClientAsync.onError() received {}", e, e); - signalAMComplete(new ActionStopSlider("stop", EXIT_EXCEPTION_THROWN, - FinalApplicationStatus.FAILED, - SliderUtils.extractFirstLine(e.getLocalizedMessage()))); + ActionStopSlider stopSlider = new ActionStopSlider("stop", + EXIT_EXCEPTION_THROWN, FinalApplicationStatus.FAILED, + SliderUtils.extractFirstLine(e.getLocalizedMessage())); + stopSlider.setExitReason(SliderExitReason.APP_ERROR); + signalAMComplete(stopSlider); } else if (e instanceof InvalidApplicationMasterRequestException) { // halt the AM LOG_YARN.error("AMRMClientAsync.onError() received {}", e, e); @@ -2165,7 +2173,9 @@ public class SliderAppMaster extends AbstractSliderLaunchedService // cluster flex failure: log log.error("Failed to flex cluster nodes: {}", e, e); // then what? exit - queue(new ActionStopSlider(e)); + ActionStopSlider stopSlider = new ActionStopSlider(e); + stopSlider.setExitReason(SliderExitReason.SLIDER_AM_ERROR); + queue(stopSlider); } } @@ -2217,6 +2227,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService mappedProcessExitCode, FinalApplicationStatus.FAILED, reason); + stop.setExitReason(SliderExitReason.YARN_ERROR); //this wasn't expected: the process finished early spawnedProcessExitedBeforeShutdownTriggered = true; log.info( @@ -2392,9 +2403,11 @@ public class SliderAppMaster extends AbstractSliderLaunchedService if (exception instanceof ExitCodeProvider) { exitCode = ((ExitCodeProvider) exception).getExitCode(); } - signalAMComplete( - new ActionStopSlider("stop", exitCode, FinalApplicationStatus.FAILED, - SliderUtils.extractFirstLine(exception.getLocalizedMessage()))); + ActionStopSlider stopSlider = new ActionStopSlider("stop", exitCode, + FinalApplicationStatus.FAILED, + SliderUtils.extractFirstLine(exception.getLocalizedMessage())); + stopSlider.setExitReason(SliderExitReason.SLIDER_AM_ERROR); + signalAMComplete(stopSlider); } } @@ -2451,6 +2464,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService LauncherExitCodes.EXIT_FALSE, FinalApplicationStatus.FAILED, E_TRIGGERED_LAUNCH_FAILURE); + stop.setExitReason(SliderExitReason.CHAOS_MONKEY); queue(stop); } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/134ef53f/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java index 055cea5..6d9e466 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java @@ -19,6 +19,7 @@ package org.apache.slider.server.appmaster.actions; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; +import org.apache.slider.api.SliderExitReason; import org.apache.slider.core.exceptions.ExceptionConverter; import org.apache.slider.core.exceptions.TriggerClusterTeardownException; import org.apache.slider.core.main.ExitCodeProvider; @@ -37,6 +38,7 @@ public class ActionStopSlider extends AsyncAction { private FinalApplicationStatus finalApplicationStatus; private String message; private final Exception ex; + private SliderExitReason exitReason; /** * Simple constructor @@ -159,4 +161,12 @@ public class ActionStopSlider extends AsyncAction { public Exception getEx() { return ex; } + + public SliderExitReason getExitReason() { + return exitReason; + } + + public void setExitReason(SliderExitReason exitReason) { + this.exitReason = exitReason; + } } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/134ef53f/slider-core/src/main/java/org/apache/slider/server/appmaster/rpc/SliderIPCService.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/rpc/SliderIPCService.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/rpc/SliderIPCService.java index fda23aa..1496ed0 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/rpc/SliderIPCService.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/rpc/SliderIPCService.java @@ -26,6 +26,7 @@ import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.slider.api.ClusterDescription; import org.apache.slider.api.SliderClusterProtocol; +import org.apache.slider.api.SliderExitReason; import org.apache.slider.api.proto.Messages; import org.apache.slider.api.types.ApplicationLivenessInformation; import org.apache.slider.api.types.ComponentInformation; @@ -184,6 +185,7 @@ public class SliderIPCService extends AbstractService LauncherExitCodes.EXIT_SUCCESS, FinalApplicationStatus.SUCCEEDED, message); + stopSlider.setExitReason(SliderExitReason.STOP_COMMAND_ISSUED); log.info("SliderAppMasterApi.stopCluster: {}", stopSlider); schedule(stopSlider); return Messages.StopClusterResponseProto.getDefaultInstance(); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/134ef53f/slider-core/src/main/java/org/apache/slider/server/appmaster/web/rest/application/actions/RestActionStop.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/web/rest/application/actions/RestActionStop.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/web/rest/application/actions/RestActionStop.java index 544f589..02e2295 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/web/rest/application/actions/RestActionStop.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/web/rest/application/actions/RestActionStop.java @@ -19,6 +19,7 @@ package org.apache.slider.server.appmaster.web.rest.application.actions; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; +import org.apache.slider.api.SliderExitReason; import org.apache.slider.core.main.LauncherExitCodes; import org.apache.slider.server.appmaster.actions.ActionStopSlider; import org.apache.slider.server.appmaster.web.WebAppApi; @@ -59,6 +60,7 @@ public class RestActionStop { LauncherExitCodes.EXIT_SUCCESS, FinalApplicationStatus.SUCCEEDED, text); + stopSlider.setExitReason(SliderExitReason.STOP_COMMAND_ISSUED); log.info("SliderAppMasterApi.stopCluster: {}", stopSlider); slider.getQueues().schedule(stopSlider); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/134ef53f/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AgentClusterLifecycleIT.groovy ---------------------------------------------------------------------- diff --git a/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AgentClusterLifecycleIT.groovy b/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AgentClusterLifecycleIT.groovy index faeb0a1..e8e2791 100644 --- a/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AgentClusterLifecycleIT.groovy +++ b/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AgentClusterLifecycleIT.groovy @@ -22,12 +22,15 @@ import groovy.transform.CompileStatic import groovy.util.logging.Slf4j import org.apache.hadoop.yarn.api.records.YarnApplicationState import org.apache.slider.api.ClusterDescription +import org.apache.slider.api.SliderExitReason; import org.apache.slider.api.StatusKeys +import org.apache.slider.api.types.ApplicationDiagnostics; import org.apache.slider.client.SliderClient import org.apache.slider.common.SliderExitCodes import org.apache.slider.common.SliderXmlConfKeys import org.apache.slider.common.params.Arguments import org.apache.slider.common.params.SliderActions +import org.apache.slider.core.launch.SerializedApplicationReport import org.apache.slider.funtest.ResourcePaths import org.apache.slider.funtest.framework.AgentCommandTestBase import org.apache.slider.funtest.framework.FuntestProperties @@ -145,6 +148,12 @@ public class AgentClusterLifecycleIT extends AgentCommandTestBase // should be in finished state, as this was a clean shutdown assertInYarnState(appId, YarnApplicationState.FINISHED) + // Get diagnostics and validate that exitReason is STOP_COMMAND_ISSUED + SerializedApplicationReport appReport = lookupApplication(appId) + log.info("Application Report {}", appReport); + ApplicationDiagnostics appDiagnostics = ApplicationDiagnostics.fromJson(appReport.diagnostics) + assert appDiagnostics.exitReason == SliderExitReason.STOP_COMMAND_ISSUED + //cluster exists if you don't want it to be live exists(0, CLUSTER, false) //condition returns false if it is required to be live