SLIDER-856 pre-emption is not treated as failure; special handling for node failures (not treated as role failures), and container limits exceeded (not treated as node failures). Counters in RoleStatus and NodeEntry for this.
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/0a50c48e Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/0a50c48e Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/0a50c48e Branch: refs/heads/develop Commit: 0a50c48e713a2df27efd226215c9f748e4335708 Parents: 3d61bed Author: Steve Loughran <[email protected]> Authored: Mon May 4 23:08:32 2015 +0100 Committer: Steve Loughran <[email protected]> Committed: Mon May 4 23:08:32 2015 +0100 ---------------------------------------------------------------------- .../apache/slider/api/ClusterDescription.java | 28 +- .../java/org/apache/slider/api/RoleKeys.java | 17 +- .../java/org/apache/slider/api/StatusKeys.java | 3 + .../org/apache/slider/api/proto/Messages.java | 342 +++++++++++++++++-- .../slider/api/proto/RestTypeMarshalling.java | 9 +- .../slider/api/types/ComponentInformation.java | 14 +- .../server/appmaster/SliderAppMaster.java | 2 - .../slider/server/appmaster/state/AppState.java | 55 ++- .../appmaster/state/ContainerOutcome.java | 61 ++++ .../server/appmaster/state/NodeEntry.java | 36 +- .../server/appmaster/state/RoleHistory.java | 19 +- .../server/appmaster/state/RoleStatus.java | 74 +++- .../src/main/proto/SliderClusterMessages.proto | 5 +- .../TestMockAppStateContainerFailure.groovy | 132 ++++++- .../TestRoleHistoryContainerEvents.groovy | 10 +- ...stRoleHistoryFindNodesForNewInstances.groovy | 10 +- .../model/mock/BaseMockAppStateTest.groovy | 5 +- .../appmaster/web/view/TestIndexBlock.groovy | 6 +- 18 files changed, 725 insertions(+), 103 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java b/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java index 7db7e7f..025bd32 100644 --- a/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java +++ b/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java @@ -594,7 +594,7 @@ public class ClusterDescription implements Cloneable { } /** - * Get a role opt; use {@link Integer#decode(String)} so as to take hex + * Get an integer role option; use {@link Integer#decode(String)} so as to take hex * oct and bin values too. * * @param role role to get from @@ -609,6 +609,21 @@ public class ClusterDescription implements Cloneable { } /** + * Get an integer role option; use {@link Integer#decode(String)} so as to take hex + * oct and bin values too. + * + * @param role role to get from + * @param option option name + * @param defVal default value + * @return parsed value + * @throws NumberFormatException if the role could not be parsed. + */ + public long getRoleOptLong(String role, String option, long defVal) { + String val = getRoleOpt(role, option, Long.toString(defVal)); + return Long.decode(val); + } + + /** * Set a role option, creating the role if necessary * @param role role name * @param option option name @@ -630,6 +645,17 @@ public class ClusterDescription implements Cloneable { } /** + * Set a role option of any object, using its string value. + * This works for (Boxed) numeric values as well as other objects + * @param role role name + * @param option option name + * @param val non-null value + */ + public void setRoleOpt(String role, String option, Object val) { + setRoleOpt(role, option, val.toString()); + } + + /** * Get the value of a role requirement (cores, RAM, etc). * These are returned as integers, but there is special handling of the * string {@link ResourceKeys#YARN_RESOURCE_MAX}, which triggers http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java b/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java index 0f0fb8c..4512354 100644 --- a/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java +++ b/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java @@ -45,11 +45,26 @@ public interface RoleKeys { String ROLE_RELEASING_INSTANCES = "role.releasing.instances"; /** - * Status report: number currently being released: {@value} + * Status report: total number that have failed: {@value} */ String ROLE_FAILED_INSTANCES = "role.failed.instances"; /** + * Status report: number that have failed recently: {@value} + */ + String ROLE_FAILED_RECENTLY_INSTANCES = "role.failed.recently.instances"; + + /** + * Status report: number that have failed for node-related issues: {@value} + */ + String ROLE_NODE_FAILED_INSTANCES = "role.failed.node.instances"; + + /** + * Status report: number that been pre-empted: {@value} + */ + String ROLE_PREEMPTED_INSTANCES = "role.failed.preempted.instances"; + + /** * Status report: number currently being released: {@value} */ String ROLE_FAILED_STARTING_INSTANCES = "role.failed.starting.instances"; http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java b/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java index ef68aad..6a0a2fa 100644 --- a/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java +++ b/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java @@ -27,6 +27,9 @@ public interface StatusKeys { String STATISTICS_CONTAINERS_COMPLETED = "containers.completed"; String STATISTICS_CONTAINERS_DESIRED = "containers.desired"; String STATISTICS_CONTAINERS_FAILED = "containers.failed"; + String STATISTICS_CONTAINERS_FAILED_RECENTLY = "containers.failed.recently"; + String STATISTICS_CONTAINERS_FAILED_NODE = "containers.failed.node"; + String STATISTICS_CONTAINERS_PREEMPTED = "containers.failed.preempted"; String STATISTICS_CONTAINERS_LIVE = "containers.live"; String STATISTICS_CONTAINERS_REQUESTED = "containers.requested"; String STATISTICS_CONTAINERS_STARTED = "containers.start.started"; http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java b/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java index a6c8da8..6c92f46 100644 --- a/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java +++ b/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java @@ -15070,6 +15070,36 @@ public final class Messages { */ com.google.protobuf.ByteString getContainersBytes(int index); + + // optional int32 failedRecently = 15; + /** + * <code>optional int32 failedRecently = 15;</code> + */ + boolean hasFailedRecently(); + /** + * <code>optional int32 failedRecently = 15;</code> + */ + int getFailedRecently(); + + // optional int32 nodeFailed = 16; + /** + * <code>optional int32 nodeFailed = 16;</code> + */ + boolean hasNodeFailed(); + /** + * <code>optional int32 nodeFailed = 16;</code> + */ + int getNodeFailed(); + + // optional int32 preempted = 17; + /** + * <code>optional int32 preempted = 17;</code> + */ + boolean hasPreempted(); + /** + * <code>optional int32 preempted = 17;</code> + */ + int getPreempted(); } /** * Protobuf type {@code org.apache.slider.api.ComponentInformationProto} @@ -15200,6 +15230,21 @@ public final class Messages { containers_.add(input.readBytes()); break; } + case 120: { + bitField0_ |= 0x00002000; + failedRecently_ = input.readInt32(); + break; + } + case 128: { + bitField0_ |= 0x00004000; + nodeFailed_ = input.readInt32(); + break; + } + case 136: { + bitField0_ |= 0x00008000; + preempted_ = input.readInt32(); + break; + } } } } catch (com.google.protobuf.InvalidProtocolBufferException e) { @@ -15535,6 +15580,54 @@ public final class Messages { return containers_.getByteString(index); } + // optional int32 failedRecently = 15; + public static final int FAILEDRECENTLY_FIELD_NUMBER = 15; + private int failedRecently_; + /** + * <code>optional int32 failedRecently = 15;</code> + */ + public boolean hasFailedRecently() { + return ((bitField0_ & 0x00002000) == 0x00002000); + } + /** + * <code>optional int32 failedRecently = 15;</code> + */ + public int getFailedRecently() { + return failedRecently_; + } + + // optional int32 nodeFailed = 16; + public static final int NODEFAILED_FIELD_NUMBER = 16; + private int nodeFailed_; + /** + * <code>optional int32 nodeFailed = 16;</code> + */ + public boolean hasNodeFailed() { + return ((bitField0_ & 0x00004000) == 0x00004000); + } + /** + * <code>optional int32 nodeFailed = 16;</code> + */ + public int getNodeFailed() { + return nodeFailed_; + } + + // optional int32 preempted = 17; + public static final int PREEMPTED_FIELD_NUMBER = 17; + private int preempted_; + /** + * <code>optional int32 preempted = 17;</code> + */ + public boolean hasPreempted() { + return ((bitField0_ & 0x00008000) == 0x00008000); + } + /** + * <code>optional int32 preempted = 17;</code> + */ + public int getPreempted() { + return preempted_; + } + private void initFields() { name_ = ""; priority_ = 0; @@ -15550,6 +15643,9 @@ public final class Messages { failureMessage_ = ""; placementPolicy_ = 0; containers_ = com.google.protobuf.LazyStringArrayList.EMPTY; + failedRecently_ = 0; + nodeFailed_ = 0; + preempted_ = 0; } private byte memoizedIsInitialized = -1; public final boolean isInitialized() { @@ -15605,6 +15701,15 @@ public final class Messages { for (int i = 0; i < containers_.size(); i++) { output.writeBytes(14, containers_.getByteString(i)); } + if (((bitField0_ & 0x00002000) == 0x00002000)) { + output.writeInt32(15, failedRecently_); + } + if (((bitField0_ & 0x00004000) == 0x00004000)) { + output.writeInt32(16, nodeFailed_); + } + if (((bitField0_ & 0x00008000) == 0x00008000)) { + output.writeInt32(17, preempted_); + } getUnknownFields().writeTo(output); } @@ -15675,6 +15780,18 @@ public final class Messages { size += dataSize; size += 1 * getContainersList().size(); } + if (((bitField0_ & 0x00002000) == 0x00002000)) { + size += com.google.protobuf.CodedOutputStream + .computeInt32Size(15, failedRecently_); + } + if (((bitField0_ & 0x00004000) == 0x00004000)) { + size += com.google.protobuf.CodedOutputStream + .computeInt32Size(16, nodeFailed_); + } + if (((bitField0_ & 0x00008000) == 0x00008000)) { + size += com.google.protobuf.CodedOutputStream + .computeInt32Size(17, preempted_); + } size += getUnknownFields().getSerializedSize(); memoizedSerializedSize = size; return size; @@ -15765,6 +15882,21 @@ public final class Messages { } result = result && getContainersList() .equals(other.getContainersList()); + result = result && (hasFailedRecently() == other.hasFailedRecently()); + if (hasFailedRecently()) { + result = result && (getFailedRecently() + == other.getFailedRecently()); + } + result = result && (hasNodeFailed() == other.hasNodeFailed()); + if (hasNodeFailed()) { + result = result && (getNodeFailed() + == other.getNodeFailed()); + } + result = result && (hasPreempted() == other.hasPreempted()); + if (hasPreempted()) { + result = result && (getPreempted() + == other.getPreempted()); + } result = result && getUnknownFields().equals(other.getUnknownFields()); return result; @@ -15834,6 +15966,18 @@ public final class Messages { hash = (37 * hash) + CONTAINERS_FIELD_NUMBER; hash = (53 * hash) + getContainersList().hashCode(); } + if (hasFailedRecently()) { + hash = (37 * hash) + FAILEDRECENTLY_FIELD_NUMBER; + hash = (53 * hash) + getFailedRecently(); + } + if (hasNodeFailed()) { + hash = (37 * hash) + NODEFAILED_FIELD_NUMBER; + hash = (53 * hash) + getNodeFailed(); + } + if (hasPreempted()) { + hash = (37 * hash) + PREEMPTED_FIELD_NUMBER; + hash = (53 * hash) + getPreempted(); + } hash = (29 * hash) + getUnknownFields().hashCode(); memoizedHashCode = hash; return hash; @@ -15976,6 +16120,12 @@ public final class Messages { bitField0_ = (bitField0_ & ~0x00001000); containers_ = com.google.protobuf.LazyStringArrayList.EMPTY; bitField0_ = (bitField0_ & ~0x00002000); + failedRecently_ = 0; + bitField0_ = (bitField0_ & ~0x00004000); + nodeFailed_ = 0; + bitField0_ = (bitField0_ & ~0x00008000); + preempted_ = 0; + bitField0_ = (bitField0_ & ~0x00010000); return this; } @@ -16062,6 +16212,18 @@ public final class Messages { bitField0_ = (bitField0_ & ~0x00002000); } result.containers_ = containers_; + if (((from_bitField0_ & 0x00004000) == 0x00004000)) { + to_bitField0_ |= 0x00002000; + } + result.failedRecently_ = failedRecently_; + if (((from_bitField0_ & 0x00008000) == 0x00008000)) { + to_bitField0_ |= 0x00004000; + } + result.nodeFailed_ = nodeFailed_; + if (((from_bitField0_ & 0x00010000) == 0x00010000)) { + to_bitField0_ |= 0x00008000; + } + result.preempted_ = preempted_; result.bitField0_ = to_bitField0_; onBuilt(); return result; @@ -16131,6 +16293,15 @@ public final class Messages { } onChanged(); } + if (other.hasFailedRecently()) { + setFailedRecently(other.getFailedRecently()); + } + if (other.hasNodeFailed()) { + setNodeFailed(other.getNodeFailed()); + } + if (other.hasPreempted()) { + setPreempted(other.getPreempted()); + } this.mergeUnknownFields(other.getUnknownFields()); return this; } @@ -16762,6 +16933,105 @@ public final class Messages { return this; } + // optional int32 failedRecently = 15; + private int failedRecently_ ; + /** + * <code>optional int32 failedRecently = 15;</code> + */ + public boolean hasFailedRecently() { + return ((bitField0_ & 0x00004000) == 0x00004000); + } + /** + * <code>optional int32 failedRecently = 15;</code> + */ + public int getFailedRecently() { + return failedRecently_; + } + /** + * <code>optional int32 failedRecently = 15;</code> + */ + public Builder setFailedRecently(int value) { + bitField0_ |= 0x00004000; + failedRecently_ = value; + onChanged(); + return this; + } + /** + * <code>optional int32 failedRecently = 15;</code> + */ + public Builder clearFailedRecently() { + bitField0_ = (bitField0_ & ~0x00004000); + failedRecently_ = 0; + onChanged(); + return this; + } + + // optional int32 nodeFailed = 16; + private int nodeFailed_ ; + /** + * <code>optional int32 nodeFailed = 16;</code> + */ + public boolean hasNodeFailed() { + return ((bitField0_ & 0x00008000) == 0x00008000); + } + /** + * <code>optional int32 nodeFailed = 16;</code> + */ + public int getNodeFailed() { + return nodeFailed_; + } + /** + * <code>optional int32 nodeFailed = 16;</code> + */ + public Builder setNodeFailed(int value) { + bitField0_ |= 0x00008000; + nodeFailed_ = value; + onChanged(); + return this; + } + /** + * <code>optional int32 nodeFailed = 16;</code> + */ + public Builder clearNodeFailed() { + bitField0_ = (bitField0_ & ~0x00008000); + nodeFailed_ = 0; + onChanged(); + return this; + } + + // optional int32 preempted = 17; + private int preempted_ ; + /** + * <code>optional int32 preempted = 17;</code> + */ + public boolean hasPreempted() { + return ((bitField0_ & 0x00010000) == 0x00010000); + } + /** + * <code>optional int32 preempted = 17;</code> + */ + public int getPreempted() { + return preempted_; + } + /** + * <code>optional int32 preempted = 17;</code> + */ + public Builder setPreempted(int value) { + bitField0_ |= 0x00010000; + preempted_ = value; + onChanged(); + return this; + } + /** + * <code>optional int32 preempted = 17;</code> + */ + public Builder clearPreempted() { + bitField0_ = (bitField0_ & ~0x00010000); + preempted_ = 0; + onChanged(); + return this; + } + // @@protoc_insertion_point(builder_scope:org.apache.slider.api.ComponentInformationProto) } @@ -28687,46 +28957,48 @@ public final class Messages { " \002(\t\022\023\n\013application\030\003 \002(\t\"`\n#Application" + "LivenessInformationProto\022\034\n\024allRequestsS" + "atisfied\030\001 \001(\010\022\033\n\023requestsOutstanding\030\002 " + - "\001(\005\"\250\002\n\031ComponentInformationProto\022\014\n\004nam", + "\001(\005\"\347\002\n\031ComponentInformationProto\022\014\n\004nam", "e\030\001 \001(\t\022\020\n\010priority\030\002 \001(\005\022\017\n\007desired\030\003 \001" + "(\005\022\016\n\006actual\030\004 \001(\005\022\021\n\treleasing\030\005 \001(\005\022\021\n" + "\trequested\030\006 \001(\005\022\016\n\006failed\030\007 \001(\005\022\017\n\007star" + "ted\030\010 \001(\005\022\023\n\013startFailed\030\t \001(\005\022\021\n\tcomple" + "ted\030\n \001(\005\022\026\n\016totalRequested\030\013 \001(\005\022\026\n\016fai" + "lureMessage\030\014 \001(\t\022\027\n\017placementPolicy\030\r \001" + - "(\005\022\022\n\ncontainers\030\016 \003(\t\"\210\002\n\031ContainerInfo" + - "rmationProto\022\023\n\013containerId\030\001 \001(\t\022\021\n\tcom" + - "ponent\030\002 \001(\t\022\020\n\010released\030\003 \001(\010\022\r\n\005state\030" + - "\004 \001(\005\022\020\n\010exitCode\030\005 \001(\005\022\023\n\013diagnostics\030\006", - " \001(\t\022\022\n\ncreateTime\030\007 \001(\003\022\021\n\tstartTime\030\010 " + - "\001(\003\022\016\n\006output\030\t \003(\t\022\014\n\004host\030\n \001(\t\022\017\n\007hos" + - "tURL\030\013 \001(\t\022\021\n\tplacement\030\014 \001(\t\022\022\n\nappVers" + - "ion\030\r \001(\t\"N\n\024PingInformationProto\022\014\n\004tex" + - "t\030\001 \001(\t\022\014\n\004verb\030\002 \001(\t\022\014\n\004body\030\003 \001(\t\022\014\n\004t" + - "ime\030\004 \001(\003\"\026\n\024GetModelRequestProto\"\035\n\033Get" + - "ModelDesiredRequestProto\"$\n\"GetModelDesi" + - "redAppconfRequestProto\"&\n$GetModelDesire" + - "dResourcesRequestProto\"%\n#GetModelResolv" + - "edAppconfRequestProto\"\'\n%GetModelResolve", - "dResourcesRequestProto\"#\n!GetModelLiveRe" + - "sourcesRequestProto\"\037\n\035GetLiveContainers" + - "RequestProto\"u\n\036GetLiveContainersRespons" + - "eProto\022\r\n\005names\030\001 \003(\t\022D\n\ncontainers\030\002 \003(" + - "\01320.org.apache.slider.api.ContainerInfor" + - "mationProto\"3\n\034GetLiveContainerRequestPr" + - "oto\022\023\n\013containerId\030\001 \002(\t\"\037\n\035GetLiveCompo" + - "nentsRequestProto\"u\n\036GetLiveComponentsRe" + - "sponseProto\022\r\n\005names\030\001 \003(\t\022D\n\ncomponents" + - "\030\002 \003(\01320.org.apache.slider.api.Component", - "InformationProto\",\n\034GetLiveComponentRequ" + - "estProto\022\014\n\004name\030\001 \002(\t\"$\n\"GetApplication" + - "LivenessRequestProto\"\023\n\021EmptyPayloadProt" + - "o\" \n\020WrappedJsonProto\022\014\n\004json\030\001 \002(\t\"h\n\037G" + - "etCertificateStoreRequestProto\022\020\n\010hostna" + - "me\030\001 \001(\t\022\023\n\013requesterId\030\002 \002(\t\022\020\n\010passwor" + - "d\030\003 \002(\t\022\014\n\004type\030\004 \002(\t\"1\n GetCertificateS" + - "toreResponseProto\022\r\n\005store\030\001 \002(\014B-\n\033org." + - "apache.slider.api.protoB\010Messages\210\001\001\240\001\001" + "(\005\022\022\n\ncontainers\030\016 \003(\t\022\026\n\016failedRecently" + + "\030\017 \001(\005\022\022\n\nnodeFailed\030\020 \001(\005\022\021\n\tpreempted\030" + + "\021 \001(\005\"\210\002\n\031ContainerInformationProto\022\023\n\013c" + + "ontainerId\030\001 \001(\t\022\021\n\tcomponent\030\002 \001(\t\022\020\n\010r", + "eleased\030\003 \001(\010\022\r\n\005state\030\004 \001(\005\022\020\n\010exitCode" + + "\030\005 \001(\005\022\023\n\013diagnostics\030\006 \001(\t\022\022\n\ncreateTim" + + "e\030\007 \001(\003\022\021\n\tstartTime\030\010 \001(\003\022\016\n\006output\030\t \003" + + "(\t\022\014\n\004host\030\n \001(\t\022\017\n\007hostURL\030\013 \001(\t\022\021\n\tpla" + + "cement\030\014 \001(\t\022\022\n\nappVersion\030\r \001(\t\"N\n\024Ping" + + "InformationProto\022\014\n\004text\030\001 \001(\t\022\014\n\004verb\030\002" + + " \001(\t\022\014\n\004body\030\003 \001(\t\022\014\n\004time\030\004 \001(\003\"\026\n\024GetM" + + "odelRequestProto\"\035\n\033GetModelDesiredReque" + + "stProto\"$\n\"GetModelDesiredAppconfRequest" + + "Proto\"&\n$GetModelDesiredResourcesRequest", + "Proto\"%\n#GetModelResolvedAppconfRequestP" + + "roto\"\'\n%GetModelResolvedResourcesRequest" + + "Proto\"#\n!GetModelLiveResourcesRequestPro" + + "to\"\037\n\035GetLiveContainersRequestProto\"u\n\036G" + + "etLiveContainersResponseProto\022\r\n\005names\030\001" + + " \003(\t\022D\n\ncontainers\030\002 \003(\01320.org.apache.sl" + + "ider.api.ContainerInformationProto\"3\n\034Ge" + + "tLiveContainerRequestProto\022\023\n\013containerI" + + "d\030\001 \002(\t\"\037\n\035GetLiveComponentsRequestProto" + + "\"u\n\036GetLiveComponentsResponseProto\022\r\n\005na", + "mes\030\001 \003(\t\022D\n\ncomponents\030\002 \003(\01320.org.apac" + + "he.slider.api.ComponentInformationProto\"" + + ",\n\034GetLiveComponentRequestProto\022\014\n\004name\030" + + "\001 \002(\t\"$\n\"GetApplicationLivenessRequestPr" + + "oto\"\023\n\021EmptyPayloadProto\" \n\020WrappedJsonP" + + "roto\022\014\n\004json\030\001 \002(\t\"h\n\037GetCertificateStor" + + "eRequestProto\022\020\n\010hostname\030\001 \001(\t\022\023\n\013reque" + + "sterId\030\002 \002(\t\022\020\n\010password\030\003 \002(\t\022\014\n\004type\030\004" + + " \002(\t\"1\n GetCertificateStoreResponseProto" + + "\022\r\n\005store\030\001 \002(\014B-\n\033org.apache.slider.api", + ".protoB\010Messages\210\001\001\240\001\001" }; com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { @@ -28882,7 +29154,7 @@ public final class Messages { internal_static_org_apache_slider_api_ComponentInformationProto_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_org_apache_slider_api_ComponentInformationProto_descriptor, - new java.lang.String[] { "Name", "Priority", "Desired", "Actual", "Releasing", "Requested", "Failed", "Started", "StartFailed", "Completed", "TotalRequested", "FailureMessage", "PlacementPolicy", "Containers", }); + new java.lang.String[] { "Name", "Priority", "Desired", "Actual", "Releasing", "Requested", "Failed", "Started", "StartFailed", "Completed", "TotalRequested", "FailureMessage", "PlacementPolicy", "Containers", "FailedRecently", "NodeFailed", "Preempted", }); internal_static_org_apache_slider_api_ContainerInformationProto_descriptor = getDescriptor().getMessageTypes().get(25); internal_static_org_apache_slider_api_ContainerInformationProto_fieldAccessorTable = new http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java b/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java index f005bc3..c408ed2 100644 --- a/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java +++ b/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java @@ -78,7 +78,7 @@ public class RestTypeMarshalling { info.started = wire.getStarted(); info.startFailed = wire.getStartFailed(); info.totalRequested = wire.getTotalRequested(); - info.containers = new ArrayList<String>(wire.getContainersList()); + info.containers = new ArrayList<>(wire.getContainersList()); if (wire.hasFailureMessage()) { info.failureMessage = wire.getFailureMessage(); } @@ -95,6 +95,7 @@ public class RestTypeMarshalling { return builder.build(); } + @SuppressWarnings("IOResourceOpenedButNotSafelyClosed") private static byte[] getStoreBytes(SecurityStore securityStore) throws IOException { InputStream is = new FileInputStream(securityStore.getFile()); @@ -105,8 +106,7 @@ public class RestTypeMarshalling { return response.getStore().toByteArray(); } - public static Messages.ComponentInformationProto - marshall(ComponentInformation info) { + public static Messages.ComponentInformationProto marshall(ComponentInformation info) { Messages.ComponentInformationProto.Builder builder = Messages.ComponentInformationProto.newBuilder(); @@ -123,6 +123,9 @@ public class RestTypeMarshalling { builder.setStarted(info.started); builder.setStartFailed(info.startFailed); builder.setTotalRequested(info.totalRequested); + builder.setNodeFailed(info.nodeFailed); + builder.setPreempted(info.preempted); + builder.setFailedRecently(info.failedRecently); if (info.failureMessage != null) { builder.setFailureMessage(info.failureMessage); } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java b/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java index 286c3a1..52f6c08 100644 --- a/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java +++ b/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java @@ -28,6 +28,14 @@ import java.util.Map; /** * Serializable version of component data. + * <p> + * This is sent in REST calls as a JSON object âbut is also marshalled into + * a protobuf structure. Look at {@link org.apache.slider.api.proto.RestTypeMarshalling} + * for the specifics there. + * <p> + * This means that if any fields are added here. they must be added to + * <code>src/main/proto/SliderClusterMessages.proto</code> and + * the probuf structures rebuilt. */ @JsonIgnoreProperties(ignoreUnknown = true) @JsonSerialize(include = JsonSerialize.Inclusion.NON_NULL) @@ -40,6 +48,7 @@ public class ComponentInformation { public int placementPolicy; public int requested; public int failed, started, startFailed, completed, totalRequested; + public int nodeFailed, failedRecently, preempted; public String failureMessage; public List<String> containers; @@ -48,7 +57,7 @@ public class ComponentInformation { * @return a map for use in statistics reports */ public Map<String, Integer> buildStatistics() { - Map<String, Integer> stats = new HashMap<String, Integer>(); + Map<String, Integer> stats = new HashMap<>(); stats.put(StatusKeys.STATISTICS_CONTAINERS_ACTIVE_REQUESTS, requested); stats.put(StatusKeys.STATISTICS_CONTAINERS_COMPLETED, completed); stats.put(StatusKeys.STATISTICS_CONTAINERS_DESIRED, desired); @@ -57,6 +66,9 @@ public class ComponentInformation { stats.put(StatusKeys.STATISTICS_CONTAINERS_REQUESTED, totalRequested); stats.put(StatusKeys.STATISTICS_CONTAINERS_STARTED, started); stats.put(StatusKeys.STATISTICS_CONTAINERS_START_FAILED, startFailed); + stats.put(StatusKeys.STATISTICS_CONTAINERS_FAILED_RECENTLY, failedRecently); + stats.put(StatusKeys.STATISTICS_CONTAINERS_FAILED_NODE, nodeFailed); + stats.put(StatusKeys.STATISTICS_CONTAINERS_PREEMPTED, preempted); return stats; } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java index 3ab4501..7da627e 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java @@ -164,9 +164,7 @@ import org.apache.slider.server.services.yarnregistry.YarnRegistryViewForProvide import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; import java.io.File; -import java.io.FileReader; import java.io.IOException; import java.net.InetSocketAddress; import java.net.URI; http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java index 5428a34..18eb578 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java @@ -93,6 +93,9 @@ import static org.apache.slider.api.ResourceKeys.YARN_LABEL_EXPRESSION; import static org.apache.slider.api.ResourceKeys.YARN_MEMORY; import static org.apache.slider.api.RoleKeys.ROLE_FAILED_INSTANCES; import static org.apache.slider.api.RoleKeys.ROLE_FAILED_STARTING_INSTANCES; +import static org.apache.slider.api.RoleKeys.ROLE_FAILED_RECENTLY_INSTANCES; +import static org.apache.slider.api.RoleKeys.ROLE_NODE_FAILED_INSTANCES; +import static org.apache.slider.api.RoleKeys.ROLE_PREEMPTED_INSTANCES; import static org.apache.slider.api.RoleKeys.ROLE_RELEASING_INSTANCES; import static org.apache.slider.api.RoleKeys.ROLE_REQUESTED_INSTANCES; import static org.apache.slider.api.StateValues.STATE_CREATED; @@ -1430,7 +1433,7 @@ public class AppState { text = "container start failure"; } instance.diagnostics = text; - roleStatus.noteFailed(true, null); + roleStatus.noteFailed(true, text, ContainerOutcome.Failed); getFailedNodes().put(containerId, instance); roleHistory.onNodeManagerContainerStartFailed(instance.container); } @@ -1477,7 +1480,11 @@ public class AppState { public static class NodeCompletionResult { public boolean surplusNode = false; public RoleInstance roleInstance; - public boolean containerFailed; + // did the container fail for *any* reason? + public boolean containerFailed = false; + // detailed outcome on the container failure + public ContainerOutcome outcome = ContainerOutcome.Completed; + public int exitStatus = 0; public boolean unknownNode = false; @@ -1486,7 +1493,9 @@ public class AppState { new StringBuilder("NodeCompletionResult{"); sb.append("surplusNode=").append(surplusNode); sb.append(", roleInstance=").append(roleInstance); + sb.append(", exitStatus=").append(exitStatus); sb.append(", containerFailed=").append(containerFailed); + sb.append(", outcome=").append(outcome); sb.append(", unknownNode=").append(unknownNode); sb.append('}'); return sb.toString(); @@ -1504,6 +1513,8 @@ public class AppState { NodeCompletionResult result = new NodeCompletionResult(); RoleInstance roleInstance; + int exitStatus = status.getExitStatus(); + result.exitStatus = exitStatus; if (containersBeingReleased.containsKey(containerId)) { log.info("Container was queued for release : {}", containerId); Container container = containersBeingReleased.remove(containerId); @@ -1516,14 +1527,18 @@ public class AppState { actual, releasing, completedCount); + result.outcome = ContainerOutcome.Completed; roleHistory.onReleaseCompleted(container); } else if (surplusNodes.remove(containerId)) { //its a surplus one being purged result.surplusNode = true; } else { - //a container has failed + // a container has failed or been killed + // use the exit code to determine the outcome result.containerFailed = true; + result.outcome = ContainerOutcome.fromExitStatus(exitStatus); + roleInstance = removeOwnedContainer(containerId); if (roleInstance != null) { //it was active, move it to failed @@ -1544,32 +1559,34 @@ public class AppState { boolean shortLived = isShortLived(roleInstance); String message; Container failedContainer = roleInstance.container; - + //build the failure message if (failedContainer != null) { String completedLogsUrl = getLogsURLForContainer(failedContainer); - message = String.format("Failure %s on host %s: %s", + message = String.format("Failure %s on host %s (%d): %s", roleInstance.getContainerId().toString(), failedContainer.getNodeId().getHost(), + exitStatus, completedLogsUrl); } else { - message = String.format("Failure %s", containerId); + message = String.format("Failure %s (%d)", containerId, exitStatus); } - int failed = roleStatus.noteFailed(shortLived, message); + roleStatus.noteFailed(shortLived, message, result.outcome); + long failed = roleStatus.getFailed(); log.info("Current count of failed role[{}] {} = {}", roleId, rolename, failed); if (failedContainer != null) { - roleHistory.onFailedContainer(failedContainer, shortLived); + roleHistory.onFailedContainer(failedContainer, shortLived, result.outcome); } - + } catch (YarnRuntimeException e1) { log.error("Failed container of unknown role {}", roleId); } } else { //this isn't a known container. - + log.error("Notified of completed container {} that is not in the list" + - " of active or failed containers", containerId); + " of active or failed containers", containerId); completionOfUnknownContainerEvent.incrementAndGet(); result.unknownNode = true; } @@ -1587,7 +1604,7 @@ public class AppState { RoleInstance node = getLiveNodes().remove(id); if (node != null) { node.state = STATE_DESTROYED; - node.exitCode = status.getExitStatus(); + node.exitCode = exitStatus; node.diagnostics = status.getDiagnostics(); getCompletedNodes().put(id, node); result.roleInstance = node; @@ -1595,7 +1612,6 @@ public class AppState { // not in the list log.warn("Received notification of completion of unknown node {}", id); completionOfNodeNotInLiveListEvent.incrementAndGet(); - } // and the active node list if present @@ -1709,6 +1725,9 @@ public class AppState { cd.setRoleOpt(rolename, ROLE_RELEASING_INSTANCES, role.getReleasing()); cd.setRoleOpt(rolename, ROLE_FAILED_INSTANCES, role.getFailed()); cd.setRoleOpt(rolename, ROLE_FAILED_STARTING_INSTANCES, role.getStartFailed()); + cd.setRoleOpt(rolename, ROLE_FAILED_RECENTLY_INSTANCES, role.getFailedRecently()); + cd.setRoleOpt(rolename, ROLE_NODE_FAILED_INSTANCES, role.getNodeFailed()); + cd.setRoleOpt(rolename, ROLE_PREEMPTED_INSTANCES, role.getPreempted()); Map<String, Integer> stats = role.buildStatistics(); cd.statistics.put(rolename, stats); } @@ -1798,14 +1817,14 @@ public class AppState { } /** - * Check the failure threshold for a role + * Check the "recent" failure threshold for a role * @param role role to examine * @throws TriggerClusterTeardownException if the role * has failed too many times */ private void checkFailureThreshold(RoleStatus role) throws TriggerClusterTeardownException { - int failures = role.getFailed(); + long failures = role.getFailedRecently(); int threshold = getFailureThresholdForRole(role); log.debug("Failure count of component: {}: {}, threshold={}", role.getName(), failures, threshold); @@ -1814,7 +1833,7 @@ public class AppState { throw new TriggerClusterTeardownException( SliderExitCodes.EXIT_DEPLOYMENT_FAILED, FinalApplicationStatus.FAILED, ErrorStrings.E_UNSTABLE_CLUSTER + - " - failed with component %s failing %d times (%d in startup);" + + " - failed with component %s failed 'recently' %d times (%d in startup);" + " threshold is %d - last failure: %s", role.getName(), role.getFailed(), @@ -1853,11 +1872,11 @@ public class AppState { } /** - * Reset the failure counts of all roles + * Reset the "recent" failure counts of all roles */ public void resetFailureCounts() { for (RoleStatus roleStatus : getRoleStatusMap().values()) { - int failed = roleStatus.resetFailed(); + long failed = roleStatus.resetFailedRecently(); log.info("Resetting failure count of {}; was {}", roleStatus.getName(), failed); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java new file mode 100644 index 0000000..59ab30b --- /dev/null +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.slider.server.appmaster.state; + +import org.apache.hadoop.yarn.api.records.ContainerExitStatus; + +/** + * Container outcomes we care about; slightly simplified from + * {@link ContainerExitStatus} -and hopefully able to handle + * any new exit codes. + */ +public enum ContainerOutcome { + Completed, + Failed, + Failed_limits_exceeded, + Node_failure, + Preempted; + + /** + * Build a container outcome from an exit status. + * The values in {@link ContainerExitStatus} are used + * here. + * @param exitStatus exit status + * @return an enumeration of the outcome. + */ + public static ContainerOutcome fromExitStatus(int exitStatus) { + switch (exitStatus) { + case ContainerExitStatus.ABORTED: + case ContainerExitStatus.KILLED_BY_APPMASTER: + case ContainerExitStatus.KILLED_BY_RESOURCEMANAGER: + case ContainerExitStatus.KILLED_AFTER_APP_COMPLETION: + // could either be a release or node failure. Treat as completion + return Completed; + case ContainerExitStatus.DISKS_FAILED: + return Node_failure; + case ContainerExitStatus.PREEMPTED: + return Preempted; + case ContainerExitStatus.KILLED_EXCEEDED_PMEM: + case ContainerExitStatus.KILLED_EXCEEDED_VMEM: + return Failed_limits_exceeded; + default: + return exitStatus == 0 ? Completed : Failed; + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java index 0aa2d42..0f46054 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java @@ -53,6 +53,7 @@ public class NodeEntry { private int starting; private int startFailed; private int failed; + private int preempted; /** * Counter of "failed recently" events. These are all failures * which have happened since it was last reset. @@ -138,8 +139,7 @@ public class NodeEntry { public synchronized boolean onStartFailed() { decStarting(); ++startFailed; - ++failedRecently; - return containerCompleted(false); + return containerCompleted(false, ContainerOutcome.Failed); } /** @@ -183,14 +183,35 @@ public class NodeEntry { * planned: dec our release count * unplanned: dec our live count * @param wasReleased true if this was planned + * @param outcome * @return true if this node is now available */ - public synchronized boolean containerCompleted(boolean wasReleased) { + public synchronized boolean containerCompleted(boolean wasReleased, ContainerOutcome outcome) { if (wasReleased) { releasing = RoleHistoryUtils.decToFloor(releasing); } else { - ++failed; - ++failedRecently; + // for the node, we use the outcome of the faiure to decide + // whether this is potentially "node-related" + switch(outcome) { + // general "any reason" app failure + case Failed: + // specific node failure + case Node_failure: + + ++failed; + ++failedRecently; + break; + + case Preempted: + preempted++; + break; + + // failures which are node-independent + case Failed_limits_exceeded: + case Completed: + default: + break; + } } decLive(); return isAvailable(); @@ -219,6 +240,10 @@ public class NodeEntry { return failedRecently; } + public synchronized int getPreempted() { + return preempted; + } + /** * Reset the failed recently count. */ @@ -236,6 +261,7 @@ public class NodeEntry { sb.append(", releasing=").append(releasing); sb.append(", lastUsed=").append(lastUsed); sb.append(", failedRecently=").append(failedRecently); + sb.append(", preempted=").append(preempted); sb.append(", startFailed=").append(startFailed); sb.append('}'); return sb.toString(); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java index bb482af..926d440 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java @@ -767,7 +767,7 @@ public class RoleHistory { * @return true if the node was queued */ public boolean onNodeManagerContainerStartFailed(Container container) { - return markContainerFinished(container, false, true); + return markContainerFinished(container, false, true, ContainerOutcome.Failed); } /** @@ -808,7 +808,7 @@ public class RoleHistory { * @return true if the node was queued */ public boolean onReleaseCompleted(Container container) { - return markContainerFinished(container, true, false); + return markContainerFinished(container, true, false, ContainerOutcome.Failed); } /** @@ -817,10 +817,13 @@ public class RoleHistory { * * @param container completed container * @param shortLived was the container short lived? + * @param outcome * @return true if the node is considered available for work */ - public boolean onFailedContainer(Container container, boolean shortLived) { - return markContainerFinished(container, false, shortLived); + public boolean onFailedContainer(Container container, + boolean shortLived, + ContainerOutcome outcome) { + return markContainerFinished(container, false, shortLived, outcome); } /** @@ -831,11 +834,13 @@ public class RoleHistory { * @param container completed container * @param wasReleased was the container released? * @param shortLived was the container short lived? + * @param outcome * @return true if the node was queued */ protected synchronized boolean markContainerFinished(Container container, - boolean wasReleased, - boolean shortLived) { + boolean wasReleased, + boolean shortLived, + ContainerOutcome outcome) { NodeEntry nodeEntry = getOrCreateNodeEntry(container); log.info("Finished container for node {}, released={}, shortlived={}", nodeEntry.rolePriority, wasReleased, shortLived); @@ -844,7 +849,7 @@ public class RoleHistory { nodeEntry.onStartFailed(); available = false; } else { - available = nodeEntry.containerCompleted(wasReleased); + available = nodeEntry.containerCompleted(wasReleased, outcome); maybeQueueNodeForWork(container, nodeEntry, available); } touch(); http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java ---------------------------------------------------------------------- diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java index 899948f..e2974fc 100644 --- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java +++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java @@ -25,6 +25,7 @@ import org.apache.slider.providers.ProviderRole; import java.io.Serializable; import java.util.Comparator; import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; /** @@ -43,7 +44,12 @@ public final class RoleStatus implements Cloneable { private final ProviderRole providerRole; private int desired, actual, requested, releasing; - private int failed, started, startFailed, completed, totalRequested; + private int failed, startFailed; + private int started, completed, totalRequested; + private final AtomicLong preempted = new AtomicLong(0); + private final AtomicLong nodeFailed = new AtomicLong(0); + private final AtomicLong failedRecently = new AtomicLong(0); + private final AtomicLong limitsExceeded = new AtomicLong(0); private String failureMessage = ""; @@ -163,15 +169,20 @@ public final class RoleStatus implements Cloneable { return failed; } + public synchronized long getFailedRecently() { + return failedRecently.get(); + } + /** - * Reset the failure counts - * @return the total number of failures up to this point + * Reset the recent failure + * @return the number of failures in the "recent" window */ - public synchronized int resetFailed() { - int total = failed + startFailed; - failed = 0; - startFailed = 0; - return total; + public long resetFailedRecently() { + return failedRecently.getAndSet(0); + } + + public long getLimitsExceeded() { + return limitsExceeded.get(); } /** @@ -179,19 +190,37 @@ public final class RoleStatus implements Cloneable { * be used in any diagnostics if an exception * is later raised. * @param startupFailure flag to indicate this was a startup event - * @return the number of failures * @param text text about the failure + * @param outcome outcome of the container */ - public synchronized int noteFailed(boolean startupFailure, String text) { - int current = ++failed; + public synchronized void noteFailed(boolean startupFailure, String text, + ContainerOutcome outcome) { if (text != null) { failureMessage = text; } - //have a look to see if it short lived - if (startupFailure) { - incStartFailed(); + switch (outcome) { + case Preempted: + preempted.incrementAndGet(); + break; + + case Node_failure: + nodeFailed.incrementAndGet(); + failed++; + break; + + case Failed_limits_exceeded: // exceeded memory or CPU; app/configuration related + limitsExceeded.incrementAndGet(); + // fall through + case Failed: // application failure, possibly node related, possibly not + default: // anything else (future-proofing) + failed++; + failedRecently.incrementAndGet(); + //have a look to see if it short lived + if (startupFailure) { + incStartFailed(); + } + break; } - return current; } public synchronized int getStartFailed() { @@ -229,7 +258,14 @@ public final class RoleStatus implements Cloneable { return totalRequested; } - + public long getPreempted() { + return preempted.get(); + } + + public long getNodeFailed() { + return nodeFailed.get(); + } + /** * Get the number of roles we are short of. * nodes released are ignored. @@ -267,6 +303,9 @@ public final class RoleStatus implements Cloneable { ", requested=" + requested + ", releasing=" + releasing + ", failed=" + failed + + ", failed recently=" + failedRecently.get() + + ", node failed=" + nodeFailed.get() + + ", pre-empted=" + preempted.get() + ", started=" + started + ", startFailed=" + startFailed + ", completed=" + completed + @@ -313,6 +352,9 @@ public final class RoleStatus implements Cloneable { info.placementPolicy = getPlacementPolicy(); info.failureMessage = failureMessage; info.totalRequested = totalRequested; + info.failedRecently = failedRecently.intValue(); + info.nodeFailed = nodeFailed.intValue(); + info.preempted = preempted.intValue(); return info; } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/proto/SliderClusterMessages.proto ---------------------------------------------------------------------- diff --git a/slider-core/src/main/proto/SliderClusterMessages.proto b/slider-core/src/main/proto/SliderClusterMessages.proto index 5e770d5..caca87b 100644 --- a/slider-core/src/main/proto/SliderClusterMessages.proto +++ b/slider-core/src/main/proto/SliderClusterMessages.proto @@ -246,7 +246,10 @@ message ComponentInformationProto { optional int32 totalRequested = 11; optional string failureMessage = 12 ; optional int32 placementPolicy = 13; - repeated string containers = 14; + repeated string containers = 14; + optional int32 failedRecently = 15; + optional int32 nodeFailed = 16; + optional int32 preempted = 17; } /* http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy index 6368a3d..1b79115 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy @@ -30,6 +30,7 @@ import org.apache.slider.server.appmaster.model.mock.BaseMockAppStateTest import org.apache.slider.server.appmaster.model.mock.MockRoles import org.apache.slider.server.appmaster.model.mock.MockYarnEngine import org.apache.slider.server.appmaster.state.AppState +import org.apache.slider.server.appmaster.state.ContainerOutcome import org.apache.slider.server.appmaster.state.NodeEntry import org.apache.slider.server.appmaster.state.NodeInstance import org.apache.slider.server.appmaster.state.RoleHistory @@ -87,7 +88,7 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest assert appState.isShortLived(instance) AppState.NodeCompletionResult result = appState.onCompletedNode(containerStatus(cid, 1)) assert result.roleInstance != null - assert result.containerFailed + assert result.containerFailed RoleStatus status = role0Status assert status.failed == 1 assert status.startFailed == 1 @@ -141,8 +142,8 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest ContainerId cid = ids[0] appState.onNodeManagerContainerStartFailed(cid, new SliderException("oops")) RoleStatus status = role0Status - assert status.failed == 1 - assert status.startFailed == 1 + assert 1 == status.failed + assert 1 == status.startFailed RoleHistory history = appState.roleHistory @@ -182,7 +183,7 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest @Test - public void testFailureWindow() throws Throwable { + public void testRoleStatusFailureWindow() throws Throwable { ResetFailureWindow resetter = new ResetFailureWindow(); @@ -209,4 +210,127 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest } } + @Test + public void testRoleStatusFailed() throws Throwable { + def status = role0Status + // limits exceeded + status.noteFailed(false, "text",ContainerOutcome.Failed) + assert 1 == status.failed + assert 1L == status.failedRecently + assert 0L == status.limitsExceeded + assert 0L == status.preempted + assert 0L == status.nodeFailed + + ResetFailureWindow resetter = new ResetFailureWindow(); + resetter.execute(null, null, appState) + assert 1 == status.failed + assert 0L == status.failedRecently + } + + @Test + public void testRoleStatusFailedLimitsExceeded() throws Throwable { + def status = role0Status + // limits exceeded + status.noteFailed(false, "text",ContainerOutcome.Failed_limits_exceeded) + assert 1 == status.failed + assert 1L == status.failedRecently + assert 1L == status.limitsExceeded + assert 0L == status.preempted + assert 0L == status.nodeFailed + + ResetFailureWindow resetter = new ResetFailureWindow(); + resetter.execute(null, null, appState) + assert 1 == status.failed + assert 0L == status.failedRecently + assert 1L == status.limitsExceeded + } + + + @Test + public void testRoleStatusFailedPrempted() throws Throwable { + def status = role0Status + // limits exceeded + status.noteFailed(false, "text", ContainerOutcome.Preempted) + assert 0 == status.failed + assert 1L == status.preempted + assert 0L == status.failedRecently + assert 0L == status.nodeFailed + + ResetFailureWindow resetter = new ResetFailureWindow(); + resetter.execute(null, null, appState) + assert 1L == status.preempted + } + + + @Test + public void testRoleStatusFailedNode() throws Throwable { + def status = role0Status + // limits exceeded + status.noteFailed(false, "text", ContainerOutcome.Node_failure) + assert 1 == status.failed + assert 0L == status.failedRecently + assert 0L == status.limitsExceeded + assert 0L == status.preempted + assert 1L == status.nodeFailed + } + + @Test + public void testNodeEntryCompleted() throws Throwable { + NodeEntry nodeEntry = new NodeEntry(1) + nodeEntry.containerCompleted(true, ContainerOutcome.Completed); + assert 0 == nodeEntry.failed + assert 0 == nodeEntry.failedRecently + assert 0 == nodeEntry.startFailed + assert 0 == nodeEntry.preempted + assert 0 == nodeEntry.active + assert nodeEntry.available + } + + @Test + public void testNodeEntryFailed() throws Throwable { + NodeEntry nodeEntry = new NodeEntry(1) + nodeEntry.containerCompleted(false, ContainerOutcome.Failed); + assert 1 == nodeEntry.failed + assert 1 == nodeEntry.failedRecently + assert 0 == nodeEntry.startFailed + assert 0 == nodeEntry.preempted + assert 0 == nodeEntry.active + assert nodeEntry.available + nodeEntry.resetFailedRecently() + assert 1 == nodeEntry.failed + assert 0 == nodeEntry.failedRecently + } + + @Test + public void testNodeEntryLimitsExceeded() throws Throwable { + NodeEntry nodeEntry = new NodeEntry(1) + nodeEntry.containerCompleted(false, ContainerOutcome.Failed_limits_exceeded); + assert 0 == nodeEntry.failed + assert 0 == nodeEntry.failedRecently + assert 0 == nodeEntry.startFailed + assert 0 == nodeEntry.preempted + } + + @Test + public void testNodeEntryPreempted() throws Throwable { + NodeEntry nodeEntry = new NodeEntry(1) + nodeEntry.containerCompleted(false, ContainerOutcome.Preempted); + assert 0 == nodeEntry.failed + assert 0 == nodeEntry.failedRecently + assert 0 == nodeEntry.startFailed + assert 1 == nodeEntry.preempted + } + + @Test + public void testNodeEntryNodeFailure() throws Throwable { + NodeEntry nodeEntry = new NodeEntry(1) + nodeEntry.containerCompleted(false, ContainerOutcome.Node_failure); + assert 1 == nodeEntry.failed + assert 1 == nodeEntry.failedRecently + assert 0 == nodeEntry.startFailed + assert 0 == nodeEntry.preempted + } + + + } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy index c7a38f5..4fdf39c 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy @@ -301,7 +301,10 @@ class TestRoleHistoryContainerEvents extends BaseMockAppStateTest { roleHistory.onContainerStarted(container) //later, declare that it failed - roleHistory.onFailedContainer(container, false) + roleHistory.onFailedContainer( + container, + false, + ContainerOutcome.Failed) assert roleEntry.starting == 0 assert roleEntry.available assert roleEntry.active == 0 @@ -330,7 +333,10 @@ class TestRoleHistoryContainerEvents extends BaseMockAppStateTest { NodeInstance allocated = nodemap.get(hostname) NodeEntry roleEntry = allocated.get(role) assert roleEntry.available - roleHistory.onFailedContainer(container, false) + roleHistory.onFailedContainer( + container, + false, + ContainerOutcome.Failed) assert roleEntry.starting == 0 assert roleEntry.failed == 1 assert roleEntry.available http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy index b29a0b5..79d23e5 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy @@ -23,7 +23,7 @@ import groovy.util.logging.Slf4j import org.apache.slider.providers.ProviderRole import org.apache.slider.server.appmaster.model.mock.BaseMockAppStateTest import org.apache.slider.server.appmaster.model.mock.MockFactory -import org.apache.slider.server.appmaster.state.NodeEntry +import org.apache.slider.server.appmaster.state.ContainerOutcome import org.apache.slider.server.appmaster.state.NodeInstance import org.apache.slider.server.appmaster.state.RoleHistory import org.apache.slider.server.appmaster.state.RoleStatus @@ -132,10 +132,14 @@ class TestRoleHistoryFindNodesForNewInstances extends BaseMockAppStateTest { // mark age2 and active 0 as busy, expect a null back def entry0 = age2Active0.get(0) - entry0.containerCompleted(false) + entry0.containerCompleted( + false, + ContainerOutcome.Failed) assert entry0.failed assert entry0.failedRecently - entry0.containerCompleted(false) + entry0.containerCompleted( + false, + ContainerOutcome.Failed) assert !age2Active0.exceedsFailureThreshold(roleStat) // set failure to 1 roleStat.providerRole.nodeFailureThreshold = 1 http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy index 3e5494f..29eefa5 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy @@ -35,6 +35,7 @@ import org.apache.slider.core.main.LauncherExitCodes import org.apache.slider.server.appmaster.operations.AbstractRMOperation import org.apache.slider.server.appmaster.state.AppState import org.apache.slider.server.appmaster.state.ContainerAssignment +import org.apache.slider.server.appmaster.state.ContainerOutcome import org.apache.slider.server.appmaster.state.NodeEntry import org.apache.slider.server.appmaster.state.NodeInstance import org.apache.slider.server.appmaster.state.RoleInstance @@ -344,7 +345,9 @@ abstract class BaseMockAppStateTest extends SliderTestBase implements MockRoles public NodeEntry recordAsFailed(NodeInstance node, int id, int count) { def entry = node.getOrCreate(id) 1.upto(count) { - entry.containerCompleted(false) + entry.containerCompleted( + false, + ContainerOutcome.Failed) } entry } http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy ---------------------------------------------------------------------- diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy index 9ec230c..c2ea837 100644 --- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy +++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy @@ -19,13 +19,13 @@ package org.apache.slider.server.appmaster.web.view import com.google.inject.AbstractModule import com.google.inject.Guice import com.google.inject.Injector -import groovy.transform.CompileStatic import groovy.util.logging.Slf4j import org.apache.hadoop.yarn.api.records.Container import org.apache.hadoop.yarn.api.records.Priority import org.apache.hadoop.yarn.webapp.hamlet.Hamlet import org.apache.slider.providers.ProviderService import org.apache.slider.server.appmaster.model.mock.* +import org.apache.slider.server.appmaster.state.ContainerOutcome import org.apache.slider.server.appmaster.state.ProviderAppState import org.apache.slider.server.appmaster.web.WebAppApi import org.apache.slider.server.appmaster.web.WebAppApiImpl @@ -90,8 +90,8 @@ public class TestIndexBlock extends BaseMockAppStateTest { role1.incRequested() role1.incRequested() role1.incRequested() - role0.noteFailed(false, "") - role0.noteFailed(true, "") + role0.noteFailed(false, "", ContainerOutcome.Failed) + role0.noteFailed(true, "", ContainerOutcome.Failed) StringWriter sw = new StringWriter(64); PrintWriter pw = new PrintWriter(sw);
