SAMZA-368 - AM UI should show failed container information
Project: http://git-wip-us.apache.org/repos/asf/samza/repo Commit: http://git-wip-us.apache.org/repos/asf/samza/commit/f7f237e9 Tree: http://git-wip-us.apache.org/repos/asf/samza/tree/f7f237e9 Diff: http://git-wip-us.apache.org/repos/asf/samza/diff/f7f237e9 Branch: refs/heads/samza-sql Commit: f7f237e939f0dabe0cd5130768096083a2b4a543 Parents: 91de984 Author: Aleksandar Bircakovic <[email protected]> Authored: Mon Oct 19 13:51:57 2015 -0700 Committer: Navina <[email protected]> Committed: Mon Oct 19 13:51:57 2015 -0700 ---------------------------------------------------------------------- .../org/apache/samza/job/yarn/SamzaAppState.java | 6 ++++++ .../org/apache/samza/job/yarn/SamzaTaskManager.java | 1 + .../resources/scalate/WEB-INF/views/index.scaml | 16 ++++++++++++++++ 3 files changed, 23 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/samza/blob/f7f237e9/samza-yarn/src/main/java/org/apache/samza/job/yarn/SamzaAppState.java ---------------------------------------------------------------------- diff --git a/samza-yarn/src/main/java/org/apache/samza/job/yarn/SamzaAppState.java b/samza-yarn/src/main/java/org/apache/samza/job/yarn/SamzaAppState.java index d5be36e..3df927e 100644 --- a/samza-yarn/src/main/java/org/apache/samza/job/yarn/SamzaAppState.java +++ b/samza-yarn/src/main/java/org/apache/samza/job/yarn/SamzaAppState.java @@ -23,6 +23,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.samza.coordinator.JobCoordinator; +import org.apache.hadoop.yarn.api.records.ContainerStatus; import java.net.URL; import java.util.HashSet; @@ -119,6 +120,11 @@ public class SamzaAppState { public AtomicInteger releasedContainers = new AtomicInteger(0); /** + * ContainerStatus of failed containers. + */ + public ConcurrentMap<String, ContainerStatus> failedContainersStatus = new ConcurrentHashMap<String, ContainerStatus>(); + + /** * Number of containers configured for the job */ public int containerCount = 0; http://git-wip-us.apache.org/repos/asf/samza/blob/f7f237e9/samza-yarn/src/main/java/org/apache/samza/job/yarn/SamzaTaskManager.java ---------------------------------------------------------------------- diff --git a/samza-yarn/src/main/java/org/apache/samza/job/yarn/SamzaTaskManager.java b/samza-yarn/src/main/java/org/apache/samza/job/yarn/SamzaTaskManager.java index d17ffe0..a3562a1 100644 --- a/samza-yarn/src/main/java/org/apache/samza/job/yarn/SamzaTaskManager.java +++ b/samza-yarn/src/main/java/org/apache/samza/job/yarn/SamzaTaskManager.java @@ -204,6 +204,7 @@ class SamzaTaskManager implements YarnAppMasterListener { log.info("Container " + containerIdStr + " failed with exit code " + exitStatus + " - " + containerStatus.getDiagnostics()); state.failedContainers.incrementAndGet(); + state.failedContainersStatus.put(containerIdStr, containerStatus); state.jobHealthy.set(false); if(containerId != -1) { http://git-wip-us.apache.org/repos/asf/samza/blob/f7f237e9/samza-yarn/src/main/resources/scalate/WEB-INF/views/index.scaml ---------------------------------------------------------------------- diff --git a/samza-yarn/src/main/resources/scalate/WEB-INF/views/index.scaml b/samza-yarn/src/main/resources/scalate/WEB-INF/views/index.scaml index 2d16fe0..93660c7 100644 --- a/samza-yarn/src/main/resources/scalate/WEB-INF/views/index.scaml +++ b/samza-yarn/src/main/resources/scalate/WEB-INF/views/index.scaml @@ -139,6 +139,22 @@ Ordinary: #{state.jobCoordinator.jobModel.getContainerToHostValue(containerId, org.apache.samza.coordinator.stream.messages.SetContainerHostMapping.JMX_URL_KEY)} Tunneling: #{state.jobCoordinator.jobModel.getContainerToHostValue(containerId, org.apache.samza.coordinator.stream.messages.SetContainerHostMapping.JMX_TUNNELING_URL_KEY)} + %h2 Failed Containers + %table.table.table-striped.table-bordered.tablesorter#containers-table + %thead + %tr + %th Container + %th Exit code + %th Message + %tbody + - for((containerId, containerStatus) <- state.failedContainersStatus) + %tr + %td + #{containerId} + %td + Exit code: #{containerStatus.getExitStatus} + %td + %div.value= containerStatus.getDiagnostics %div.tab-pane#task-groups %h2 Task Groups
