fixing monitorng
Project: http://git-wip-us.apache.org/repos/asf/airavata/repo Commit: http://git-wip-us.apache.org/repos/asf/airavata/commit/b0fde67c Tree: http://git-wip-us.apache.org/repos/asf/airavata/tree/b0fde67c Diff: http://git-wip-us.apache.org/repos/asf/airavata/diff/b0fde67c Branch: refs/heads/workflow-support Commit: b0fde67c03f2787c5f6d7ec576dc92ab41322c53 Parents: 779b618 Author: lahiru <[email protected]> Authored: Sat Jul 12 12:58:38 2014 -0400 Committer: lahiru <[email protected]> Committed: Sat Jul 12 12:58:38 2014 -0400 ---------------------------------------------------------------------- .../airavata/gfac/core/monitor/MonitorID.java | 10 ++++++++-- .../monitor/impl/pull/qstat/HPCPullMonitor.java | 16 ++++++++-------- .../airavata/gfac/monitor/util/CommonUtils.java | 1 + 3 files changed, 17 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/airavata/blob/b0fde67c/modules/gfac/gfac-core/src/main/java/org/apache/airavata/gfac/core/monitor/MonitorID.java ---------------------------------------------------------------------- diff --git a/modules/gfac/gfac-core/src/main/java/org/apache/airavata/gfac/core/monitor/MonitorID.java b/modules/gfac/gfac-core/src/main/java/org/apache/airavata/gfac/core/monitor/MonitorID.java index 8456e35..8599a02 100644 --- a/modules/gfac/gfac-core/src/main/java/org/apache/airavata/gfac/core/monitor/MonitorID.java +++ b/modules/gfac/gfac-core/src/main/java/org/apache/airavata/gfac/core/monitor/MonitorID.java @@ -177,12 +177,14 @@ public class MonitorID { // because in some machines job state vanishes quicckly when the job is done // during that case job state comes as unknown.so we handle it here. if (this.state != null && status.equals(JobState.UNKNOWN)) { - if (getFailedCount() > 2) { + if (getFailedCount() >= 2) { switch (this.state) { case ACTIVE: this.state = JobState.COMPLETE; + logger.info("Failed count is high and old status is ACTIVE so we mark this as COMPLETE"); break; case QUEUED: + logger.info("Failed count is high and old status is QUEUED so we mark this as COMPLETE"); this.state = JobState.COMPLETE; break; } @@ -193,10 +195,14 @@ public class MonitorID { } catch (InterruptedException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } + int loginfo = getFailedCount()+1; + logger.info("Increasing the failed count to:"+loginfo); setFailedCount(getFailedCount() + 1); } - } else { + } else { // normal scenario + logger.info("Resetting failed count to 0 because correct state came in"); + setFailedCount(0); this.state = status; } } http://git-wip-us.apache.org/repos/asf/airavata/blob/b0fde67c/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java ---------------------------------------------------------------------- diff --git a/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java b/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java index 193f23f..1238bf6 100644 --- a/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java +++ b/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java @@ -149,20 +149,20 @@ public class HPCPullMonitor extends PullMonitor { if (iHostMonitorData.getHost().getType() instanceof GsisshHostType || iHostMonitorData.getHost().getType() instanceof SSHHostType) { currentHostDescription = iHostMonitorData.getHost(); - String hostName = iHostMonitorData.getHost().getType().getHostAddress(); + String hostName = iHostMonitorData.getHost().getType().getHostAddress(); ResourceConnection connection = null; if (connections.containsKey(hostName)) { logger.debug("We already have this connection so not going to create one"); connection = connections.get(hostName); } else { - connection = new ResourceConnection(iHostMonitorData, getAuthenticationInfo()); + connection = new ResourceConnection(iHostMonitorData,getAuthenticationInfo()); connections.put(hostName, connection); } List<MonitorID> monitorID = iHostMonitorData.getMonitorIDs(); Map<String, JobState> jobStatuses = connection.getJobStatuses(monitorID); for (MonitorID iMonitorID : monitorID) { currentMonitorID = iMonitorID; - iMonitorID.setStatus(jobStatuses.get(iMonitorID.getJobID())); + iMonitorID.setStatus(jobStatuses.get(iMonitorID.getJobID())); //IMPORTANT this is not a simple setter we have a logic jobStatus = new JobStatusChangeRequest(iMonitorID); // we have this JobStatus class to handle amqp monitoring @@ -176,13 +176,13 @@ public class HPCPullMonitor extends PullMonitor { try { gfac.invokeOutFlowHandlers(iMonitorID.getJobExecutionContext()); } catch (GFacException e) { - publisher.publish(new TaskStatusChangeRequest(new TaskIdentity(iMonitorID.getExperimentID(), iMonitorID.getWorkflowNodeID(), - iMonitorID.getTaskID()), TaskState.FAILED)); - publisher.publish(new ExperimentStatusChangeRequest(new ExperimentIdentity(iMonitorID.getExperimentID()), - ExperimentState.FAILED)); + publisher.publish(new TaskStatusChangeRequest(new TaskIdentity(iMonitorID.getExperimentID(), iMonitorID.getWorkflowNodeID(), + iMonitorID.getTaskID()), TaskState.FAILED)); + publisher.publish(new ExperimentStatusChangeRequest(new ExperimentIdentity(iMonitorID.getExperimentID()), + ExperimentState.FAILED)); logger.info(e.getLocalizedMessage(), e); } - } else if (iMonitorID.getFailedCount() > 2 && iMonitorID.getStatus().equals(JobState.UNKNOWN)) { + } else if (iMonitorID.getFailedCount() > 2) { logger.error("Tried to monitor the job with ID " + iMonitorID.getJobID() + " But failed 3 times, so skip this Job from Monitor"); iMonitorID.setLastMonitored(new Timestamp((new Date()).getTime())); completedJobs.add(iMonitorID); http://git-wip-us.apache.org/repos/asf/airavata/blob/b0fde67c/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/util/CommonUtils.java ---------------------------------------------------------------------- diff --git a/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/util/CommonUtils.java b/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/util/CommonUtils.java index a9f1520..27b213f 100644 --- a/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/util/CommonUtils.java +++ b/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/util/CommonUtils.java @@ -138,6 +138,7 @@ public class CommonUtils { if(iMonitorID.getJobID().equals(monitorID.getJobID())) { // OK we found the object, we cannot do list.remove(object) states of two objects // could be different, thats why we check the jobID + logger.info("Removing the job:"+ monitorID.getJobID()+" from monitoring last status:" + monitorID.getStatus().toString()); monitorIDs.remove(iMonitorID); if(monitorIDs.size()==0) { hostMonitorData.remove(iHostMonitorID);
