Author: acmurthy
Date: Thu Oct 16 17:26:28 2008
New Revision: 705426
URL: http://svn.apache.org/viewvc?rev=705426&view=rev
Log:
Merge -r 705419:705420 from trunk to branch-0.17 to fix HADOOP-3127.
Modified:
hadoop/core/branches/branch-0.17/src/contrib/hod/CHANGES.txt
hadoop/core/branches/branch-0.17/src/contrib/hod/bin/hod
hadoop/core/branches/branch-0.17/src/contrib/hod/hodlib/Hod/hadoop.py
hadoop/core/branches/branch-0.17/src/contrib/hod/hodlib/NodePools/torque.py
Modified: hadoop/core/branches/branch-0.17/src/contrib/hod/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.17/src/contrib/hod/CHANGES.txt?rev=705426&r1=705425&r2=705426&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.17/src/contrib/hod/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.17/src/contrib/hod/CHANGES.txt Thu Oct 16
17:26:28 2008
@@ -1,6 +1,13 @@
HOD Change Log
+Release 0.17.3 - Unreleased
+
+ BUG FIXES
+
+ HADOOP-3217. Decrease the rate at which the hod queries the resource
+ manager for job status. (Hemanth Yamijala via acmurthy)
+
Release 0.17.0 - 2008-05-18
INCOMPATIBLE CHANGES
Modified: hadoop/core/branches/branch-0.17/src/contrib/hod/bin/hod
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.17/src/contrib/hod/bin/hod?rev=705426&r1=705425&r2=705426&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.17/src/contrib/hod/bin/hod (original)
+++ hadoop/core/branches/branch-0.17/src/contrib/hod/bin/hod Thu Oct 16
17:26:28 2008
@@ -152,7 +152,20 @@
True, None, False, True, 'l'),
('script-wait-time', 'pos_int', 'Specifies the time to wait
before running the script. Used with the hod.script option.',
- True, 10, False, True, 'W')),
+ True, 10, False, True, 'W'),
+
+ ('job-status-query-interval', 'pos_int', 'Specifies the time
between checking for job status',
+ False, 30, False, True),
+
+ ('job-command-failure-interval', 'pos_int', 'Specifies the time
between checking for failed job status or submission commands',
+ False, 10, False, True),
+
+ ('job-status-query-failure-retries', 'pos_int', 'Specifies the
number of times job status failure queries are retried',
+ False, 3, False, True),
+
+ ('job-submission-failure-retries', 'pos_int', 'Specifies the
number of times job submission failure queries are retried',
+ False, 3, False, True)),
+
'resource_manager' : (
('id', 'string', 'Batch scheduler ID: torque|condor.',
Modified: hadoop/core/branches/branch-0.17/src/contrib/hod/hodlib/Hod/hadoop.py
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.17/src/contrib/hod/hodlib/Hod/hadoop.py?rev=705426&r1=705425&r2=705426&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.17/src/contrib/hod/hodlib/Hod/hadoop.py
(original)
+++ hadoop/core/branches/branch-0.17/src/contrib/hod/hodlib/Hod/hadoop.py Thu
Oct 16 17:26:28 2008
@@ -182,22 +182,26 @@
return serviceData
def __check_job_status(self):
- initWaitCount = 20
- count = 0
+ failureCount = 0
status = False
state = 'Q'
- while state == 'Q':
+ while (state=='Q') or (state==False):
if hodInterrupt.isSet():
raise HodInterruptException()
state = self.__nodePool.getJobState()
- if (state==False) or (state!='Q'):
+ self.__log.debug('job state %s' % state)
+ if state == False:
+ failureCount += 1
+ if (failureCount >=
self.__cfg['hod']['job-status-query-failure-retries']):
+ self.__log.debug('Number of retries reached max limit while querying
job status')
+ break
+ time.sleep(self.__cfg['hod']['job-command-failure-interval'])
+ elif state!='Q':
break
- count = count + 1
- if count < initWaitCount:
- time.sleep(0.5)
else:
- time.sleep(10)
+ self.__log.debug('querying for job status after
job-status-query-interval')
+ time.sleep(self.__cfg['hod']['job-status-query-interval'])
if state and state != 'C':
status = True
@@ -237,7 +241,7 @@
time.sleep(1)
count = count + 1
# check to see if the job exited by any chance in that time:
- if (count % 10 == 0):
+ if (count % self.__cfg['hod']['job-status-query-interval'] == 0):
if not self.__check_job_status():
break
@@ -256,9 +260,9 @@
serviceAddress = xmlrpcClient.getServiceAddr(serviceName)
if serviceAddress:
if serviceAddress == 'not found':
- time.sleep(.5)
+ time.sleep(1)
# check to see if the job exited by any chance in that time:
- if (i % 10 == 0):
+ if ((i+1) % self.__cfg['hod']['job-status-query-interval'] == 0):
if not self.__check_job_status():
break
else:
@@ -420,6 +424,7 @@
def allocate(self, clusterDir, min, max=None):
status = 0
+ failureCount = 0
self.__svcrgyClient = self.__get_svcrgy_client()
self.__log.debug("allocate %s %s %s" % (clusterDir, min, max))
@@ -432,7 +437,25 @@
walltime = None
if self.__cfg['hod'].has_key('walltime'):
walltime = self.__cfg['hod']['walltime']
+
self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime)
+ # if the job submission returned an error other than no resources
+ # retry a couple of times
+ while (self.jobId is False) and (exitCode != 188):
+ if hodInterrupt.isSet():
+ raise HodInterruptException()
+
+ failureCount += 1
+ if (failureCount >=
self.__cfg['hod']['job-status-query-failure-retries']):
+ self.__log.debug("failed submitting job more than the retries.
exiting")
+ break
+ else:
+ # wait a bit before retrying
+ time.sleep(self.__cfg['hod']['job-command-failure-interval'])
+ if hodInterrupt.isSet():
+ raise HodInterruptException()
+ self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet,
walltime)
+
if self.jobId:
try:
jobStatus = self.__check_job_status()
@@ -558,12 +581,12 @@
if exitCode == 188:
self.__log.critical("Request execeeded maximum resource allocation.")
else:
- self.__log.critical("Insufficient resources available.")
+ self.__log.critical("Job submission failed with exit code %s" %
exitCode)
status = 4
- else:
+ else:
self.__log.critical("Scheduler failure, allocation failed.\n\n")
status = 4
-
+
return status
def __isRingMasterAlive(self, rmAddr):
Modified:
hadoop/core/branches/branch-0.17/src/contrib/hod/hodlib/NodePools/torque.py
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.17/src/contrib/hod/hodlib/NodePools/torque.py?rev=705426&r1=705425&r2=705426&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.17/src/contrib/hod/hodlib/NodePools/torque.py
(original)
+++ hadoop/core/branches/branch-0.17/src/contrib/hod/hodlib/NodePools/torque.py
Thu Oct 16 17:26:28 2008
@@ -265,23 +265,12 @@
return id
def getJobState(self):
- #torque error code when credentials fail, a temporary condition sometimes.
- credFailureErrorCode = 171
- credFailureRetries = 10
- i = 0
jobState = False
- while i < credFailureRetries:
- qstatInfo, exitCode = self.__torque.qstat(self.getServiceId())
- if exitCode == 0:
- jobState = qstatInfo['job_state']
- break
- else:
- if exitCode == credFailureErrorCode:
- time.sleep(1)
- i = i+1
- else:
- break
+ qstatInfo, exitCode = self.__torque.qstat(self.getServiceId())
+ if exitCode == 0:
+ jobState = qstatInfo['job_state']
+
return jobState
def deleteJob(self, jobId):