Author: ddas
Date: Tue Jun 17 01:13:01 2008
New Revision: 668558
URL: http://svn.apache.org/viewvc?rev=668558&view=rev
Log:
Merge -r 668553:668554 from trunk onto 0.18 branch. Fixes HADOOP-3531.
Modified:
hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt
hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/HodRing/hodRing.py
Modified: hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt?rev=668558&r1=668557&r2=668558&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt Tue Jun 17
01:13:01 2008
@@ -34,6 +34,9 @@
HADOOP-3523. Fixes auto-deallocation of cluster if job id is not found in
Torque's job list (Hemanth Yamijala via ddas)
+ HADOOP-3531. Fixes a bug related to handling JobTracker failures because of
+ timing issues on slow nodes. (Hemanth Yamijala via ddas)
+
Release 0.17.0 - 2008-05-18
INCOMPATIBLE CHANGES
Modified:
hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/HodRing/hodRing.py
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/HodRing/hodRing.py?rev=668558&r1=668557&r2=668558&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/HodRing/hodRing.py
(original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/HodRing/hodRing.py
Tue Jun 17 01:13:01 2008
@@ -435,31 +435,12 @@
if self.__hadoopThread.exit_code() != 0:
status = False
else:
- code = self.__hadoopThread.exit_code()
- if code != 0 and code != None:
- status = False
+ status = self.getCommandStatus()
self.log.debug("hadoop run status: %s" % status)
if status == False:
- self.log.error('hadoop error: %s' % (
- self.__hadoopThread.exit_status_string()))
- # read the contents of redirected stderr to print information back to
user
- if os.path.exists(self.err):
- f = None
- try:
- f = open(self.err)
- lines = f.readlines()
- # format
- for line in lines:
- self.stdErrContents = "%s%s" % (self.stdErrContents, line)
- finally:
- if f is not None:
- f.close()
- self.log.error('See %s.out and/or %s.err for details. They are ' % \
- (self.name, self.name) + \
- 'located at subdirectories under either ' + \
- 'hodring.work-dirs or hodring.log-destination-uri.')
+ self.handleFailedCommand()
if (status == True) or (not desc.isIgnoreFailures()):
return status
@@ -476,6 +457,33 @@
list.extend(self.workdirs)
list.append(self.confdir)
+ def getCommandStatus(self):
+ status = True
+ ec = self.__hadoopThread.exit_code()
+ if (ec != 0) and (ec != None):
+ status = False
+ return status
+
+ def handleFailedCommand(self):
+ self.log.error('hadoop error: %s' % (
+ self.__hadoopThread.exit_status_string()))
+ # read the contents of redirected stderr to print information back to user
+ if os.path.exists(self.err):
+ f = None
+ try:
+ f = open(self.err)
+ lines = f.readlines()
+ # format
+ for line in lines:
+ self.stdErrContents = "%s%s" % (self.stdErrContents, line)
+ finally:
+ if f is not None:
+ f.close()
+ self.log.error('See %s.out and/or %s.err for details. They are ' % \
+ (self.name, self.name) + \
+ 'located at subdirectories under either ' + \
+ 'hodring.work-dirs or hodring.log-destination-uri.')
+
class HodRing(hodBaseService):
"""The main class for hodring that
polls the commands it runs"""
@@ -636,9 +644,9 @@
# ok.. now command is running. If this HodRing got jobtracker,
# Check if it is ready for accepting jobs, and then only return
- self.__check_jobtracker(desc, id-1)
+ self.__check_jobtracker(desc, id-1, pkgdir)
- def __check_jobtracker(self, desc, id):
+ def __check_jobtracker(self, desc, id, pkgdir):
# Check jobtracker status. Return properly if it is ready to accept jobs.
# Currently Checks for Jetty to come up, the last thing that can be checked
# before JT completes initialisation. To be perfectly reliable, we need
@@ -649,7 +657,8 @@
self.log.debug("Waiting for jobtracker to initialise")
version = desc.getVersion()
self.log.debug("jobtracker version : %s" % version)
- attrs = self.getRunningValues()[id].getFilledInKeyValues()
+ hadoopCmd = self.getRunningValues()[id]
+ attrs = hadoopCmd.getFilledInKeyValues()
attrs = parseEquals(attrs)
jobTrackerAddr = attrs['mapred.job.tracker']
self.log.debug("jobtracker rpc server : %s" % jobTrackerAddr)
@@ -669,6 +678,20 @@
jettyStatus = False
jettyStatusmsg = ""
while sleepTime <= 32:
+ # There is a possibility that the command might fail after a while.
+ # This code will check if the command failed so that a better
+ # error message can be returned to the user.
+ if not hadoopCmd.getCommandStatus():
+ self.log.critical('Hadoop command found to have failed when ' \
+ 'checking for jobtracker status')
+ hadoopCmd.handleFailedCommand()
+ addnInfo = ""
+ if hadoopCmd.stdErrContents is not "":
+ addnInfo = " Information from stderr of the command:\n%s" \
+ % (hadoopCmd.stdErrContents)
+ raise Exception("Could not launch the %s using %s/bin/hadoop.%s" \
+ % (desc.getName(), pkgdir, addnInfo))
+
try:
jettyConn = httplib.HTTPConnection(jettyAddr)
jettyConn.request("HEAD", "/jobtracker.jsp")