Modified: hadoop/core/trunk/src/contrib/hod/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/CHANGES.txt?rev=694702&r1=694701&r2=694702&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/CHANGES.txt (original) +++ hadoop/core/trunk/src/contrib/hod/CHANGES.txt Fri Sep 12 07:57:52 2008 @@ -22,6 +22,11 @@ HADOOP-4060. Modified HOD to rotate log files on the client side. (Vinod Kumar Vavilapalli via yhemanth) + IMPROVEMENTS + + HADOOP-4145. Add an accounting plugin (script) for HOD. + (Hemanth Yamijala via nigel) + BUG FIXES HADOOP-4161. Fixed bug in HOD cleanup that had the potential to
Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py?rev=694702&r1=694701&r2=694702&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py (original) +++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py Fri Sep 12 07:57:52 2008 @@ -451,8 +451,43 @@ raise Exception("Invalid state: Node pool is not initialized to delete the given job.") return ret + def is_valid_account(self): + """Verify if the account being used to submit the job is a valid account. + This code looks for a file <install-dir>/bin/verify-account. + If the file is present, it executes the file, passing as argument + the account name. It returns the exit code and output from the + script on non-zero exit code.""" + + accountValidationScript = os.path.abspath('./verify-account') + if not os.path.exists(accountValidationScript): + return (0, None) + + account = self.__nodePool.getAccountString() + exitCode = 0 + errMsg = None + try: + accountValidationCmd = simpleCommand('Account Validation Command',\ + '%s %s' % (accountValidationScript, + account)) + accountValidationCmd.start() + accountValidationCmd.wait() + accountValidationCmd.join() + exitCode = accountValidationCmd.exit_code() + self.__log.debug('account validation script is run %d' \ + % exitCode) + errMsg = None + if exitCode is not 0: + errMsg = accountValidationCmd.output() + except Exception, e: + exitCode = 0 + self.__log.warn('Error executing account script: %s ' \ + 'Accounting is disabled.' \ + % get_exception_error_string()) + self.__log.debug(get_exception_string()) + return (exitCode, errMsg) + def allocate(self, clusterDir, min, max=None): - status = 0 + status = 0 self.__svcrgyClient = self.__get_svcrgy_client() self.__log.debug("allocate %s %s %s" % (clusterDir, min, max)) Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py?rev=694702&r1=694701&r2=694702&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py (original) +++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py Fri Sep 12 07:57:52 2008 @@ -252,7 +252,6 @@ self.__cfg['ringmaster']['max-master-failures'] = \ min(maxFailures, maxFailedNodes) - def _op_allocate(self, args): operation = "allocate" argLength = len(args) @@ -313,6 +312,21 @@ return self.__setup_cluster_logger(clusterDir) + + (status, message) = self.__cluster.is_valid_account() + if status is not 0: + if message: + for line in message: + self.__log.critical("verify-account output: %s" % line) + self.__log.critical("Cluster cannot be allocated because account verification failed. " \ + + "verify-account returned exit code: %s." % status) + self.__opCode = 4 + return + else: + self.__log.debug("verify-account returned zero exit code.") + if message: + self.__log.debug("verify-account output: %s" % message) + if re.match('\d+-\d+', nodes): (min, max) = nodes.split("-") min = int(min) Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py?rev=694702&r1=694701&r2=694702&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py (original) +++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py Fri Sep 12 07:57:52 2008 @@ -116,6 +116,10 @@ """Update information about the workers started by this NodePool.""" raise NotImplementedError + def getAccountString(self): + """Return the account string for this job""" + raise NotImplementedError + def getNextNodeSetId(self): id = self.nextNodeSetId self.nextNodeSetId += 1 Modified: hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py?rev=694702&r1=694701&r2=694702&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py (original) +++ hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py Fri Sep 12 07:57:52 2008 @@ -51,6 +51,12 @@ self.__torque = torqueInterface( self._cfg['resource_manager']['batch-home'], environ, self._log) + def getAccountString(self): + account = '' + if self._cfg['resource_manager'].has_key('pbs-account'): + account = self._cfg['resource_manager']['pbs-account'] + return account + def __gen_submit_params(self, nodeSet, walltime = None, qosLevel = None, account = None): argList = [] Modified: hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml?rev=694702&r1=694701&r2=694702&view=diff ============================================================================== --- hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml (original) +++ hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml Fri Sep 12 07:57:52 2008 @@ -351,6 +351,37 @@ it is better not to run this inside a tight loop without sleeping.</p> </section> </section> + + <section> + <title>verify-account - Script to verify an account under which + jobs are submitted</title> + <p>Production systems use accounting packages to charge users for using + shared compute resources. HOD supports a parameter + <em>resource_manager.pbs-account</em> to allow users to identify the + account under which they would like to submit jobs. It may be necessary + to verify that this account is a valid one configured in an accounting + system. The <em>hod-install-dir/bin/verify-account</em> script + provides a mechanism to plug-in a custom script that can do this + verification.</p> + + <section> + <title>Integrating the verify-account script with HOD</title> + <p>HOD runs the <em>verify-account</em> script passing in the + <em>resource_manager.pbs-account</em> value as argument to the script, + before allocating a cluster. Sites can write a script that verify this + account against their accounting systems. Returning a non-zero exit + code from this script will cause HOD to fail allocation. Also, in + case of an error, HOD will print the output of script to the user. + Any descriptive error message can be passed to the user from the + script in this manner.</p> + <p>The default script that comes with the HOD installation does not + do any validation, and returns a zero exit code.</p> + <p>If the verify-account script is not found, then HOD will treat + that verification is disabled, and continue allocation as is.</p> + </section> + </section> + </section> + </body> </document> Modified: hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml?rev=694702&r1=694701&r2=694702&view=diff ============================================================================== --- hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml (original) +++ hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml Fri Sep 12 07:57:52 2008 @@ -412,7 +412,8 @@ <td> 5 </td> <td> Job execution failure </td> <td> 1. Torque Job was deleted from outside. Execute the Torque <code>qstat</code> command to see if you have any jobs in the <code>R</code> (Running) state. If none exist, try re-executing HOD. <br /> - 2. Torque problems such as the server momentarily going down, or becoming unresponsive. Contact system administrator. </td> + 2. Torque problems such as the server momentarily going down, or becoming unresponsive. Contact system administrator. <br/> + 3. The system administrator might have configured account verification, and an invalid account is specified. Contact system administrator.</td> </tr> <tr> <td> 6 </td>
