Repository: mesos Updated Branches: refs/heads/master 8d4b1d504 -> 8ca2934ef
Removed unused frameworks code. Review: https://reviews.apache.org/r/33090 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/923d735c Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/923d735c Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/923d735c Branch: refs/heads/master Commit: 923d735cb892f24a05d42b89bbe278aaa9fcb7b9 Parents: 8d4b1d5 Author: Jiang Yan Xu <[email protected]> Authored: Fri Apr 10 12:13:48 2015 -0700 Committer: Jiang Yan Xu <[email protected]> Committed: Tue Jul 7 10:49:52 2015 -0700 ---------------------------------------------------------------------- docs/using-the-mesos-submit-tool.md | 9 - frameworks/deploy_jar/README.txt | 14 - frameworks/deploy_jar/daemon_executor.py | 33 --- frameworks/deploy_jar/daemon_executor.sh | 11 - frameworks/deploy_jar/daemon_framework | 11 - frameworks/deploy_jar/daemon_scheduler.py | 72 ----- frameworks/deploy_jar/haproxy.config.template | 8 - frameworks/deploy_jar/hw.jar | Bin 763 -> 0 bytes frameworks/haproxy+apache/README.txt | 14 - frameworks/haproxy+apache/haproxy+apache | 11 - frameworks/haproxy+apache/haproxy+apache.py | 217 --------------- .../haproxy+apache/haproxy.config.template | 8 - frameworks/haproxy+apache/startapache.py | 47 ---- frameworks/haproxy+apache/startapache.sh | 11 - frameworks/mesos-submit/executor | 12 - frameworks/mesos-submit/executor.py | 122 --------- frameworks/mesos-submit/mesos-submit | 12 - frameworks/mesos-submit/mesos_submit.py | 102 ------- frameworks/torque/README.txt | 57 ---- frameworks/torque/hpl-24node.qsub | 6 - frameworks/torque/hpl-48node.qsub | 6 - frameworks/torque/hpl-8node-small.qsub | 6 - frameworks/torque/hpl-8node.qsub | 6 - frameworks/torque/mesos-hpl/24node/HPL.dat | 31 --- frameworks/torque/mesos-hpl/48node/HPL.dat | 31 --- frameworks/torque/mesos-hpl/8node-small/HPL.dat | 31 --- frameworks/torque/mesos-hpl/8node/HPL.dat | 31 --- .../torque/mesos-hpl/Make.Linux_PII_CBLAS | 183 ------------- frameworks/torque/mesos-hpl/README | 24 -- frameworks/torque/mpi_example.c | 22 -- frameworks/torque/start_pbs_mom.py | 89 ------- frameworks/torque/start_pbs_mom.sh | 15 -- .../torque/test_date_sleep_date_1node.qsub | 17 -- .../torque/test_date_sleep_date_2node.qsub | 16 -- .../torque/test_date_sleep_date_3node.qsub | 17 -- .../test_date_sleep_date_5node_10sec.qsub | 17 -- .../test_date_sleep_date_5node_60sec.qsub | 17 -- frameworks/torque/torquelib.py | 105 -------- frameworks/torque/torquesched.py | 266 ------------------- frameworks/torque/torquesched.sh | 15 -- 40 files changed, 1722 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/docs/using-the-mesos-submit-tool.md ---------------------------------------------------------------------- diff --git a/docs/using-the-mesos-submit-tool.md b/docs/using-the-mesos-submit-tool.md deleted file mode 100644 index 2ba4acc..0000000 --- a/docs/using-the-mesos-submit-tool.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -layout: documentation ---- - -# Using the Mesos Submit Tool - -Sometimes you just want to run a command or launch a binary on all (or a subset) of the nodes in your cluster. `mesos-submit` lets you do just that! - -Mesos-submit is a little framework that lets you run a binary in the Mesos cluster without having to keep a scheduler running on the machine you submitted it from. You call mesos-submit <binary> and the script will launch a framework with a single task. This task then takes over as the scheduler for this framework (using the scheduler failover feature), and the mesos-submit process (which was the initial scheduler) can safely exit. The task then goes on to run the command. This is useful for people who want to submit their schedulers to the cluster for example. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/deploy_jar/README.txt ---------------------------------------------------------------------- diff --git a/frameworks/deploy_jar/README.txt b/frameworks/deploy_jar/README.txt deleted file mode 100644 index 37ccc17..0000000 --- a/frameworks/deploy_jar/README.txt +++ /dev/null @@ -1,14 +0,0 @@ -haproxy load balancer + Apache web server Mesos framework Readme ----------------------------------------------------------------- - -First you will need to be able to run apache on the slave nodes in your cluster. - -In ubuntu, you can run 'sudo apt-get install apache2' - -Then 'sudo /etc/init.d/apache2 restart' - - -You need to have haproxy installed, currently it is assumed (in haproxy+apache.py) to be in /root/haproxy-1.3.20/haproxy. - -installation instructions are here: http://www.lastengine.com/99/installing-haproxy-load-balancing-for-http-and-https/ - http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/deploy_jar/daemon_executor.py ---------------------------------------------------------------------- diff --git a/frameworks/deploy_jar/daemon_executor.py b/frameworks/deploy_jar/daemon_executor.py deleted file mode 100755 index 4720a6e..0000000 --- a/frameworks/deploy_jar/daemon_executor.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python -import mesos -import sys -import time -import os - -from subprocess import * - -class MyExecutor(mesos.Executor): - def __init__(self): - mesos.Executor.__init__(self) - - def init(self, driver, arg): - print "in daemon executor" - - def launchTask(self, driver, task): - print "in launchTask" - self.tid = task.taskId - print "task id is " + str(task.taskId) + ", task.args is " + task.arg - self.args = task.arg.split("\t") - print "running: " + "java -cp " + self.args[0] + " " + self.args[1] + " " + self.args[2] - print Popen("/usr/lib/jvm/java-6-sun/bin/java -cp " + self.args[0] + " " + self.args[1] + " " + self.args[2], shell=True, stdout=PIPE).stdout.readline() - update = mesos.TaskStatus(task.taskId, mesos.TASK_FINISHED, "") - driver.sendStatusUpdate(update) - - - def error(self, code, message): - print "Error: %s" % message - -if __name__ == "__main__": - print "starting daemon framework executor" - executor = MyExecutor() - mesos.MesosExecutorDriver(executor).run() http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/deploy_jar/daemon_executor.sh ---------------------------------------------------------------------- diff --git a/frameworks/deploy_jar/daemon_executor.sh b/frameworks/deploy_jar/daemon_executor.sh deleted file mode 100755 index ac54d73..0000000 --- a/frameworks/deploy_jar/daemon_executor.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -PYTHON=python - -if [ "`uname`" == "SunOS" ]; then - PYTHON=python2.6 -fi - -export PYTHONPATH=`dirname $0`/../../src/swig/python:$PYTHONPATH - -$PYTHON `dirname $0`/daemon_executor.py $@ http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/deploy_jar/daemon_framework ---------------------------------------------------------------------- diff --git a/frameworks/deploy_jar/daemon_framework b/frameworks/deploy_jar/daemon_framework deleted file mode 100755 index 2bb9107..0000000 --- a/frameworks/deploy_jar/daemon_framework +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -PYTHON=python - -if [ "`uname`" == "SunOS" ]; then - PYTHON=python2.6 -fi - -export PYTHONPATH=`dirname $0`/../../lib/python:$PYTHONPATH - -$PYTHON ./daemon_scheduler.py $@ http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/deploy_jar/daemon_scheduler.py ---------------------------------------------------------------------- diff --git a/frameworks/deploy_jar/daemon_scheduler.py b/frameworks/deploy_jar/daemon_scheduler.py deleted file mode 100755 index a919717..0000000 --- a/frameworks/deploy_jar/daemon_scheduler.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python - -import mesos -import os -import sys -import time -import httplib -import Queue -import threading - -from optparse import OptionParser -from subprocess import * -from socket import gethostname - - -class MyScheduler(mesos.Scheduler): - def __init__(self, num_tasks, jar_url, jar_class, jar_args): - mesos.Scheduler.__init__(self) - self.lock = threading.RLock() - self.task_count = 0 - self.num_tasks = num_tasks - self.jar_url = jar_url - self.jar_class = jar_class - self.jar_args = jar_args - - def getExecutorInfo(self, driver): - execPath = os.path.join(os.getcwd(), "daemon_executor.sh") - return mesos.ExecutorInfo(execPath, "") - - def registered(self, driver, fid): - print "Mesos daemon scheduler registered as framework #%s" % fid - - def getFrameworkName(self, driver): - return "Python Deploy Jar" - - def resourceOffer(self, driver, oid, slave_offers): - print "Got resource offer %s" % oid - self.lock.acquire() - tasks = [] - for offer in slave_offers: - if int(self.task_count) > int(self.num_tasks): - print "Rejecting slot because we've launched enough tasks" - elif int(offer.params['mem']) < 1024 or int(offer.params['cpus']) < 1: - print "Rejecting slot because it is too small" - print "It had mem=" + offer.params['mem']\ - + " and cpus=" + offer.params['cpus'] - else: - print "accepting slot of size 1 cpu and 1024MB to launch a jar" - params = {"cpus": "1", "mem": "1024"} - task_args = self.jar_url + "\t" + self.jar_class + "\t" + self.jar_args - print "task args are: " + task_args - td = mesos.TaskDescription( - self.task_count, offer.slaveId, "task %d" % self.task_count, params, task_args) - tasks.append(td) - print "incrementing self.task_count from " + str(self.task_count) - print "self.num_tasks is " + str(self.num_tasks) - self.task_count += 1 - driver.replyToOffer(oid, tasks, {"timeout": "1"}) - self.lock.release() - -if __name__ == "__main__": - args = sys.argv[1:len(sys.argv)] - - print "sched = MyScheduler(" + args[1] + ", "+ args[2]+ ", "+ args[3]+ ", "+ " ".join(args[4:len(args)])+")" - - sched = MyScheduler(args[1], args[2], args[3], " ".join(args[4:len(args)])) - - print "Connecting to mesos master %s" % args[0] - - mesos.MesosSchedulerDriver(sched, sys.argv[1]).run() - - print "Finished!" http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/deploy_jar/haproxy.config.template ---------------------------------------------------------------------- diff --git a/frameworks/deploy_jar/haproxy.config.template b/frameworks/deploy_jar/haproxy.config.template deleted file mode 100644 index 4d537e9..0000000 --- a/frameworks/deploy_jar/haproxy.config.template +++ /dev/null @@ -1,8 +0,0 @@ -listen webfarm ec2-174-129-94-218.compute-1.amazonaws.com:80 - timeout server 7500 - timeout client 7500 - timeout connect 7500 - mode http - balance roundrobin - option httpchk HEAD /index.html HTTP/1.0 - stats uri /stats http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/deploy_jar/hw.jar ---------------------------------------------------------------------- diff --git a/frameworks/deploy_jar/hw.jar b/frameworks/deploy_jar/hw.jar deleted file mode 100644 index aa13459..0000000 Binary files a/frameworks/deploy_jar/hw.jar and /dev/null differ http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/haproxy+apache/README.txt ---------------------------------------------------------------------- diff --git a/frameworks/haproxy+apache/README.txt b/frameworks/haproxy+apache/README.txt deleted file mode 100644 index 37ccc17..0000000 --- a/frameworks/haproxy+apache/README.txt +++ /dev/null @@ -1,14 +0,0 @@ -haproxy load balancer + Apache web server Mesos framework Readme ----------------------------------------------------------------- - -First you will need to be able to run apache on the slave nodes in your cluster. - -In ubuntu, you can run 'sudo apt-get install apache2' - -Then 'sudo /etc/init.d/apache2 restart' - - -You need to have haproxy installed, currently it is assumed (in haproxy+apache.py) to be in /root/haproxy-1.3.20/haproxy. - -installation instructions are here: http://www.lastengine.com/99/installing-haproxy-load-balancing-for-http-and-https/ - http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/haproxy+apache/haproxy+apache ---------------------------------------------------------------------- diff --git a/frameworks/haproxy+apache/haproxy+apache b/frameworks/haproxy+apache/haproxy+apache deleted file mode 100755 index 05b9759..0000000 --- a/frameworks/haproxy+apache/haproxy+apache +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -PYTHON=python - -if [ "`uname`" == "SunOS" ]; then - PYTHON=python2.6 -fi - -export PYTHONPATH=`dirname $0`/../../lib/python:$PYTHONPATH - -$PYTHON ./haproxy+apache.py $@ http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/haproxy+apache/haproxy+apache.py ---------------------------------------------------------------------- diff --git a/frameworks/haproxy+apache/haproxy+apache.py b/frameworks/haproxy+apache/haproxy+apache.py deleted file mode 100755 index 39d4e7b..0000000 --- a/frameworks/haproxy+apache/haproxy+apache.py +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/env python - -import mesos -import os -import sys -import time -import httplib -import Queue -import threading - -from optparse import OptionParser -from subprocess import * -from socket import gethostname - -MIN_SERVERS = 1 -START_THRESHOLD = 25 -KILL_THRESHOLD = 5 -HAPROXY_EXE = "/root/haproxy-1.3.20/haproxy" #EC2 -#HAPROXY_EXE = "/scratch/haproxy-1.3.25/haproxy" #Rcluster - -class ApacheWebFWScheduler(mesos.Scheduler): - def __init__(self): - mesos.Scheduler.__init__(self) - self.lock = threading.RLock() - self.id = 0 - self.haproxy = -1 - self.reconfigs = 0 - self.servers = {} - self.overloaded = False - - def registered(self, driver, fid): - print "Mesos haproxy+apache scheduler registered as framework #%s" % fid - self.driver = driver - - def getFrameworkName(self, driver): - return "haproxy+apache" - - def getExecutorInfo(self, driver): - execPath = os.path.join(os.getcwd(), "startapache.sh") - return mesos.ExecutorInfo(execPath, "") - - def reconfigure(self): - print "reconfiguring haproxy" - name = "/tmp/haproxy.conf.%d" % self.reconfigs - with open(name, 'w') as config: - with open('haproxy.config.template', 'r') as template: - for line in template: - config.write(line) - for id, host in self.servers.iteritems(): - config.write(" ") - config.write("server %d %s:80 check\n" % (id, host)) - - cmd = [] - if self.haproxy != -1: - cmd = [HAPROXY_EXE, - "-f", - name, - "-sf", - str(self.haproxy.pid)] - else: - cmd = [HAPROXY_EXE, - "-f", - name] - - self.haproxy = Popen(cmd, shell = False) - self.reconfigs += 1 - - def resourceOffer(self, driver, oid, slave_offers): - print "Got resource offer %s with %s slots." % (oid, len(slave_offers)) - self.lock.acquire() - tasks = [] - for offer in slave_offers: - if offer.host in self.servers.values(): - print "Rejecting slot on host " + offer.host + " because we've launched a server on that machine already." - #print "self.servers currently looks like: " + str(self.servers) - elif not self.overloaded and len(self.servers) > 0: - print "Rejecting slot because we've launched enough tasks." - elif int(offer.params['mem']) < 1024: - print "Rejecting offer because it doesn't contain enough memory (it has " + offer.params['mem'] + " and we need 1024mb." - elif int(offer.params['cpus']) < 1: - print "Rejecting offer because it doesn't contain enough CPUs." - else: - print "Offer is for " + offer.params['cpus'] + " CPUS and " + offer.params["mem"] + " MB on host " + offer.host - params = {"cpus": "1", "mem": "1024"} - td = mesos.TaskDescription(self.id, offer.slaveId, "server %s" % self.id, params, "") - print "Accepting task, id=" + str(self.id) + ", params: " + params['cpus'] + " CPUS, and " + params['mem'] + " MB, on node " + offer.host - tasks.append(td) - self.servers[self.id] = offer.host - self.id += 1 - self.overloaded = False - driver.replyToOffer(oid, tasks, {"timeout":"1"}) - #driver.replyToOffer(oid, tasks, {}) - print "done with resourceOffer()" - self.lock.release() - - def statusUpdate(self, driver, status): - print "received status update from taskID " + str(status.taskId) + ", with state: " + str(status.state) - reconfigured = False - self.lock.acquire() - if status.taskId in self.servers.keys(): - if status.state == mesos.TASK_STARTING: - print "Task " + str(status.taskId) + " reported that it is STARTING." - del self.servers[status.taskId] - self.reconfigure() - reconfigured = True - if status.state == mesos.TASK_RUNNING: - print "Task " + str(status.taskId) + " reported that it is RUNNING, reconfiguring haproxy to include it in webfarm now." - self.reconfigure() - reconfigured = True - if status.state == mesos.TASK_FINISHED: - del self.servers[status.taskId] - print "Task " + str(status.taskId) + " reported FINISHED (state " + status.state + ")." - self.reconfigure() - reconfigured = True - if status.state == mesos.TASK_FAILED: - print "Task " + str(status.taskId) + " reported that it FAILED!" - del self.servers[status.taskId] - self.reconfigure() - reconfigured = True - if status.state == mesos.TASK_KILLED: - print "Task " + str(status.taskId) + " reported that it was KILLED!" - del self.servers[status.taskId] - self.reconfigure() - reconfigured = True - if status.state == mesos.TASK_LOST: - print "Task " + str(status.taskId) + " reported was LOST!" - del self.servers[status.taskId] - self.reconfigure() - reconfigured = True - self.lock.release() - if reconfigured: - None#driver.reviveOffers() - print "done in statusupdate" - - def scaleUp(self): - print "SCALING UP" - self.lock.acquire() - self.overloaded = True - self.lock.release() - - def scaleDown(self, id): - print "SCALING DOWN (removing server %d)" % id - kill = False - self.lock.acquire() - if self.overloaded: - self.overloaded = False - else: - kill = True - self.lock.release() - if kill: - self.driver.killTask(id) - - -def monitor(sched): - print "in MONITOR()" - while True: - time.sleep(1) - print "done sleeping" - try: - conn = httplib.HTTPConnection("r3.millennium.berkeley.edu:9001") - print "done creating connection" - conn.request("GET", "/stats;csv") - print "done with request()" - res = conn.getresponse() - print "testing response status" - if (res.status != 200): - print "response != 200" - continue - else: - print "got some stats" - data = res.read() - lines = data.split('\n')[2:-2] - - data = data.split('\n') - data = data[1].split(',') - - if int(data[33]) >= START_THRESHOLD: - sched.scaleUp() - elif int(data[4]) <= KILL_THRESHOLD: - minload, minid = (sys.maxint, 0) - for l in lines: - cols = l.split(',') - id = int(cols[1]) - load = int(cols[4]) - if load < minload: - minload = load - minid = id - - if len(lines) > MIN_SERVERS and minload == 0: - sched.scaleDown(minid) - - conn.close() - except Exception, e: - print "exception in monitor()" - continue - print "done in MONITOR()" - -if __name__ == "__main__": - parser = OptionParser(usage = "Usage: %prog mesos_master") - - (options,args) = parser.parse_args() - if len(args) < 1: - print >> sys.stderr, "At least one parameter required." - print >> sys.stderr, "Use --help to show usage." - exit(2) - - print "sched = ApacheWebFWScheduler()" - sched = ApacheWebFWScheduler() - - print "Connecting to mesos master %s" % args[0] - driver = mesos.MesosSchedulerDriver(sched, sys.argv[1]) - - threading.Thread(target = monitor, args=[sched]).start() - - driver.run() - - print "Scheduler finished!" http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/haproxy+apache/haproxy.config.template ---------------------------------------------------------------------- diff --git a/frameworks/haproxy+apache/haproxy.config.template b/frameworks/haproxy+apache/haproxy.config.template deleted file mode 100644 index ca82ba9..0000000 --- a/frameworks/haproxy+apache/haproxy.config.template +++ /dev/null @@ -1,8 +0,0 @@ -listen webfarm localhost:80 - timeout server 7500 - timeout client 7500 - timeout connect 7500 - mode http - balance roundrobin - option httpchk HEAD /index.html HTTP/1.0 - stats uri /stats http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/haproxy+apache/startapache.py ---------------------------------------------------------------------- diff --git a/frameworks/haproxy+apache/startapache.py b/frameworks/haproxy+apache/startapache.py deleted file mode 100755 index dce0ddd..0000000 --- a/frameworks/haproxy+apache/startapache.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python -import mesos -import sys -import time -import os -import atexit - -from subprocess import * - -APACHECTL = "/usr/apache2/2.2/bin/apachectl" #EC2 -#APACHECTL = "sudo /etc/init.d/apache2" #R Cluster - -def cleanup(): - try: - # TODO(*): This will kill ALL apaches...oops. - os.waitpid(Popen(APACHECTL + " stop", shell=True).pid, 0) - except Exception, e: - print e - None - -class MyExecutor(mesos.Executor): - def __init__(self): - mesos.Executor.__init__(self) - self.tid = -1 - - def launchTask(self, driver, task): - self.tid = task.taskId - Popen(APACHECTL + " start", shell=True) - - def killTask(self, driver, tid): - if (tid != self.tid): - print "Expecting different task id ... killing anyway!" - cleanup() - update = mesos.TaskStatus(tid, mesos.TASK_FINISHED, "") - driver.sendStatusUpdate(update) - - def shutdown(driver, self): - cleanup() - - def error(self, code, message): - print "Error: %s" % message - -if __name__ == "__main__": - print "Starting haproxy+apache executor" - atexit.register(cleanup) - executor = MyExecutor() - mesos.MesosExecutorDriver(executor).run() http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/haproxy+apache/startapache.sh ---------------------------------------------------------------------- diff --git a/frameworks/haproxy+apache/startapache.sh b/frameworks/haproxy+apache/startapache.sh deleted file mode 100755 index 6eb2456..0000000 --- a/frameworks/haproxy+apache/startapache.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -PYTHON=python - -if [ "`uname`" == "SunOS" ]; then - PYTHON=python2.6 -fi - -export PYTHONPATH=`dirname $0`/../../lib/python:$PYTHONPATH - -$PYTHON `dirname $0`/startapache.py $@ http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/mesos-submit/executor ---------------------------------------------------------------------- diff --git a/frameworks/mesos-submit/executor b/frameworks/mesos-submit/executor deleted file mode 100755 index dd8b98c..0000000 --- a/frameworks/mesos-submit/executor +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -if [ "x$PYTHON" == "x" ]; then - PYTHON=python -fi - -if [ "x$MESOS_HOME" == "x" ]; then - MESOS_HOME="$(dirname $0)/../.." -fi - -export PYTHONPATH=$MESOS_HOME/lib/python -exec $PYTHON "$(dirname $0)/executor.py" $@ http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/mesos-submit/executor.py ---------------------------------------------------------------------- diff --git a/frameworks/mesos-submit/executor.py b/frameworks/mesos-submit/executor.py deleted file mode 100755 index 38813b9..0000000 --- a/frameworks/mesos-submit/executor.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env python -import os -import pickle -import subprocess -import sys -import threading -import time -from threading import Thread - -import mesos - - -# This function is called in its own thread to actually run the user's command. -# When it finishes, it shuts down the scheduler driver (disconnecting the -# framework) and exits the program. -def run_command(command, driver): - print "Running " + command - equal_signs = "=" * 40 - print equal_signs - try: - code = os.system(command) - print equal_signs - print "Command completed with code %d" % code - except OSError,e: - print equal_signs - print "os.system call failed, see stderr for details" - print >>sys.stderr, "Error executing command" - print >>sys.stderr, e - driver.stop() - sys.exit(2) - driver.stop() - sys.exit(0) - - -# A secondary scheduler registered for our framework with Mesos so that -# our first scheduler (on the machine that ran mesos-submit) can disconnect. -# This scheduler launches no further tasks but allows our one task to continue -# running in the cluster -- the task essentially becomes its own scheduler. -class SecondaryScheduler(mesos.Scheduler): - def __init__(self, framework_name, command): - mesos.Scheduler.__init__(self) - self.framework_name = framework_name - self.command = command - self.command_started = False - - def getFrameworkName(self, driver): - return self.framework_name - - def getExecutorInfo(self, driver): - executorPath = os.path.join(os.getcwd(), "executor") - return mesos.ExecutorInfo(executorPath, "") - - def resourceOffer(self, driver, oid, offers): - # Reject the offer with an infinite timeout, since we are here - # only to serve as a second scheduler to keep the framework running - driver.replyToOffer(oid, [], {"timeout": "-1"}) - - def registered(self, driver, fid): - # Registered is called both the first time whe connect to a master - # and subsequent times if we reconnect or the master fails over. - # We only want to launch the command the first time. - if not self.command_started: - print "Registered with Mesos; starting command" - Thread(target=run_command, args=[self.command, driver]).start() - self.command_started = True - else: - print "Re-registered with Mesos (likely due to master failover)" - - def error(self, driver, code, message): - print "Error from Mesos: %s (code %s)" % (message, code) - - -# This function is called in a separate thread to run our secondary scheduler; -# for some reason, things fail if we launch it from the executor's launchTask -# callback (this is likely to be SWIG/Python related). -def run_scheduler(fid, framework_name, master, command): - print "Starting secondary scheduler" - sched = SecondaryScheduler(framework_name, command) - sched_driver = mesos.MesosSchedulerDriver(sched, master, fid) - sched_driver.run() - - -# Executor class for mesos-submit. Expects to be given a single task -# to launch with a framework ID, master URL and command as parameters. -# Once this task is received, the executor registers as a scheduler for the -# framework by creating a SecondaryScheduler object, allowing the mesos-submit -# command on the user's machine to exit, and it starts the user's command -# on this cluster node as a subprocess. -class MyExecutor(mesos.Executor): - def __init__(self): - mesos.Executor.__init__(self) - self.sched = None - - def launchTask(self, driver, task): - if self.sched == None: - print "Received task; going to register as scheduler" - # Recover framework ID, master and command from task arg - fid, framework_name, master, command = pickle.loads(task.arg) - print "Mesos-submit parameters:" - print " framework ID = %s" % fid - print " framework name = %s" % framework_name - print " master = %s" % master - print " command = %s" % command - # Start our secondary scheduler in a different thread (for some reason, - # this fails if we do it from the same thread.. probably due to some - # SWIG Python interaction). - Thread(target=run_scheduler, - args=[fid, framework_name, master, command]).start() - else: - print "Error: received a second task -- this should never happen!" - - def killTask(self, driver, tid): - sys.exit(1) - - def error(self, driver, code, message): - print "Error from Mesos: %s (code %s)" % (message, code) - - -if __name__ == "__main__": - print "Starting mesos-submit executor" - executor = MyExecutor() - mesos.MesosExecutorDriver(executor).run() http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/mesos-submit/mesos-submit ---------------------------------------------------------------------- diff --git a/frameworks/mesos-submit/mesos-submit b/frameworks/mesos-submit/mesos-submit deleted file mode 100755 index 8cd5a67..0000000 --- a/frameworks/mesos-submit/mesos-submit +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -if [ "x$PYTHON" == "x" ]; then - PYTHON=python -fi - -if [ "x$MESOS_HOME" == "x" ]; then - MESOS_HOME="$(dirname $0)/../.." -fi - -export PYTHONPATH=$MESOS_HOME/lib/python -exec $PYTHON "$(dirname $0)/mesos_submit.py" "$@" http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/mesos-submit/mesos_submit.py ---------------------------------------------------------------------- diff --git a/frameworks/mesos-submit/mesos_submit.py b/frameworks/mesos-submit/mesos_submit.py deleted file mode 100755 index 94d6232..0000000 --- a/frameworks/mesos-submit/mesos_submit.py +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env python -import os -import pickle -import re -import sys -import time -from optparse import OptionParser - -import mesos - -# Default resources to use for the command we execute. -DEFAULT_CPUS = 1 -DEFAULT_MEM = 512 - - -# The scheduler for mesos-submit, running on the machine that the user -# executed mesos-submit on, launches a single task in the cluster with -# the required amounts of CPUs and memory, and then waits for it to become -# the framework's scheduler using the scheduler failover mechanism. -# It then exits mesos-submit successfully, while the task goes on to run -# the user's command. -# -# Note that we pass our framework ID, master URL and command to the executor -# using the task's argument field. -# -# We currently don't recover if our task fails for some reason, but we -# do print its state transitions so the user can notice this. -class SubmitScheduler(mesos.Scheduler): - def __init__(self, options, master, command): - mesos.Scheduler.__init__(self) - if options.name != None: - self.framework_name = options.name - else: - self.framework_name = "mesos-submit " + command - self.cpus = options.cpus - self.mem = options.mem - self.master = master - self.command = command - self.task_launched = False - - def getFrameworkName(self, driver): - return self.framework_name - - def getExecutorInfo(self, driver): - frameworkDir = os.path.abspath(os.path.dirname(sys.argv[0])) - executorPath = os.path.join(frameworkDir, "executor") - return mesos.ExecutorInfo(executorPath, "") - - def registered(self, driver, fid): - print "Registered with Mesos, FID = %s" % fid - self.fid = "" + fid - - def resourceOffer(self, driver, oid, offers): - if self.task_launched: - # Since we already launched our task, we reject the offer - driver.replyToOffer(oid, [], {"timeout": "-1"}) - else: - for offer in offers: - cpus = int(offer.params["cpus"]) - mem = int(offer.params["mem"]) - if cpus >= self.cpus and mem >= self.mem: - print "Accepting slot on slave %s (%s)" % (offer.slaveId, offer.host) - params = {"cpus": "%d" % self.cpus, "mem": "%d" % self.mem} - arg = [self.fid, self.framework_name, self.master, self.command] - task = mesos.TaskDescription(0, offer.slaveId, "task", params, - pickle.dumps(arg)) - driver.replyToOffer(oid, [task], {"timeout": "1"}) - self.task_launched = True - return - - def statusUpdate(self, driver, update): - print "Task %d in state %d" % (update.taskId, update.state) - - def error(self, driver, code, message): - if message == "Framework failover": - # Scheduler failover is currently reported by this error message; - # this is kind of a brittle way to detect it, but it's all we can do now. - print "In-cluster scheduler started; exiting mesos-submit" - else: - print "Error from Mesos: %s (error code: %d)" % (message, code) - driver.stop() - - -if __name__ == "__main__": - parser = OptionParser(usage="Usage: %prog [options] <master_url> <command>") - parser.add_option("-c","--cpus", - help="number of CPUs to request (default: 1)", - dest="cpus", type="int", default=DEFAULT_CPUS) - parser.add_option("-m","--mem", - help="MB of memory to request (default: 512)", - dest="mem", type="int", default=DEFAULT_MEM) - parser.add_option("-n","--name", - help="Framework name", dest="name", type="string") - (options,args) = parser.parse_args() - if len(args) < 2: - parser.error("At least two parameters are required.") - exit(2) - master = args[0] - command = " ".join(args[1:]) - print "Connecting to mesos master %s" % master - sched = SubmitScheduler(options, master, command) - mesos.MesosSchedulerDriver(sched, master).run() http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/README.txt ---------------------------------------------------------------------- diff --git a/frameworks/torque/README.txt b/frameworks/torque/README.txt deleted file mode 100644 index dd671e2..0000000 --- a/frameworks/torque/README.txt +++ /dev/null @@ -1,57 +0,0 @@ -NOTE: this document is grossly out of date and probably not useful at all! - -Debugging notes: ------------------ -- Be *very* careful about naming in config files, I *think* that FQDNs will work if you use them for both pbs_mom and pbs_server but a node keeps showing up with status "down" when I add it without using the .eecs.berkeley.edu (this is on my own laptop as both slave and server) - - -Mesos TORQUE framework readme --------------------------------------------- -This framework is a wrapper around the Torque cluster resource manager for the cluster which integrates with a cluster scheduler such as pbs_sched, maui, or moab. - -Installing TORQUE: ------------------- - -option #1) install from source - -option #2) sudo apt-get install torque-server, torque-mom (also potentially relevant: torque-dev, torque-client) - - -==Structure Overview of the Framework== ---------------------------------------- - -==FRAMEWORK EXECUTOR== -The mesos executor for this framework will run pbs_mom and tell it to look at the framework scheduler as its host - -can interact with pbs_mom using `momctl` - -==FRAMEWORK SCHEDULER== -The torque FW scheduler is responsible for managing the pbs_server daemon. This includees starting it if necessary, adding and removing nodes using the `pbsmgr` command as it accepts resource offers to dynamically grow (but never beyond its 'safe allocation') or shrink or releases resources. - -For example, the FW scheduler can shrink mesos tasks (i.e. torque compute nodes) or killing them (i.e. remove compute notes) in response to a the scheduler queue becoming more (or entirely) empty in order to free resources for another active framework in the cluster (while maintining some minimum number of resources). It can also relaunch and grow tasks to account for increased queue lengths - -The minimum number of resources torque should hang on to is configurable: -MIN_TORQUE_COMPUTE_NODE = resource vector of resources per compute node min -MIN_TORQUE_CLUSTER_SIZE = # compute nodes to keep around min - -===Torque Scheduler=== -The framework can use whichever torque compatible scheduler that is desired. By default it will use the default torque fifo scheduler (pbs_sched) - -Permissions: ------------- - -****Currently**** -As of right now, to run this framework, the node that the framework scheduler is run on will need to have pbs_server installed. - -The framework scheduler will launch pbs_server for you, I *think* you need to be root to run the pbs_server daemon (I haven't figured out how to do it otherwise). - -****Future Alternative**** -An alternate way that we could structure this framework is to require that pbs_server is running on some server already but that it is running with the intention to be fully managed by the torque mesos framework. I think that the management commands can be set up to work for non root users, which the framework could then run as. Then the mesos torque framework scheduler would take the address of the pbs_server as a parameter and would assume it has permissions to add and remove nodes from the server - - -TODO: ------ -- explore an install a torque UI - -- maybe PBSWeb (http://www.clusterresources.com/pipermail/torqueusers/2004-March/000411.html) or apt-get install torque-gui (which has gui clients) -- figure out permissions better (this page http://www.clusterresources.com/torquedocs21/commands/qrun.shtml mentions "PBS Operation or Manager privilege.") -- might want to add mpi to this framework too (so that people can submit mpi jobs to torque framework) http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/hpl-24node.qsub ---------------------------------------------------------------------- diff --git a/frameworks/torque/hpl-24node.qsub b/frameworks/torque/hpl-24node.qsub deleted file mode 100644 index 3df1a45..0000000 --- a/frameworks/torque/hpl-24node.qsub +++ /dev/null @@ -1,6 +0,0 @@ -#! /usr/bin/env sh -#PBS -l nodes=24 -#PBS -N 24_node_hpl_job - -cd /nfs/hpl/24node/ -mpiexec -n 24 /nfs/hpl/24node/xhpl http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/hpl-48node.qsub ---------------------------------------------------------------------- diff --git a/frameworks/torque/hpl-48node.qsub b/frameworks/torque/hpl-48node.qsub deleted file mode 100644 index 105a7bb..0000000 --- a/frameworks/torque/hpl-48node.qsub +++ /dev/null @@ -1,6 +0,0 @@ -#! /usr/bin/env sh -#PBS -l nodes=48 -#PBS -N 48_node_hpl_job - -cd /nfs/hpl/48node/ -mpiexec -n 48 /nfs/hpl/48node/xhpl http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/hpl-8node-small.qsub ---------------------------------------------------------------------- diff --git a/frameworks/torque/hpl-8node-small.qsub b/frameworks/torque/hpl-8node-small.qsub deleted file mode 100644 index b14dc95..0000000 --- a/frameworks/torque/hpl-8node-small.qsub +++ /dev/null @@ -1,6 +0,0 @@ -#! /usr/bin/env sh -#PBS -l nodes=8 -#PBS -N 8_node_hpl_job - -cd /nfs/hpl/8node/ -mpiexec -n 8 /nfs/hpl/8node/xhpl http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/hpl-8node.qsub ---------------------------------------------------------------------- diff --git a/frameworks/torque/hpl-8node.qsub b/frameworks/torque/hpl-8node.qsub deleted file mode 100644 index b14dc95..0000000 --- a/frameworks/torque/hpl-8node.qsub +++ /dev/null @@ -1,6 +0,0 @@ -#! /usr/bin/env sh -#PBS -l nodes=8 -#PBS -N 8_node_hpl_job - -cd /nfs/hpl/8node/ -mpiexec -n 8 /nfs/hpl/8node/xhpl http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/mesos-hpl/24node/HPL.dat ---------------------------------------------------------------------- diff --git a/frameworks/torque/mesos-hpl/24node/HPL.dat b/frameworks/torque/mesos-hpl/24node/HPL.dat deleted file mode 100644 index e9ab3fd..0000000 --- a/frameworks/torque/mesos-hpl/24node/HPL.dat +++ /dev/null @@ -1,31 +0,0 @@ -HPLinpack benchmark input file -Innovative Computing Laboratory, University of Tennessee -HPL.out output file name (if any) -6 device out (6=stdout,7=stderr,file) -2 # of problems sizes (N) -500 500 Ns -2 # of NBs -50 100 NBs -0 PMAP process mapping (0=Row-,1=Column-major) -3 # of process grids (P x Q) -24 12 6 Ps -1 2 4 Qs -16.0 threshold -1 # of panel fact -0 PFACTs (0=left, 1=Crout, 2=Right) -1 # of recursive stopping criterium -4 NBMINs (>= 1) -1 # of panels in recursion -2 NDIVs -1 # of recursive panel fact. -0 RFACTs (0=left, 1=Crout, 2=Right) -1 # of broadcast -0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) -1 # of lookahead depth -0 DEPTHs (>=0) -2 SWAP (0=bin-exch,1=long,2=mix) -64 swapping threshold -0 L1 in (0=transposed,1=no-transposed) form -0 U in (0=transposed,1=no-transposed) form -1 Equilibration (0=no,1=yes) -8 memory alignment in double (> 0) http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/mesos-hpl/48node/HPL.dat ---------------------------------------------------------------------- diff --git a/frameworks/torque/mesos-hpl/48node/HPL.dat b/frameworks/torque/mesos-hpl/48node/HPL.dat deleted file mode 100644 index d5c3e85..0000000 --- a/frameworks/torque/mesos-hpl/48node/HPL.dat +++ /dev/null @@ -1,31 +0,0 @@ -HPLinpack benchmark input file -Innovative Computing Laboratory, University of Tennessee -HPL.out output file name (if any) -6 device out (6=stdout,7=stderr,file) -1 # of problems sizes (N) -10000 Ns -2 # of NBs -50 100 NBs -0 PMAP process mapping (0=Row-,1=Column-major) -1 # of process grids (P x Q) -6 Ps -8 Qs -16.0 threshold -1 # of panel fact -0 PFACTs (0=left, 1=Crout, 2=Right) -1 # of recursive stopping criterium -4 NBMINs (>= 1) -1 # of panels in recursion -2 NDIVs -1 # of recursive panel fact. -0 RFACTs (0=left, 1=Crout, 2=Right) -1 # of broadcast -0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) -1 # of lookahead depth -0 DEPTHs (>=0) -2 SWAP (0=bin-exch,1=long,2=mix) -64 swapping threshold -0 L1 in (0=transposed,1=no-transposed) form -0 U in (0=transposed,1=no-transposed) form -1 Equilibration (0=no,1=yes) -8 memory alignment in double (> 0) http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/mesos-hpl/8node-small/HPL.dat ---------------------------------------------------------------------- diff --git a/frameworks/torque/mesos-hpl/8node-small/HPL.dat b/frameworks/torque/mesos-hpl/8node-small/HPL.dat deleted file mode 100644 index 941726c..0000000 --- a/frameworks/torque/mesos-hpl/8node-small/HPL.dat +++ /dev/null @@ -1,31 +0,0 @@ -HPLinpack benchmark input file -Innovative Computing Laboratory, University of Tennessee -HPL.out output file name (if any) -6 device out (6=stdout,7=stderr,file) -1 # of problems sizes (N) -200 Ns -2 # of NBs -10 20 NBs -0 PMAP process mapping (0=Row-,1=Column-major) -2 # of process grids (P x Q) -1 2 Ps -8 4 Qs -16.0 threshold -1 # of panel fact -0 PFACTs (0=left, 1=Crout, 2=Right) -1 # of recursive stopping criterium -4 NBMINs (>= 1) -1 # of panels in recursion -2 NDIVs -1 # of recursive panel fact. -0 RFACTs (0=left, 1=Crout, 2=Right) -1 # of broadcast -0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) -1 # of lookahead depth -0 DEPTHs (>=0) -2 SWAP (0=bin-exch,1=long,2=mix) -64 swapping threshold -0 L1 in (0=transposed,1=no-transposed) form -0 U in (0=transposed,1=no-transposed) form -1 Equilibration (0=no,1=yes) -8 memory alignment in double (> 0) http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/mesos-hpl/8node/HPL.dat ---------------------------------------------------------------------- diff --git a/frameworks/torque/mesos-hpl/8node/HPL.dat b/frameworks/torque/mesos-hpl/8node/HPL.dat deleted file mode 100644 index 250563c..0000000 --- a/frameworks/torque/mesos-hpl/8node/HPL.dat +++ /dev/null @@ -1,31 +0,0 @@ -HPLinpack benchmark input file -Innovative Computing Laboratory, University of Tennessee -HPL.out output file name (if any) -6 device out (6=stdout,7=stderr,file) -2 # of problems sizes (N) -4000 4000 Ns -2 # of NBs -10 20 NBs -0 PMAP process mapping (0=Row-,1=Column-major) -2 # of process grids (P x Q) -1 2 Ps -8 4 Qs -16.0 threshold -1 # of panel fact -0 PFACTs (0=left, 1=Crout, 2=Right) -1 # of recursive stopping criterium -4 NBMINs (>= 1) -1 # of panels in recursion -2 NDIVs -1 # of recursive panel fact. -0 RFACTs (0=left, 1=Crout, 2=Right) -1 # of broadcast -0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) -1 # of lookahead depth -0 DEPTHs (>=0) -2 SWAP (0=bin-exch,1=long,2=mix) -64 swapping threshold -0 L1 in (0=transposed,1=no-transposed) form -0 U in (0=transposed,1=no-transposed) form -1 Equilibration (0=no,1=yes) -8 memory alignment in double (> 0) http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/mesos-hpl/Make.Linux_PII_CBLAS ---------------------------------------------------------------------- diff --git a/frameworks/torque/mesos-hpl/Make.Linux_PII_CBLAS b/frameworks/torque/mesos-hpl/Make.Linux_PII_CBLAS deleted file mode 100644 index de646ba..0000000 --- a/frameworks/torque/mesos-hpl/Make.Linux_PII_CBLAS +++ /dev/null @@ -1,183 +0,0 @@ -# -# -- High Performance Computing Linpack Benchmark (HPL) -# HPL - 1.0a - January 20, 2004 -# Antoine P. Petitet -# University of Tennessee, Knoxville -# Innovative Computing Laboratories -# (C) Copyright 2000-2004 All Rights Reserved -# -# -- Copyright notice and Licensing terms: -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions, and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# 3. All advertising materials mentioning features or use of this -# software must display the following acknowledgement: -# This product includes software developed at the University of -# Tennessee, Knoxville, Innovative Computing Laboratories. -# -# 4. The name of the University, the name of the Laboratory, or the -# names of its contributors may not be used to endorse or promote -# products derived from this software without specific written -# permission. -# -# -- Disclaimer: -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY -# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ###################################################################### -# -# ---------------------------------------------------------------------- -# - shell -------------------------------------------------------------- -# ---------------------------------------------------------------------- -# -SHELL = /bin/sh -# -CD = cd -CP = cp -LN_S = ln -s -MKDIR = mkdir -RM = /bin/rm -f -TOUCH = touch -# -# ---------------------------------------------------------------------- -# - Platform identifier ------------------------------------------------ -# ---------------------------------------------------------------------- -# -ARCH = Linux_PII_CBLAS -# -# ---------------------------------------------------------------------- -# - HPL Directory Structure / HPL library ------------------------------ -# ---------------------------------------------------------------------- -# -TOPdir = $(HOME)/hpl -INCdir = $(TOPdir)/include -BINdir = $(TOPdir)/bin/$(ARCH) -LIBdir = $(TOPdir)/lib/$(ARCH) -# -HPLlib = $(LIBdir)/libhpl.a -# -# ---------------------------------------------------------------------- -# - Message Passing library (MPI) -------------------------------------- -# ---------------------------------------------------------------------- -# MPinc tells the C compiler where to find the Message Passing library -# header files, MPlib is defined to be the name of the library to be -# used. The variable MPdir is only used for defining MPinc and MPlib. -# -MPdir = /usr -MPinc = -I$(MPdir)/include/mpich2 -MPlib = $(MPdir)/lib/libmpich.a -# -# ---------------------------------------------------------------------- -# - Linear Algebra library (BLAS or VSIPL) ----------------------------- -# ---------------------------------------------------------------------- -# LAinc tells the C compiler where to find the Linear Algebra library -# header files, LAlib is defined to be the name of the library to be -# used. The variable LAdir is only used for defining LAinc and LAlib. -# -LAdir = /root/CBLAS/lib/LINUX -LAinc = -LAlib = /usr/lib/libcblas.a /usr/lib/libatlas.a -#/CBLAS/lib/LINUX/cblas_LINUX.a -# ---------------------------------------------------------------------- -# - F77 / C interface -------------------------------------------------- -# ---------------------------------------------------------------------- -# You can skip this section if and only if you are not planning to use -# a BLAS library featuring a Fortran 77 interface. Otherwise, it is -# necessary to fill out the F2CDEFS variable with the appropriate -# options. **One and only one** option should be chosen in **each** of -# the 3 following categories: -# -# 1) name space (How C calls a Fortran 77 routine) -# -# -DAdd_ : all lower case and a suffixed underscore (Suns, -# Intel, ...), [default] -# -DNoChange : all lower case (IBM RS6000), -# -DUpCase : all upper case (Cray), -# -DAdd__ : the FORTRAN compiler in use is f2c. -# -# 2) C and Fortran 77 integer mapping -# -# -DF77_INTEGER=int : Fortran 77 INTEGER is a C int, [default] -# -DF77_INTEGER=long : Fortran 77 INTEGER is a C long, -# -DF77_INTEGER=short : Fortran 77 INTEGER is a C short. -# -# 3) Fortran 77 string handling -# -# -DStringSunStyle : The string address is passed at the string loca- -# tion on the stack, and the string length is then -# passed as an F77_INTEGER after all explicit -# stack arguments, [default] -# -DStringStructPtr : The address of a structure is passed by a -# Fortran 77 string, and the structure is of the -# form: struct {char *cp; F77_INTEGER len;}, -# -DStringStructVal : A structure is passed by value for each Fortran -# 77 string, and the structure is of the form: -# struct {char *cp; F77_INTEGER len;}, -# -DStringCrayStyle : Special option for Cray machines, which uses -# Cray fcd (fortran character descriptor) for -# interoperation. -# -F2CDEFS = -# -# ---------------------------------------------------------------------- -# - HPL includes / libraries / specifics ------------------------------- -# ---------------------------------------------------------------------- -# -HPL_INCLUDES = -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) -HPL_LIBS = $(HPLlib) $(LAlib) $(MPlib) -# -# - Compile time options ----------------------------------------------- -# -# -DHPL_COPY_L force the copy of the panel L before bcast; -# -DHPL_CALL_CBLAS call the cblas interface; -# -DHPL_CALL_VSIPL call the vsip library; -# -DHPL_DETAILED_TIMING enable detailed timers; -# -# By default HPL will: -# *) not copy L before broadcast, -# *) call the BLAS Fortran 77 interface, -# *) not display detailed timing information. -# -HPL_OPTS = -DHPL_CALL_CBLAS -# -# ---------------------------------------------------------------------- -# -HPL_DEFS = $(F2CDEFS) $(HPL_OPTS) $(HPL_INCLUDES) -# -# ---------------------------------------------------------------------- -# - Compilers / linkers - Optimization flags --------------------------- -# ---------------------------------------------------------------------- -# -CC = /usr/bin/gcc -CCNOOPT = $(HPL_DEFS) -CCFLAGS = $(HPL_DEFS) -fomit-frame-pointer -O3 -funroll-loops -lpthread -# -# On some platforms, it is necessary to use the Fortran linker to find -# the Fortran internals used in the BLAS library. -# -LINKER = /usr/bin/gfortran -LINKFLAGS = $(CCFLAGS) -lpthread -# -ARCHIVER = ar -ARFLAGS = r -RANLIB = echo -# -# ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/mesos-hpl/README ---------------------------------------------------------------------- diff --git a/frameworks/torque/mesos-hpl/README b/frameworks/torque/mesos-hpl/README deleted file mode 100644 index 17b7b3b..0000000 --- a/frameworks/torque/mesos-hpl/README +++ /dev/null @@ -1,24 +0,0 @@ -This installation of HPL was used for the OSDI submission of Mesos (named Mesos at the time). It was compiled for large EC2 instances running Ubuntu (Lucid). - -The various "X-node" directories contain just the configuration file HPL.dat which adjusts the size of the problem (e.g. the matrix) in order to provide a mixed workload for experiments. Jobs using these conf files can be launched with the corresponding qsub scripts in the /root/mesos/frameworks/torque/ directory. - -Running setup-hpl in the frameworks/torque/mesos-hpl directory should take care of the following steps, but we list them now in case you want to do it manually: - -However, first these should be moved into /nfs/hpl so that mesos slave nodes can access them. Then the xhpl executable must be built and then copied into each of these directories. The xhpl executable can be built using the Make.Linux_PII_CBLAS, which was constructed with to work on this version (lucid) of ubuntu. More detailed instructions for building the xhpl executable are below: - -Building for ubuntu (lucid, EC2 ami-id ami-19a14870): -0) Make sure necessary prerequired sofware (libatlas-base-dev, afortran, mpi.h) - is installed by running ~/root/mesos-ec2/setup-hpl -1) Download and untar http://mesos.berkeley.edu/hpl-mesos.tar.gz into /root/hpl, - which is a modified copy of hpl v1.0a - NOTE: this package should contain the Makefile we created to build on - lucid, which also exists in the same dir as this README - Else, you can download the unmodified hpl source from - http://www.netlib.org/benchmark/hpl/hpl.tgz and copy over - Make.Linux_PII_CBLAS -2) in hpl directory, to build hpl binary, run: - make arch=Linux_PII_CBLAS - -Then copy bin/xhpl to the "X-node" directories, and you should be able to launch xhpl jobs of various sizes using qsub commands in root/mesos/frameworks/torque/ - - http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/mpi_example.c ---------------------------------------------------------------------- diff --git a/frameworks/torque/mpi_example.c b/frameworks/torque/mpi_example.c deleted file mode 100644 index 3f3bcd0..0000000 --- a/frameworks/torque/mpi_example.c +++ /dev/null @@ -1,22 +0,0 @@ -#include "mpi.h" -#include "unistd.h" -#include "stdio.h" - -int main(int argc, char **argv) { - - size_t len=256; - char *hostname = new char[len]; - int size,rank; - - MPI_Init(&argc, &argv); - - MPI_Comm_size(MPI_COMM_WORLD, &size); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - gethostname(hostname, len); - - printf("Hi, I am %d of %d and my hostname is %s\n", rank, size, hostname); - - MPI_Finalize(); - -} http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/start_pbs_mom.py ---------------------------------------------------------------------- diff --git a/frameworks/torque/start_pbs_mom.py b/frameworks/torque/start_pbs_mom.py deleted file mode 100755 index ef8ed7c..0000000 --- a/frameworks/torque/start_pbs_mom.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python -import mesos -import sys -import time -import os -import atexit - -from subprocess import * - -PBS_MOM_CONF_FILE = "/var/spool/torque/mom_priv/config" -PBS_SERVER_NAME_FILE = "/var/spool/torque/server_name" - -def cleanup(): - try: - # TODO(*): This will kill ALL mpds...oops. - print "cleanup" - os.waitpid(Popen("momctl -s", shell=True).pid, 0) - except Exception, e: - print e - None - -class MyExecutor(mesos.Executor): - def __init__(self): - mesos.Executor.__init__(self) - - def init(self, driver, arg): - print "in torque executor init" - print "initializing self.pbs_server_fqdn to " + str(arg.data) - self.pbs_server_fqdn = arg.data - - def launchTask(self, driver, task): - print "Running task %d" % task.taskId - - #TODO: if config file exists, check to see that it is correct - # (right now we overwrite it no matter what) - - #print "checking pbs_mom conf file " + PBS_MOM_CONF_FILE + " is it a file? "\ - # + str(os.path.isfile(PBS_MOM_CONF_FILE)) - #if not os.path.isfile(PBS_MOM_CONF_FILE): - # print PBS_MOM_CONF_FILE + " file not found, about to create it" - #else: - # print "about to overwrite file " + PBS_MOM_CONF_FILE + " to update "#\ - # + "pbs_server on this node" - - #print "adding line to conf file: $pbsserver " + self.pbs_server_ip + "\n" - #FILE = open(PBS_MOM_CONF_FILE,'w') - #FILE.write("$pbsserver " + self.pbs_server_ip + "\n") - #FILE.write("$logevent 255 #bitmap of which events to log\n") - #FILE.close() - - #print "overwrote pbs_mom config file, its contents now are:" - #FILE = open(PBS_MOM_CONF_FILE,'r') - #for line in FILE: print line + "\n" - #FILE.close() - - #overwrite $(TORQUECFG)/server_name file with fqdn of pbs_server - #FILE = open(PBS_SERVER_NAME_FILE,'w') - #FILE.write(self.pbs_server_fqdn) - #FILE.close() - - ##try killing pbs_mom in case we changed the config - #rval = Popen("momctl -s",shell=True).wait() - #print "rval of momctl -s command was " + str(rval) - #if rval != 0: - # print "tried to kill pbs_mom, but momctl -s command failed, prob because no mom was running" - #else: - # time.sleep(1) #not sure if necessary, but wait a sec to be sure the mom lock file is deleted - - ##run pbs_mom - print "running pbs_mom on compute node" - Popen("pbs_mom", shell=True) - - #def killTask(self, driver, tid): - #send a message back to the scheduler to tell it this task is dead - #msg = mesos.TaskStatus(tid, mesos.TASK_KILLED, "") - #driver.sendStatusUpdate(msg); - - def shutdown(self, driver): - print "shutdown" - #cleanup() - - def error(self, driver, code, message): - print "Error: %s" % message - -if __name__ == "__main__": - print "Starting pbs_mom executor" - atexit.register(cleanup) - executor = MyExecutor() - mesos.MesosExecutorDriver(executor).run() http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/start_pbs_mom.sh ---------------------------------------------------------------------- diff --git a/frameworks/torque/start_pbs_mom.sh b/frameworks/torque/start_pbs_mom.sh deleted file mode 100755 index e0c81d0..0000000 --- a/frameworks/torque/start_pbs_mom.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -if [ "x$PYTHON" == "x" ]; then - PYTHON=python - if [ "`uname`" == "SunOS" ]; then - PYTHON=python2.6 - fi -fi - -if [ "x$MESOS_HOME" == "x" ]; then - MESOS_HOME="$(dirname $0)/../.." -fi - -export PYTHONPATH=$MESOS_HOME/lib/python -exec $PYTHON "$(dirname $0)/start_pbs_mob.py" $@ http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/test_date_sleep_date_1node.qsub ---------------------------------------------------------------------- diff --git a/frameworks/torque/test_date_sleep_date_1node.qsub b/frameworks/torque/test_date_sleep_date_1node.qsub deleted file mode 100644 index 99d91da..0000000 --- a/frameworks/torque/test_date_sleep_date_1node.qsub +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -# -#This is an example script example.sh -# -#These commands set up the Grid Environment for your job: -#PBS -N date_sleep_date_test_job -#PBS -l nodes=1 -#PBS -q batch - -#print the time and date -date - -#wait 120 seconds -sleep 10 - -#print the time and date again -date http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/test_date_sleep_date_2node.qsub ---------------------------------------------------------------------- diff --git a/frameworks/torque/test_date_sleep_date_2node.qsub b/frameworks/torque/test_date_sleep_date_2node.qsub deleted file mode 100644 index bd7ea0f..0000000 --- a/frameworks/torque/test_date_sleep_date_2node.qsub +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -# -#PBS -l nodes=2 - -/bin/cat $PBS_NODEFILE - -echo "Print out the hostname and date" -/bin/hostname -/bin/date -#PBS -q batch - -#wait 120 seconds -sleep 240 - -#print the time and date again -date http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/test_date_sleep_date_3node.qsub ---------------------------------------------------------------------- diff --git a/frameworks/torque/test_date_sleep_date_3node.qsub b/frameworks/torque/test_date_sleep_date_3node.qsub deleted file mode 100644 index b8391da..0000000 --- a/frameworks/torque/test_date_sleep_date_3node.qsub +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -# -#This is an example script example.sh -# -#These commands set up the Grid Environment for your job: -#PBS -N date_sleep_date_test_job -#PBS -l nodes=3 -#PBS -q batch - -#print the time and date -date - -#wait 120 seconds -sleep 120 - -#print the time and date again -date http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/test_date_sleep_date_5node_10sec.qsub ---------------------------------------------------------------------- diff --git a/frameworks/torque/test_date_sleep_date_5node_10sec.qsub b/frameworks/torque/test_date_sleep_date_5node_10sec.qsub deleted file mode 100644 index c0e6d28..0000000 --- a/frameworks/torque/test_date_sleep_date_5node_10sec.qsub +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -# -#This is an example script example.sh -# -#These commands set up the Grid Environment for your job: -#PBS -N date_sleep_date_test_job -#PBS -l nodes=5 -#PBS -q batch - -#print the time and date -date - -#wait 10 seconds -sleep 10 - -#print the time and date again -date http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/test_date_sleep_date_5node_60sec.qsub ---------------------------------------------------------------------- diff --git a/frameworks/torque/test_date_sleep_date_5node_60sec.qsub b/frameworks/torque/test_date_sleep_date_5node_60sec.qsub deleted file mode 100644 index 17cd352..0000000 --- a/frameworks/torque/test_date_sleep_date_5node_60sec.qsub +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -# -#This is an example script example.sh -# -#These commands set up the Grid Environment for your job: -#PBS -N date_sleep_date_test_job -#PBS -l nodes=5 -#PBS -q batch - -#print the time and date -date - -#wait 60 seconds -sleep 60 - -#print the time and date again -date http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/torquelib.py ---------------------------------------------------------------------- diff --git a/frameworks/torque/torquelib.py b/frameworks/torque/torquelib.py deleted file mode 100644 index b69df1c..0000000 --- a/frameworks/torque/torquelib.py +++ /dev/null @@ -1,105 +0,0 @@ -import re -import tempfile -import logging -import xml.dom.minidom - -from subprocess import * - -logging.basicConfig() - -class Job: - def __init__(self, xmlJobElt): - #logging.debug("creating a new Job obj out of xml") - assert xmlJobElt.nodeName == "Job", "xml element passed to Job constructor was not named 'Job'" - self.resourceList = {} - for res in xmlJobElt.getElementsByTagName("Resource_List")[0].childNodes: - self.resourceList[res.nodeName] = res.childNodes[0].data - self.jobState = xmlJobElt.getElementsByTagName("job_state")[0].childNodes[0].data - -def getActiveJobs(): - logging.debug("in getJobs, grabbing xml output from qstat") - qstat_out = tempfile.TemporaryFile() - qstat_obj = Popen("qstat -x", shell=True, stdout=qstat_out) - qstat_obj.wait() - logging.debug("output of qstat: ") - jobs = [] - jobsExist = False - qstat_out.seek(0) - for line in qstat_out: - if re.match(".*Job.*", line): - #logging.debug("#############" + line + "#############") - jobsExist = True - break - if jobsExist: - qstat_out.seek(0) - dom_doc = xml.dom.minidom.parse(qstat_out) - logging.debug("grabbing the Job elements from the xml dom doc") - xmljobs = dom_doc.getElementsByTagName("Job") - logging.debug("creating a new job object for each job dom elt") - for j in xmljobs: - #make sure job's state is not complete - newJob = Job(j) - if newJob.jobState != "C": - jobs.append(newJob) - if len(jobs) == 0: - logging.debug("the string \"Job\" was not found in the qstat -x command output") - return jobs - -class Node: - def __init__(self, xmlHostElt): - #logging.debug("creating a new Host obj out of xml") - assert xmlHostElt.nodeName == "Node", "xml element passed to Node constructor was not named 'Node'" - self.name = xmlHostElt.getElementsByTagName("name")[0].childNodes[0].data - - self.state = [] - for stateElt in xmlHostElt.getElementsByTagName("state"): - assert stateElt.nodeName == "state" - for st in stateElt.childNodes[0].data.split(","): - self.state.append(st) - - self.np = xmlHostElt.getElementsByTagName("np")[0].childNodes[0].data - self.ntype = xmlHostElt.getElementsByTagName("ntype")[0].childNodes[0].data - - self.status = {} - for statusElt in xmlHostElt.getElementsByTagName("status"): - assert statusElt.nodeName == "status" - for pairs in statusElt.childNodes[0].data.split(","): - key, val = pairs.split("=") - self.status[key] = val - -#returns nodes whose state is not marked as "down" -def getNodes(): - logging.debug("in getNodes, grabbing xml output from pbsnodes") - pbsnodes_out = tempfile.TemporaryFile() - pbsnodes_obj = Popen("pbsnodes -x", shell=True, stdout=pbsnodes_out) - pbsnodes_obj.wait() - nodes = [] - nodesExist = False - pbsnodes_out.seek(0) - for line in pbsnodes_out: - #logging.debug("node line is %s" % line) - if re.match(".*Node.*", line): - nodesExist = True - break - if nodesExist: - pbsnodes_out.seek(0) - dom_doc = xml.dom.minidom.parse(pbsnodes_out) - logging.debug("grabbing the Job elements from the xml dom doc") - xmlnodes = dom_doc.getElementsByTagName("Node") - logging.debug("creating a new node object for each node dom elt") - for n in xmlnodes: - #make sure node's state is online - nodes.append(Node(n)) - if len(nodes) == 0: - logging.debug("the string \"Node\" was not found in the pbsnodes -x command output") - return nodes - -#TODO: DELETE THIS? Might note be used eventually -def getQueueLength(): - #logging.debug("computing the number of active jobs in the queue") - qstat = Popen("qstat -Q",shell=True,stdout=PIPE).stdout - jobcount = 0 - for line in qstat: - if re.match('^batch.*', line): - jobcount = int(line.split()[5]) + int(line.split()[6]) + int(line.split()[7]) + int(line.split()[8]) - return jobcount http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/torquesched.py ---------------------------------------------------------------------- diff --git a/frameworks/torque/torquesched.py b/frameworks/torque/torquesched.py deleted file mode 100644 index da12f05..0000000 --- a/frameworks/torque/torquesched.py +++ /dev/null @@ -1,266 +0,0 @@ -#!/usr/bin/env python - -import mesos -import os -import sys -import time -import httplib -import Queue -import threading -import re -import socket -import torquelib -import time -import logging -import logging.handlers - -from optparse import OptionParser -from subprocess import * -from socket import gethostname - -PBS_SERVER_FILE = "/var/spool/torque/server_name" -EVENT_LOG_FILE = "log_fw_utilization.txt" -LOG_FILE = "log.txt" - -SCHEDULER_ITERATION = 2 #number of seconds torque waits before looping through - #the queue to try to match resources to jobs. default - #is 10min (ie 600) but we want it to be low so jobs - #will run as soon as the framework has acquired enough - #resources -SAFE_ALLOCATION = {"cpus":48,"mem":128} #just set statically for now, 128MB -MIN_SLOT_SIZE = {"cpus":"1","mem":1024} #1GB -MIN_SLOTS_HELD = 0 #keep at least this many slots even if none are needed - -eventlog = logging.getLogger("event_logger") -eventlog.setLevel(logging.DEBUG) -fh = logging.FileHandler(EVENT_LOG_FILE,'w') #create handler -fh.setFormatter(logging.Formatter("%(asctime)s %(message)s")) -eventlog.addHandler(fh) - -#Something special about this file makes logging not work normally -#I think it might be swig? the StreamHandler prints at DEBUG level -#even though I setLevel to INFO -ch = logging.StreamHandler() -ch.setLevel(logging.INFO) -fh = logging.FileHandler(LOG_FILE,"w") -fh.setLevel(logging.DEBUG) - -driverlog = logging.getLogger("driver_logger") -driverlog.setLevel(logging.DEBUG) -driverlog.addHandler(fh) -#driverlog.addHandler(ch) - -monitorlog = logging.getLogger("monitor_logger") -monitorlog.setLevel(logging.DEBUG) -monitorlog.addHandler(fh) -#monitorlog.addHandler(ch) - -class MyScheduler(mesos.Scheduler): - def __init__(self, ip): - mesos.Scheduler.__init__(self) - self.lock = threading.RLock() - self.id = 0 - self.ip = ip - self.servers = {} - self.overloaded = False - self.numToRegister = MIN_SLOTS_HELD - - def getExecutorInfo(self, driver): - execPath = os.path.join(os.getcwd(), "start_pbs_mom.sh") - initArg = self.ip # tell executor which node the pbs_server is running on - driverlog.info("in getExecutorInfo, setting execPath = " + execPath + " and initArg = " + initArg) - return mesos.ExecutorInfo(execPath, initArg) - - def registered(self, driver, fid): - driverlog.info("Mesos torque framework registered with frameworkID %s" % fid) - - def resourceOffer(self, driver, oid, slave_offers): - self.driver = driver - driverlog.debug("Got slot offer %d" % oid) - self.lock.acquire() - driverlog.debug("resourceOffer() acquired lock") - tasks = [] - for offer in slave_offers: - # if we haven't registered this node, accept slot & register w pbs_server - #TODO: check to see if slot is big enough - if self.numToRegister <= 0: - driverlog.debug("Rejecting slot, no need for more slaves") - continue - if offer.host in self.servers.values(): - driverlog.debug("Rejecting slot, already registered node " + offer.host) - continue - if len(self.servers) >= SAFE_ALLOCATION["cpus"]: - driverlog.debug("Rejecting slot, already at safe allocation (i.e. %d CPUS)" % SAFE_ALLOCATION["cpus"]) - continue - driverlog.info("Need %d more nodes, so accepting slot, setting up params for it..." % self.numToRegister) - params = {"cpus": "1", "mem": "1024"} - td = mesos.TaskDescription( - self.id, offer.slaveId, "task %d" % self.id, params, "") - tasks.append(td) - self.servers[self.id] = offer.host - self.regComputeNode(offer.host) - self.numToRegister -= 1 - self.id += 1 - driverlog.info("writing logfile") - eventlog.info("%d %d" % (time.time(),len(self.servers))) - driverlog.info("done writing logfile") - driverlog.info("self.id now set to " + str(self.id)) - #print "---" - driver.replyToOffer(oid, tasks, {"timeout": "1"}) - self.lock.release() - driverlog.debug("resourceOffer() finished, released lock\n\n") - - def statusUpdate(self, driver, status): - driverlog.info("got status update from TID %s, state is: %s, data is: %s" %(status.taskId,status.state,status.data)) - - def regComputeNode(self, new_node): - driverlog.info("registering new compute node, "+new_node+", with pbs_server") - driverlog.info("checking to see if node is registered with server already") - #nodes = Popen("qmgr -c 'list node'", shell=True, stdout=PIPE).stdout - nodes = Popen("pbsnodes", shell=True, stdout=PIPE).stdout - driverlog.info("output of pbsnodes command is: ") - for line in nodes: - driverlog.info(line) - if line.find(new_node) != -1: - driverlog.info("Warn: tried to register node that's already registered, skipping") - return - #add node to server - driverlog.info("registering node with command: qmgr -c create node " + new_node) - qmgr_add = Popen("qmgr -c \"create node " + new_node + "\"", shell=True, stdout=PIPE).stdout - driverlog.info("output of qmgr:") - for line in qmgr_add: driverlog.info(line) - - def unregComputeNode(self, node_name): - #remove node from server - monitorlog.info("removing node from pbs_server: qmgr -c delete node " + node_name) - monitorlog.info(Popen('qmgr -c "delete node ' + node_name + '"', shell=True, stdout=PIPE).stdout) - - #unreg up to N random compute nodes, leave at least one - def unregNNodes(self, numNodes): - monitorlog.debug("unregNNodes called with arg %d" % numNodes) - if numNodes > len(self.servers)-MIN_SLOTS_HELD: - monitorlog.debug("... however, only unregistering %d nodes, leaving one alive" % (len(self.servers)-MIN_SLOTS_HELD)) - toKill = min(numNodes,len(self.servers)-MIN_SLOTS_HELD) - - monitorlog.debug("getting and filtering list of nodes using torquelib") - noJobs = lambda x: x.state != "job-exclusive" - inactiveNodes = map(lambda x: x.name,filter(noJobs, torquelib.getNodes())) - monitorlog.debug("victim pool of inactive nodes:") - for inode in inactiveNodes: - monitorlog.debug(inode) - for tid, hostname in self.servers.items(): - if len(self.servers) > MIN_SLOTS_HELD and toKill > 0 and hostname in inactiveNodes: - monitorlog.info("We still have to kill %d of the %d compute nodes which master is tracking" % (toKill, len(self.servers))) - monitorlog.info("unregistering node " + str(hostname)) - self.unregComputeNode(hostname) - self.servers.pop(tid) - eventlog.info("%d %d" % (time.time(),len(self.servers))) - toKill = toKill - 1 - monitorlog.info("killing corresponding task with tid %d" % tid) - self.driver.killTask(tid) - if toKill > 0: - monitorlog.warn("Done killing. We were supposed to kill %d nodes, but only found and killed %d free nodes" % (numNodes, numNodes-toKill)) - - def getFrameworkName(self, driver): - return "Mesos Torque Framework" - - -def monitor(sched): - while True: - time.sleep(1) - monitorlog.debug("monitor thread acquiring lock") - sched.lock.acquire() - monitorlog.debug("computing num nodes needed to satisfy eligable jobs in queue") - needed = 0 - jobs = torquelib.getActiveJobs() - monitorlog.debug("retreived jobs in queue, count: %d" % len(jobs)) - for j in jobs: - #WARNING: this check should only be used if torque is using fifo queue - #if needed + j.needsnodes <= SAFE_ALLOCATION: - monitorlog.debug("job resource list is: " + str(j.resourceList)) - needed += int(j.resourceList["nodect"]) - monitorlog.debug("number of nodes needed by jobs in queue: %d" % needed) - numToRelease = len(sched.servers) - needed - monitorlog.debug("number of nodes to release is %d - %d" % (len(sched.servers),needed)) - if numToRelease > 0: - sched.unregNNodes(numToRelease) - sched.numToRegister = 0 - else: - monitorlog.debug("monitor updating sched.numToRegister from %d to %d" % (sched.numToRegister, numToRelease * -1)) - sched.numToRegister = numToRelease * -1 - sched.lock.release() - monitorlog.debug("monitor thread releasing lock") - monitorlog.debug("\n") - -if __name__ == "__main__": - parser = OptionParser(usage = "Usage: %prog mesos_master") - - (options,args) = parser.parse_args() - if len(args) < 1: - print >> sys.stderr, "At least one parameter required." - print >> sys.stderr, "Use --help to show usage." - exit(2) - - fqdn = socket.getfqdn() - ip = socket.gethostbyname(gethostname()) - - #monitorlog.info("running killall pbs_server") - #Popen("killall pbs_server", shell=True) - #time.sleep(1) - - #monitorlog.info("writing $(TORQUECFG)/server_name file with fqdn of pbs_server: " + fqdn) - #Popen("touch %s" % PBS_SERVER_FILE, shell=True) - #FILE = open(PBS_SERVER_FILE,'w') - #FILE.write(fqdn) - #FILE.close() - - monitorlog.info("starting pbs_server") - #Popen("/etc/init.d/pbs_server start", shell=True) - Popen("pbs_server", shell=True) - #time.sleep(2) - - # monitorlog.info("running command: qmgr -c \"set queue batch resources_available.nodes=%s\"" % SAFE_ALLOCATION["cpus"]) - # Popen("qmgr -c \"set queue batch resources_available.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True) - # Popen("qmgr -c \"set server resources_available.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True) - - # #these lines might not be necessary since we hacked the torque fifo scheduler - # Popen("qmgr -c \"set queue batch resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True) - # Popen("qmgr -c \"set server resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True) - # Popen("qmgr -c \"set server scheduler_iteration=%s\"" % SCHEDULER_ITERATION, shell=True) - - # outp = Popen("qmgr -c \"l queue batch\"", shell=True, stdout=PIPE).stdout - # for l in outp: - # monitorlog.info(l) - - # monitorlog.info("RE-killing pbs_server for resources_available setting to take effect") - # #Popen("/etc/init.d/pbs_server start", shell=True) - # Popen("qterm", shell=True) - # time.sleep(1) - - # monitorlog.info("RE-starting pbs_server for resources_available setting to take effect") - #Popen("pbs_server", shell=True) - # monitorlog.debug("qmgr list queue settings: ") - # output = Popen("qmgr -c 'l q batch'", shell=True, stdout=PIPE).stdout - # for line in output: - # monitorlog.debug(line) - - # monitorlog.info("running killall pbs_sched") - # Popen("killall pbs_sched", shell=True) - # #time.sleep(2) - - monitorlog.info("starting pbs_scheduler") - #Popen("/etc/init.d/pbs_sched start", shell=True) - Popen("pbs_sched", shell=True) - - #ip = Popen("hostname -i", shell=True, stdout=PIPE).stdout.readline().rstrip() #linux - #ip = Popen("ifconfig en1 | awk '/inet / { print $2 }'", shell=True, stdout=PIPE).stdout.readline().rstrip() # os x - monitorlog.info("Remembering IP address of scheduler (" + ip + "), and fqdn: " + fqdn) - - monitorlog.info("Connecting to mesos master %s" % args[0]) - - sched = MyScheduler(fqdn) - threading.Thread(target = monitor, args=[sched]).start() - - mesos.MesosSchedulerDriver(sched, args[0]).run() - - monitorlog.info("Finished!") http://git-wip-us.apache.org/repos/asf/mesos/blob/923d735c/frameworks/torque/torquesched.sh ---------------------------------------------------------------------- diff --git a/frameworks/torque/torquesched.sh b/frameworks/torque/torquesched.sh deleted file mode 100755 index a40c7ea..0000000 --- a/frameworks/torque/torquesched.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -if [ "x$PYTHON" == "x" ]; then - PYTHON=python - if [ "`uname`" == "SunOS" ]; then - PYTHON=python2.6 - fi -fi - -if [ "x$MESOS_HOME" == "x" ]; then - MESOS_HOME="$(dirname $0)/../.." -fi - -export PYTHONPATH=$MESOS_HOME/lib/python -exec $PYTHON "$(dirname $0)/torquesched.py" $@
