Author: degenaro Date: Wed Aug 15 12:05:15 2018 New Revision: 1838082 URL: http://svn.apache.org/viewvc?rev=1838082&view=rev Log: UIMA-5742 Reliable DUCC - New versions of start_ducc, stop_ducc, and check_ducc that use DB instead of PID files - New python script db_autostart_delete to remove an autostart table entry - New python script db_autostart_query to query autostart table entries - New autostart python script to be run as crontab on each DUCC host to keep daemons alive - New crontab example for launching autostart script - New DB package to support autostart CRUD operations - Revised DuccBook documenting above
Added: uima/uima-ducc/trunk/src/main/admin/2.2_check_ducc uima/uima-ducc/trunk/src/main/admin/2.2_start_ducc uima/uima-ducc/trunk/src/main/admin/2.2_stop_ducc uima/uima-ducc/trunk/src/main/admin/autostart.py (with props) uima/uima-ducc/trunk/src/main/admin/cron/db_autostart.crontab.example uima/uima-ducc/trunk/src/main/admin/db_autostart_delete.py (with props) uima/uima-ducc/trunk/src/main/admin/db_autostart_query.py (with props) uima/uima-ducc/trunk/uima-ducc-database/src/main/java/org/apache/uima/ducc/database/lifetime/ uima/uima-ducc/trunk/uima-ducc-database/src/main/java/org/apache/uima/ducc/database/lifetime/DbDaemonLifetime.java (with props) uima/uima-ducc/trunk/uima-ducc-database/src/main/java/org/apache/uima/ducc/database/lifetime/DbDaemonLifetimeCommon.java (with props) uima/uima-ducc/trunk/uima-ducc-database/src/main/java/org/apache/uima/ducc/database/lifetime/DbDaemonLifetimeUI.java (with props) uima/uima-ducc/trunk/uima-ducc-database/src/main/java/org/apache/uima/ducc/database/lifetime/IDbDaemonLifetime.java (with props) Modified: uima/uima-ducc/trunk/src/main/admin/check_ducc uima/uima-ducc/trunk/src/main/admin/ducc_util.py uima/uima-ducc/trunk/src/main/admin/start_ducc uima/uima-ducc/trunk/src/main/admin/stop_ducc uima/uima-ducc/trunk/uima-ducc-duccdocs/src/site/tex/duccbook/part4/admin/admin-commands.tex Added: uima/uima-ducc/trunk/src/main/admin/2.2_check_ducc URL: http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/2.2_check_ducc?rev=1838082&view=auto ============================================================================== --- uima/uima-ducc/trunk/src/main/admin/2.2_check_ducc (added) +++ uima/uima-ducc/trunk/src/main/admin/2.2_check_ducc Wed Aug 15 12:05:15 2018 @@ -0,0 +1,428 @@ +#!/usr/bin/env python +# ----------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ----------------------------------------------------------------------- + + +import os +import sys +from time import time +import getopt +import signal + +from ducc_util import DuccUtil +from properties import Properties +from local_hooks import verify_slave_node +from local_hooks import verify_master_node + +#from ducc_util import ThreadWorker +from ducc_util import ThreadPool + +class CheckDucc(DuccUtil): + + def __init__(self): + DuccUtil.__init__(self) + self.badnodes = [] + + def validate(self, checkdate): + verify_slave_node(checkdate, self.ducc_properties) + self.check_clock_skew(checkdate) + self.verify_jvm() + self.verify_limits() + (viable, elevated, safe) = self.verify_duccling() + self.duccling_ok(viable, elevated, safe) + if ( not safe or not viable ): + print 'NOTOK ducc_ling is not installed correctly.' + + return + + def verify_database(self): + if ( self.db_bypass == True ): + return True + + ret = self.db_alive(1) + if ( ret ): + print 'The database is running' + else: + print 'The database is not running' + + def verify_activemq(self): + if ( self.is_amq_active() ): + print 'ActiveMQ is found listening at', self.broker_protocol + "://" + self.broker_host + ':' + self.broker_port + return True + return False + + def check_node(self, args): + + messages = [] + spacer = ' ' + node = args[0] + + messages.append((' ')) + messages.append(('Checking', node, '...')) + + if(self.ssh_operational(node)): + text = "ssh is operational to "+node + #print text + else: + text = "ssh is NOT operational to "+node + print text + messages.append((spacer, text)) + return messages + + response = self.find_ducc_process(node) # a tuple, (True|False, proclist) + if ( not response[0] ): + messages.append((spacer, "No response.")) + return messages + + proclist = response[1] # a list of tuples, tuple is (component, pid, user) + if ( len(proclist) > 0 ): + for proc in proclist: + component = proc[0] + pid = proc[1] + found_user = proc[2] + + signal = self.kill_signal + + if(self.is_reliable_backup()): + if ( component == 'agent' ): + continue + + if ( component == 'orchestrator' ): + component = 'or' + + if ( component == 'database' ): + if ( signal != None ): + if ( self.kill_db9 == False ): + signal = '-QUIT' + + process_id = found_user + ' ' + component + '@' + node + ' PID ' + pid + if ( signal != None ) : + if ( self.user != found_user ): + messages.append((spacer, "Not killing someone else's process.", process_id)) + elif ( component == 'unknown-java' ): + messages.append((spacer, 'Not killing non-ducc process', process_id)) + else: + messages.append((spacer, 'Killing (' + signal + ')', process_id)) + self.kill_process(node, proc, signal) + if ( component == 'agent' ): + self.pids_agents.delete(pid) + else: + self.pids_daemons.delete(pid) + process_changes = True + + else: + messages.append((spacer, 'Found', process_id)) + full_name = component + '@' + node + if ( component == 'agent' ): + self.pids_agents.put(full_name, pid) + else: + if ( component in self.default_components ): + self.pids_daemons.put(full_name, pid) + self.pids_daemons.put(component, full_name) + else: + messages.append((spacer, 'no processes found.')) + + if ( self.kill_signal == None ): + response = "Node health checks return." + lines = self.ssh(node, True, self.DUCC_HOME + "/admin/check_ducc", "-x", str(int(time()))) + while 1: + line = lines.readline() + if ( 'signal' in line ): + response = "Node health did not complete: " + line + self.badnodes.append(node) + # these next two filter junk if 'mesg' is running in a shell rc + if ( 'stdin: is not a tty' in line ): + continue + if ( 'mesg' in line ): + continue + + if ( not line ): + break + line = line.strip() + messages.append((spacer, line)) + #messages.append((spacer, '[]', line)) + messages.append((spacer, response)) + + return messages + + def signalHandler(self, signum, frame): + print "-------- Caught signal", signum, "--------" + if ( len(self.badnodes) != 0 ): + print "Health checks on these nodes did not return:" + for n in self.badnodes: + print n, + print '' + sys.exit(1) + + def usage(self, msg): + if ( msg != None ): + print msg + print "Usage:" + print " check_ducc [options]" + print " If no options are given this is the equivalent of:" + print "" + print " check_ducc -n ../resources/ducc.nodes" + print "" + print " For reliable DUCC agents will not be killed from backup head node. " + print "" + print " Broker will not be killed when ducc.broker.automanage = false. " + print " Database will not be killed when ducc.database.automanage = false. " + print "" + print "Options:" + print " -n --nodelist nodefile" + print " Check for agents on the nodes in nodefile. This option may be specified multiple time" + print " for multiple nodefiles. The 'local' node is always checked" + print "" + print " -c --configuration" + print " Do basic sanity checking on the configuration only. Note that configuration checking is always" + print " performed with most options. The [-c, --configuration] option does ONLY configuration checking." + print "" + print " -k --kill" + print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" + print " uses kill -KILL (-9) for all daemons, except database which uses -QUIT (3)," + print " and only kills processes owned by the invoking user." + print "" + print " --db-9" + print " Use signal -KILL (-9) to kill database, rather than the default -QUIT (-3)" + print "" + print " -i --int" + print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" + print " uses kill -INT (-2) and only kills processes owned by the invoking user." + print "" + print " -q --quit" + print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" + print " uses kill -QUIT (-3) and only kills processes owned by the invoking user." + print "" + print " -p --pids" + print " Rewrite the PID file. The PID file is always rewritten if any changes to processes are made. Sometimes" + print " the PID file needs rebuilding. This option causes the file to be rebuilt regardless of" + print " changes." + print "" + print " -x localdate" + print " Validate the local installation, called via ssh usually. The date is the date on the calling machine." + print "" + print " --nothreading" + print " Disable multithreaded operation if it would otherwise be used" + print "" + print " -v --verbose" + print " If specified, print the validated configuration to the console." + print "" + print " -? prints this message." + sys.exit(1) + + def main(self, argv): + + try: + opts, args = getopt.getopt(argv, 'cikn:opqx:h?v', ['configuration', 'nodelist=', 'int', 'quit', 'kill', 'db-9', 'pids', 'verbose', 'nothreading', ]) + except: + self.usage("Invalid arguments " + ' '.join(argv)) + + nodefiles = [] + self.user = os.environ['LOGNAME'] + self.kill_signal = None + self.kill_db9 = False + redo_pids = False + process_changes = False + do_validate = False + checkdate = 0 + config_only = False + verbose = False + + for ( o, a ) in opts: + if o in ('-c', '--configuration'): + config_only = True + elif o in ('-n', '--nodelist'): + nodefiles.append(a) + elif o in ('-i', '--int'): + if ( self.kill_signal != None ): + print 'Conflicting kill signals: -INT and', self.kill_signal + return + self.kill_signal = '-INT' + elif o in ('-q', '--quit'): + if ( self.kill_signal != None ): + print 'Conflicting kill signals: -QUIT and', self.kill_signal + return + self.kill_signal = '-QUIT' + elif o in ('-k', '--kill'): + if ( self.kill_signal != None ): + print 'Conflicting kill signals: -KILL and', self.kill_signal + return + self.kill_signal = '-KILL' + elif o in ('--db-9'): + self.kill_db9 = True + elif o in ( '--nothreading' ): + self.disable_threading() + elif o in ('-p', '--pids'): + redo_pids = True + elif o in ('-x'): + # intended to be called recursively from check_ducc, NOT from the command line + do_validate = True + checkdate = float(a) + elif o in ('-v', '--verbose'): + verbose = True + elif o in ('-h', '-?', '--help'): + self.usage(None) + else: + print 'badarg', a + usage('bad arg: ' + a) + + + if not self.installed(): + print "Head node is not initialized. Have you run ducc_post_install?" + return + + self.check_properties() + + if ( do_validate ): + # if validating, ONLY validate, called via ssh usually + self.validate(checkdate) + return + + # When called directly must be from the head node + self.verify_head() + + self.set_duccling_version() + + os.system('cat ' + self.DUCC_HOME + '/state/duccling.version') + # not -x option, do this only on local node + env = self.show_ducc_environment() + for e in env: + print e + + + jvm = self.ducc_properties.get('ducc.jvm') + if ( jvm == None ): + print 'WARN: ducc.jvm is not specified in ducc.properties. Default is simply "java" which may not work on all nodes.' + + if ( not verify_master_node(self.ducc_properties) ): + print 'FAIL: Cannot verify master mode' + return + + if ( not self.verify_activemq() ): + print 'ActiveMQ broker is not running on', self.broker_protocol + "://" + self.broker_host + ':' + self.broker_port + + self.verify_database() + + # init the PID file + if(not self.is_reliable_backup()): + self.pids_agents = Properties() + self.pids_agents.load_if_exists(self.pid_file_agents) + self.pids_daemons = Properties() + self.pids_daemons.load_if_exists(self.pid_file_daemons) + + # read the nodelists + if ( len(nodefiles) == 0 ): + nodefiles = self.default_nodefiles + check_nodepools = True + else: + # if using other than the fully configured set of nodes we can't reliably check nodepools + # because anything other than the full set of nodes may be missing something + check_nodepools = False + + nodes = {} + n_nodes = 0 + for nf in nodefiles: + n_nodes, nodes = self.read_nodefile(nf, nodes) + + # + # add in the local host if needed, and the webserver node + # + localnodes = [] + if ( not self.localhost in nodes ): + localnodes.append(self.localhost) + + if ( not (self.webserver_node in ['localhost', self.localhost, None]) ): + localnodes.append(self.webserver_node) + + if ( len(localnodes) > 0 ): + nodes['local'] = localnodes + + self.verify_jvm() + + + if ( config_only ): + if ( nodefiles != self.default_nodefiles): + print "NOTOK: Config check only works with full, default nodefile:", self.default_nodefiles + return + if self.verify_class_configuration(nodefiles[0], verbose): + print "OK: Class configuration checked" + else: + print "NOTOK: Errors in class or node configuration." + + return + + # checking starts here + print "Checking", n_nodes, "nodes" + self.threadpool = ThreadPool(n_nodes + 5) # more for the head processes + checked = {} + + signal.signal(signal.SIGINT, self.signalHandler) + + try: + for (nodefile, nodelist) in nodes.items(): + if ( nodelist == None ): + # loading the nodes prints the necessary message + continue + for node in nodelist: + if ( checked.has_key(node) ): + continue + + checked[node] = node + self.threadpool.invoke(self.check_node, node) + # check backup head node(s) + for node in self.get_head_node_list(): + if(not node in checked): + checked[node] = node + self.threadpool.invoke(self.check_node, node) + except: + self.threadpool.quit() + print sys.exc_info()[0], "Exiting." + sys.exit(1) + + self.threadpool.quit() + + if ( self.kill_signal != None ): + if(self.automanage_broker): + print 'Stopping broker' + self.stop_broker() + else: + print 'Not stopping broker - not automanaged' + if(self.automanage_database): + print 'Stopping database' + self.db_stop() + else: + print 'Not stopping database - not automanaged' + + if(not self.is_reliable_backup()): + if ( len(self.pids_agents) == 0): + if ( os.path.exists(self.pid_file_agents) ): + os.remove(self.pid_file_agents) + elif (process_changes or redo_pids): + self.pids_agents.write(self.pid_file_agents) + + if ( len(self.pids_daemons) == 0): + if ( os.path.exists(self.pid_file_daemons) ): + os.remove(self.pid_file_daemons) + elif (process_changes or redo_pids): + self.pids_daemons.write(self.pid_file_daemons) + +if __name__ == "__main__": + checker = CheckDucc() + checker.main(sys.argv[1:]) + Added: uima/uima-ducc/trunk/src/main/admin/2.2_start_ducc URL: http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/2.2_start_ducc?rev=1838082&view=auto ============================================================================== --- uima/uima-ducc/trunk/src/main/admin/2.2_start_ducc (added) +++ uima/uima-ducc/trunk/src/main/admin/2.2_start_ducc Wed Aug 15 12:05:15 2018 @@ -0,0 +1,431 @@ +#!/usr/bin/env python +# ----------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ----------------------------------------------------------------------- + + +import os +import sys +import time +import getopt +import threading +import traceback + +from ducc_util import DuccUtil +from properties import Properties +from local_hooks import verify_slave_node +from local_hooks import verify_master_node +from ducc import Ducc +from ducc_util import ThreadPool +from ducc_base import find_ducc_home + +class StartDucc(DuccUtil): + + def __init__(self): + DuccUtil.__init__(self, True) + + def start_broker(self): + + broker_host = self.localhost + print 'Starting broker on', broker_host + lines = self.ssh(broker_host, True, "'", self.DUCC_HOME + '/admin/ducc.py', '-c', 'broker', "'") + while 1: + line = lines.readline().strip() + if ( not line ): + break + #print '[] ' + line + if ( line.startswith('PID') ): + toks = line.split(' ') # get the PID + print "Broker on", broker_host, 'PID', toks[1] + self.pids_daemons.put('broker@' + broker_host, toks[1]) + lines.close() + break + + for i in range(0, 9): + if ( self.is_amq_active() ): + return + print 'Waiting for broker .....', str(i) + time.sleep(1) + + def start_component(self, args): + + ducc, component, or_parms = args + msgs = [] + + node = self.ducc_properties.get('ducc.head') + + com = component + if ( com.find('@') >= 0 ): + com, node = com.split('@') + + if (com in self.local_components): + node = self.localhost + + if ((com in self.default_components) or ( com == 'agent')) : + msgs.append((node, 'Starting', com)) + else: + msgs.append(('Unrecognized component', component)) + return msgs + + + if ( or_parms == None ): + or_parms = '--or_parms=' + else: + or_parms = '--or_parms=' + or_parms + + if ( node == 'local' ): + node = self.localhost + + lines = self.ssh(node, True, "'", self.DUCC_HOME + '/admin/ducc.py', '-c', com, '-b', or_parms, '-d', str(time.time()), '--nodup', "'") + + # we'll capture anything that the python shell spews because it may be useful, and then drop the + # pipe when we see a PID message + while 1: + line = lines.readline().strip() + if ( not line ): + break + #msgs.append(('[]', line)) + if ( line.startswith('PID') ): + toks = line.split(' ') # get the PID + msgs.append((' PID', toks[1])) + self.pids_daemons.put(com + '@' + node, toks[1]) + lines.close() + break + if ( line.startswith('WARN') ): + msgs.append((' ', line)) + + if ( com in self.default_components ): # tracks where the management processes are + self.pidlock.acquire() + self.pids_daemons.put(com, com + '@' + node) + self.pidlock.release() + + return msgs + + def start_one_agent(self, args): + + host = args[0] + msgs = [] + spacer = ' ' + msgs.append((host, "")) + lines = self.ssh(host, True, "'", self.DUCC_HOME + '/admin/ducc.py', '-c' 'agent', '-b', '-d', str(time.time()), '--nodup', "'") + for line in lines: + line = line.strip() + # print '[]', host, line + # msgs.append(('[l]', line)) + if ( line.startswith('PID') ): + toks = line.split(' ') + pid = toks[1] + self.pidlock.acquire() + self.pids_agents.put('agent@' + host, pid) + self.pidlock.release() + + lines.close() + msgs.append((spacer, 'DUCC Agent started PID', pid)) + break + + if ( 'tty' in line ): + # ssh junk if mesg is set + continue + + toks = line.split() + + sshmsgs = self.ssh_ok(host, line ) + if ( sshmsgs != None ): + for m in sshmsgs: + print '[S]', m + + if ( toks[0] == 'NOTOK' ): + msgs.append((spacer, 'NOTOK Not started:', ' '.join(toks[1:]))) + else: + msgs.append((spacer, line)) + + return msgs + + def verify_required_directories(self): + for dir in ('history', 'state', 'logs'): + d = self.DUCC_HOME + '/' + dir + if ( not os.path.exists(d) ): + print "Initializing", d + os.mkdir(d) + + def usage(self, *msg): + if ( msg[0] != None ): + print ' '.join(msg) + + print "Usage:" + print " start_ducc [options]" + print " If no options are given, all DUCC processes are started, using the default" + print " nodelist, DUCC_HOME/resources/ducc.nodes. " + print "" + print " For reliable DUCC agents will not be started from backup head node. " + print "" + print " Broker will not be started when ducc.broker.automanage = false. " + print " Database will not be started when ducc.database.automanage = false. " + print "" + print "Options:" + print " -n --nodelist nodefile" + print " Start agents on the nodes in the nodefile. Multiple nodefiles may be specified:" + print "" + print " start_ducc -n foo.nodes -n bar.nodes -n baz.nodes" + print "" + print " -c, --component component" + print " Start a specific DUCC component, optionally on a specific node. If the component name" + print " is qualified with a nodename, the component is started on that node. To qualify a" + print " component name with a destination node, use the notation component@nodename." + print " Multiple components may be specified:" + print "" + print " start_ducc -c sm -c pm -c rm@node1 -c or@node2 -c agent@remote1 -c agent@remote2" + print "" + print " Components include:" + print " rm - resource manager" + print " or - orchestrator" + print " pm - process manager" + print " sm - services manager" + print " ws - web server" + print " agent - node agent" + print ' head = { or, pm, rm, sm, ws, db, broker }' + print "" + print " --nothreading" + print " Disable multithreaded operation if it would otherwise be used" + print "" + print " Choose none or one of the following two options, which is only effective when the orchestrator (or) component is started." + print " When specified here it supersedes that specified for ducc.orchestrator.start.type in ducc.properties." + print " When not specified here or in ducc.properties, the default is --warm." + print "" + print " --warm" + print " Do NOT force active Jobs, Services, and Reservations to Completed state." + print "" + print " --cold" + print " Force active Jobs, Services, and Reservations to Completed state." + print "" + print "Examples:" + print " Start all DUCC processes, using custom nodelists:" + print " start_ducc -n foo.nodes -n bar.nodes" + print "" + print " Start just agents on a specific set of nodes:" + print " start_ducc -n foo.nodes -n bar.nodes" + print "" + print " Start the webserver on node 'bingle':" + print " start_ducc -c ws@bingle" + sys.exit(1) + + def invalid(self, *msg): + if ( msg[0] != None ): + print + print ' '.join(msg) + print + print "For usage run" + print " start_ducc -h" + print 'or' + print ' start_ducc --help' + sys.exit(1) + + def main(self, argv): + + self.verify_head() + + self.check_properties() + + if ( not self.verify_jvm() ): + sys.exit(1); + + self.set_duccling_version() + + nodefiles = [] + components = [] + or_parms = self.ducc_properties.get('ducc.orchestrator.start.type') + if(not self.is_reliable_backup()): + self.pids_agents = Properties() + self.pids_agents.load_if_exists(self.pid_file_agents) + self.pids_daemons = Properties() + self.pids_daemons.load_if_exists(self.pid_file_daemons) + + try: + opts, args = getopt.getopt(argv, 'c:mn:sh?v', ['component=', 'help', 'nodelist=', 'cold', 'warm', 'nothreading']) + except: + self.invalid('Invalid arguments', ' '.join(argv)) + + if (len(args) > 0): + self.invalid('Invalid extra args: ', ' '.join(args)) + + for ( o, a ) in opts: + if o in ( '-c', '--component' ): + if (a.strip() == 'head'): + components.append('or') + components.append('pm') + components.append('rm') + components.append('sm') + components.append('ws') + components.append('db') + components.append('broker') + else: + components.append(a) + elif o in ( '-n', '--nodelist' ): + nodefiles.append(a) + elif o in ( '--nothreading' ): + self.disable_threading() + elif o in ( '--cold', '--warm' ): + or_parms = o[2:] # (strip the leading --) + elif ( o == '-v'): + print self.version() + sys.exit(0) + elif o in ( '-h', '--help' ): + self.usage(None) + elif ( o == '-?'): + self.usage(None) + else: + self.invalid('bad arg: ', o, 'in:', ' '.join(argv)) + + if not self.installed(): + print "Head node is not initialized. Have you run ducc_post_install?" + return + + environ = self.show_ducc_environment() + for e in environ: + print e + + # no args, or just -s - make equivalent of -management and -nodefile=DUCC.HOME/resources/ducc.nodes + if ( (len(components) == 0) and (len(nodefiles) == 0 ) ) : + nodefiles = self.default_nodefiles + components = self.default_components + + self.verify_required_directories() + + if ( not verify_master_node(self.ducc_properties) ): + print 'FAIL: Cannot run javac to run java verification' + return + + # make sure all the nodefiles exist and are readable + ok = True + nodes = {} + n_nodes = 0 + for n in nodefiles: + n_nodes, nodes = self.read_nodefile(n, nodes) + + for ( nf, nl ) in nodes.items(): + if ( nl == None ): + print "Can't read nodefile", nf + ok = False + + if ok and (nodefiles == self.default_nodefiles): + if self.verify_class_configuration(nodefiles[0], False): + print "OK: Class configuration checked" + else: + print "NOTOK: Bad configuration, cannot start." + ok = False + + if ( not ok ): + sys.exit(1) + + if ( not self.verify_limits() ): + print "Limits too low to run DUCC" + sys.exit(1) + + # activeMQ needs to be started externally before starting any DUCC processes + if ( self.automanage_broker and ('broker' in components) ): + if ( self.is_amq_active() ): + print 'ActiveMQ broker is already running on host and port:', self.broker_host + ':' + self.broker_port, 'NOT restarting' + else: + try: + self.start_broker() + except: + print sys.exc_info()[0], "DUCC may not be started correctly." + sys.exit(1) + + if ( self.automanage_database and ('db' in components) ): + try: + if ( not self.db_start() ): + print "Failed to start or connect to the database." + sys.exit(1) + except Exception (e): + # print e + print sys.exc_info()[0], "Can't start the database." + sys.exit(1) + + if ( self.is_amq_active() ): + print 'ActiveMQ broker is found on configured host and port:', self.broker_host + ':' + self.broker_port + else: + print 'ActiveMQ broker is required but cannot be found on', self.broker_host + ':' + self.broker_port + sys.exit(1) + + ducc = Ducc() + + self.threadpool = ThreadPool(n_nodes + 5) # a few more for the head processes + self.pidlock = threading.Lock() + + #start 'or' first to field system log requests + if ( len(components) != 0 ): + for com in components: + if ( com in ('or') ): + try: + self.threadpool.invoke(self.start_component, ducc, com, or_parms) + #self.start_component(ducc, com, or_parms) + except: + self.threadpool.quit() + print sys.exc_info()[0], "DUCC may not be started correctly." + sys.exit(1) + # give 'or' a small head start + time.sleep(2) + + if(self.is_reliable_backup()): + print '********** "backup" head node -> not starting agents' + else: + print "Starting", n_nodes, "agents" + for (nodefile, nodelist) in nodes.items(): + print '********** Starting agents from file', nodefile + try: + for node in nodelist: + self.threadpool.invoke(self.start_one_agent, node) + except: + self.threadpool.quit() + print sys.exc_info()[0], "DUCC may not be started correctly." + sys.exit(1) + + if ( len(components) != 0 ): + print 'Starting', or_parms + + for com in components: + if ( com in ('broker', 'db', 'or') ): + pass # already started + else: + try: + self.threadpool.invoke(self.start_component, ducc, com, or_parms) + #self.start_component(ducc, com, or_parms) + except: + self.threadpool.quit() + print sys.exc_info()[0], "DUCC may not be started correctly." + sys.exit(1) + + self.threadpool.quit() + + if(not self.is_reliable_backup()): + if ( len(self.pids_agents) > 0 ): + self.pids_agents.write(self.pid_file_agents) + if ( len(self.pids_daemons) > 0 ): + self.pids_daemons.write(self.pid_file_daemons) + return + +if __name__ == "__main__": + # First check if ducc_post_install has been run + DUCC_HOME = find_ducc_home() + propsfile = DUCC_HOME + '/resources/site.ducc.properties' + if ( not os.path.exists(propsfile) ): + print "\n>> ERROR >> Missing site.ducc.properties -- please run ducc_post_install\n" + sys.exit(99) + starter = StartDucc() + starter.main(sys.argv[1:]) Added: uima/uima-ducc/trunk/src/main/admin/2.2_stop_ducc URL: http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/2.2_stop_ducc?rev=1838082&view=auto ============================================================================== --- uima/uima-ducc/trunk/src/main/admin/2.2_stop_ducc (added) +++ uima/uima-ducc/trunk/src/main/admin/2.2_stop_ducc Wed Aug 15 12:05:15 2018 @@ -0,0 +1,386 @@ +#!/usr/bin/env python +# ----------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ----------------------------------------------------------------------- + + +import os +import sys +import time +import getopt +import glob + +from ducc_util import DuccUtil +from properties import * +from ducc import Ducc + +class StopDucc(DuccUtil): + + def stop_component(self, component, force): + + if ( (component == 'broker') and self.automanage_broker ): + print 'Stopping broker' + self.stop_broker() + return + if ( (component == 'db') and self.automanage_database ): + print 'Stopping database' + self.db_stop() + return + + # + # If it's an unqualified management component, we need to get it's qualified name + # + if ( component in self.default_components ): + if( component == 'agent' ): + if ( self.pids_agents.has_key(component) ): + component = self.pids_agents.get(component) + else: + print 'Skipping', component, 'not in pids file.' + return + else: + if ( self.pids_daemons.has_key(component) ): + component = self.pids_daemons.get(component) + else: + print 'Skipping', component, 'not in pids file.' + return + + # + # If the name is not qualified we've got a problem, everything in the pids file is qualified + # + if ( component.find('@') >= 0 ): + com, target_node = component.split('@') + else: + self.invalid("Must specify hostname when stopping", component) + + # + # If despite all that we can't find the pid, we need to run check_ducc + # + if( com == 'agent' ): + if ( not self.pids_agents.has_key(component) ): + print "Cannot find PID for component", component, ". Run check_ducc -p to refresh PIDS and then rerun stop_ducc." + return + else: + pid = self.pids_agents.get(component) + else: + if ( not self.pids_daemons.has_key(component) ): + print "Cannot find PID for component", component, ". Run check_ducc -p to refresh PIDS and then rerun stop_ducc." + return + else: + pid = self.pids_daemons.get(component) + + + if ( force ): + print 'Stopping component', com, 'on node', target_node, 'with PID', pid, 'forcibly (kill -9)' + self.nohup(['ssh', target_node, 'kill', '-KILL', pid], False) + + pass + else: + print 'Stopping component', com, 'on node', target_node, 'with PID', pid + self.nohup(['ssh', target_node, 'kill', '-INT', pid], False) + + # clear the short name if it exists, and the long name + if( com == 'agent' ): + self.pids_agents.delete(com) + self.pids_agents.delete(component) + else: + self.pids_daemons.delete(com) + self.pids_daemons.delete(component) + + def quiesce_agents(self, components, nodes): + allnodes = [] + for ( nf, nl ) in nodes.items(): + allnodes = allnodes + nl + + for c in components: + if ( c.find('@') >= 0 ): + com, target_node = c.split('@') + allnodes.append(target_node) + else: + self.invalid("Must specify hostname when stopping", component) + + qparm = ','.join(allnodes) + print 'Quiescing', qparm + DUCC_JVM_OPTS = ' -Dducc.deploy.configuration=' + self.DUCC_HOME + "/resources/ducc.properties " + DUCC_JVM_OPTS = DUCC_JVM_OPTS + ' -DDUCC_HOME=' + self.DUCC_HOME + DUCC_JVM_OPTS = DUCC_JVM_OPTS + ' -Dducc.head=' + self.ducc_properties.get('ducc.head') + self.spawn(self.java(), DUCC_JVM_OPTS, 'org.apache.uima.ducc.common.main.DuccAdmin', '--quiesceAgents', qparm) + + # NOTE: quiesce does not actually cause agents to terminate so we don't update the PIDs file + return + + def stop_agent(self, node, force): + self.stop_component('agent@' + node.strip(), force) + + def usage(self, msg): + if ( msg != None ): + print msg + + print 'stop_ducc [options]' + print ' If no options are given, this help screen is shown.' + print '' + print ' For reliable DUCC agents will not be stopped from backup head node. ' + print '' + print ' Broker will not be stopped when ducc.broker.automanage = false. ' + print ' Database will not be stopped when ducc.database.automanage = false. ' + print '' + print 'Options:' + print ' -a --all' + print ' Stop all the DUCC processes, including agents and management processes.' + print '' + print ' -n --nodelist nodefile' + print ' Stop agents on the nodes in the nodefile. Multiple nodefiles may be specified:' + print '' + print ' stop_ducc -n foo.nodes -n bar.nodes -n baz.nodes' + print '' + print ' -c --component component' + print ' Stop a specific component. The component may be qualified with the node name' + print ' using the @ symbol: component@node.' + print '' + print ' stop_ducc -c rm@foonode' + print ' stop_ducc -c agent@barnode -c or' + print '' + print ' Components include:' + print ' agent - node agent' + print ' broker - AMQ broker' + print ' db - database' + print ' or - orchestrator' + print ' pm - process manager' + print ' rm - resource manager' + print ' sm - services manager' + print ' ws - web server' + print ' head = { or, pm, rm, sm, ws, db, broker }' + print '' + print ' -w --wait' + print ' Time to wait for everything to come down, in seconds. Default is 60.' + print '' + print ' -k --kill' + print ' Stop the component forcibly and immediately using kill -9. Use this only if a' + print ' normal stop does not work (e.g. the process may be hung).' + print '' + print ' --nothreading' + print ' Disable multithreaded operation if it would otherwise be used' + print '' + + sys.exit(1) + + def invalid(self, *msg): + if ( msg[0] != None ): + print ' '.join(msg) + + print "For usage run" + print " stop_ducc -h" + print 'or' + print ' stop_ducc --help' + sys.exit(1) + + + def main(self, argv): + + self.verify_head() + + self.check_properties() + + if ( len(argv) == 0 ): + self.usage(None) + + components = [] + nodefiles = [] + do_agents = False + do_components = False + force = False + quiesce = False + all = False + wait_time = 60 + + try: + opts, args = getopt.getopt(argv, 'ac:n:kn:w:qh?v', ['all', 'component=', 'help', 'nodelist=', 'kill', 'quiesce', 'nothreading', 'wait']) + except: + self.invalid('Invalid arguments ' + ' '.join(argv)) + + if (len(args) > 0): + self.invalid('Invalid extra args: ', ' '.join(args)) + + for ( o, a ) in opts: + if o in ('-c', '--component' ): + if (a.strip() == 'head'): + components.append('or') + components.append('pm') + components.append('rm') + components.append('sm') + components.append('ws') + components.append('db') + components.append('broker') + else: + components.append(a) + do_components = True + elif o in ( '-a', '--all' ): + all = True + components = self.default_components + elif o in ( '-n', '--nodelist' ): + nodefiles.append(a) + do_agents = True + elif o in ( '-k', '--kill' ): + force = True + elif o in ( '-q', '--quiesce' ): + quiesce = True + elif o in ( '-w', '--wait' ): + wait_time = int(a) + elif o in ( '--nothreading' ): + self.disable_threading() + elif ( o == '-v' ) : + print self.version() + sys.exit(0) + elif o in ( '-h', '--help' ): + self.usage(None) + elif ( o == '-?'): + self.usage(None) + else: + self.invalid('bad arg: ' + o) + + if ( quiesce ): + if ( all ): + self.invalid("May not quiesce 'all'."); + if ( force ): + self.invalid("May not both quiesce and force."); + for c in components: + if ( not c.startswith('agent') ): + self.invalid("Only agents may be quiesced.") + + + + # avoid confusion by insuring that if 'all', then nothing else is specified + if ( all and ( do_components ) ): + self.invalid("The --all option is mutually exclusive with --component") + + # 'all' means everything. we use broadcast. should use check_ducc to make sure + # it actually worked, and find the stragglers. + if ( all ): + if ( not force ) : + self.clean_shutdown() + + # Agents may wait up to 60 secs for processes to quiesce + print "Waiting " + str(wait_time) + " seconds to broadcast agent shutdown." + time.sleep(wait_time) + + if ( self.automanage_broker ): + print "Stopping broker" + self.stop_broker() + + if ( self.automanage_database ): + print "Stopping database" + self.db_stop() + + if ( os.path.exists(self.pid_file_agents) ): + os.remove(self.pid_file_agents) + if ( os.path.exists(self.pid_file_daemons) ): + os.remove(self.pid_file_daemons) + return + else: + if ( len(nodefiles) == 0 ): + nodefiles = self.default_nodefiles + + + self.pids_agents = Properties() + self.pids_daemons = Properties() + sc = set(components) + sb = set(['broker', 'db']) + read_pids = True + if ( sc.issubset(sb) ): + read_pids = False + + # The broker and db do not set the pid file + if ( read_pids ): + try: + if(not self.is_reliable_backup()): + self.pids_agents.load(self.pid_file_agents) + self.pids_daemons.load(self.pid_file_daemons) + except PropertiesException, (inst): + print inst.msg + print '' + print 'Run check_ducc -p to refresh the PIDs file, or check_ducc -k to search for and', + print 'kill all DUCC processes.' + print '' + sys.exit(1) + + # + # if not 'all', we use nodefiles and component names + # + + # make sure all the nodefiles exist and are readable + ok = True + nodes = {} + n_nodes = 0 + for n in nodefiles: + n_nodes, nodes = self.read_nodefile(n, nodes) + + for ( nf, nl ) in nodes.items(): + if ( nl == None ): # die early if the parameters are wrong + print "Can't read nodefile", nf + ok = False + + if ( not ok ): + sys.exit(1) + + if ( quiesce ): + if(self.is_reliable_backup()): + print '********** "backup" head node -> not quiescing agents' + else: + self.quiesce_agents(components, nodes) + else: + if(self.is_reliable_backup()): + print '********** "backup" head node -> not stopping agents' + else: + for (nf, nl) in nodes.items(): + for n in nl: + self.stop_agent(n, force) + host = self.localhost.split('.')[0] + for c in components: + c = c.strip() + if(c in ('pm','rm','sm','ws')): + c = c+'@'+host + self.stop_component(c, force) + time.sleep(2) + for c in components: + c = c.strip() + if(c in ('or')): + c = c+'@'+host + self.stop_component(c, force) + time.sleep(2) + for c in components: + c = c.strip() + if(c in ('db','broker')): + self.stop_component(c, force) + + if ( read_pids ): + if(not self.is_reliable_backup()): + if ( len(self.pids_agents) > 0 ): + self.pids_agents.write(self.pid_file_agents) + else: + os.remove(self.pid_file_agents) + if ( len(self.pids_daemons) > 0 ): + self.pids_daemons.write(self.pid_file_daemons) + else: + os.remove(self.pid_file_daemons) + + return + +if __name__ == "__main__": + stopper = StopDucc() + stopper.main(sys.argv[1:]) + + Added: uima/uima-ducc/trunk/src/main/admin/autostart.py URL: http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/autostart.py?rev=1838082&view=auto ============================================================================== --- uima/uima-ducc/trunk/src/main/admin/autostart.py (added) +++ uima/uima-ducc/trunk/src/main/admin/autostart.py Wed Aug 15 12:05:15 2018 @@ -0,0 +1,296 @@ +#!/usr/bin/env python +# ----------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ----------------------------------------------------------------------- + +import sys + +version_min = [2, 7] +version_info = sys.version_info +version_error = False +if(version_info[0] < version_min[0]): + version_error = True +elif(version_info[0] == version_min[0]): + if(version_info[1] < version_min[1]): + version_error = True +if(version_error): + print('Python minimum requirement is version '+str(version_min[0])+'.'+str(version_min[1])) + sys.exit(1) + +import argparse +import datetime +import logging +import os +import shlex +import socket +import subprocess +import time +import traceback + +from ducc_util import DuccUtil + +# produce a time stamp +def get_timestamp(): + tod = time.time() + timestamp = datetime.datetime.fromtimestamp(tod).strftime('%Y-%m-%d %H:%M:%S') + return timestamp + +# embedded class to log to file +# default level is info level, +# controlled by environment variable LOGLEVEL = { info, debug, trace } +class Logger(): + + flag_info = True + flag_debug = False + flag_trace = False + + flag_error = True + flag_warn = True + + def __init__(self,filename,level='info'): + self.filename = filename + if(level == 'debug'): + self.flag_debug = True + elif(level == 'trace'): + self.flag_debug = True + self.flag_trace = True + + # write to file + def output(self,type,mn,text): + message = get_timestamp()+' '+type+' '+mn+' '+text + with open(self.filename, 'a') as logfile: + logfile.write(message+'\n') + + # record info message + def info(self,mn,text): + if(self.flag_info): + self.output('I',mn,text) + + # record debug message + def debug(self,mn,text): + if(self.flag_debug): + self.output('D',mn,text) + + # record trace message + def trace(self,mn,text): + if(self.flag_trace): + self.output('T',mn,text) + + # record error message + def error(self,mn,text): + if(self.flag_error): + self.output('E',mn,text) + + # record warn message + def warn(self,mn,text): + if(self.flag_warn): + self.output('W',mn,text) + +# class to automagically start DUCC daemons +class AutoStart(DuccUtil): + + components = [ 'agent', 'broker', 'orchestrator', 'pm', 'rm', 'sm', 'ws' ] + map = { 'ag':'agent', + 'br':'broker', + 'or':'or', + 'pm':'pm', + 'rm':'rm', + 'sm':'sm', + 'ws':'ws', + } + + # return file name + def _fn(self): + fpath = __file__.split('/') + flen = len(fpath) + return fpath[flen-1] + + # return class name + def _cn(self): + return self.__class__.__name__ + + # return method name + def _mn(self): + return traceback.extract_stack(None,2)[0][2] + + description = 'Start daemon(s) on the present node when listed in the autostart database table but not already running.' + + def get_args(self): + parser = argparse.ArgumentParser(description=self.description) + self.args = parser.parse_args() + + # setup logging to file for autostart script + def setup_logging(self,NAME): + LOGDIR = self.DUCC_HOME+'/logs/'+NAME + self.makedirs(LOGDIR) + self.LOCAL_HOST = socket.getfqdn() + FN = self.LOCAL_HOST.split('.')[0]+'.'+NAME+'.log' + LOGFILE = LOGDIR+'/'+FN + LOGLEVEL = os.environ.get('LOGLEVEL','info') + self.logger = Logger(LOGFILE,LOGLEVEL) + + + # check if host names with domain match + def is_host_match_with_domain(self,h1,h2): + retVal = False + if(h1 == h2): + retVal = True + text = str(h1)+' '+str(h2)+' '+str(retVal) + self.logger.debug(self._mn(),text) + return retVal + + # check if host names without domain match + def is_host_match_without_domain(self,h1,h2): + retVal = False + h10 = h1.split('.')[0] + h20 = h2.split('.')[0] + if(h10 == h20): + retVal = True + text = str(h10)+' '+str(h20)+' '+str(retVal) + self.logger.debug(self._mn(),text) + return retVal + + #check if host names match with/without domain + def is_host_match(self,h1,h2): + retVal = False + if(h1 != None): + if(h2 != None): + if(self.is_host_match_with_domain(h1,h2)): + retVal = True + elif(self.is_host_match_without_domain(h1,h2)): + retVal = True + text = str(h1)+' '+str(h2)+' '+str(retVal) + self.logger.debug(self._mn(),text) + return retVal + + def parse_line(self,line): + retVal = '', '', '' + try: + if(line != None): + text = line + self.logger.debug(self._mn(),text) + tokens = line.split() + if(len(tokens) == 1): + host = line.split('.')[0] + remainder = line.split('.')[1] + name = remainder.split('=')[0] + state = remainder.split('=')[1] + retVal = host, name, state + except: + pass + return retVal + + # get daemons started (from DB) + def get_daemons_started(self): + daemons = [] + jclass = 'org.apache.uima.ducc.database.lifetime.DbDaemonLifetimeUI' + option = '--query' + cmd = [self.jvm, '-DDUCC_HOME='+self.DUCC_HOME, jclass, option] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = p.communicate() + lines = out.split('\n') + for line in lines: + host, daemon, state = self.parse_line(line) + if(self.is_host_match(self.LOCAL_HOST, host)): + if(state == 'Start'): + daemons.append(daemon) + text = 'add'+' '+host+' '+daemon + self.logger.debug(self._mn(),text) + else: + text = 'skip'+' '+host+' '+daemon + self.logger.debug(self._mn(),text) + else: + text = 'skip'+' '+host+' '+daemon + self.logger.debug(self._mn(),text) + text = 'daemons'+' '+str(daemons) + self.logger.debug(self._mn(),text) + return daemons + + def normalize_component(self,component): + daemon = component[:2] + return daemon + + # get daemons running (from system) + def get_daemons_running(self): + daemons = [] + result = self.find_ducc_process(self.LOCAL_HOST) + find_status = result[0] + text = 'find_status:'+str(find_status) + self.logger.debug(self._mn(),text) + tuples = result[1] + text = 'tuples:'+str(tuples) + self.logger.debug(self._mn(),text) + for tuple in tuples: + component = tuple[0] + pid = tuple[1] + user = tuple[2] + if(user == self.ducc_uid): + if(component in self.components): + text = 'keep:'+str(tuple) + self.logger.debug(self._mn(),text) + daemon = self.normalize_component(component) + daemons.append(daemon) + else: + text = 'skip:'+str(tuple) + self.logger.debug(self._mn(),text) + else: + text = 'skip:'+str(tuple)+' '+str(self.ducc_uid) + self.logger.debug(self._mn(),text) + text = 'daemons:'+str(daemons) + self.logger.debug(self._mn(),text) + return daemons + + def start(self,daemon): + component = self.map[daemon] + text = str(component) + self.logger.warn(self._mn(),text) + if(daemon == 'ag'): + python_script = os.path.join(self.DUCC_HOME,'admin','ducc.py') + cmd = [ python_script, '-c', component, '-b', '-d', str(time.time()), '--nodup' ] + else: + python_script = os.path.join(self.DUCC_HOME,'admin','start_ducc',) + cmd = [ python_script, '-c', component ] + text = str(cmd) + self.logger.info(self._mn(),text) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = p.communicate() + text = str(out) + self.logger.info(self._mn(),text) + + # autostart: start head or agent daemons, as required + def main(self, argv): + NAME = 'autostart' + self.setup_logging(NAME) + self.get_args() + try: + daemons_started = self.get_daemons_started() + daemons_running = self.get_daemons_running() + for daemon in daemons_started: + if(not daemon in daemons_running): + self.start(daemon) + + except Exception,e: + lines = traceback.format_exc().splitlines() + for line in lines: + text = line + self.logger.debug(self._mn(),text) + +if __name__ == '__main__': + instance = AutoStart() + instance.main(sys.argv[1:]) + \ No newline at end of file Propchange: uima/uima-ducc/trunk/src/main/admin/autostart.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: uima/uima-ducc/trunk/src/main/admin/autostart.py ------------------------------------------------------------------------------ svn:executable = * Modified: uima/uima-ducc/trunk/src/main/admin/check_ducc URL: http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/check_ducc?rev=1838082&r1=1838081&r2=1838082&view=diff ============================================================================== --- uima/uima-ducc/trunk/src/main/admin/check_ducc (original) +++ uima/uima-ducc/trunk/src/main/admin/check_ducc Wed Aug 15 12:05:15 2018 @@ -128,13 +128,13 @@ class CheckDucc(DuccUtil): else: messages.append((spacer, 'Found', process_id)) - full_name = component + '@' + node - if ( component == 'agent' ): - self.pids_agents.put(full_name, pid) - else: - if ( component in self.default_components ): - self.pids_daemons.put(full_name, pid) - self.pids_daemons.put(component, full_name) + #full_name = component + '@' + node + #if ( component == 'agent' ): + # self.pids_agents.put(full_name, pid) + #else: + # if ( component in self.default_components ): + # self.pids_daemons.put(full_name, pid) + # self.pids_daemons.put(component, full_name) else: messages.append((spacer, 'no processes found.')) @@ -179,11 +179,11 @@ class CheckDucc(DuccUtil): print "" print " check_ducc -n ../resources/ducc.nodes" print "" - print " For reliable DUCC agents will not be killed from backup head node. " - print "" - print " Broker will not be killed when ducc.broker.automanage = false. " - print " Database will not be killed when ducc.database.automanage = false. " - print "" + #print " For reliable DUCC agents will not be killed from backup head node. " + #print "" + #print " Broker will not be killed when ducc.broker.automanage = false. " + #print " Database will not be killed when ducc.database.automanage = false. " + #print "" print "Options:" print " -n --nodelist nodefile" print " Check for agents on the nodes in nodefile. This option may be specified multiple time" @@ -193,27 +193,27 @@ class CheckDucc(DuccUtil): print " Do basic sanity checking on the configuration only. Note that configuration checking is always" print " performed with most options. The [-c, --configuration] option does ONLY configuration checking." print "" - print " -k --kill" - print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" - print " uses kill -KILL (-9) for all daemons, except database which uses -QUIT (3)," - print " and only kills processes owned by the invoking user." - print "" - print " --db-9" - print " Use signal -KILL (-9) to kill database, rather than the default -QUIT (-3)" - print "" - print " -i --int" - print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" - print " uses kill -INT (-2) and only kills processes owned by the invoking user." - print "" - print " -q --quit" - print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" - print " uses kill -QUIT (-3) and only kills processes owned by the invoking user." - print "" - print " -p --pids" - print " Rewrite the PID file. The PID file is always rewritten if any changes to processes are made. Sometimes" - print " the PID file needs rebuilding. This option causes the file to be rebuilt regardless of" - print " changes." - print "" + #print " -k --kill" + #print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" + #print " uses kill -KILL (-9) for all daemons, except database which uses -QUIT (3)," + #print " and only kills processes owned by the invoking user." + #print "" + #print " --db-9" + #print " Use signal -KILL (-9) to kill database, rather than the default -QUIT (-3)" + #print "" + #print " -i --int" + #print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" + #print " uses kill -INT (-2) and only kills processes owned by the invoking user." + #print "" + #print " -q --quit" + #print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" + #print " uses kill -QUIT (-3) and only kills processes owned by the invoking user." + #print "" + #print " -p --pids" + #print " Rewrite the PID file. The PID file is always rewritten if any changes to processes are made. Sometimes" + #print " the PID file needs rebuilding. This option causes the file to be rebuilt regardless of" + #print " changes." + #print "" print " -x localdate" print " Validate the local installation, called via ssh usually. The date is the date on the calling machine." print "" @@ -229,7 +229,8 @@ class CheckDucc(DuccUtil): def main(self, argv): try: - opts, args = getopt.getopt(argv, 'cikn:opqx:h?v', ['configuration', 'nodelist=', 'int', 'quit', 'kill', 'db-9', 'pids', 'verbose', 'nothreading', ]) + opts, args = getopt.getopt(argv, 'cn:x:h?v', ['configuration', 'nodelist=', 'verbose', 'nothreading', ]) + #opts, args = getopt.getopt(argv, 'cikn:opqx:h?v', ['configuration', 'nodelist=', 'int', 'quit', 'kill', 'db-9', 'pids', 'verbose', 'nothreading', ]) except: self.usage("Invalid arguments " + ' '.join(argv)) @@ -249,27 +250,27 @@ class CheckDucc(DuccUtil): config_only = True elif o in ('-n', '--nodelist'): nodefiles.append(a) - elif o in ('-i', '--int'): - if ( self.kill_signal != None ): - print 'Conflicting kill signals: -INT and', self.kill_signal - return - self.kill_signal = '-INT' - elif o in ('-q', '--quit'): - if ( self.kill_signal != None ): - print 'Conflicting kill signals: -QUIT and', self.kill_signal - return - self.kill_signal = '-QUIT' - elif o in ('-k', '--kill'): - if ( self.kill_signal != None ): - print 'Conflicting kill signals: -KILL and', self.kill_signal - return - self.kill_signal = '-KILL' - elif o in ('--db-9'): - self.kill_db9 = True + #elif o in ('-i', '--int'): + # if ( self.kill_signal != None ): + # print 'Conflicting kill signals: -INT and', self.kill_signal + # return + # self.kill_signal = '-INT' + #elif o in ('-q', '--quit'): + # if ( self.kill_signal != None ): + # print 'Conflicting kill signals: -QUIT and', self.kill_signal + # return + # self.kill_signal = '-QUIT' + #elif o in ('-k', '--kill'): + # if ( self.kill_signal != None ): + # print 'Conflicting kill signals: -KILL and', self.kill_signal + # return + # self.kill_signal = '-KILL' + #elif o in ('--db-9'): + # self.kill_db9 = True elif o in ( '--nothreading' ): self.disable_threading() - elif o in ('-p', '--pids'): - redo_pids = True + #elif o in ('-p', '--pids'): + # redo_pids = True elif o in ('-x'): # intended to be called recursively from check_ducc, NOT from the command line do_validate = True @@ -320,11 +321,11 @@ class CheckDucc(DuccUtil): self.verify_database() # init the PID file - if(not self.is_reliable_backup()): - self.pids_agents = Properties() - self.pids_agents.load_if_exists(self.pid_file_agents) - self.pids_daemons = Properties() - self.pids_daemons.load_if_exists(self.pid_file_daemons) + #if(not self.is_reliable_backup()): + # self.pids_agents = Properties() + # self.pids_agents.load_if_exists(self.pid_file_agents) + #self.pids_daemons = Properties() + #self.pids_daemons.load_if_exists(self.pid_file_daemons) # read the nodelists if ( len(nodefiles) == 0 ): @@ -397,30 +398,30 @@ class CheckDucc(DuccUtil): self.threadpool.quit() - if ( self.kill_signal != None ): - if(self.automanage_broker): - print 'Stopping broker' - self.stop_broker() - else: - print 'Not stopping broker - not automanaged' - if(self.automanage_database): - print 'Stopping database' - self.db_stop() - else: - print 'Not stopping database - not automanaged' + #if ( self.kill_signal != None ): + # if(self.automanage_broker): + # print 'Stopping broker' + # self.stop_broker() + # else: + # print 'Not stopping broker - not automanaged' + # if(self.automanage_database): + # print 'Stopping database' + # self.db_stop() + # else: + # print 'Not stopping database - not automanaged' - if(not self.is_reliable_backup()): - if ( len(self.pids_agents) == 0): - if ( os.path.exists(self.pid_file_agents) ): - os.remove(self.pid_file_agents) - elif (process_changes or redo_pids): - self.pids_agents.write(self.pid_file_agents) + #if(not self.is_reliable_backup()): + # if ( len(self.pids_agents) == 0): + # if ( os.path.exists(self.pid_file_agents) ): + # os.remove(self.pid_file_agents) + # elif (process_changes or redo_pids): + # self.pids_agents.write(self.pid_file_agents) - if ( len(self.pids_daemons) == 0): - if ( os.path.exists(self.pid_file_daemons) ): - os.remove(self.pid_file_daemons) - elif (process_changes or redo_pids): - self.pids_daemons.write(self.pid_file_daemons) + #if ( len(self.pids_daemons) == 0): + # if ( os.path.exists(self.pid_file_daemons) ): + # os.remove(self.pid_file_daemons) + #elif (process_changes or redo_pids): + # self.pids_daemons.write(self.pid_file_daemons) if __name__ == "__main__": checker = CheckDucc() Added: uima/uima-ducc/trunk/src/main/admin/cron/db_autostart.crontab.example URL: http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/cron/db_autostart.crontab.example?rev=1838082&view=auto ============================================================================== --- uima/uima-ducc/trunk/src/main/admin/cron/db_autostart.crontab.example (added) +++ uima/uima-ducc/trunk/src/main/admin/cron/db_autostart.crontab.example Wed Aug 15 12:05:15 2018 @@ -0,0 +1,25 @@ + +# Example crontab to autostart DUCC daemons + +# ----------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ----------------------------------------------------------------------- + +# every 10 minutes: re-start any DUCC daemons that are down on this node + +10 * * * * /share/Python-2.7.8/bin/python2.7 /home/ducc/ducc_runtime/admin/autostart.py 2>&1 Added: uima/uima-ducc/trunk/src/main/admin/db_autostart_delete.py URL: http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/db_autostart_delete.py?rev=1838082&view=auto ============================================================================== --- uima/uima-ducc/trunk/src/main/admin/db_autostart_delete.py (added) +++ uima/uima-ducc/trunk/src/main/admin/db_autostart_delete.py Wed Aug 15 12:05:15 2018 @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# ----------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ----------------------------------------------------------------------- + +import sys + +version_min = [2, 7] +version_info = sys.version_info +version_error = False +if(version_info[0] < version_min[0]): + version_error = True +elif(version_info[0] == version_min[0]): + if(version_info[1] < version_min[1]): + version_error = True +if(version_error): + print('Python minimum requirement is version '+str(version_min[0])+'.'+str(version_min[1])) + sys.exit(1) + +import argparse +import os +import subprocess + +from ducc_util import DuccUtil + +# command to delete from the autostart database table the specified host & daemon + +class AutostartDelete(DuccUtil): + + valid_names = [ 'ag', 'br', 'or', 'pm', 'rm', 'sm', 'ws' ] + jclass = 'org.apache.uima.ducc.database.lifetime.DbDaemonLifetimeUI' + + description = 'Delete an entry from the autostart database table.' + + def get_args(self): + parser = argparse.ArgumentParser(description=self.description) + parser.add_argument('--host', action='store', required=True, help='the DUCC daemon host') + parser.add_argument('--name', action='store', required=True, choices=self.valid_names, help='the DUCC daemon name') + self.args = parser.parse_args() + + def find(self): + retVal = False + option = '--query' + cmd = [self.jvm, '-DDUCC_HOME='+self.DUCC_HOME, self.jclass, option] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = p.communicate() + lines = out.split('\n') + for line in lines: + tokens = line.split('.') + if(len(tokens) == 2): + host = tokens[0] + name = tokens[1].split('=')[0] + if(host == self.args.host): + if(name == self.args.name): + retVal = True + return retVal + + def delete(self): + option = '--delete' + cmd = [self.jvm, '-DDUCC_HOME='+self.DUCC_HOME, self.jclass, option, self.args.host, self.args.name] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = p.communicate() + + def main(self, argv): + self.get_args() + if(self.find()): + self.delete() + if(self.find()): + print 'not deleted' + else: + print 'deleted' + else: + print 'not found' + +if __name__ == "__main__": + + instance = AutostartDelete() + instance.main(sys.argv[1:]) Propchange: uima/uima-ducc/trunk/src/main/admin/db_autostart_delete.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: uima/uima-ducc/trunk/src/main/admin/db_autostart_delete.py ------------------------------------------------------------------------------ svn:executable = * Added: uima/uima-ducc/trunk/src/main/admin/db_autostart_query.py URL: http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/db_autostart_query.py?rev=1838082&view=auto ============================================================================== --- uima/uima-ducc/trunk/src/main/admin/db_autostart_query.py (added) +++ uima/uima-ducc/trunk/src/main/admin/db_autostart_query.py Wed Aug 15 12:05:15 2018 @@ -0,0 +1,92 @@ +#!/usr/bin/env python +# ----------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ----------------------------------------------------------------------- + +import sys + +version_min = [2, 7] +version_info = sys.version_info +version_error = False +if(version_info[0] < version_min[0]): + version_error = True +elif(version_info[0] == version_min[0]): + if(version_info[1] < version_min[1]): + version_error = True +if(version_error): + print('Python minimum requirement is version '+str(version_min[0])+'.'+str(version_min[1])) + sys.exit(1) + +import argparse +import os +import subprocess + +from ducc_util import DuccUtil + +# command to query the database for all started daemons (i.e. what the autostart.py command sees) + +class AutostartQuery(DuccUtil): + + description = 'List the entries from the autostart database table.' + + def get_args(self): + parser = argparse.ArgumentParser(description=self.description) + self.args = parser.parse_args() + + def main(self, argv): + self.get_args() + jclass = 'org.apache.uima.ducc.database.lifetime.DbDaemonLifetimeUI' + option = '--query' + cmd = [self.jvm, '-DDUCC_HOME='+self.DUCC_HOME, jclass, option] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = p.communicate() + #print out + lines = out.split('\n') + counter = 0 + for line in lines: + if('.db' in line): + print line + counter = counter + 1 + elif('.br' in line): + print line + counter = counter + 1 + elif('.or' in line): + print line + counter = counter + 1 + elif('.pm' in line): + print line + counter = counter + 1 + elif('.rm' in line): + print line + counter = counter + 1 + elif('.sm' in line): + print line + counter = counter + 1 + elif('.ws' in line): + print line + counter = counter + 1 + elif('.ag' in line): + print line + counter = counter + 1 + if(counter == 0): + print 'no daemon(s) registered as started in database' + +if __name__ == "__main__": + + instance = AutostartQuery() + instance.main(sys.argv[1:]) Propchange: uima/uima-ducc/trunk/src/main/admin/db_autostart_query.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: uima/uima-ducc/trunk/src/main/admin/db_autostart_query.py ------------------------------------------------------------------------------ svn:executable = *