Author: burn
Date: Thu Nov 14 13:41:11 2019
New Revision: 1869795
URL: http://svn.apache.org/viewvc?rev=1869795&view=rev
Log:
UIMA-6144 Make ducc_update for multi-head installation check that only the
local daemons must be down
Modified:
uima/uima-ducc/trunk/src/main/admin/check_ducc
uima/uima-ducc/trunk/src/main/admin/ducc_update
Modified: uima/uima-ducc/trunk/src/main/admin/check_ducc
URL:
http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/check_ducc?rev=1869795&r1=1869794&r2=1869795&view=diff
==============================================================================
--- uima/uima-ducc/trunk/src/main/admin/check_ducc (original)
+++ uima/uima-ducc/trunk/src/main/admin/check_ducc Thu Nov 14 13:41:11 2019
@@ -24,6 +24,7 @@ import sys
from time import time
import getopt
import signal
+import socket
from ducc_util import DuccUtil
from properties import Properties
@@ -100,43 +101,15 @@ class CheckDucc(DuccUtil):
signal = self.kill_signal
- # Don't kill agents if a backup node (kill has been disabled
as stop_ducc should be used)
- if (signal != None and not self.is_active() and component ==
'agent' ):
- print 'Ignoring agent as not the master head node'
- continue
-
if ( component == 'orchestrator' ):
component = 'or'
- if ( component == 'database' ):
- if ( signal != None ):
- if ( self.kill_db9 == False ):
- signal = '-QUIT'
-
- process_id = found_user + ' ' + component + '@' + node + ' PID
' + pid
- if ( signal != None ) :
- if ( self.user != found_user ):
- messages.append((spacer, "Not killing someone else's
process.", process_id))
- elif ( component == 'unknown-java' ):
- messages.append((spacer, 'Not killing non-ducc
process', process_id))
- else:
- messages.append((spacer, 'Killing (' + signal + ')',
process_id))
- self.kill_process(node, proc, signal)
- if ( component == 'agent' ):
- self.pids_agents.delete(pid)
- else:
- self.pids_daemons.delete(pid)
- process_changes = True
-
+ if ( component == 'database' and not self.automanage_database
):
+ automan = " (NOT auto-managed)"
else:
- messages.append((spacer, 'Found', process_id))
- #full_name = component + '@' + node
- #if ( component == 'agent' ):
- # self.pids_agents.put(full_name, pid)
- #else:
- # if ( component in self.default_components ):
- # self.pids_daemons.put(full_name, pid)
- # self.pids_daemons.put(component, full_name)
+ automan = ""
+ process_id = found_user + ' ' + component + '@' + node + ' PID
' + pid + automan
+ messages.append((spacer, 'Found', process_id))
else:
messages.append((spacer, 'no processes found.'))
@@ -181,41 +154,18 @@ class CheckDucc(DuccUtil):
print ""
print " check_ducc -n ../resources/ducc.nodes"
print ""
- #print " For reliable DUCC agents will not be killed from backup
head node. "
- #print ""
- #print " Broker will not be killed when ducc.broker.automanage =
false. "
- #print " Database will not be killed when ducc.database.automanage =
false. "
- #print ""
print "Options:"
print " -n --nodelist nodefile"
print " Check for agents on the nodes in nodefile. This option
may be specified multiple time"
- print " for multiple nodefiles. The 'local' node is always
checked"
+ print " for multiple nodefiles. The head node(s) are always
checked"
+ print ""
+ print " --localonly"
+ print " Check only this head node (used when updating a single
head node on a local filesystem)"
print ""
print " -c --configuration"
print " Do basic sanity checking on the configuration only.
Note that configuration checking is always"
print " performed with most options. The [-c, --configuration]
option does ONLY configuration checking."
print ""
- #print " -k --kill"
- #print " Force-kill any DUCC process you find on a node (if
normal stop_ducc isn't working. This"
- #print " uses kill -KILL (-9) for all daemons, except database
which uses -QUIT (3),"
- #print " and only kills processes owned by the invoking user."
- #print ""
- #print " --db-9"
- #print " Use signal -KILL (-9) to kill database, rather than the
default -QUIT (-3)"
- #print ""
- #print " -i --int"
- #print " Force-kill any DUCC process you find on a node (if
normal stop_ducc isn't working. This"
- #print " uses kill -INT (-2) and only kills processes owned by
the invoking user."
- #print ""
- #print " -q --quit"
- #print " Force-kill any DUCC process you find on a node (if
normal stop_ducc isn't working. This"
- #print " uses kill -QUIT (-3) and only kills processes owned by
the invoking user."
- #print ""
- #print " -p --pids"
- #print " Rewrite the PID file. The PID file is always rewritten
if any changes to processes are made. Sometimes"
- #print " the PID file needs rebuilding. This option causes the
file to be rebuilt regardless of"
- #print " changes."
- #print ""
print " -x localdate"
print " Validate the local installation, called via ssh usually.
The date is the date on the calling machine."
print ""
@@ -231,48 +181,30 @@ class CheckDucc(DuccUtil):
def main(self, argv):
try:
- opts, args = getopt.getopt(argv, 'cn:x:h?v', ['configuration',
'nodelist=', 'verbose', 'nothreading', ])
- #opts, args = getopt.getopt(argv, 'cikn:opqx:h?v',
['configuration', 'nodelist=', 'int', 'quit', 'kill', 'db-9', 'pids',
'verbose', 'nothreading', ])
+ opts, args = getopt.getopt(argv, 'cn:x:h?v', ['configuration',
'nodelist=', 'verbose', 'nothreading', 'localonly' ])
+
except:
self.usage("Invalid arguments " + ' '.join(argv))
nodefiles = []
self.user = os.environ['LOGNAME']
- self.kill_signal = None
- self.kill_db9 = False
- redo_pids = False
+ self.kill_signal = None # Kill disabled ... now handled by
stop_ducc
process_changes = False
do_validate = False
checkdate = 0
config_only = False
verbose = False
+ local_only = False
for ( o, a ) in opts:
if o in ('-c', '--configuration'):
config_only = True
elif o in ('-n', '--nodelist'):
nodefiles.append(a)
- #elif o in ('-i', '--int'):
- # if ( self.kill_signal != None ):
- # print 'Conflicting kill signals: -INT and',
self.kill_signal
- # return
- # self.kill_signal = '-INT'
- #elif o in ('-q', '--quit'):
- # if ( self.kill_signal != None ):
- # print 'Conflicting kill signals: -QUIT and',
self.kill_signal
- # return
- # self.kill_signal = '-QUIT'
- #elif o in ('-k', '--kill'):
- # if ( self.kill_signal != None ):
- # print 'Conflicting kill signals: -KILL and',
self.kill_signal
- # return
- # self.kill_signal = '-KILL'
- #elif o in ('--db-9'):
- # self.kill_db9 = True
+ elif o in ( '--localonly' ):
+ local_only = True
elif o in ( '--nothreading' ):
self.disable_threading()
- #elif o in ('-p', '--pids'):
- # redo_pids = True
elif o in ('-x'):
# intended to be called recursively from check_ducc, NOT from
the command line
do_validate = True
@@ -286,6 +218,11 @@ class CheckDucc(DuccUtil):
usage('bad arg: ' + a)
+ if ( local_only ):
+ if ( len(nodefiles) > 0 ):
+ print "NOTOK: Cannot specify nodefiles with --localonly"
+ return
+
if not self.installed():
print "Head node is not initialized. Have you run
ducc_post_install?"
return
@@ -323,39 +260,23 @@ class CheckDucc(DuccUtil):
self.verify_database()
- # init the PID file
- #if(not self.is_reliable_backup()):
- # self.pids_agents = Properties()
- # self.pids_agents.load_if_exists(self.pid_file_agents)
- #self.pids_daemons = Properties()
- #self.pids_daemons.load_if_exists(self.pid_file_daemons)
-
- # read the nodelists
- if ( len(nodefiles) == 0 ):
- nodefiles = self.default_nodefiles
- check_nodepools = True
- else:
- # if using other than the fully configured set of nodes we can't
reliably check nodepools
- # because anything other than the full set of nodes may be missing
something
- check_nodepools = False
-
- nodes = {}
+ # Create the nodelists
+ nodesmap = {}
n_nodes = 0
- for nf in nodefiles:
- n_nodes, nodes = self.read_nodefile(nf, nodes)
-
- #
- # add in the local host if needed, and the webserver node
- #
- localnodes = []
- if ( not self.localhost in nodes ):
- localnodes.append(self.localhost)
-
- if ( not (self.webserver_node in ['localhost', self.localhost, None])
):
- localnodes.append(self.webserver_node)
-
- if ( len(localnodes) > 0 ):
- nodes['local'] = localnodes
+ if ( local_only ):
+ # Include just this head node
+ nodesmap['head'] = [ self.localhost ]
+ n_nodes = 1
+ else:
+ # Load the specified or default nodefiles
+ if ( len(nodefiles) == 0 ):
+ nodefiles = self.default_nodefiles
+ for nf in nodefiles:
+ n, nodesmap = self.read_nodefile(nf, nodesmap)
+ n_nodes += n
+ # Include all the head node(s)
+ nodesmap['head'] = self.head_nodes
+ n_nodes += len(self.head_nodes)
self.verify_jvm()
@@ -368,30 +289,24 @@ class CheckDucc(DuccUtil):
print "OK: Class configuration checked"
else:
print "NOTOK: Errors in class or node configuration."
-
return
- # checking starts here
+ # checking starts here - reduce any full names to the short names
without the domain
print "Checking", n_nodes, "nodes"
- self.threadpool = ThreadPool(n_nodes + 5) # more for the head
processes
+ self.threadpool = ThreadPool(n_nodes) # n_nodes is >= number of
unique nodes
checked = {}
signal.signal(signal.SIGINT, self.signalHandler)
try:
- for (nodefile, nodelist) in nodes.items():
+ for (nodefile, nodelist) in nodesmap.items():
if ( nodelist == None ):
# loading the nodes prints the necessary message
continue
for node in nodelist:
+ node = node.split('.')[0]
if ( checked.has_key(node) ):
continue
-
- checked[node] = node
- self.threadpool.invoke(self.check_node, node)
- # check head node(s)
- for node in self.head_nodes:
- if(not node in checked):
checked[node] = node
self.threadpool.invoke(self.check_node, node)
except:
@@ -400,31 +315,6 @@ class CheckDucc(DuccUtil):
sys.exit(1)
self.threadpool.quit()
-
- #if ( self.kill_signal != None ):
- # if(self.automanage_broker):
- # print 'Stopping broker'
- # self.stop_broker()
- # else:
- # print 'Not stopping broker - not automanaged'
- # if(self.automanage_database):
- # print 'Stopping database'
- # self.db_stop()
- # else:
- # print 'Not stopping database - not automanaged'
-
- #if(not self.is_reliable_backup()):
- # if ( len(self.pids_agents) == 0):
- # if ( os.path.exists(self.pid_file_agents) ):
- # os.remove(self.pid_file_agents)
- # elif (process_changes or redo_pids):
- # self.pids_agents.write(self.pid_file_agents)
-
- #if ( len(self.pids_daemons) == 0):
- # if ( os.path.exists(self.pid_file_daemons) ):
- # os.remove(self.pid_file_daemons)
- #elif (process_changes or redo_pids):
- # self.pids_daemons.write(self.pid_file_daemons)
if __name__ == "__main__":
checker = CheckDucc()
Modified: uima/uima-ducc/trunk/src/main/admin/ducc_update
URL:
http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/ducc_update?rev=1869795&r1=1869794&r2=1869795&view=diff
==============================================================================
--- uima/uima-ducc/trunk/src/main/admin/ducc_update (original)
+++ uima/uima-ducc/trunk/src/main/admin/ducc_update Thu Nov 14 13:41:11 2019
@@ -166,6 +166,12 @@ def update_directory(olddir, newdir, arc
else:
print "\n", " --- Processing folder:", olddir[lenRuntime:]
+ # Play safe by discarding all pyc files (note that the whole of the bin
directory has been replaced)
+ # Some of these will be recreated later when ducc_props_manager is run
+ if olddir.endswith('admin'):
+ cmd = 'rm ' + olddir + '/*.pyc'
+ rc = os.system(cmd);
+
subdirs = []
preserveAll = os.path.basename(newdir) in preserveDirectories
if not os.path.exists(archdir):
@@ -213,8 +219,13 @@ def update_directory(olddir, newdir, arc
# Insure ducc not running
#-----------------------------------------------------------------------------------------
def check_ducc(runtime):
- print 'checking for ducc running, may take a few minutes...'
- cmd = [ os.path.join(runtime,'admin/check_ducc') ]
+ # Check just the local node if multi-headed AND installed on a local
filesystem
+ if islocal(runtime):
+ cmd = [ os.path.join(runtime,'admin/check_ducc'), "--localonly" ]
+ print 'checking for ducc running on this node, may take a few
minutes...'
+ else:
+ cmd = [ os.path.join(runtime,'admin/check_ducc') ]
+ print 'checking for ducc running, may take a few minutes...'
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
bufsize=1, universal_newlines=True)
foundDucc = 'Found ' + find_ducc_uid(os.path.join(runtime,'admin'))
while True:
@@ -225,6 +236,8 @@ def check_ducc(runtime):
line = output.strip()
#print line
if foundDucc in line:
+ if "NOT auto-managed" in line:
+ continue
print "ERROR - DUCC appears to be running: " + line
print "Please run '" + os.path.join(runtime,'admin/stop_ducc')
+ " --all'"
exit(1)
@@ -232,6 +245,32 @@ def check_ducc(runtime):
return rc
#-----------------------------------------------------------------------------------------
+# Test if a reliable installation on a local filesystem
+# Should ideally check if multiple head-nodes are actually specified
+# but instead check just for the required key (or a prefix)
+#-----------------------------------------------------------------------------------------
+def islocal(runtime):
+ siteprops = runtime + '/resources/site.ducc.properties'
+ reliable = False
+ with open(siteprops) as f:
+ for line in f:
+ if line.startswith("ducc.head.reliable.list"):
+ reliable = True
+ break
+ if not reliable:
+ return False
+ ismounted = False;
+ # Check if any part of the runtime path (except '/') is a mount point ...
probably a shared filesystem
+ path = runtime
+ while len(path) > 1:
+ if os.path.ismount(path):
+ print "This multi-head installation appears to be using a shared
filesystem mounted at", path
+ return False
+ path,tail = os.path.split(path)
+ return True
+
+
+#-----------------------------------------------------------------------------------------
# The "ducc" userid is the user that installed DUCC and created this directory
#-----------------------------------------------------------------------------------------
def find_ducc_uid(dir):
@@ -385,12 +424,13 @@ print "\n", " --- Files not replaced are
#-----------------------------------------------------------------------------------------
# Re-build ducc_ling
# Since it needs ducc.properties run the merge from the admin directory
+# NOTE - this will be using the just-updated scripts in the admin directory
#-----------------------------------------------------------------------------------------
print "\n", " --- Rebuilding ducc_ling"
os.chdir(runtime + '/admin')
rc = os.system('./ducc_props_manager --merge
../resources/default.ducc.properties --with ../resources/site.ducc.properties
--to ../resources/ducc.properties')
if (rc != 0):
- print "ERROR - failed to create ducc.properties and to rebuild ducc_ling"
+ print "ERROR - failed to create ducc.properties so ducc_ling was not
rebuilt"
exit(9)
rc = os.system('./build_duccling')
if (rc != 0):