Author: challngr
Date: Tue Jul 16 20:29:54 2013
New Revision: 1503870
URL: http://svn.apache.org/r1503870
Log:
UIMA-3081 Verify configuration before allowing startup.
Modified:
uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc
uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py
uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java
Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc
URL:
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc?rev=1503870&r1=1503869&r2=1503870&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc Tue Jul 16 20:29:54
2013
@@ -59,6 +59,10 @@ class CheckDucc(DuccUtil):
print " Check for agents on the nodes in nodefile. This option
may be specified multiple time"
print " for multiple nodefiles. The 'local' node is always
checked"
print ""
+ print " -c --configuration"
+ print " Do basic sanity checking on the configuration only.
Note that configuration checking is always"
+ print " performed with most options. The [-c, --configuration]
option does ONLY configuration checking."
+ print ""
print " -u --user userid"
print " Userid is the user whose processes check_ducc searches
for. If not specified,"
print " the user executing check_ducc is used. If specified as
'all' then all ducc processes"
@@ -66,7 +70,15 @@ class CheckDucc(DuccUtil):
print ""
print " -k --kill"
print " Force-kill any DUCC process you find on a node (if
normal stop_ducc isn't working. This"
- print " uses kill -9 and only kills processes owned by the
invoking user."
+ print " uses kill -KILL (-9) and only kills processes owned by
the invoking user."
+ print ""
+ print " -i --int"
+ print " Force-kill any DUCC process you find on a node (if
normal stop_ducc isn't working. This"
+ print " uses kill -INT (-2) and only kills processes owned by
the invoking user."
+ print ""
+ print " -q --quit"
+ print " Force-kill any DUCC process you find on a node (if
normal stop_ducc isn't working. This"
+ print " uses kill -QUIT (-3) and only kills processes owned by
the invoking user."
print ""
print " -p --pids"
print " Rewrite the PID file. The PID file is always rewritten
if any changes to processes are made. Sometimes"
@@ -85,43 +97,55 @@ class CheckDucc(DuccUtil):
def main(self, argv):
- self.show_ducc_environment()
-
try:
- opts, args = getopt.getopt(argv, 'kn:prs:u:h?v')
+ opts, args = getopt.getopt(argv, 'cikn:pqrs:u:h?v',
['--configuration', '--nodelist=', '--user=', '--int', '--quit', '--kill',
'--pids', '--reap', '--version'])
except:
self.usage("Invalid arguments " + ' '.join(argv))
nodefiles = []
user = os.environ['LOGNAME']
victim_user = None
- kill = False
+ kill_signal = None
reap = False
redo_pids = False
process_changes = False
do_validate = False
checkdate = 0
-
+ config_only = False
+
for ( o, a ) in opts:
- if ( o == '-n' ) :
+ if o in ('-c', '--configuration'):
+ config_only = True
+ elif o in ('-n', '--nodelist'):
nodefiles.append(a)
- elif ( o == '-k' ) :
- kill = True
- elif ( o == '-u' ) :
+ elif o in ('-i', '--int'):
+ if ( kill_signal != None ):
+ print 'Conflicting kill signals: -INT and', kill_signal
+ return
+ kill_signal = '-INT'
+ elif o in ('-q', '--quit'):
+ if ( kill_signal != None ):
+ print 'Conflicting kill signals: -QUIT and', kill_signal
+ return
+ kill_signal = '-QUIT'
+ elif o in ('-k', '--kill'):
+ if ( kill_signal != None ):
+ print 'Conflicting kill signals: -KILL and', kill_signal
+ return
+ kill_signal = '-KILL'
+ elif o in ('-u', '--user'):
victim_user = a
- elif ( o == '-v'):
- ducc_util.version()
- elif ( o == '-r'):
+ elif o in ('-r', '--reap'):
reap = True
- elif ( o == '-p'):
+ elif o in ('-p', '--pids'):
redo_pids = True
- elif ( o == '-s'):
+ elif o in ('-s'):
# intended to be called recursively from check_ducc, NOT from
the command line
do_validate = True
checkdate = float(a)
- elif ( o in ('-h', '-?') ):
+ elif o in ('-h', '-?', '--help'):
self.usage(None)
- elif ( o == '-v'):
+ elif o in ('-v', '--version'):
self.version(None)
else:
print 'badarg', a
@@ -134,6 +158,13 @@ class CheckDucc(DuccUtil):
self.validate(checkdate)
return
+ os.system('cat ' + self.DUCC_HOME + '/state/duccling.version')
+ # not -s option, do this only on local node
+ env = self.show_ducc_environment()
+ for e in env:
+ print e
+
+
if ( reap and (user == 'ducc') ):
usage('Can only reap non-udcc users')
@@ -172,6 +203,13 @@ class CheckDucc(DuccUtil):
if ( len(localnodes) > 0 ):
nodes['local'] = localnodes
+ self.verify_jvm()
+ if self.verify_class_configuration(nodes):
+ print "OK: Class configuration checked"
+
+ if ( config_only ):
+ return
+
# checking starts here
checked = {}
for (nodefile, nodelist) in nodes.items():
@@ -203,14 +241,14 @@ class CheckDucc(DuccUtil):
continue
process_id = found_user + ' ' + component + '@' + node
+ ' PID ' + pid
- if ( kill ) :
+ if ( kill_signal != None ) :
if ( user != found_user ):
print spacer, "Not killing someone else's
process.", process_id
elif ( component == 'unknown-java' ):
print spacer, 'Not killing non-ducc process',
process_id
else:
- print spacer, 'Killing (kill -9)', process_id
- self.kill_process(node, proc)
+ print spacer, 'Killing (' + kill_signal +
')', process_id
+ self.kill_process(node, proc, kill_signal)
pids.delete(pid)
process_changes = True
elif ( reap ):
@@ -236,7 +274,7 @@ class CheckDucc(DuccUtil):
else:
print 'no processes found.'
- if ( not (kill or reap) ):
+ if ( not ((kill_signal != None) or reap) ):
lines = self.ssh(node, True, self.DUCC_HOME +
"/admin/check_ducc", "-s", str(int(time())))
while 1:
line = lines.readline()
@@ -249,7 +287,7 @@ class CheckDucc(DuccUtil):
if ( reap ):
return
- if ( kill ):
+ if ( kill_signal != None ):
self.stop_broker()
self.remove_orchestrator_lock()
Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py
URL:
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py?rev=1503870&r1=1503869&r2=1503870&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py Tue Jul 16
20:29:54 2013
@@ -235,10 +235,10 @@ class DuccUtil(DuccBase):
def verify_jvm(self):
jvm = self.java()
- CMD = jvm + ' -fullversion > /dev/null 2>&1'
+ CMD = jvm + ' -version > /dev/null 2>&1'
rc = os.system(CMD)
if ( rc != 0 ):
- print 'NOTOK', CMD, 'returns', rc, '. Must return rc 0. Startup
cannot continue.'
+ print 'NOTOK', CMD, 'returns', int(rc), '. Must return rc 0.
Startup cannot continue.'
return False
return True
@@ -477,8 +477,8 @@ class DuccUtil(DuccBase):
except:
print 'Unable to remove orchestrator lock'
- def kill_process(self, node, proc):
- self.ssh(node, False, 'kill', '-KILL', proc[1])
+ def kill_process(self, node, proc, signal):
+ self.ssh(node, False, 'kill', signal, proc[1])
def clean_shutdown(self):
DUCC_JVM_OPTS = ' -Dducc.deploy.configuration=' + self.DUCC_HOME +
"/resources/ducc.properties "
@@ -547,7 +547,7 @@ class DuccUtil(DuccBase):
manifest = DuccProperties()
manifest.load_from_manifest(self.DUCC_HOME + '/lib/' + j)
- response.append('ENV: %25s %18s %12s %s' % (j + ':',
manifest.get('Ducc-Version'), 'compiled at', manifest.get('Ducc-Build-Date')))
+ response.append('ENV: %25s %18s %12s %s' % (j + ':',
manifest.get('Ducc-Version'), 'compiled at', manifest.get('Build-Date')))
return response
@@ -590,6 +590,147 @@ class DuccUtil(DuccBase):
#print 'RETURN', nodefile, ret
return ret
+ def compare_nodes(self, n1, n2):
+
+ if ( n1 == n2 ): # exact match - covers both short and
both long
+ return True
+
+ if ( n1.find('.') >= 0 ): # shortened n1 == n2?
+ t1 = n1.split('.')
+ n1A = t1[0]
+ if ( n1A == n2 ):
+ return True
+
+ if ( n2.find('.') >= 0 ): # n1 == shortened n2?
+ t2 = n2.split('.')
+ n2A = t2[0]
+ if ( n1 == n2A ):
+ return True
+ return False
+
+ #
+ # Make sure all the nodes in the configured nodepools are also in the
startup list
+ #
+ def check_nodepools(self, classprops, allnodes):
+ #
+ # First make sure that all the nodepools that are declared have
definition files
+ # and that the defined nodes are in some nodelist.
+ #
+ nodepools = classprops.get('scheduling.nodepool').split()
+ nodepools_ok = True
+ for np in nodepools:
+ npkey = 'scheduling.nodepool.' + np
+ npfilename = classprops.get(npkey)
+ if ( npfilename == None ):
+ print 'NOTOK: Missing nodepool definition file for Nodepool',
np
+ nodepools_ok = False
+ continue
+
+ npfile = self.DUCC_HOME + '/resources/' + npfilename
+ if ( not os.path.exists(npfile) ):
+ print 'NOTOK: Cannot find nodepool file', npfile
+ errors = errors + 1
+ continue
+
+ npnodes = {}
+ npnodes = self.read_nodefile(npfile, npnodes)
+ found = False
+ for ( impfile, nodes ) in npnodes.items():
+ for node in nodes:
+ for (nodefile, nodelist) in allnodes.items():
+ for n in nodelist:
+ if ( self.compare_nodes(n, node)):
+ found = True
+ break
+ if ( not found ):
+ print 'NOTOK: Cannot find node defined in pool "' +np+'"
in any nodefile:', node
+ nodepools_ok = False
+
+ #
+ # Now make sure that all classes that reference nodepools have
corresponding
+ # nodepool definitions
+ #
+
+ for ( k, v ) in classprops.items():
+ if ( k.startswith('scheduling.class.') and k.endswith('.nodepool')
):
+ if ( not ( v in nodepools ) ):
+ toks = k.split('.')
+ classname = toks[2]
+ print 'NOTOK: Class', classname, 'references non-existent
nodepool', v
+ nodepools_ok = False
+
+ if ( nodepools_ok ):
+ print 'OK: All nodepools are verified'
+ else:
+ print 'NOTOK: some nodepools are not correctly defined.'
+
+ return nodepools_ok
+
+ def verify_class_configuration(self, allnodes):
+ answer = True
+ # first, find the class definition
+ classfile = self.ducc_properties.get('ducc.rm.class.definitions')
+ classfile = self.resolve(classfile, self.propsfile) # resolve the
classfile relative to ducc.properties
+
+ print 'Class definition file is', classfile
+ classprops = DuccProperties()
+ try:
+ classprops.load(classfile)
+ except:
+ print 'NOTOK: Cannot read properties file', classfile
+ return False
+
+ # Verify nodepool definitions.
+ if ( not self.check_nodepools(classprops, allnodes) ):
+ # this check will emit necessary messages
+ answer = False
+
+ nodepools = classprops.get('scheduling.nodepool').split()
+ class_set = classprops.get('scheduling.class_set').split()
+ # first, make sure every class that is defined exists, has a policy,
and a priority
+ # FAIR_SHARE classes, they must also have a weight
+ # if a nodeppol is assigned, it must also be one of the defined, and
now verified, nodepools
+ for cl in class_set:
+ po = classprops.get('scheduling.class.' + cl +'.policy')
+ if ( po == None ):
+ print 'NOTOK: Missing policy definition for class "' + cl + '"'
+ answer = False
+ else:
+ we = classprops.get('scheduling.class.' + cl +'.share_weight')
+ if ( po == 'FAIR_SHARE' and we == None ):
+ print 'NOTOK: Missing "weight" definition for class: "' +
cl + '"'
+ answer = False
+
+ pr = classprops.get('scheduling.class.' + cl +'.priority')
+ if ( pr == None ):
+ print 'NOTOK: Missing priority definition for class: "' + cl +
'"'
+ answer = False
+
+ clnp = classprops.get('scheduling.class.' + cl +'.nodepool')
+ if ( clnp != None ):
+ if ( not clnp in nodepools ):
+ print 'NOTOK: Nodepool "' + clnp + '" is configured for
class "' + cl + '" but has no definition.'
+ answer = False
+
+ # Dig out the jobdriver class and insure it exists.
+ jdclass = self.ducc_properties.get('ducc.jd.host.class')
+ if ( not jdclass in class_set ):
+ print 'NOTOK: Job Driver class "' + jdclass + '" is not defined
(see ducc.properties: ducc.jd.host.class).'
+ answer = False
+
+ # if a default.name and/or default.name.reserve class is defined, make
sure they exist
+ default_class = classprops.get('scheduling.default.name')
+ if ( (default_class != None) and (not default_class in class_set) ):
+ print 'NOTOK: Default class "' + default_class + '" is not
defined.'
+ answer = False
+
+ default_reserve_class =
classprops.get('scheduling.default.name.reserve')
+ if ( (default_reserve_class != None) and (not default_reserve_class in
class_set) ):
+ print 'NOTOK: Default reserve class "' + default_reserve_class +
'" is not defined.'
+ answer = False
+
+ return answer
+
def __init__(self):
DuccBase.__init__(self)
self.duccling = None
Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc
URL:
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc?rev=1503870&r1=1503869&r2=1503870&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc Tue Jul 16 20:29:54
2013
@@ -289,6 +289,12 @@ class StartDucc(DuccUtil):
print "Can't read nodefile", nf
ok = False
+ if ok and self.verify_class_configuration(nodes):
+ print "OK: Class configuration checked"
+ else:
+ print "NOTOK: Bad configuration, cannot start."
+ ok = False
+
if ( not ok ):
sys.exit(1)
Modified:
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java
URL:
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java?rev=1503870&r1=1503869&r2=1503870&view=diff
==============================================================================
---
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java
(original)
+++
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java
Tue Jul 16 20:29:54 2013
@@ -341,10 +341,12 @@ public class Scheduler
Map<String, String> readNodepoolFile(String npfile)
{
+ String methodName = "readNodepoolFile";
String my_domain = getDomainName();
String ducc_home = System.getProperty("DUCC_HOME");
npfile = ducc_home + "/resources/" + npfile;
+ logger.info(methodName, null, "Domain name:", my_domain);
Map<String, String> response = new HashMap<String, String>();
try {
@@ -401,7 +403,7 @@ public class Scheduler
// read in nodepools
String npn = props.getProperty("scheduling.nodepool");
if ( npn != null ) {
- String[] npnames = npn.split(" ");
+ String[] npnames = npn.split("\\s+");
for ( String nodepoolName : npnames ) {
int nporder = props.getIntProperty("scheduling.nodepool." +
nodepoolName + ".order", 100);
String npfile = props.getProperty("scheduling.nodepool." +
nodepoolName).trim();