Hi,
Those 3 patches add support for GPU enabled nodes in oscar wizard.
Since torque 2.5.4, torque is able to handle GPU resource (gpus =) resource.
This patch add support for:
1/ detecting nvidia GPUs on nodes (in /usr/bin/post_install)
2/ storing the info in the oscar database (thru a new colum in Nodes table)
3/ torque configuration in api-post-deploy
The api-post deploy of torque has been fully debuggued and extensively tested
agains: nodes down during config, gpu able or not, server started or not, work
queue already present or not. Most if not all failure have been checked and
handeled.
The oscar_postinstall has been updated with many more checks and error
handeling.
IMHO, with those patches, torque shouldn't be marked experimental anymore.
The only requirements are:
opkg-torque needs torque >= 2.5.4
--
Olivier LAHAYE
CEA DRT/LIST/DCSI/DIR
Index: packages/torque/scripts/api-post-deploy
===================================================================
--- packages/torque/scripts/api-post-deploy (révision 9483)
+++ packages/torque/scripts/api-post-deploy (copie de travail)
@@ -33,6 +33,18 @@
my $default; # Did we type --default on the command line?
#########################################################################
+# Guess TORQUE home is it /usr or /opt/pbs ?
+#########################################################################
+
+my $torque_home;
+if (-f '/usr/bin/pbsnodes') {
+ $torque_home='/usr';
+} else {
+ $torque_home='/opt/pbs';
+}
+
+
+#########################################################################
# compactSpaces strips off the leading and trailing spaces from a #
# string. If you also pass in $compact=1, then it compresses multiple #
# spaces within the string down to 1 space. Also, you can pass in #
@@ -72,7 +84,7 @@
open(CMD,"/etc/init.d/pbs_server status |");
my $result = <CMD>;
close(CMD);
- system('/etc/init.d/pbs_server restart')
+ system('/etc/init.d/pbs_server start')
unless ($result =~ /is running/);
}
@@ -82,6 +94,21 @@
}
######################################################################
+# Check to see if workq is defined #
+######################################################################
+sub isWorkqDefined
+{
+ open(CMD,$torque_home."/bin/qmgr -c 'l q workq' |");
+ my $result = <CMD>;
+ close(CMD);
+ if($result =~ /^Queue workq/) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+######################################################################
# Check to see if 'Maui' is running. If not, then start it. #
######################################################################
sub restartMaui
@@ -130,7 +157,7 @@
opkg_print("Updating pbs_server nodes\n");
# Read in the current nodes as shown by 'pbsnodes -a'
-open (CMD,'/opt/pbs/bin/pbsnodes -a |');
+open (CMD,$torque_home.'/bin/pbsnodes -a |');
my $inp = "";
my $server = "";
my $opt = "";
@@ -156,6 +183,7 @@
# Get listing of nodes from the SIS database
my %nodes = get_machine_listing($image);
+# OL: TODO: Code duplication. Need to use nodes code
if ($compute_on_head eq "YES") {
# Get server proc count
my $server_procs = 0;
@@ -174,6 +202,7 @@
$added_node{HOST}=$ENV{HOSTNAME};
$added_node{IPADDR}="";
$added_node{NUM_PROCS}=$server_procs;
+ $added_node{NUM_GPUS}=0; # Ignore GPUS on head (too much dangerous)
$added_node{DOMAIN}="";
$nodes{$ENV{HOSTNAME}}=\%added_node;
}
@@ -189,10 +218,17 @@
$hostname = $nodes{$node}{HOST};
if ($pbsnodes{$hostname})
{
- system("/opt/pbs/bin/pbsnodes -o $hostname");
+ system($torque_home."/bin/pbsnodes -o $hostname");
# Reset the number of processors for this node
- system('/opt/pbs/bin/qmgr -a -e -c "set node ' . $hostname .
- ' np = ' . $nodes{$node}{NUM_PROCS} . '"');
+ my $qmgr_cmd=$torque_home.'/bin/qmgr -a -e -c "set node ' . $hostname;
+ $qmgr_cmd.=' np = ' . $nodes{$node}{NUM_PROCS};
+ # and number of GPUs if any
+ if ($nodes{$node}{NUM_GPUS}>0) {
+ $qmgr_cmd.=', gpus = ' . $nodes{$node}{NUM_GPUS} . '"';
+ } else {
+ $qmgr_cmd.='"';
+ }
+ system($qmgr_cmd);
# Search the properties for 'all' and add it if not found
my(@props) = split(/,/,$pbsnodes{$hostname}{'properties '});
my($alldefined) = "";
@@ -200,16 +236,18 @@
{
$alldefined = 1 if $prop eq 'all';
}
- system('/opt/pbs/bin/qmgr -a -e -c "set node ' . $hostname .
+ system($torque_home.'/bin/qmgr -a -e -c "set node ' . $hostname .
' properties += all"') unless $alldefined;
- system("/opt/pbs/bin/pbsnodes -c $hostname");
+ system($torque_home."/bin/pbsnodes -c $hostname");
# Remove from pbsnodes hash
delete $pbsnodes{$nodes{$node}{HOST}};
}
else
{ # Didn't find a match -> ADD the node
- system('/opt/pbs/bin/qmgr -a -e -c "create node ' . $hostname .
- ' np = ' . $nodes{$node}{NUM_PROCS} . ' , properties = all"');
+ system($torque_home.'/bin/qmgr -a -e -c "create node ' . $hostname .
+ ' np = ' . $nodes{$node}{NUM_PROCS} .
+ ' , gpus = ' . $nodes{$node}{NUM_GPUS} .
+ ' , properties = all"');
}
# Count up the number of nodes and processors
@@ -221,102 +259,107 @@
# Now go through the remaining pbsnodes hash and delete these nodes
foreach my $node (sortnodes( keys %pbsnodes ))
{
- system('/opt/pbs/bin/qmgr -a -e -c "delete node ' . $node . '"');
+ system($torque_home.'/bin/qmgr -a -e -c "delete node ' . $node . '"');
}
restartPBSserver();
-# Next, use qmgr command to set up the values for workq
-opkg_print("Creating TORQUE workq queue...\n");
+if(isWorkqDefined()) {
+ opkg_print("workq already defined...Skipping queue creation...\n");
+} else {
+ # Next, use qmgr command to set up the values for workq
+ opkg_print("Creating TORQUE workq queue...\n");
-# These are default values set only when not present or when --default
-my @default_params = (
- 'create queue workq',
- 'set queue workq queue_type = Execution',
- 'set queue workq resources_max.cput = 10000:00:00',
- 'set queue workq resources_max.walltime = 10000:00:00',
- 'set queue workq resources_min.cput = 00:00:01',
- 'set queue workq resources_min.ncpus = 1',
- 'set queue workq resources_min.nodect = 1',
- 'set queue workq resources_min.walltime = 00:00:01',
- 'set queue workq resources_default.cput = 10000:00:00',
- 'set queue workq resources_default.ncpus = 1',
- 'set queue workq resources_default.nodect = 1',
- 'set queue workq resources_default.walltime = 10000:00:00',
- 'set queue workq enabled = True',
- 'set queue workq started = True',
- 'set server scheduling = True',
- 'set server default_queue = workq',
- 'set server mail_from = adm',
- 'set server query_other_jobs = True',
-);
+ # These are default values set only when not present or when --default
+ my @default_params = (
+ 'create queue workq',
+ 'set queue workq queue_type = Execution',
+ 'set queue workq resources_max.cput = 10000:00:00',
+ 'set queue workq resources_max.walltime = 10000:00:00',
+ 'set queue workq resources_min.cput = 00:00:01',
+ 'set queue workq resources_min.ncpus = 1',
+ 'set queue workq resources_min.nodect = 1',
+ 'set queue workq resources_min.walltime = 00:00:01',
+ 'set queue workq resources_default.cput = 10000:00:00',
+ 'set queue workq resources_default.ncpus = 1',
+ 'set queue workq resources_default.nodect = 1',
+ 'set queue workq resources_default.walltime = 10000:00:00',
+ 'set queue workq enabled = True',
+ 'set queue workq started = True',
+ 'set server scheduling = True',
+ 'set server default_queue = workq',
+ 'set server mail_from = adm',
+ 'set server query_other_jobs = True',
+ );
-# These are variable parameters that are set everytime
-my @variable_params = (
- "set queue workq resources_max.ncpus = $TOT_NP",
- "set queue workq resources_max.nodect = $TOT_NODES",
- "set queue workq resources_available.nodect = $TOT_NODES",
- "set server resources_available.ncpus = $TOT_NP",
- "set server resources_available.nodect = $TOT_NODES",
- "set server resources_available.nodes = $TOT_NODES",
- "set server resources_max.ncpus = $TOT_NP",
- "set server resources_max.nodes = $TOT_NODES",
- "set server scheduler_iteration = 60",
- "set server log_events = $loglevel",
-);
+ # These are variable parameters that are set everytime
+ my @variable_params = (
+ "set queue workq resources_max.ncpus = $TOT_NP",
+ "set queue workq resources_max.nodect = $TOT_NODES",
+ "set queue workq resources_available.nodect = $TOT_NODES",
+ "set server resources_available.ncpus = $TOT_NP",
+ "set server resources_available.nodect = $TOT_NODES",
+ "set server resources_available.nodes = $TOT_NODES",
+ "set server resources_max.ncpus = $TOT_NP",
+ "set server resources_max.nodes = $TOT_NODES",
+ "set server scheduler_iteration = 60",
+ "set server log_events = $loglevel",
+ );
-# First, read in the current values for the localhost
-undef $/; # Suck in the entire files into one big string
-open (CMD,'/opt/pbs/bin/qmgr -a -e -c "print server @localhost" |');
-my $cmd_output = <CMD>;
-close (CMD);
+ # First, read in the current values for the localhost
+ undef $/; # Suck in the entire files into one big string
+ open (CMD,$torque_home.'/bin/qmgr -a -e -c "print server @localhost" |');
+ my $cmd_output = <CMD>;
+ close (CMD);
-# Next, build up a command string to pass back to the qmgr command.
-my $line = "";
-my $key = "";
-my $value = "";
-my $cmd_input = "";
+ # Next, build up a command string to pass back to the qmgr command.
+ my $line = "";
+ my $key = "";
+ my $value = "";
+ my $cmd_input = "";
-# Search through the default_params for anything missing.
-foreach $line (@default_params)
- {
- ($key,$value) = split /=/, $line;
- $key = compactSpaces($key);
- $value = compactSpaces($value);
- if ($value eq "")
- { # Search for just the $key since there's no value for it.
- $cmd_input .= "$key\n" if
- ($cmd_output !~ /$key/m) ||
- (defined($default) && $key !~ /create/);
- }
- else
- { # Search for "$key =". Output key and value if needed.
- $cmd_input .= "$key = $value\n" if
- ($cmd_output !~ /$key\s*=/m) || defined($default);
- }
- }
+ # Search through the default_params for anything missing.
+ foreach $line (@default_params)
+ {
+ ($key,$value) = split /=/, $line;
+ $key = compactSpaces($key);
+ $value = compactSpaces($value);
+ if ($value eq "")
+ { # Search for just the $key since there's no value for it.
+ $cmd_input .= "$key\n" if
+ ($cmd_output !~ /$key/m) ||
+ (defined($default) && $key !~ /create/);
+ }
+ else
+ { # Search for "$key =". Output key and value if needed.
+ $cmd_input .= "$key = $value\n" if
+ ($cmd_output !~ /$key\s*=/m) || defined($default);
+ }
+ }
-# Then, force the setting of the of the variable_params.
-foreach $line (@variable_params)
- {
- ($key,$value) = split /=/, $line;
- $key = compactSpaces($key);
- $value = compactSpaces($value);
- $cmd_input .= "$key";
- $cmd_input .= " = $value"
- if ($value ne "");
- $cmd_input .= "\n";
- }
+ # Then, force the setting of the of the variable_params.
+ foreach $line (@variable_params)
+ {
+ ($key,$value) = split /=/, $line;
+ $key = compactSpaces($key);
+ $value = compactSpaces($value);
+ $cmd_input .= "$key";
+ $cmd_input .= " = $value"
+ if ($value ne "");
+ $cmd_input .= "\n";
+ }
-# Run the qmgr command if necessary
-if ($cmd_input ne "")
- {
- my $pbs_spool = "/var/spool/pbs";
- open (CMD,"|/opt/pbs/bin/qmgr -a -e ");
- print CMD $cmd_input;
- close (CMD)
- or croak("ERROR: Impossible to Configure TORQUE queues, ".
- "check the logs at $pbs_spool");
- }
+ # Run the qmgr command if necessary
+ if ($cmd_input ne "")
+ {
+ my $pbs_server_logs = "/var/log/torque/server_logs";
+ open (CMD,"|".$torque_home."/bin/qmgr -a -e ");
+ print CMD $cmd_input;
+ close (CMD)
+ or croak("ERROR: Impossible to Configure TORQUE queues, ".
+ "check the logs at $pbs_server_logs");
+ }
+}
+
restartMaui();
# Get current state of server's mom
Index: packages/torque/scripts/server-post-install
===================================================================
--- packages/torque/scripts/server-post-install (révision 9483)
+++ packages/torque/scripts/server-post-install (copie de travail)
@@ -37,6 +37,9 @@
#######################################################
sub startPBSserver
{
+ if(!-f '/var/lib/torque/server_priv/serverdb' && !-f '/var/spool/pbs/server_priv/serverdb') {
+ system('/etc/init.d/pbs_server create'); # no config file: need to initialise config.
+ }
system('/etc/init.d/pbs_server restart');
}
@@ -68,10 +71,20 @@
my $xpbsmonrc;
# Attention: More sophistication needed here.
+# Check for 32/64 bit architecture AND
+# Check if using /opt located oscar torque package or distro torque package
if ($arch eq "x86_64") {
- $xpbsmonrc = "/opt/pbs/lib64/xpbsmon/xpbsmonrc";
+ if (-f "/usr/lib64/xpbsmon/xpbsmonrc") {
+ $xpbsmonrc = "/usr/lib64/xpbsmon/xpbsmonrc";
+ } else {
+ $xpbsmonrc = "/opt/pbs/lib64/xpbsmon/xpbsmonrc";
+ }
} else {
- $xpbsmonrc = "/opt/pbs/lib/xpbsmon/xpbsmonrc";
+ if (-f "/usr/lib/xpbsmon/xpbsmonrc") {
+ $xpbsmonrc = "/usr/lib/xpbsmon/xpbsmonrc";
+ } else {
+ $xpbsmonrc = "/opt/pbs/lib/xpbsmon/xpbsmonrc";
+ }
}
# Eventually 'torque_gui' is not installed, so we cannot configure xpbsmonrc
Index: packages/torque/testing/test_root
===================================================================
--- packages/torque/testing/test_root (révision 9483)
+++ packages/torque/testing/test_root (copie de travail)
@@ -14,14 +14,26 @@
# Jeremy Enos
# Bernard Li
+#########################################################################
+# Guess TORQUE home is it /usr or /opt/pbs ?
+#########################################################################
+
+if test -f '/usr/bin/pbsnodes'
+then
+ pbsnodes='/usr/bin/pbsnodes';
+else
+ pbsnodes='/opt/pbs/bin/pbsnodes';
+fi
+
+
# Stop TORQUE Server if any nodes exhibit "state-unknown" status.
# This condition could be due to the 5 minute polling interval to the moms
# by the server. Restarting the server will induce a fresh poll.
-unknown_nodes=`/opt/pbs/bin/pbsnodes -a |grep -c state-unknown`
+unknown_nodes=`$pbsnodes -a |grep -c state-unknown`
if [ $unknown_nodes != 0 ] ; then
/etc/init.d/pbs_server stop
fi
-unknown_nodes=`/opt/pbs/bin/pbsnodes -a |grep -c state-unknown`
+unknown_nodes=`$pbsnodes -a |grep -c state-unknown`
if [ $unknown_nodes != 0 ] ; then
$OSCAR_TESTPRINT --label "TORQUE node check" -f
# exit 1;
Index: packages/torque/testing/pbs_test
===================================================================
--- packages/torque/testing/pbs_test (révision 9483)
+++ packages/torque/testing/pbs_test (copie de travail)
@@ -30,8 +30,23 @@
# are failing due to timeouts (not because of failing package tests)
default_timeout=60
-qsub='/opt/pbs/bin/qsub'
-qstat='/opt/pbs/bin/qstat'
+#########################################################################
+# Guess TORQUE home is it /usr or /opt/pbs ?
+#########################################################################
+
+if test -f '/usr/bin/pbsnodes'
+then
+ qsub='/usr/bin/qsub'
+ qdel='/usr/bin/qdel'
+ qstat='/usr/bin/qstat'
+ pbsnodes='/usr/bin/pbsnodes'
+else
+ qsub='/opt/pbs/bin/qsub'
+ qdel='/opt/pbs/bin/qdel'
+ qstat='/opt/pbs/bin/qstat'
+ pbsnodes='/opt/pbs/bin/pbsnodes'
+fi
+
exit_status=0
# Check usage (since this can be run seperately)
@@ -108,7 +123,7 @@
}
function check_free_nodes () {
- free=`/opt/pbs/bin/pbsnodes -a | grep -c 'state = free'`
+ free=`$pbsnodes -a | grep -c 'state = free'`
if [ $free -eq $clients ] || [ $free -gt $clients ]; then
# at least $clients free nodes
echo -n ""
@@ -158,8 +173,8 @@
$OSCAR_TESTPRINT --label "$pbs_test_type" --fail
job_state=fail
exit_status=1
- if [ `/opt/pbs/bin/qstat | grep -c $job` -eq 1 ]; then
- /opt/pbs/bin/qdel $job
+ if [ `$qstat | grep -c $job` -eq 1 ]; then
+ $qdel $job
sleep 1 # Allows nodes to free up again
fi
if [ "$exit_on_fail" = yes ]; then waitexit ; fi
Index: packages/torque/testing/test_user
===================================================================
--- packages/torque/testing/test_user (révision 9483)
+++ packages/torque/testing/test_user (copie de travail)
@@ -14,6 +14,18 @@
# Jeremy Enos
# Bernard Li
+#########################################################################
+# Guess TORQUE home is it /usr or /opt/pbs ?
+#########################################################################
+
+if test -f '/usr/bin/qmgr'
+then
+ qmgr='/usr/bin/qmgr'
+else
+ qmgr='/opt/pbs/bin/qmgr'
+fi
+
+
cd ~
clients=0
for i in $@
@@ -24,7 +36,7 @@
# DEFAULT TORQUE QUEUE
# --------------------
-if [ `/opt/pbs/bin/qmgr -c "l s" | grep -c 'default_queue'` -eq 1 ]; then
+if [ `$qmgr -c "l s" | grep -c 'default_queue'` -eq 1 ]; then
$OSCAR_TESTPRINT --label "TORQUE default queue definition" -p
else
$OSCAR_TESTPRINT --label "TORQUE default queue definition" -f
Index: packages/torque/testing/pbs_script.shell
===================================================================
--- packages/torque/testing/pbs_script.shell (révision 9483)
+++ packages/torque/testing/pbs_script.shell (copie de travail)
@@ -11,7 +11,18 @@
### Queue name
#PBS -q workq
+#########################################################################
+# Guess TORQUE home is it /usr or /opt/pbs ?
+#########################################################################
+
+if test -f '/usr/bin/pbsdsh'
+then
+ pbsdsh='/usr/bin/pbsdsh'
+else
+ pbsdsh='/opt/pbs/bin/pbsdsh'
+fi
+
### Script commands
-/opt/pbs/bin/pbsdsh -s hostname
-/opt/pbs/bin/pbsdsh -s date +"Hello, date is %D, time is %T"
+$pbsdsh -s hostname
+$pbsdsh -s date +"Hello, date is %D, time is %T"
exit
Index: oda/trunk/lib/Database.pm
===================================================================
--- oda/trunk/lib/Database.pm (révision 9474)
+++ oda/trunk/lib/Database.pm (copie de travail)
@@ -2294,6 +2294,7 @@
return 0;
}
} else {
+ # This function is inapropriate for node info update (cpu count info)
print "The node $node is already in the database\n";
}
return 1;
Index: systeminstaller-oscar/trunk/lib/SystemInstaller/Machine.pm
===================================================================
--- systeminstaller-oscar/trunk/lib/SystemInstaller/Machine.pm (révision 9474)
+++ systeminstaller-oscar/trunk/lib/SystemInstaller/Machine.pm (copie de travail)
@@ -58,6 +58,7 @@
HOST => $machine->{hostname},
DOMAIN => $machine->{domainname},
NUM_PROCS => $machine->{proccount},
+ NUM_GPUS => $machine->{gpucount},
IPADDR => @$adapter[0]->{ip}
};
}
Index: systeminstaller-oscar/trunk/lib/SystemInstaller/Env.pm
===================================================================
--- systeminstaller-oscar/trunk/lib/SystemInstaller/Env.pm (révision 9474)
+++ systeminstaller-oscar/trunk/lib/SystemInstaller/Env.pm (copie de travail)
@@ -107,7 +107,7 @@
# This just returns the version number, looks silly,
# but the string below is replaced during the build
# process with the proper version.
- my $SIVERSION="2.4.2svn20090612";
+ my $SIVERSION="2.4.11";
return $SIVERSION;
}
Index: systeminstaller-oscar/trunk/lib/SIS/NewDB.pm
===================================================================
--- systeminstaller-oscar/trunk/lib/SIS/NewDB.pm (révision 9483)
+++ systeminstaller-oscar/trunk/lib/SIS/NewDB.pm (copie de travail)
@@ -112,11 +112,14 @@
name => "Nodes.name",
# route => "Networks.gateway:Nics.node_id=Nodes.id AND Networks.n_id=Nics.network_id",
hostname => "Nodes.hostname",
-# domainname => "Nodes.dns_domain",
+ # domainname required by SystemInstaller::Image::get_machine_listing()
+ domainname => "Nodes.dns_domain",
# arch => "Images.architecture:Nodes.image_id=Images.id",
imagename => "Images.name:Images.id=Nodes.image_id",
# name => "Nodes.name",
-# proccount => "Nodes.cpu_num",
+ # proccount required by SystemInstaller::Image::get_machine_listing() and torque api-post-deploy
+ proccount => "Nodes.cpu_num",
+ gpucount => "Nodes.gpu_num",
},
);
Index: packages/oda/scripts/oscar_table.sql
===================================================================
--- packages/oda/scripts/oscar_table.sql (révision 9483)
+++ packages/oda/scripts/oscar_table.sql (copie de travail)
@@ -96,6 +96,7 @@
CREATE TABLE IF NOT EXISTS Nodes(
cluster_id integer not null default 0,
cpu_num integer,
+ gpu_num integer,
cpu_speed VARCHAR(100),
dns_domain VARCHAR(100),
fqdn VARCHAR(100),
Index: scripts/post_install
===================================================================
--- scripts/post_install (révision 9483)
+++ scripts/post_install (copie de travail)
@@ -44,6 +44,7 @@
use OSCAR::Package;
use Getopt::Long;
use Carp;
+use Switch;
my ($verbose);
GetOptions(
@@ -62,9 +63,15 @@
# We currently do not assume that getting the number of procs is a fatal
# problem
-if (get_numproc()) {
- warn "WARNING: Impossible to get proc number";
+if(update_node_ressource("cat /proc/cpuinfo",qr/^processor/,"CPU","cpu_num",1)) {
+ warn "WARNING: Impossible to get CPU count";
}
+if(update_node_ressource("nvidia-smi -L",qr/^GPU/,"GPU","gpu_num",0)) {
+ warn "WARNING: Impossible to get GPU count";
+}
+#if(update_node_ressource("/bin/ls -1 /dev",qr/^nvidia[0-9]+/,"GPU","gpu_num",0)) {
+# warn "WARNING: Impossible to get GPU count";
+#}
# This makes sure existing nodes have the latest /etc/hosts.
# The profile script is execd so we know the path (from opium's post_install)
@@ -112,29 +119,53 @@
@_;
}
+
+# update_node_ressource: Scan nodes for cpu count and GPU count and update SIS base.
+# - $query_cmd: command to list ressources
+# - $query filter: regex that isolate ressource
+# - $ressource name: text to identify ressource (for messages and logs)
+# - $db_field: database field affected (cpu_num or gpu_num)
+# - $min_count: (can't have 0 cpu, but it's allowed to have 0 gpu).
# Return: 0 if success, -1 else.
-sub get_numproc {
+
+sub update_node_ressource($$$$$) {
+ my ($query_cmd, $query_filter, $ressource_name, $db_field, $min_count) = @_;
my @machines= sortclients SIS::NewDB::list_client();
foreach my $mach (@machines) {
- my $CMD="/usr/bin/ssh -n ".$mach->{name}." cat /proc/cpuinfo";
- oscar_log_subsection ("Gathering processor count from " .
- $mach->{name}."($CMD)");
- open (CPIPE,"$CMD |")
- or (carp("ERROR: Unable to query machine ".$mach->{name}),next);
+ my $return_code=0;
+ my $CMD="/usr/bin/ssh -n ".$mach->{name}." $query_cmd 2> /dev/null";
+ oscar_log_subsection ("Gathering $ressource_name count from " . $mach->{name});
+ open (CPIPE,"$CMD |") or (carp("ERROR: Can't run $CMD"),next);
my $count=0;
while (<CPIPE>) {
- ++$count if (/^processor/);
+ ++$count if (/$query_filter/);
}
- close(CPIPE);
- if (($count !~ /^[0-9]+$/) || ($count == 0)) {
- carp("ERROR: Improper count ($count) returned from machine " .
- $mach->{name});
- return -1;
+ close(CPIPE) or $return_code=$?;
+ if ($return_code == 32512) { # Command not found (e.g. no nvidia-smi)
+ carp("WARNING: '$query_cmd' not found on Node ".$mach->{name}),next;
+ } elsif ($return_code == 65280) { # Node not responding to ssh command
+ carp("ERROR: Node ".$mach->{name}." DOWN! (not responding)"),next;
+ } elsif ($return_code > 0) { # ????
+ carp("ERROR: ($return_code) while trying to run: $CMD"),next;
+ }
+
+ if (($count !~ /^[0-9]+$/) || ($count < $min_count)) {
+ carp("ERROR: Improper count ($count) returned from machine " . $mach->{name});
+ return -1;
} else {
oscar_log_subsection ("Updating database for machine " .
- $mach->{name} . " cpus=$count.");
- $mach->{proccount} = $count;
- SIS::NewDB::set_client($mach);
+ $mach->{name} . " $db_field=$count.");
+ my %data = (
+ $db_field => $count,
+ );
+ if (OSCAR::Database::update_node (
+ $mach->{'name'},
+ \%data,
+ undef,
+ undef) != 1) {
+ carp "ERROR: Impossible to update node $ressource_name information";
+ return -1;
+ }
}
}
return 0;
------------------------------------------------------------------------------
Everyone hates slow websites. So do we.
Make your web apps faster with AppDynamics
Download AppDynamics Lite for free today:
http://p.sf.net/sfu/appdyn_sfd2d_oct
_______________________________________________
Oscar-devel mailing list
Oscar-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/oscar-devel