Hi,

Those 3 patches add support for GPU enabled nodes in oscar wizard.
Since torque 2.5.4, torque is able to handle GPU resource (gpus =) resource.
This patch add support for:
1/ detecting nvidia GPUs on nodes (in /usr/bin/post_install)
2/ storing the info in the oscar database (thru a new colum in Nodes table)
3/ torque configuration in api-post-deploy

The api-post deploy of torque has been fully debuggued and extensively tested 
agains: nodes down during config, gpu able or not, server started or not, work 
queue already present or not. Most if not all failure have been checked and 
handeled.

The oscar_postinstall has been updated with many more checks and error 
handeling.

IMHO, with those patches, torque shouldn't be marked experimental anymore.

The only requirements are:
opkg-torque needs torque >= 2.5.4

--
   Olivier LAHAYE
   CEA DRT/LIST/DCSI/DIR
Index: packages/torque/scripts/api-post-deploy
===================================================================
--- packages/torque/scripts/api-post-deploy	(révision 9483)
+++ packages/torque/scripts/api-post-deploy	(copie de travail)
@@ -33,6 +33,18 @@
 my $default;       # Did we type --default on the command line?
 
 #########################################################################
+# Guess TORQUE home is it /usr or /opt/pbs ?
+#########################################################################
+
+my $torque_home;
+if (-f '/usr/bin/pbsnodes') {
+   $torque_home='/usr';
+} else {
+   $torque_home='/opt/pbs';
+}
+
+
+#########################################################################
 #  compactSpaces strips off the leading and trailing spaces from a      #
 #  string.  If you also pass in $compact=1, then it compresses multiple #
 #  spaces within the string down to 1 space.  Also, you can pass in     #
@@ -72,7 +84,7 @@
   open(CMD,"/etc/init.d/pbs_server status |");
   my $result = <CMD>;
   close(CMD);
-  system('/etc/init.d/pbs_server restart')
+  system('/etc/init.d/pbs_server start')
       unless ($result =~ /is running/);
 }
 
@@ -82,6 +94,21 @@
 }
 
 ######################################################################
+#  Check to see if workq is defined                                  #
+######################################################################
+sub isWorkqDefined
+{
+  open(CMD,$torque_home."/bin/qmgr -c 'l q workq' |");
+  my $result = <CMD>;
+  close(CMD);
+  if($result =~ /^Queue workq/) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+######################################################################
 #  Check to see if 'Maui' is running.  If not, then start it.        #
 ######################################################################
 sub restartMaui
@@ -130,7 +157,7 @@
 opkg_print("Updating pbs_server nodes\n");
 
 # Read in the current nodes as shown by 'pbsnodes -a'
-open (CMD,'/opt/pbs/bin/pbsnodes -a |');
+open (CMD,$torque_home.'/bin/pbsnodes -a |');
 my $inp = "";
 my $server = "";
 my $opt = "";
@@ -156,6 +183,7 @@
 # Get listing of nodes from the SIS database
 my %nodes = get_machine_listing($image);
 
+# OL: TODO: Code duplication. Need to use nodes code
 if ($compute_on_head eq "YES") {
   # Get server proc count
   my $server_procs = 0;
@@ -174,6 +202,7 @@
   $added_node{HOST}=$ENV{HOSTNAME};
   $added_node{IPADDR}="";
   $added_node{NUM_PROCS}=$server_procs;
+  $added_node{NUM_GPUS}=0; # Ignore GPUS on head (too much dangerous)
   $added_node{DOMAIN}="";
   $nodes{$ENV{HOSTNAME}}=\%added_node;
 }
@@ -189,10 +218,17 @@
     $hostname = $nodes{$node}{HOST};
     if ($pbsnodes{$hostname})
       {
-        system("/opt/pbs/bin/pbsnodes -o $hostname");
+        system($torque_home."/bin/pbsnodes -o $hostname");
         # Reset the number of processors for this node
-        system('/opt/pbs/bin/qmgr -a -e -c "set node ' . $hostname .
-               ' np = ' . $nodes{$node}{NUM_PROCS} . '"');
+        my $qmgr_cmd=$torque_home.'/bin/qmgr -a -e -c "set node ' . $hostname;
+        $qmgr_cmd.=' np = ' . $nodes{$node}{NUM_PROCS};
+        # and number of GPUs if any
+        if ($nodes{$node}{NUM_GPUS}>0) {
+	    $qmgr_cmd.=', gpus = ' . $nodes{$node}{NUM_GPUS} . '"';
+        } else {
+            $qmgr_cmd.='"';
+        }
+	system($qmgr_cmd);
         # Search the properties for 'all' and add it if not found
         my(@props) = split(/,/,$pbsnodes{$hostname}{'properties '});
         my($alldefined) = "";
@@ -200,16 +236,18 @@
           {
             $alldefined = 1 if $prop eq 'all';
           }
-        system('/opt/pbs/bin/qmgr -a -e -c "set node ' . $hostname . 
+        system($torque_home.'/bin/qmgr -a -e -c "set node ' . $hostname . 
                ' properties += all"') unless $alldefined;
-        system("/opt/pbs/bin/pbsnodes -c $hostname");
+        system($torque_home."/bin/pbsnodes -c $hostname");
         # Remove from pbsnodes hash
         delete $pbsnodes{$nodes{$node}{HOST}};
       }
     else
       { # Didn't find a match -> ADD the node
-        system('/opt/pbs/bin/qmgr -a -e -c "create node ' . $hostname .
-               ' np = ' . $nodes{$node}{NUM_PROCS} . ' , properties = all"');
+        system($torque_home.'/bin/qmgr -a -e -c "create node ' . $hostname .
+               ' np = ' . $nodes{$node}{NUM_PROCS} . 
+               ' , gpus = ' . $nodes{$node}{NUM_GPUS} .
+               ' , properties = all"');
       }
     
     # Count up the number of nodes and processors
@@ -221,102 +259,107 @@
 # Now go through the remaining pbsnodes hash and delete these nodes
 foreach my $node (sortnodes( keys %pbsnodes )) 
   {
-    system('/opt/pbs/bin/qmgr -a -e -c "delete node ' . $node . '"');
+    system($torque_home.'/bin/qmgr -a -e -c "delete node ' . $node . '"');
   }
 restartPBSserver();
 
-# Next, use qmgr command to set up the values for workq
-opkg_print("Creating TORQUE workq queue...\n");
+if(isWorkqDefined()) {
+  opkg_print("workq already defined...Skipping queue creation...\n");
+} else {
+  # Next, use qmgr command to set up the values for workq
+  opkg_print("Creating TORQUE workq queue...\n");
  
-# These are default values set only when not present or when --default
-my @default_params = (
-  'create queue workq',
-  'set queue workq queue_type                 = Execution',
-  'set queue workq resources_max.cput         = 10000:00:00',
-  'set queue workq resources_max.walltime     = 10000:00:00',
-  'set queue workq resources_min.cput         = 00:00:01',
-  'set queue workq resources_min.ncpus        = 1',
-  'set queue workq resources_min.nodect       = 1',
-  'set queue workq resources_min.walltime     = 00:00:01',
-  'set queue workq resources_default.cput     = 10000:00:00',
-  'set queue workq resources_default.ncpus    = 1',
-  'set queue workq resources_default.nodect   = 1',
-  'set queue workq resources_default.walltime = 10000:00:00',
-  'set queue workq enabled                    = True',
-  'set queue workq started                    = True',
-  'set server scheduling                      = True',
-  'set server default_queue                   = workq',
-  'set server mail_from                       = adm',
-  'set server query_other_jobs                = True',
-);
+  # These are default values set only when not present or when --default
+  my @default_params = (
+    'create queue workq',
+    'set queue workq queue_type                 = Execution',
+    'set queue workq resources_max.cput         = 10000:00:00',
+    'set queue workq resources_max.walltime     = 10000:00:00',
+    'set queue workq resources_min.cput         = 00:00:01',
+    'set queue workq resources_min.ncpus        = 1',
+    'set queue workq resources_min.nodect       = 1',
+    'set queue workq resources_min.walltime     = 00:00:01',
+    'set queue workq resources_default.cput     = 10000:00:00',
+    'set queue workq resources_default.ncpus    = 1',
+    'set queue workq resources_default.nodect   = 1',
+    'set queue workq resources_default.walltime = 10000:00:00',
+    'set queue workq enabled                    = True',
+    'set queue workq started                    = True',
+    'set server scheduling                      = True',
+    'set server default_queue                   = workq',
+    'set server mail_from                       = adm',
+    'set server query_other_jobs                = True',
+  );
 
-# These are variable parameters that are set everytime
-my @variable_params = (
-  "set queue workq resources_max.ncpus        = $TOT_NP",
-  "set queue workq resources_max.nodect       = $TOT_NODES",
-  "set queue workq resources_available.nodect = $TOT_NODES",
-  "set server resources_available.ncpus       = $TOT_NP",
-  "set server resources_available.nodect      = $TOT_NODES",
-  "set server resources_available.nodes       = $TOT_NODES",
-  "set server resources_max.ncpus             = $TOT_NP",
-  "set server resources_max.nodes             = $TOT_NODES",
-  "set server scheduler_iteration             = 60",
-  "set server log_events                      = $loglevel",
-);
+  # These are variable parameters that are set everytime
+  my @variable_params = (
+    "set queue workq resources_max.ncpus        = $TOT_NP",
+    "set queue workq resources_max.nodect       = $TOT_NODES",
+    "set queue workq resources_available.nodect = $TOT_NODES",
+    "set server resources_available.ncpus       = $TOT_NP",
+    "set server resources_available.nodect      = $TOT_NODES",
+    "set server resources_available.nodes       = $TOT_NODES",
+    "set server resources_max.ncpus             = $TOT_NP",
+    "set server resources_max.nodes             = $TOT_NODES",
+    "set server scheduler_iteration             = 60",
+    "set server log_events                      = $loglevel",
+  );
 
-# First, read in the current values for the localhost
-undef $/;   # Suck in the entire files into one big string
-open (CMD,'/opt/pbs/bin/qmgr -a -e -c "print server @localhost" |');
-my $cmd_output = <CMD>;
-close (CMD);
+  # First, read in the current values for the localhost
+  undef $/;   # Suck in the entire files into one big string
+  open (CMD,$torque_home.'/bin/qmgr -a -e -c "print server @localhost" |');
+  my $cmd_output = <CMD>;
+  close (CMD);
 
-# Next, build up a command string to pass back to the qmgr command.
-my $line = "";
-my $key = "";
-my $value = "";
-my $cmd_input = "";
+  # Next, build up a command string to pass back to the qmgr command.
+  my $line = "";
+  my $key = "";
+  my $value = "";
+  my $cmd_input = "";
 
-# Search through the default_params for anything missing.
-foreach $line (@default_params)
-  {
-    ($key,$value) = split /=/, $line;
-    $key = compactSpaces($key);
-    $value = compactSpaces($value);
-    if ($value eq "")
-      { # Search for just the $key since there's no value for it.
-        $cmd_input .= "$key\n" if
-          ($cmd_output !~ /$key/m) || 
-          (defined($default) && $key !~ /create/);
-      }
-    else
-      { # Search for "$key =".  Output key and value if needed.
-        $cmd_input .= "$key = $value\n" if
-          ($cmd_output !~ /$key\s*=/m) || defined($default);
-      }
-  }
+  # Search through the default_params for anything missing.
+  foreach $line (@default_params)
+    {
+      ($key,$value) = split /=/, $line;
+      $key = compactSpaces($key);
+      $value = compactSpaces($value);
+      if ($value eq "")
+        { # Search for just the $key since there's no value for it.
+          $cmd_input .= "$key\n" if
+            ($cmd_output !~ /$key/m) || 
+            (defined($default) && $key !~ /create/);
+        }
+      else
+        { # Search for "$key =".  Output key and value if needed.
+          $cmd_input .= "$key = $value\n" if
+            ($cmd_output !~ /$key\s*=/m) || defined($default);
+        }
+    }
 
-# Then, force the setting of the of the variable_params.
-foreach $line (@variable_params)
-  {
-    ($key,$value) = split /=/, $line;
-    $key = compactSpaces($key);
-    $value = compactSpaces($value);
-    $cmd_input .= "$key";
-    $cmd_input .= " = $value"
-      if ($value ne "");
-    $cmd_input .= "\n";
-  }
+  # Then, force the setting of the of the variable_params.
+  foreach $line (@variable_params)
+    {
+      ($key,$value) = split /=/, $line;
+      $key = compactSpaces($key);
+      $value = compactSpaces($value);
+      $cmd_input .= "$key";
+      $cmd_input .= " = $value"
+        if ($value ne "");
+      $cmd_input .= "\n";
+    }
 
-# Run the qmgr command if necessary
-if ($cmd_input ne "")
-  {
-    my $pbs_spool = "/var/spool/pbs";
-    open (CMD,"|/opt/pbs/bin/qmgr -a -e ");
-    print CMD $cmd_input;
-    close (CMD) 
-        or croak("ERROR: Impossible to Configure TORQUE queues, ".
-                 "check the logs at $pbs_spool");
-  }
+  # Run the qmgr command if necessary
+  if ($cmd_input ne "")
+    {
+      my $pbs_server_logs = "/var/log/torque/server_logs";
+      open (CMD,"|".$torque_home."/bin/qmgr -a -e ");
+      print CMD $cmd_input;
+      close (CMD) 
+          or croak("ERROR: Impossible to Configure TORQUE queues, ".
+                   "check the logs at $pbs_server_logs");
+    }
+}
+
 restartMaui();
 
 # Get current state of server's mom
Index: packages/torque/scripts/server-post-install
===================================================================
--- packages/torque/scripts/server-post-install	(révision 9483)
+++ packages/torque/scripts/server-post-install	(copie de travail)
@@ -37,6 +37,9 @@
 #######################################################
 sub startPBSserver
 {
+  if(!-f '/var/lib/torque/server_priv/serverdb' && !-f '/var/spool/pbs/server_priv/serverdb') {
+    system('/etc/init.d/pbs_server create'); # no config file: need to initialise config.
+  }
   system('/etc/init.d/pbs_server restart');
 }
 
@@ -68,10 +71,20 @@
 my $xpbsmonrc;
 
 # Attention: More sophistication needed here.
+# Check for 32/64 bit architecture AND
+# Check if using /opt located oscar torque package or distro torque package
 if ($arch eq "x86_64") {
-    $xpbsmonrc = "/opt/pbs/lib64/xpbsmon/xpbsmonrc";
+    if (-f "/usr/lib64/xpbsmon/xpbsmonrc") {
+        $xpbsmonrc = "/usr/lib64/xpbsmon/xpbsmonrc";
+    } else {
+        $xpbsmonrc = "/opt/pbs/lib64/xpbsmon/xpbsmonrc";
+    } 
 } else {
-    $xpbsmonrc = "/opt/pbs/lib/xpbsmon/xpbsmonrc";
+    if (-f "/usr/lib/xpbsmon/xpbsmonrc") {
+        $xpbsmonrc = "/usr/lib/xpbsmon/xpbsmonrc";
+    } else {
+        $xpbsmonrc = "/opt/pbs/lib/xpbsmon/xpbsmonrc";
+    }
 }
 
 # Eventually 'torque_gui' is not installed, so we cannot configure xpbsmonrc
Index: packages/torque/testing/test_root
===================================================================
--- packages/torque/testing/test_root	(révision 9483)
+++ packages/torque/testing/test_root	(copie de travail)
@@ -14,14 +14,26 @@
 #          Jeremy Enos
 #          Bernard Li
 
+#########################################################################
+# Guess TORQUE home is it /usr or /opt/pbs ?
+#########################################################################
+
+if test -f '/usr/bin/pbsnodes'
+then
+   pbsnodes='/usr/bin/pbsnodes';
+else
+   pbsnodes='/opt/pbs/bin/pbsnodes';
+fi
+
+
 # Stop TORQUE Server if any nodes exhibit "state-unknown" status.
 # This condition could be due to the 5 minute polling interval to the moms
 # by the server.  Restarting the server will induce a fresh poll.
-unknown_nodes=`/opt/pbs/bin/pbsnodes -a |grep -c state-unknown`
+unknown_nodes=`$pbsnodes -a |grep -c state-unknown`
 if [ $unknown_nodes != 0 ] ; then
   /etc/init.d/pbs_server stop
 fi
-unknown_nodes=`/opt/pbs/bin/pbsnodes -a |grep -c state-unknown`
+unknown_nodes=`$pbsnodes -a |grep -c state-unknown`
 if [ $unknown_nodes != 0 ] ; then
   $OSCAR_TESTPRINT --label "TORQUE node check" -f
 #  exit 1;
Index: packages/torque/testing/pbs_test
===================================================================
--- packages/torque/testing/pbs_test	(révision 9483)
+++ packages/torque/testing/pbs_test	(copie de travail)
@@ -30,8 +30,23 @@
 # are failing due to timeouts (not because of failing package tests)
 default_timeout=60
 
-qsub='/opt/pbs/bin/qsub'
-qstat='/opt/pbs/bin/qstat'
+#########################################################################
+# Guess TORQUE home is it /usr or /opt/pbs ?
+#########################################################################
+
+if test -f '/usr/bin/pbsnodes'
+then
+    qsub='/usr/bin/qsub'
+    qdel='/usr/bin/qdel'
+    qstat='/usr/bin/qstat'
+    pbsnodes='/usr/bin/pbsnodes'
+else
+    qsub='/opt/pbs/bin/qsub'
+    qdel='/opt/pbs/bin/qdel'
+    qstat='/opt/pbs/bin/qstat'
+    pbsnodes='/opt/pbs/bin/pbsnodes'
+fi
+
 exit_status=0
 
 # Check usage (since this can be run seperately)
@@ -108,7 +123,7 @@
 }              
 
 function check_free_nodes () {
-  free=`/opt/pbs/bin/pbsnodes -a | grep -c 'state = free'`
+  free=`$pbsnodes -a | grep -c 'state = free'`
   if [ $free -eq $clients ] || [ $free -gt $clients ]; then  
     # at least $clients free nodes
     echo -n ""
@@ -158,8 +173,8 @@
         $OSCAR_TESTPRINT --label "$pbs_test_type" --fail
         job_state=fail
 	exit_status=1
-        if [ `/opt/pbs/bin/qstat | grep -c $job` -eq 1 ]; then
-          /opt/pbs/bin/qdel $job
+        if [ `$qstat | grep -c $job` -eq 1 ]; then
+          $qdel $job
           sleep 1 # Allows nodes to free up again
         fi
         if [ "$exit_on_fail" = yes ]; then waitexit ; fi
Index: packages/torque/testing/test_user
===================================================================
--- packages/torque/testing/test_user	(révision 9483)
+++ packages/torque/testing/test_user	(copie de travail)
@@ -14,6 +14,18 @@
 #          Jeremy Enos
 #          Bernard Li
 
+#########################################################################
+# Guess TORQUE home is it /usr or /opt/pbs ?
+#########################################################################
+
+if test -f '/usr/bin/qmgr'
+then
+   qmgr='/usr/bin/qmgr'
+else
+   qmgr='/opt/pbs/bin/qmgr'
+fi
+
+
 cd ~
 clients=0
 for i in $@ 
@@ -24,7 +36,7 @@
 #                       DEFAULT TORQUE QUEUE
 #                       --------------------
 
-if [ `/opt/pbs/bin/qmgr -c "l s" | grep -c 'default_queue'` -eq 1 ]; then
+if [ `$qmgr -c "l s" | grep -c 'default_queue'` -eq 1 ]; then
         $OSCAR_TESTPRINT --label "TORQUE default queue definition" -p
 else
         $OSCAR_TESTPRINT --label "TORQUE default queue definition" -f
Index: packages/torque/testing/pbs_script.shell
===================================================================
--- packages/torque/testing/pbs_script.shell	(révision 9483)
+++ packages/torque/testing/pbs_script.shell	(copie de travail)
@@ -11,7 +11,18 @@
 ### Queue name
 #PBS -q workq
 
+#########################################################################
+# Guess TORQUE home is it /usr or /opt/pbs ?
+#########################################################################
+
+if test -f '/usr/bin/pbsdsh'
+then
+    pbsdsh='/usr/bin/pbsdsh'
+else
+    pbsdsh='/opt/pbs/bin/pbsdsh'
+fi
+
 ### Script commands
-/opt/pbs/bin/pbsdsh -s hostname
-/opt/pbs/bin/pbsdsh -s date +"Hello, date is %D, time is %T"
+$pbsdsh -s hostname
+$pbsdsh -s date +"Hello, date is %D, time is %T"
 exit
Index: oda/trunk/lib/Database.pm
===================================================================
--- oda/trunk/lib/Database.pm	(révision 9474)
+++ oda/trunk/lib/Database.pm	(copie de travail)
@@ -2294,6 +2294,7 @@
             return 0;
         }
     } else {
+	# This function is inapropriate for node info update (cpu count info)
         print "The node $node is already in the database\n";
     }
     return 1;
Index: systeminstaller-oscar/trunk/lib/SystemInstaller/Machine.pm
===================================================================
--- systeminstaller-oscar/trunk/lib/SystemInstaller/Machine.pm	(révision 9474)
+++ systeminstaller-oscar/trunk/lib/SystemInstaller/Machine.pm	(copie de travail)
@@ -58,6 +58,7 @@
                                       HOST => $machine->{hostname},
                                       DOMAIN => $machine->{domainname},
                                       NUM_PROCS => $machine->{proccount},
+                                      NUM_GPUS => $machine->{gpucount},
                                       IPADDR => @$adapter[0]->{ip}
                                      };
     }
Index: systeminstaller-oscar/trunk/lib/SystemInstaller/Env.pm
===================================================================
--- systeminstaller-oscar/trunk/lib/SystemInstaller/Env.pm	(révision 9474)
+++ systeminstaller-oscar/trunk/lib/SystemInstaller/Env.pm	(copie de travail)
@@ -107,7 +107,7 @@
         # This just returns the version number, looks silly,
         # but the string below is replaced during the build
         # process with the proper version.
-        my $SIVERSION="2.4.2svn20090612";
+        my $SIVERSION="2.4.11";
         return $SIVERSION;
 }
 
Index: systeminstaller-oscar/trunk/lib/SIS/NewDB.pm
===================================================================
--- systeminstaller-oscar/trunk/lib/SIS/NewDB.pm	(révision 9483)
+++ systeminstaller-oscar/trunk/lib/SIS/NewDB.pm	(copie de travail)
@@ -112,11 +112,14 @@
                     name => "Nodes.name",
 #                     route => "Networks.gateway:Nics.node_id=Nodes.id AND Networks.n_id=Nics.network_id",
                     hostname => "Nodes.hostname",
-#                     domainname => "Nodes.dns_domain",
+                    # domainname required by SystemInstaller::Image::get_machine_listing()
+                    domainname => "Nodes.dns_domain",
 #                     arch => "Images.architecture:Nodes.image_id=Images.id",
                     imagename => "Images.name:Images.id=Nodes.image_id",
 #                     name => "Nodes.name",
-#                     proccount => "Nodes.cpu_num",
+                    # proccount required by SystemInstaller::Image::get_machine_listing() and torque api-post-deploy
+                    proccount => "Nodes.cpu_num",
+                    gpucount => "Nodes.gpu_num",
                 },
            );
 
Index: packages/oda/scripts/oscar_table.sql
===================================================================
--- packages/oda/scripts/oscar_table.sql	(révision 9483)
+++ packages/oda/scripts/oscar_table.sql	(copie de travail)
@@ -96,6 +96,7 @@
 CREATE TABLE IF NOT EXISTS Nodes(
     cluster_id  integer not null default 0,
     cpu_num  integer,
+    gpu_num  integer,
     cpu_speed VARCHAR(100),
     dns_domain VARCHAR(100),
     fqdn VARCHAR(100),
Index: scripts/post_install
===================================================================
--- scripts/post_install	(révision 9483)
+++ scripts/post_install	(copie de travail)
@@ -44,6 +44,7 @@
 use OSCAR::Package;
 use Getopt::Long;
 use Carp;
+use Switch;
 
 my ($verbose);
 GetOptions(
@@ -62,9 +63,15 @@
 
 # We currently do not assume that getting the number of procs is a fatal
 # problem
-if (get_numproc()) {
-    warn "WARNING: Impossible to get proc number";
+if(update_node_ressource("cat /proc/cpuinfo",qr/^processor/,"CPU","cpu_num",1)) {
+    warn "WARNING: Impossible to get CPU count";
 }
+if(update_node_ressource("nvidia-smi -L",qr/^GPU/,"GPU","gpu_num",0)) {
+    warn "WARNING: Impossible to get GPU count";
+}
+#if(update_node_ressource("/bin/ls -1 /dev",qr/^nvidia[0-9]+/,"GPU","gpu_num",0)) {
+#    warn "WARNING: Impossible to get GPU count";
+#}
 
 # This makes sure existing nodes have the latest /etc/hosts. 
 # The profile script is execd so we know the path (from opium's post_install)
@@ -112,29 +119,53 @@
 	       @_;
 }
 
+
+# update_node_ressource: Scan nodes for cpu count and GPU count and update SIS base.
+#     - $query_cmd: command to list ressources
+#     - $query filter: regex that isolate ressource
+#     - $ressource name: text to identify ressource (for messages and logs)
+#     - $db_field: database field affected (cpu_num or gpu_num)
+#     - $min_count: (can't have 0 cpu, but it's allowed to have 0 gpu).
 # Return: 0 if success, -1 else.
-sub get_numproc {
+
+sub update_node_ressource($$$$$) {
+	my ($query_cmd, $query_filter, $ressource_name, $db_field, $min_count) = @_;
 	my @machines= sortclients SIS::NewDB::list_client();
 	foreach my $mach (@machines) {
-		my $CMD="/usr/bin/ssh -n ".$mach->{name}." cat /proc/cpuinfo";
-		oscar_log_subsection ("Gathering processor count from " .
-                              $mach->{name}."($CMD)");
-		open (CPIPE,"$CMD |") 
-            or (carp("ERROR: Unable to query machine ".$mach->{name}),next);
+		my $return_code=0;
+		my $CMD="/usr/bin/ssh -n ".$mach->{name}." $query_cmd 2> /dev/null";
+		oscar_log_subsection ("Gathering $ressource_name count from " . $mach->{name});
+		open (CPIPE,"$CMD |") or (carp("ERROR: Can't run $CMD"),next);
 		my $count=0;
 		while (<CPIPE>) {
-			++$count if (/^processor/);
+			++$count if (/$query_filter/);
 		}
-		close(CPIPE);
-		if (($count !~ /^[0-9]+$/) || ($count == 0)) {
-			carp("ERROR: Improper count ($count) returned from machine " .
-                 $mach->{name});
-            return -1;
+		close(CPIPE) or $return_code=$?;
+		if ($return_code == 32512) { # Command not found (e.g. no nvidia-smi)
+			carp("WARNING: '$query_cmd' not found on Node ".$mach->{name}),next;
+		} elsif ($return_code == 65280) { # Node not responding to ssh command
+			carp("ERROR: Node ".$mach->{name}." DOWN! (not responding)"),next;
+		} elsif ($return_code > 0) { # ????
+			carp("ERROR: ($return_code) while trying to run: $CMD"),next;
+		}
+
+		if (($count !~ /^[0-9]+$/) || ($count < $min_count)) {
+			carp("ERROR: Improper count ($count) returned from machine " . $mach->{name});
+			return -1;
 		} else {
 			oscar_log_subsection ("Updating database for machine " .
-                                 $mach->{name} . " cpus=$count.");
-			$mach->{proccount} = $count;
-			SIS::NewDB::set_client($mach);
+                                 $mach->{name} . " $db_field=$count.");
+			my %data =  (
+                		$db_field    => $count,
+                	);
+			if (OSCAR::Database::update_node (
+			    $mach->{'name'},
+			    \%data,
+			    undef,
+			    undef) != 1) {
+				carp "ERROR: Impossible to update node $ressource_name information";
+				return -1;
+    			}
 		}
 	}
     return 0;
------------------------------------------------------------------------------
Everyone hates slow websites. So do we.
Make your web apps faster with AppDynamics
Download AppDynamics Lite for free today:
http://p.sf.net/sfu/appdyn_sfd2d_oct
_______________________________________________
Oscar-devel mailing list
Oscar-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/oscar-devel

Reply via email to