Author: arkurth
Date: Thu Sep 19 15:59:15 2013
New Revision: 1524774

URL: http://svn.apache.org/r1524774
Log:
VCL-16
Added code to xCAT.pm::load which updates reservation.lastcheck when progress 
is detected as a node is being loaded. It will update the value at most every 
60 seconds. This is used by the parent process in a cluster request in 
new.pm::wait_for_child_reservations to detect when child reservations are still 
loading. Updating reservation.lastcheck will prevent the parent from timing out 
the request if the child takes significantly longer than the parent process 
took to load the node.

VCL-727
Added additional looping to xCAT.pm::_rpower which makes additional attempts if 
a timeout error is encountered. Added ability to control number of rpower 
attempts via entries in the variable table: xcat|rpower_error_limit, 
xcat|timeout_error_limit

Modified:
    vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
    vcl/trunk/mysql/vcl.sql

Modified: vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
URL: 
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm?rev=1524774&r1=1524773&r2=1524774&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm Thu Sep 19 
15:59:15 2013
@@ -283,6 +283,11 @@ sub load {
        # Set to a short delay at the beginning of monitoring, this will be 
increased once installation start is detected
        my $monitor_delay_seconds = 5;
        
+       # Keep track of when reservation.lastcheck was last updated
+       my $update_lastcheck_interval_seconds = 60;
+       my $update_lastcheck_time = time;
+       update_reservation_lastcheck($reservation_id);
+       
        my $previous_nodestat_status;
        my $previous_nodeset_status;
        my $current_time;
@@ -367,6 +372,15 @@ sub load {
                if ($reset_timeout) {
                        $last_change_time = $current_time;
                        $nochange_timeout_time = ($last_change_time + 
$nochange_timeout_seconds);
+                       
+                       # Check how long ago reservation.lastcheck was updated
+                       # Update it occasionally - used by parent reservation 
in cluster requests to detect that child reservations are still loading
+                       # Updating reservation.lastcheck prevents the parent 
from timing out while waiting for children to finish loading
+                       my $update_lastcheck_elapsed = ($current_time - 
$update_lastcheck_time);
+                       if ($update_lastcheck_elapsed >= 
$update_lastcheck_interval_seconds) {
+                               update_reservation_lastcheck($reservation_id);
+                               $update_lastcheck_time = time;
+                       }
                }
                
                #notify($ERRORS{'DEBUG'}, 0, "sleeping for 
$monitor_delay_seconds seconds");
@@ -1802,6 +1816,21 @@ sub _wait_for_off {
                   reset        - Send a hardware reset
                   boot         - If off, then power on. If on, then hard reset.
                   cycle        - Power off, then on
+               
+               Multiple rpower attempts will be attempted if an error is
+               detected. For non-timeout errors, the default number of attempts
+               is 3. This can be overridden if either of the following 
variables
+               exist in the variable table in the database:
+                  xcat|rpower_error_limit|<management node hostname>
+                  xcat|rpower_error_limit
+               
+               Timeout errors are counted separately and do not count towards
+               the general error limit. The default number of timeout errors
+               which may be encountered is 5. This can be overridden if either
+               of the following variables exist in the variable table in the
+               database:
+                  xcat|timeout_error_limit|<management node hostname>
+                  xcat|timeout_error_limit
 
 =cut
 
@@ -1824,19 +1853,37 @@ sub _rpower {
                return;
        }
        
+       my $management_node_hostname = 
$self->data->get_management_node_hostname();
+       
        my $command = "$XCAT_ROOT/bin/rpower $computer_node_name 
$rpower_option";
        
-       my $rpower_attempt_limit = 5;
-       my $rpower_attempt_delay = 3;
        my $rpower_attempt = 0;
+       my $rpower_attempt_limit = 
$self->data->get_variable("xcat|rpower_error_limit|$management_node_hostname", 
0) || $self->data->get_variable("xcat|rpower_error_limit", 0);
+       if (!$rpower_error_limit || $rpower_error_limit !~ /^\d+$/) {
+               $rpower_error_limit = 3;
+       }
        
-       RPOWER_ATTEMPT: while ($rpower_attempt++ < $rpower_attempt_limit) {
+       my $timeout_error_count = 0;
+       my $timeout_error_limit = 
$self->data->get_variable("xcat|timeout_error_limit|$management_node_hostname", 
0) || $self->data->get_variable("xcat|timeout_error_limit", 0);
+       if (!$timeout_error_limit || $timeout_error_limit !~ /^\d+$/) {
+               $timeout_error_limit = 5;
+       }
+       
+       my $rinv_attempted = 0;
+       RPOWER_ATTEMPT: while ($rpower_attempt <= 
($rpower_attempt_limit+$timeout_error_count)) {
+               $rpower_attempt++;
+               
                if ($rpower_attempt > 1) {
-                       # Attempt to run rinv to fix any inventory problems 
with the blade
-                       notify($ERRORS{'DEBUG'}, 0, "attempt 
$rpower_attempt/$rpower_attempt_limit: failed to initiate rpower for 
$computer_node_name, running rinv then sleeping for $rpower_attempt_delay 
seconds");
-                       $self->_rinv($computer_node_name);
+                       # Wait a random amount of time to prevent several 
cluster reservations from reattempting at the same time
+                       my $rpower_attempt_delay = 
int(rand($rpower_attempt*2))+1;
+                       
+                       my $notify_string = "attempt 
$rpower_attempt/$rpower_attempt_limit";
+                       if ($timeout_error_count) {
+                               $notify_string .= "+$timeout_error_count 
(timeout errors: $timeout_error_count/$timeout_error_limit)";
+                       }
+                       $notify_string .= ": waiting $rpower_attempt_delay 
before issuing rpower $rpower_option command for $computer_node_name";
+                       notify($ERRORS{'DEBUG'}, 0, $notify_string);
                        sleep $rpower_attempt_delay;
-                       notify($ERRORS{'DEBUG'}, 0, "attempt 
$rpower_attempt/$rpower_attempt_limit: issuing rpower command for 
$computer_node_name, option: $rpower_option");
                }
                
                my ($exit_status, $output) = $self->mn_os->execute($command);
@@ -1844,8 +1891,32 @@ sub _rpower {
                        notify($ERRORS{'WARNING'}, 0, "failed to execute rpower 
command for $computer_node_name");
                        return;
                }
+               elsif (grep(/Error: Timeout/, @$output)) {
+                       # blade2f3-14: Error: Timeout
+                       $timeout_error_count++;
+                       if ($timeout_error_count >= $timeout_error_limit) {
+                               notify($ERRORS{'WARNING'}, 0, "attempt 
$rpower_attempt: failed to issue rpower $rpower_option command for 
$computer_node_name, timeout error limit reached: $timeout_error_count");
+                               return;
+                       }
+                       else {
+                               # Wait a random amount of time to prevent 
several cluster reservations from reattempting at the same time
+                               my $timeout_error_delay = 
int(rand($timeout_error_count*3))+1;
+                               notify($ERRORS{'DEBUG'}, 0, "attempt 
$rpower_attempt: encountered timeout error 
$timeout_error_count/$timeout_error_limit");
+                               next RPOWER_ATTEMPT;
+                       }
+               }
                elsif (grep(/Error:/, @$output)) {
-                       notify($ERRORS{'WARNING'}, 0, "failed to issue rpower 
command for $computer_node_name\ncommand: $command\noutput:\n" . join("\n", 
@$output));
+                       notify($ERRORS{'WARNING'}, 0, "attempt $rpower_attempt: 
failed to issue rpower command for $computer_node_name\ncommand: 
$command\noutput:\n" . join("\n", @$output));
+                       
+                       # Attempt to run rinv once if an error was detected, it 
may fix the following error:
+                       #    Error: Invalid nodes and/or groups in noderange: 
bladex
+                       if (!$rinv_attempted) {
+                               # Attempt to run rinv to fix any inventory 
problems with the blade
+                               notify($ERRORS{'DEBUG'}, 0, "attempt 
$rpower_attempt: failed to initiate rpower for $computer_node_name, attempting 
to run rinv");
+                               $self->_rinv($computer_node_name);
+                               $rinv_attempted = 1;
+                       }
+                       
                        next RPOWER_ATTEMPT;
                }
                
@@ -1870,7 +1941,7 @@ sub _rpower {
                for my $line (@$output) {
                        my ($status) = $line =~ 
/^$computer_node_name:.*\s([^\s]+)$/;
                        if ($status) {
-                               notify($ERRORS{'DEBUG'}, 0, "issued rpower 
command for $computer_node_name, option: $rpower_option, status line: '$line'");
+                               notify($ERRORS{'DEBUG'}, 0, "issued rpower 
$rpower_option command for $computer_node_name, status line: '$line', returning 
'$status'");
                                return $status;
                        }
                }

Modified: vcl/trunk/mysql/vcl.sql
URL: 
http://svn.apache.org/viewvc/vcl/trunk/mysql/vcl.sql?rev=1524774&r1=1524773&r2=1524774&view=diff
==============================================================================
--- vcl/trunk/mysql/vcl.sql (original)
+++ vcl/trunk/mysql/vcl.sql Thu Sep 19 15:59:15 2013
@@ -1888,7 +1888,10 @@ INSERT INTO `variable` (`name`, `seriali
 ('schema-version', 'none', '1'),
 ('timesource|global', 
'none','time.nist.gov,time-a.nist.gov,time-b.nist.gov,time.windows.com'),
 ('acknowledgetimeout', 'none', '900'),
-('connecttimeout', 'none', '900');
+('connecttimeout', 'none', '900'),
+('xcat|timeout_error_limit', 'none', '5'),
+('xcat|rpower_error_limit', 'none', '3');
+
 
 -- 
 -- Dumping data for table `vmprofile`


Reply via email to