Author: arkurth
Date: Wed Sep 18 15:13:37 2013
New Revision: 1524445

URL: http://svn.apache.org/r1524445
Log:
VCL-16
Updated new.pm::wait_for_child_reservations. The parent reservation was not 
waiting long enough for child reservations to load under some conditions. Added 
checks in waiting loop to detect any change in the child reservation.lastcheck 
values or child reservation computerloadlog entires.

Updated utils.pm::get_current_reservation_lastcheck to accept either a single 
or multiple reservation ID arguments. This is called from 
new.pm::wait_for_child_reservations in order to detect changes from child 
reservations.

VCL-681
Updated utils.pm::run_ssh_command to detect 'Connection timed out during banner 
exchange' errors. These occasionally occur.

Other
Added undefined variable check to Upstart.pm::get_service_names.

Modified:
    vcl/trunk/managementnode/lib/VCL/Module/OS/Linux/init/Upstart.pm
    vcl/trunk/managementnode/lib/VCL/new.pm
    vcl/trunk/managementnode/lib/VCL/utils.pm

Modified: vcl/trunk/managementnode/lib/VCL/Module/OS/Linux/init/Upstart.pm
URL: 
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/Module/OS/Linux/init/Upstart.pm?rev=1524445&r1=1524444&r2=1524445&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/Module/OS/Linux/init/Upstart.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/Module/OS/Linux/init/Upstart.pm Wed Sep 18 
15:13:37 2013
@@ -136,6 +136,7 @@ sub get_service_names {
        my %service_name_mappings_reversed = reverse %$SERVICE_NAME_MAPPINGS;
        for my $line (@$output) {
                my ($service_name) = $line =~ /^([^\s\t]+)/;
+               next unless $service_name;
                $service_name_hash{$service_name} = 1 if $service_name;
                if (my $service_name_mapping = 
$service_name_mappings_reversed{$service_name}) {
                        $service_name_hash{$service_name_mapping} = 1;

Modified: vcl/trunk/managementnode/lib/VCL/new.pm
URL: 
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/new.pm?rev=1524445&r1=1524444&r2=1524445&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/new.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/new.pm Wed Sep 18 15:13:37 2013
@@ -1170,78 +1170,145 @@ EOF
 
 =head2 wait_for_child_reservations
 
- Parameters  :
- Returns     :
- Description :
+ Parameters  : none
+ Returns     : boolean
+ Description : Called by the parent reservation if a cluster request is being
+               processed. The parent waits for the child reservations to finish
+               loading before proceeding to change the request state to
+               reserved. Child reservations indicate they are finished loading
+               by inserting a 'nodeready' computerloadlog entry.
+               
+               The parent attempts to detect any changes in child reservations
+               by retrieving the reservation.lastupdate values for all children
+               as well was all computerloadlog entries. The parent will wait up
+               to 20 minutes if no change is detected for any children.
 
 =cut
 
 sub wait_for_child_reservations {
        my $self              = shift;
-       
        my $request_data      = $self->data->get_request_data();
        my $request_id        = $self->data->get_request_id();
-       my $reservation_id    = $self->data->get_reservation_id();
        my @reservation_ids   = $self->data->get_reservation_ids();
+       my $reservation_count = $self->data->get_reservation_count();
 
-       # Set limits on how many attempts to make and how long to wait between 
attempts
-       # Wait a long time - 20 minutes
-       my $loop_iteration_limit = 40;
-       my $loop_iteration_delay = 30;
-
-       WAITING_LOOP: for (my $loop_iteration = 1; $loop_iteration <= 
$loop_iteration_limit; $loop_iteration++) {
-               if ($loop_iteration > 1) {
-                       notify($ERRORS{'OK'}, 0, "waiting for 
$loop_iteration_delay seconds");
-                       sleep $loop_iteration_delay;
-               }
+       # Set limits on how long to wait
+       my $overall_timeout_minutes = 60;
+       my $nochange_timeout_minutes = 20;
+       my $monitor_delay_seconds = 3;
+       
+       my $overall_timeout_seconds = ($overall_timeout_minutes * 60);
+       my $nochange_timeout_seconds = ($nochange_timeout_minutes * 60);
+       
+       my $monitor_start_time = time;
+       my $last_change_time = $monitor_start_time;
+       my $current_time;
+       my $nochange_timeout_time = ($monitor_start_time + 
$nochange_timeout_seconds);
+       my $overall_timeout_time = ($monitor_start_time + 
$overall_timeout_seconds);
+       
+       my $previous_lastcheck_info;
+       my $previous_request_loadstate_names;
+       
+       MONITOR_LOADING: while (($current_time = time) < $nochange_timeout_time 
&& $current_time < $overall_timeout_time) {
+               my $total_elapsed_seconds      = ($current_time - 
$monitor_start_time);
+               my $nochange_elapsed_seconds   = ($current_time - 
$last_change_time);
+               my $nochange_remaining_seconds = ($nochange_timeout_time - 
$current_time);
+               my $overall_remaining_seconds  = ($overall_timeout_time - 
$current_time);
+               
+               notify($ERRORS{'DEBUG'}, 0, "waiting for child reservations 
seconds elapsed/until no change timeout: 
$nochange_elapsed_seconds/$nochange_remaining_seconds, unconditional timeout: 
$total_elapsed_seconds/$overall_remaining_seconds");
                
                # Check if request has been deleted
                if (is_request_deleted($request_id)) {
                        notify($ERRORS{'OK'}, 0, "request has been deleted, 
setting computer state to 'available' and exiting");
-                       
-                       # Update state of computer and exit
-                       switch_state($request_data, '', 'available', '', '1');
+                       $self->state_exit('', 'available');
                }
                
-               # Check if all of the reservations are ready according to the 
computerloadlog table
-               my $computerloadlog_reservations_ready = 
reservations_ready($request_id);
-               if ($computerloadlog_reservations_ready) {
-                       notify($ERRORS{'OK'}, 0, "ready: all child reservations 
are ready according to computerloadlog, returning 1");
-                       return 1;
-               }
-               elsif (defined $computerloadlog_reservations_ready) {
-                       notify($ERRORS{'OK'}, 0, "not ready: all child 
reservations are NOT ready according to computerloadlog");
-               }
-               else {
-                       notify($ERRORS{'WARNING'}, 0, "error occurred checking 
if child reservations are ready according to computerloadlog");
-               }
-
-               notify($ERRORS{'OK'}, 0, "attempt 
$loop_iteration/$loop_iteration_limit: waiting for child reservations to become 
ready");
-
-               RESERVATION_LOOP: foreach my $child_reservation_id 
(@reservation_ids) {
-                       # Don't bother checking this reservation
-                       if ($child_reservation_id == $reservation_id) {
-                               next RESERVATION_LOOP;
+               my @reservations_ready;
+               my @reservations_not_ready;
+               my @reservations_failed;
+               my @reservations_lastcheck_changed;
+               my @reservations_loadstate_changed;
+               my @reservations_unknown;
+               
+               # Get reservation.lastcheck value for all reservations
+               my $current_lastcheck_info = 
get_current_reservation_lastcheck(@reservation_ids);
+               $previous_lastcheck_info = $current_lastcheck_info if 
!$previous_lastcheck_info;
+               
+               # Get computerloadlog info for all reservations
+               my $current_request_loadstate_names = 
get_request_loadstate_names($request_id);
+               $previous_request_loadstate_names = 
$current_request_loadstate_names if !$previous_request_loadstate_names;
+               
+               RESERVATION_ID: for my $reservation_id (@reservation_ids) {
+                       if 
(!defined($current_request_loadstate_names->{$reservation_id})) {
+                               notify($ERRORS{'WARNING'}, 0, "request 
loadstate info does not contain a key for reservation $reservation_id:\n" . 
format_data($current_request_loadstate_names));
+                               next RESERVATION_ID;
                        }
                        
-                       # Get the computer ID of the child reservation
-                       my $child_computer_id = 
$request_data->{reservation}{$child_reservation_id}{computer}{id};
-                       notify($ERRORS{'DEBUG'}, 0, "checking reservation 
$child_reservation_id: computer ID=$child_computer_id");
+                       my @previous_reservation_loadstate_names = 
@{$previous_request_loadstate_names->{$reservation_id}};
+                       my @current_reservation_loadstate_names = 
@{$current_request_loadstate_names->{$reservation_id}};
                        
-                       # Get the child reservation's current computer state
-                       my $child_computer_state = 
get_computer_current_state_name($child_computer_id);
-                       notify($ERRORS{'DEBUG'}, 0, "reservation 
$child_reservation_id: computer state=$child_computer_state");
+                       if (grep {$_ eq 'failed'} 
@current_reservation_loadstate_names) {
+                               push @reservations_failed, $reservation_id;
+                               next RESERVATION_ID;
+                       }
+                       elsif (grep {$_ eq 'nodeready'} 
@current_reservation_loadstate_names) {
+                               push @reservations_ready, $reservation_id;
+                               next RESERVATION_ID;
+                       }
+                       elsif (grep {$_ eq 'begin'} 
@current_reservation_loadstate_names) {
+                               push @reservations_not_ready, $reservation_id;
+                       }
+                       else {
+                               push @reservations_unknown, $reservation_id;
+                       }
                        
-                       if ($child_computer_state =~ /^(failed|maintenance)$/) {
-                               notify($ERRORS{'WARNING'}, 0, "aborting, child 
reservation $child_reservation_id computer state: $child_computer_state");
-                               return;
+                       if ($previous_lastcheck_info->{$reservation_id} ne 
$current_lastcheck_info->{$reservation_id}) {
+                               push @reservations_lastcheck_changed, 
$reservation_id;
                        }
-               } ## end foreach my $child_reservation_id (@reservation_ids)
-       } ## end for (my $loop_iteration = 1; $loop_iteration...
+                       if (scalar(@previous_reservation_loadstate_names) != 
scalar(@current_reservation_loadstate_names)) {
+                               push @reservations_loadstate_changed, 
$reservation_id;
+                       }
+               }
+               
+               my $ready_count     = scalar @reservations_ready;
+               my $not_ready_count = scalar @reservations_not_ready;
+               my $unknown_count   = scalar @reservations_unknown;
+               my $failed_count    = scalar @reservations_failed;
+               
+               notify($ERRORS{'DEBUG'}, 0, "current status of reservations:\n" 
.
+                       "ready     : $ready_count (" . join(', ', 
@reservations_ready) . ")\n" .
+                       "not ready : $not_ready_count (" . join(', ', 
@reservations_not_ready) . ")\n" .
+                       "unknown   : $unknown_count (" . join(', ', 
@reservations_unknown) . ")\n" .
+                       "failed    : $failed_count (" . join(', ', 
@reservations_failed) . ")"
+               );
+               
+               if ($failed_count) {
+                       $self->state_exit('failed', 'available');
+               }
+               elsif ($ready_count == $reservation_count) {
+                       notify($ERRORS{'OK'}, 0, "all reservations are ready");
+                       return 1;
+               }
+               
+               
+               # If any changes were detected, reset the nochange timeout
+               if (@reservations_lastcheck_changed || 
@reservations_loadstate_changed) {
+                       notify($ERRORS{'DEBUG'}, 0, "resetting no change 
timeout, detected reservation change:\n" .
+                               "reservation lastcheck changed: (" . join(', ', 
@reservations_lastcheck_changed) . ")\n" .
+                               "reservation loadstate changed: (" . join(', ', 
@reservations_loadstate_changed) . ")"
+                       );
+                       $last_change_time = $current_time;
+                       $nochange_timeout_time = ($last_change_time + 
$nochange_timeout_seconds);
+               }
+               
+               $previous_request_loadstate_names = 
$current_request_loadstate_names;
+               $previous_lastcheck_info = $current_lastcheck_info;
+               sleep $monitor_delay_seconds;
+       }
 
        # If out of main loop, waited maximum amount of time
-       notify($ERRORS{'WARNING'}, 0, "waited maximum amount of time for child 
reservations to become ready, returning 0");
-       return 0;
+       notify($ERRORS{'WARNING'}, 0, "waited maximum amount of time for all 
reservations to become ready");
+       return;
 
 } ## end sub wait_for_child_reservations
 

Modified: vcl/trunk/managementnode/lib/VCL/utils.pm
URL: 
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/utils.pm?rev=1524445&r1=1524444&r2=1524445&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/utils.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/utils.pm Wed Sep 18 15:13:37 2013
@@ -1577,7 +1577,7 @@ EOF
                                return 1;
                        }
                        else {
-                               notify($ERRORS{'WARNING'}, $LOGFILE, "failed to 
update request $request_id state to: $state_name/$laststate_name, current 
state: $current_state_name/$current_laststate_name");
+                               notify($ERRORS{'WARNING'}, $LOGFILE, "unable to 
update request $request_id state to: $state_name/$laststate_name, current 
state: $current_state_name/$current_laststate_name");
                                return;
                        }
                }
@@ -2091,42 +2091,57 @@ sub get_reservation_accounts {
 
 =head2 get_current_reservation_lastcheck
 
- Parameters  : $reservation_id
+ Parameters  : @reservation_ids
  Returns     : string
  Description : Retrieves the current value of reservation.lastcheck from the
-               database.
+               database. Either a single reservation ID or multiple reservation
+               IDs may be passed as the argument. If a single reservation ID is
+               passed, a string is returned containing the 
reservation.lastcheck
+               value. If multiple reservation IDs are passed, a hash reference
+               is returned with the keys set to the reservation IDs.
 
 =cut
 
 sub get_current_reservation_lastcheck {
-       my ($reservation_id) = @_;
+       my @reservation_ids = @_;
 
        # Check the passed parameter
-       if (!defined($reservation_id)) {
-               notify($ERRORS{'WARNING'}, 0, "reservation ID was not 
specified");
+       if (!@reservation_ids) {
+               notify($ERRORS{'WARNING'}, 0, "reservation ID argument was not 
specified");
                return;
        }
+       
+       my $reservation_id_string = join(', ', @reservation_ids);
 
        # Create the select statement
        my $select_statement = <<EOF;
 SELECT
+reservation.id,
 reservation.lastcheck
 FROM
 reservation
 WHERE
-reservation.id = $reservation_id
+reservation.id IN ($reservation_id_string)
 EOF
 
        my @selected_rows = database_select($select_statement);
 
        # Check to make sure 1 row was returned
        if (!@selected_rows) {
-               notify($ERRORS{'WARNING'}, 0, "failed to retrieve current 
reservation lastcheck for reservation $reservation_id");
+               notify($ERRORS{'WARNING'}, 0, "failed to retrieve current 
reservation lastcheck for reservations: $reservation_id_string");
                return;
        }
-       
-       my $row = $selected_rows[0];
-       return $row->{lastcheck};
+       elsif (scalar(@selected_rows) == 1) {
+               my $row = $selected_rows[0];
+               return $row->{lastcheck};
+       }
+       else {
+               my $reservation_lastcheck_info = {};
+               for my $row (@selected_rows) {
+                       $reservation_lastcheck_info->{$row->{id}} = 
$row->{lastcheck};
+               }
+               return $reservation_lastcheck_info;
+       }
 }
 
 #/////////////////////////////////////////////////////////////////////////////
@@ -4840,7 +4855,10 @@ sub run_ssh_command {
        my $ssh_output_formatted = '';
        my $attempts = 0;
        my $exit_status = 255;
-
+       
+       my $banner_exchange_error_count = 0;
+       my $banner_exchange_error_limit = 3;
+       
        # Make multiple attempts if failure occurs
        while ($attempts < $max_attempts) {
                $attempts++;
@@ -4851,10 +4869,10 @@ sub run_ssh_command {
                        notify($ERRORS{'DEBUG'}, 0, "sleeping for 
$delay_seconds seconds before making next SSH attempt") if $output_level;
                        sleep $delay_seconds;
                }
-
+               
                ## Add -v (verbose) argument to command if this is the 2nd 
attempt
                #$ssh_command =~ s/$ssh_path/$ssh_path -v/ if $attempts == 2;
-
+               
                # Print the SSH command, only display the attempt # if > 1
                if ($attempts == 1) {
                        notify($ERRORS{'DEBUG'}, 0, "executing SSH command on 
$node: '$command'") if $output_level;
@@ -4963,6 +4981,21 @@ sub run_ssh_command {
                        notify($ERRORS{'WARNING'}, 0, "attempt 
$attempts/$max_attempts: failed to execute SSH command on $node: '$command', 
exit status: $exit_status, output:\n$ssh_output_formatted") if $output_level;
                        next;
                }
+               elsif ($ssh_output_formatted =~ /(Connection timed out during 
banner exchange)/i) {
+                       $banner_exchange_error_count++;
+                       if ($banner_exchange_error_count >= 
$banner_exchange_error_limit) {
+                               notify($ERRORS{'WARNING'}, 0, "failed to 
execute SSH command on $node, encountered $banner_exchange_error_count banner 
exchange errors");
+                               return ();
+                       }
+                       else {
+                               # Don't count against attempt limit
+                               $attempts--;
+                               my $banner_exchange_delay_seconds = 
($banner_exchange_error_count * 2);
+                               notify($ERRORS{'DEBUG'}, 0, "encountered banner 
exchange error on $node, sleeping for $banner_exchange_delay_seconds seconds, 
command:\n$command\noutput:\n$ssh_output") if $output_level;
+                               sleep $banner_exchange_delay_seconds;
+                               next;
+                       }
+               }
                elsif ($exit_status == 255 && $ssh_command !~ 
/(vmware-cmd|vim-cmd|vmkfstools|vmrun)/i) {
                        notify($ERRORS{'WARNING'}, 0, "attempt 
$attempts/$max_attempts: failed to execute SSH command on $node: '$command', 
exit status: $exit_status, SSH exits with the exit status of the remote command 
or with 255 if an error occurred, output:\n$ssh_output_formatted") if 
$output_level;
                        next;
@@ -8729,7 +8762,7 @@ EOF
                $computerloadlog_string .= "\n";
        }
        
-       notify($ERRORS{'DEBUG'}, 0, "retrieved computerloadstate names for 
request $request_id:\n$computerloadlog_string");
+       #notify($ERRORS{'DEBUG'}, 0, "retrieved computerloadstate names for 
request $request_id:\n$computerloadlog_string");
        return $computerloadlog_info;
 }
 


Reply via email to