Author: arkurth
Date: Mon Feb  2 19:05:02 2009
New Revision: 740078

URL: http://svn.apache.org/viewvc?rev=740078&view=rev
Log:
VCL-75
Added check throughout utils.pm where $? is used. If an external command is 
executed and $? gets set to -1, the exit status is assumed to be 0. This is 
encountered with Perl 5.8.0 because a bug in SIGCHLD signal handling causes $? 
to be incorrectly set to -1.

VCL-20
Added conditions in utils.pm::run_ssh_command and utils.pm::run_scp_command to 
attempt the command a 4th time using port 24 if the command failed 3 times. The 
code assumes port 22 is used but some managment nodes use port 24. This is a 
temporary fix until a better solution is designed.

VCL-56
Added 'use English qw( -no_match_vars );' to xCAT.pm. This allows the friendly 
$PID variable to be used.

Other - xCAT.pm
Changed a few WARNING notify() messages in xCAT.pm to OK messages. Warnings 
were not necessary. Added a CRITICAL notify() message to xCAT.pm if rinstall 
fails on the first attempt. This allows an administrator to watch the 2nd 
attempt and have a chance to troubleshoot. Also added an NCSU-specific fix to 
legacy code in xCAT.pm. One of NCSU's management nodes is configured 
differently for Linux images.

Other - new.pm
Changed 1 CRITICAL notify() message to WARNING. This reduces redundant messages 
being sent when a reservation fails.

Modified:
    incubator/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
    incubator/vcl/trunk/managementnode/lib/VCL/new.pm
    incubator/vcl/trunk/managementnode/lib/VCL/utils.pm

Modified: incubator/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
URL: 
http://svn.apache.org/viewvc/incubator/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm?rev=740078&r1=740077&r2=740078&view=diff
==============================================================================
--- incubator/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm 
(original)
+++ incubator/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm Mon 
Feb  2 19:05:02 2009
@@ -56,6 +56,7 @@
 use strict;
 use warnings;
 use diagnostics;
+use English qw( -no_match_vars );
 
 use VCL::utils;
 use Fcntl qw(:DEFAULT :flock);
@@ -103,11 +104,11 @@
                $XCAT_ROOT = $ENV{XCATROOT};
        }
        elsif (defined($ENV{XCATROOT})) {
-               notify($ERRORS{'WARNING'}, 0, "XCATROOT environment variable is 
not defined, using /opt/xcat");
+               notify($ERRORS{'OK'}, 0, "XCATROOT environment variable is not 
defined, using /opt/xcat");
                $XCAT_ROOT = '/opt/xcat';
        }
        else {
-               notify($ERRORS{'WARNING'}, 0, "XCATROOT environment variable is 
not set, using /opt/xcat");
+               notify($ERRORS{'OK'}, 0, "XCATROOT environment variable is not 
set, using /opt/xcat");
                $XCAT_ROOT = '/opt/xcat';
        }
 
@@ -117,13 +118,13 @@
        # Make sure the xCAT root path is valid
        if (!-d $XCAT_ROOT) {
                notify($ERRORS{'WARNING'}, 0, "unable to initialize xCAT 
module, $XCAT_ROOT directory does not exist");
-               return 0;
+               return;
        }
 
        # Check to make sure one of the expected executables is where it should 
be
        if (!-x "$XCAT_ROOT/bin/rpower") {
                notify($ERRORS{'WARNING'}, 0, "unable to initialize xCAT 
module, expected executable was not found: $XCAT_ROOT/bin/rpower");
-               return 0;
+               return;
        }
        notify($ERRORS{'DEBUG'}, 0, "xCAT root path found: $XCAT_ROOT");
 
@@ -176,7 +177,7 @@
          if (!defined($reservation_id));
        notify($ERRORS{'OK'}, 0, "architecture not set")
          if (!defined($image_architecture));
-
+       
        # Initialize some timer variables
        # Do this here in case goto passes over the declaration
        my $sshd_start_time;
@@ -771,7 +772,23 @@
                                        #need to check power, maybe reboot it. 
for now fail it
                                        #try to reinstall it once
                                        if ($rinstall_attempts < 2) {
-                                               notify($ERRORS{'WARNING'}, 0, 
"$computer_node_name starting rinstall again");
+                                               my $debugging_message = 
"*reservation has NOT failed yet*\n";
+                                               $debugging_message .= "this 
notice is for debugging purposes so that node can be watched during 2nd 
rinstall attempt\n";
+                                               $debugging_message .= "sshd did 
not become active on $computer_node_name after first rinstall attempt\n\n";
+                                               
+                                               $debugging_message .= 
"management node:     " . $self->data->get_management_node_hostname() . "\n";
+                                               $debugging_message .= "pid:     
            " . $PID . "\n";
+                                               $debugging_message .= "request: 
            " . $self->data->get_request_id() . "\n";
+                                               $debugging_message .= 
"reservation:         " . $self->data->get_reservation_id() . "\n";
+                                               $debugging_message .= 
"state/laststate:     " . $self->data->get_request_state_name() . "/" . 
$self->data->get_request_laststate_name() . "\n";
+                                               $debugging_message .= 
"computer:            " . $self->data->get_computer_host_name() . " (id: " . 
$self->data->get_computer_id() . ")\n";
+                                               $debugging_message .= "user:    
            " . $self->data->get_user_login_id() . " (id: " . 
$self->data->get_user_id() . ")\n";
+                                               $debugging_message .= "image:   
            " . $self->data->get_image_name() . " (id: " . 
$self->data->get_image_id() . ")\n";
+                                               $debugging_message .= "image 
prettyname:    " . $self->data->get_image_prettyname() . "\n";
+                                               $debugging_message .= "image 
size:          " . $self->data->get_image_size() . "\n";
+                                               $debugging_message .= "reload 
time:         " . $self->data->get_image_reload_time() . "\n";
+
+                                               notify($ERRORS{'CRITICAL'}, 0, 
"$debugging_message");
                                                insertloadlog($reservation_id, 
$computer_id, "repeat", "starting install process");
                                                close(TAIL);
                                                goto XCATRINSTALL;
@@ -2402,10 +2419,20 @@
 
        # Get the image repository path
        my $image_repository_path = $self->_get_image_repository_path();
+       my $image_repository_path_source = $image_repository_path;
        if (!$image_repository_path) {
                notify($ERRORS{'WARNING'}, 0, "image repository path could not 
be determined");
                return;
        }
+       
+       # Fix for Linux images on henry4
+       my $management_node_hostname = 
$self->data->get_management_node_hostname();
+       my $image_os_type            = $self->data->get_image_os_type();
+       my $image_os_source_path     = $self->data->get_image_os_source_path();
+       if ($management_node_hostname =~ /henry4/i && $image_os_type =~ 
/linux/i && $image_os_source_path eq 'image') {
+               $image_repository_path_source =~ s/linux_image/image/;
+               notify($ERRORS{'DEBUG'}, 0, "fixed retrieval Linux image path 
for henry4: linux_image --> image: $image_repository_path_source");
+       }
 
        # Attempt to copy image from other management nodes
        notify($ERRORS{'OK'}, 0, "attempting to copy $image_name from other 
management nodes");
@@ -2422,7 +2449,7 @@
                notify($ERRORS{'OK'}, 0, "checking if $partner has 
$image_name");
 
                # Use ssh to call ls on the partner management node
-               my ($ls_exit_status, $ls_output_array_ref) = 
run_ssh_command($partner, $image_lib_key, "ls -1 $image_repository_path", 
$image_lib_user, '', 1);
+               my ($ls_exit_status, $ls_output_array_ref) = 
run_ssh_command($partner, $image_lib_key, "ls -1 
$image_repository_path_source", $image_lib_user, '', 1);
 
                # Check if the ssh command failed
                if (!$ls_output_array_ref) {
@@ -2449,7 +2476,7 @@
                notify($ERRORS{'OK'}, 0, "$image_name exists on $partner, 
attempting to copy");
 
                # Attempt copy
-               if 
(run_scp_command("$image_lib_us...@$partner:$image_repository_path/$image_name*",
 $image_repository_path, $image_lib_key)) {
+               if 
(run_scp_command("$image_lib_us...@$partner:$image_repository_path_source/$image_name*",
 $image_repository_path, $image_lib_key)) {
                        notify($ERRORS{'OK'}, 0, "$image_name files copied via 
SCP");
                        last;
                }
@@ -2602,9 +2629,13 @@
        my $du_output = `$du_command`;
 
        # Save the exit status
-       my $du_exit_status = $?;
-
-       #notify($ERRORS{'DEBUG'}, 0, "du exit staus: $du_exit_status, 
output:\n$du_output");
+       my $du_exit_status = $? >> 8;
+       
+       # Check if $? = -1, this likely means a Perl CHLD signal bug was 
encountered
+       if ($? == -1) {
+               notify($ERRORS{'OK'}, 0, "\$? is set to $?, setting exit status 
to 0, Perl bug likely encountered");
+               $du_exit_status = 0;
+       }
 
        # Check the du command output
        if ($du_exit_status > 0) {
@@ -2838,6 +2869,13 @@
        # Make a copy of the base template file
        my $cp_output = `/bin/cp -fv  $tmpl_repository_path/$basetmpl 
$tmpl_repository_path/$image_name.tmpl 2>&1`;
        my $cp_exit_status = $? >> 8;
+       
+       # Check if $? = -1, this likely means a Perl CHLD signal bug was 
encountered
+       if ($? == -1) {
+               notify($ERRORS{'OK'}, 0, "\$? is set to $?, setting exit status 
to 0, Perl bug likely encountered");
+               $cp_exit_status = 0;
+       }
+       
        if ($cp_exit_status == 0) {
                notify($ERRORS{'DEBUG'}, 0, "copied $basetmpl to 
$tmpl_repository_path/$image_name.tmpl, output:\n$cp_output");
        }
@@ -2898,6 +2936,13 @@
        # Delete the template file
        my $rm_output = `/bin/rm -fv  $tmpl_repository_path/$image_name.tmpl 
2>&1`;
        my $rm_exit_status = $? >> 8;
+       
+       # Check if $? = -1, this likely means a Perl CHLD signal bug was 
encountered
+       if ($? == -1) {
+               notify($ERRORS{'OK'}, 0, "\$? is set to $?, setting exit status 
to 0, Perl bug likely encountered");
+               $rm_exit_status = 0;
+       }
+       
        if ($rm_exit_status == 0) {
                notify($ERRORS{'DEBUG'}, 0, "deleted 
$tmpl_repository_path/$image_name.tmpl, output:\n$rm_output");
        }

Modified: incubator/vcl/trunk/managementnode/lib/VCL/new.pm
URL: 
http://svn.apache.org/viewvc/incubator/vcl/trunk/managementnode/lib/VCL/new.pm?rev=740078&r1=740077&r2=740078&view=diff
==============================================================================
--- incubator/vcl/trunk/managementnode/lib/VCL/new.pm (original)
+++ incubator/vcl/trunk/managementnode/lib/VCL/new.pm Mon Feb  2 19:05:02 2009
@@ -654,7 +654,7 @@
                insertloadlog($reservation_id, $computer_id, 
"loadimagecomplete", "$image_name was successfully reloaded on 
$computer_short_name");
        }
        else {
-               notify($ERRORS{'CRITICAL'}, 0, "$image_name failed to load on 
$computer_short_name, returning");
+               notify($ERRORS{'WARNING'}, 0, "$image_name failed to load on 
$computer_short_name, returning");
                insertloadlog($reservation_id, $computer_id, "loadimagefailed", 
"$image_name failed to load on $computer_short_name");
                return;
        }

Modified: incubator/vcl/trunk/managementnode/lib/VCL/utils.pm
URL: 
http://svn.apache.org/viewvc/incubator/vcl/trunk/managementnode/lib/VCL/utils.pm?rev=740078&r1=740077&r2=740078&view=diff
==============================================================================
--- incubator/vcl/trunk/managementnode/lib/VCL/utils.pm (original)
+++ incubator/vcl/trunk/managementnode/lib/VCL/utils.pm Mon Feb  2 19:05:02 2009
@@ -5469,12 +5469,16 @@
        my $pkill_exit_status = $? >> 8;
        
        # Check the pgrep exit status
-       if ($pkill_exit_status == 0 || $? == -1) {
-               notify($ERRORS{'DEBUG'}, 0, "reservation $reservation_id 
process was killed, returning 1");
+       if ($pkill_exit_status == 0) {
+               notify($ERRORS{'OK'}, 0, "reservation $reservation_id process 
was killed, returning 1");
+               return 1;
+       }
+       elsif ($? == -1) {
+               notify($ERRORS{'OK'}, 0, "\$? is set to -1, Perl bug likely 
encountered, assuming reservation $reservation_id process was killed, returning 
1");
                return 1;
        }
        elsif ($pkill_exit_status == 1) {
-               notify($ERRORS{'WARNING'}, 0, "process was not found for 
reservation $reservation_id, returning 1");
+               notify($ERRORS{'OK'}, 0, "process was not found for reservation 
$reservation_id, returning 1");
                return 1;
        }
        else {
@@ -6768,14 +6772,16 @@
                # Bits 9-16 of $? contain the child process exit status
                $exit_status = $? >> 8;
                
+               # Ignore the returned value of $? if it is -1
+               # This likely means a Perl bug was encountered
+               # Assume command was successful
+               if ($? == -1) {
+                       notify($ERRORS{'OK'}, 0, "exit status changed from 
$exit_status to 0, Perl bug likely encountered");
+                       $exit_status = 0;
+               }
+               
                #notify($ERRORS{'DEBUG'}, 0, "\$?: $?, signal: $signal_number, 
core dump: $core_dump, exit status: $exit_status");
 
-               ## For some reason the SSH exit status is sometimes 
right-padded with 8 0's
-               ## Shift right 8 bits to get the real value if it's > 255
-               #if ($ssh_exit_status > 255) {
-               #       $ssh_exit_status = ($ssh_exit_status >> 8);
-               #}
-
                # Strip out the key warning message from the output
                $ssh_output =~ s/\...@{10,}.*man-in-the-middle attacks\.//igs;
                
@@ -6811,6 +6817,13 @@
                # Check the exit status
                # ssh exits with the exit status of the remote command or with 
255 if an error occurred.
                if ($exit_status == 255 || $ssh_output_formatted =~ /lost 
connection|reset by peer|no route to host|connection refused|connection timed 
out/i) {
+                       # Temporary fix for problem of nodes using different 
ports
+                       if ($attempts == 3) {
+                               $max_attempts++;
+                               notify($ERRORS{'OK'}, 0, "making 1 more attempt 
using port 24");
+                               $ssh_command = "$ssh_path $identity_paths -l 
$user -p 24 -x $node '$command' 2>&1";
+                       }
+                       
                        notify($ERRORS{'WARNING'}, 0, "attempt 
$attempts/$max_attempts: failed to execute SSH command on $node: $command, exit 
status: $exit_status, SSH exits with the exit status of the remote command or 
with 255 if an error occurred, output:\n$ssh_output_formatted");
                        next;
                }
@@ -6971,6 +6984,14 @@
                # scp exits with 0 on success or >0 if an error occurred
                if ($scp_exit_status > 0 || $scp_output =~ /lost 
connection|failed|reset by peer|no route to host/i) {
                        notify($ERRORS{'WARNING'}, 0, "scp error occurred: 
attempt $attempts/$max_attempts, command: $scp_command, exit status: 
$scp_exit_status, output: $scp_output");
+                       
+                       # Temporary fix for problem of nodes using different 
ports
+                       if ($attempts == 3) {
+                               $max_attempts++;
+                               notify($ERRORS{'OK'}, 0, "making 1 more attempt 
using port 24");
+                               $scp_command = "$scp_path -B $identity_paths-P 
24 -p -r $path1 $path2 2>&1";
+                       }
+                       
                        next;
                }
                else {
@@ -9936,6 +9957,11 @@
                # Save the exit status
                $exit_status = $? >> 8;
                
+               if ($? == -1) {
+                       notify($ERRORS{'OK'}, 0, "\$? is set to $?, setting 
exit status to 0, Perl bug likely encountered");
+                       $exit_status = 0;
+               }
+               
                # Close the command handle
                close(COMMAND);
        }


Reply via email to