Author: arkurth
Date: Mon Feb 2 19:05:02 2009
New Revision: 740078
URL: http://svn.apache.org/viewvc?rev=740078&view=rev
Log:
VCL-75
Added check throughout utils.pm where $? is used. If an external command is
executed and $? gets set to -1, the exit status is assumed to be 0. This is
encountered with Perl 5.8.0 because a bug in SIGCHLD signal handling causes $?
to be incorrectly set to -1.
VCL-20
Added conditions in utils.pm::run_ssh_command and utils.pm::run_scp_command to
attempt the command a 4th time using port 24 if the command failed 3 times. The
code assumes port 22 is used but some managment nodes use port 24. This is a
temporary fix until a better solution is designed.
VCL-56
Added 'use English qw( -no_match_vars );' to xCAT.pm. This allows the friendly
$PID variable to be used.
Other - xCAT.pm
Changed a few WARNING notify() messages in xCAT.pm to OK messages. Warnings
were not necessary. Added a CRITICAL notify() message to xCAT.pm if rinstall
fails on the first attempt. This allows an administrator to watch the 2nd
attempt and have a chance to troubleshoot. Also added an NCSU-specific fix to
legacy code in xCAT.pm. One of NCSU's management nodes is configured
differently for Linux images.
Other - new.pm
Changed 1 CRITICAL notify() message to WARNING. This reduces redundant messages
being sent when a reservation fails.
Modified:
incubator/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
incubator/vcl/trunk/managementnode/lib/VCL/new.pm
incubator/vcl/trunk/managementnode/lib/VCL/utils.pm
Modified: incubator/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
URL:
http://svn.apache.org/viewvc/incubator/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm?rev=740078&r1=740077&r2=740078&view=diff
==============================================================================
--- incubator/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
(original)
+++ incubator/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm Mon
Feb 2 19:05:02 2009
@@ -56,6 +56,7 @@
use strict;
use warnings;
use diagnostics;
+use English qw( -no_match_vars );
use VCL::utils;
use Fcntl qw(:DEFAULT :flock);
@@ -103,11 +104,11 @@
$XCAT_ROOT = $ENV{XCATROOT};
}
elsif (defined($ENV{XCATROOT})) {
- notify($ERRORS{'WARNING'}, 0, "XCATROOT environment variable is
not defined, using /opt/xcat");
+ notify($ERRORS{'OK'}, 0, "XCATROOT environment variable is not
defined, using /opt/xcat");
$XCAT_ROOT = '/opt/xcat';
}
else {
- notify($ERRORS{'WARNING'}, 0, "XCATROOT environment variable is
not set, using /opt/xcat");
+ notify($ERRORS{'OK'}, 0, "XCATROOT environment variable is not
set, using /opt/xcat");
$XCAT_ROOT = '/opt/xcat';
}
@@ -117,13 +118,13 @@
# Make sure the xCAT root path is valid
if (!-d $XCAT_ROOT) {
notify($ERRORS{'WARNING'}, 0, "unable to initialize xCAT
module, $XCAT_ROOT directory does not exist");
- return 0;
+ return;
}
# Check to make sure one of the expected executables is where it should
be
if (!-x "$XCAT_ROOT/bin/rpower") {
notify($ERRORS{'WARNING'}, 0, "unable to initialize xCAT
module, expected executable was not found: $XCAT_ROOT/bin/rpower");
- return 0;
+ return;
}
notify($ERRORS{'DEBUG'}, 0, "xCAT root path found: $XCAT_ROOT");
@@ -176,7 +177,7 @@
if (!defined($reservation_id));
notify($ERRORS{'OK'}, 0, "architecture not set")
if (!defined($image_architecture));
-
+
# Initialize some timer variables
# Do this here in case goto passes over the declaration
my $sshd_start_time;
@@ -771,7 +772,23 @@
#need to check power, maybe reboot it.
for now fail it
#try to reinstall it once
if ($rinstall_attempts < 2) {
- notify($ERRORS{'WARNING'}, 0,
"$computer_node_name starting rinstall again");
+ my $debugging_message =
"*reservation has NOT failed yet*\n";
+ $debugging_message .= "this
notice is for debugging purposes so that node can be watched during 2nd
rinstall attempt\n";
+ $debugging_message .= "sshd did
not become active on $computer_node_name after first rinstall attempt\n\n";
+
+ $debugging_message .=
"management node: " . $self->data->get_management_node_hostname() . "\n";
+ $debugging_message .= "pid:
" . $PID . "\n";
+ $debugging_message .= "request:
" . $self->data->get_request_id() . "\n";
+ $debugging_message .=
"reservation: " . $self->data->get_reservation_id() . "\n";
+ $debugging_message .=
"state/laststate: " . $self->data->get_request_state_name() . "/" .
$self->data->get_request_laststate_name() . "\n";
+ $debugging_message .=
"computer: " . $self->data->get_computer_host_name() . " (id: " .
$self->data->get_computer_id() . ")\n";
+ $debugging_message .= "user:
" . $self->data->get_user_login_id() . " (id: " .
$self->data->get_user_id() . ")\n";
+ $debugging_message .= "image:
" . $self->data->get_image_name() . " (id: " .
$self->data->get_image_id() . ")\n";
+ $debugging_message .= "image
prettyname: " . $self->data->get_image_prettyname() . "\n";
+ $debugging_message .= "image
size: " . $self->data->get_image_size() . "\n";
+ $debugging_message .= "reload
time: " . $self->data->get_image_reload_time() . "\n";
+
+ notify($ERRORS{'CRITICAL'}, 0,
"$debugging_message");
insertloadlog($reservation_id,
$computer_id, "repeat", "starting install process");
close(TAIL);
goto XCATRINSTALL;
@@ -2402,10 +2419,20 @@
# Get the image repository path
my $image_repository_path = $self->_get_image_repository_path();
+ my $image_repository_path_source = $image_repository_path;
if (!$image_repository_path) {
notify($ERRORS{'WARNING'}, 0, "image repository path could not
be determined");
return;
}
+
+ # Fix for Linux images on henry4
+ my $management_node_hostname =
$self->data->get_management_node_hostname();
+ my $image_os_type = $self->data->get_image_os_type();
+ my $image_os_source_path = $self->data->get_image_os_source_path();
+ if ($management_node_hostname =~ /henry4/i && $image_os_type =~
/linux/i && $image_os_source_path eq 'image') {
+ $image_repository_path_source =~ s/linux_image/image/;
+ notify($ERRORS{'DEBUG'}, 0, "fixed retrieval Linux image path
for henry4: linux_image --> image: $image_repository_path_source");
+ }
# Attempt to copy image from other management nodes
notify($ERRORS{'OK'}, 0, "attempting to copy $image_name from other
management nodes");
@@ -2422,7 +2449,7 @@
notify($ERRORS{'OK'}, 0, "checking if $partner has
$image_name");
# Use ssh to call ls on the partner management node
- my ($ls_exit_status, $ls_output_array_ref) =
run_ssh_command($partner, $image_lib_key, "ls -1 $image_repository_path",
$image_lib_user, '', 1);
+ my ($ls_exit_status, $ls_output_array_ref) =
run_ssh_command($partner, $image_lib_key, "ls -1
$image_repository_path_source", $image_lib_user, '', 1);
# Check if the ssh command failed
if (!$ls_output_array_ref) {
@@ -2449,7 +2476,7 @@
notify($ERRORS{'OK'}, 0, "$image_name exists on $partner,
attempting to copy");
# Attempt copy
- if
(run_scp_command("$image_lib_us...@$partner:$image_repository_path/$image_name*",
$image_repository_path, $image_lib_key)) {
+ if
(run_scp_command("$image_lib_us...@$partner:$image_repository_path_source/$image_name*",
$image_repository_path, $image_lib_key)) {
notify($ERRORS{'OK'}, 0, "$image_name files copied via
SCP");
last;
}
@@ -2602,9 +2629,13 @@
my $du_output = `$du_command`;
# Save the exit status
- my $du_exit_status = $?;
-
- #notify($ERRORS{'DEBUG'}, 0, "du exit staus: $du_exit_status,
output:\n$du_output");
+ my $du_exit_status = $? >> 8;
+
+ # Check if $? = -1, this likely means a Perl CHLD signal bug was
encountered
+ if ($? == -1) {
+ notify($ERRORS{'OK'}, 0, "\$? is set to $?, setting exit status
to 0, Perl bug likely encountered");
+ $du_exit_status = 0;
+ }
# Check the du command output
if ($du_exit_status > 0) {
@@ -2838,6 +2869,13 @@
# Make a copy of the base template file
my $cp_output = `/bin/cp -fv $tmpl_repository_path/$basetmpl
$tmpl_repository_path/$image_name.tmpl 2>&1`;
my $cp_exit_status = $? >> 8;
+
+ # Check if $? = -1, this likely means a Perl CHLD signal bug was
encountered
+ if ($? == -1) {
+ notify($ERRORS{'OK'}, 0, "\$? is set to $?, setting exit status
to 0, Perl bug likely encountered");
+ $cp_exit_status = 0;
+ }
+
if ($cp_exit_status == 0) {
notify($ERRORS{'DEBUG'}, 0, "copied $basetmpl to
$tmpl_repository_path/$image_name.tmpl, output:\n$cp_output");
}
@@ -2898,6 +2936,13 @@
# Delete the template file
my $rm_output = `/bin/rm -fv $tmpl_repository_path/$image_name.tmpl
2>&1`;
my $rm_exit_status = $? >> 8;
+
+ # Check if $? = -1, this likely means a Perl CHLD signal bug was
encountered
+ if ($? == -1) {
+ notify($ERRORS{'OK'}, 0, "\$? is set to $?, setting exit status
to 0, Perl bug likely encountered");
+ $rm_exit_status = 0;
+ }
+
if ($rm_exit_status == 0) {
notify($ERRORS{'DEBUG'}, 0, "deleted
$tmpl_repository_path/$image_name.tmpl, output:\n$rm_output");
}
Modified: incubator/vcl/trunk/managementnode/lib/VCL/new.pm
URL:
http://svn.apache.org/viewvc/incubator/vcl/trunk/managementnode/lib/VCL/new.pm?rev=740078&r1=740077&r2=740078&view=diff
==============================================================================
--- incubator/vcl/trunk/managementnode/lib/VCL/new.pm (original)
+++ incubator/vcl/trunk/managementnode/lib/VCL/new.pm Mon Feb 2 19:05:02 2009
@@ -654,7 +654,7 @@
insertloadlog($reservation_id, $computer_id,
"loadimagecomplete", "$image_name was successfully reloaded on
$computer_short_name");
}
else {
- notify($ERRORS{'CRITICAL'}, 0, "$image_name failed to load on
$computer_short_name, returning");
+ notify($ERRORS{'WARNING'}, 0, "$image_name failed to load on
$computer_short_name, returning");
insertloadlog($reservation_id, $computer_id, "loadimagefailed",
"$image_name failed to load on $computer_short_name");
return;
}
Modified: incubator/vcl/trunk/managementnode/lib/VCL/utils.pm
URL:
http://svn.apache.org/viewvc/incubator/vcl/trunk/managementnode/lib/VCL/utils.pm?rev=740078&r1=740077&r2=740078&view=diff
==============================================================================
--- incubator/vcl/trunk/managementnode/lib/VCL/utils.pm (original)
+++ incubator/vcl/trunk/managementnode/lib/VCL/utils.pm Mon Feb 2 19:05:02 2009
@@ -5469,12 +5469,16 @@
my $pkill_exit_status = $? >> 8;
# Check the pgrep exit status
- if ($pkill_exit_status == 0 || $? == -1) {
- notify($ERRORS{'DEBUG'}, 0, "reservation $reservation_id
process was killed, returning 1");
+ if ($pkill_exit_status == 0) {
+ notify($ERRORS{'OK'}, 0, "reservation $reservation_id process
was killed, returning 1");
+ return 1;
+ }
+ elsif ($? == -1) {
+ notify($ERRORS{'OK'}, 0, "\$? is set to -1, Perl bug likely
encountered, assuming reservation $reservation_id process was killed, returning
1");
return 1;
}
elsif ($pkill_exit_status == 1) {
- notify($ERRORS{'WARNING'}, 0, "process was not found for
reservation $reservation_id, returning 1");
+ notify($ERRORS{'OK'}, 0, "process was not found for reservation
$reservation_id, returning 1");
return 1;
}
else {
@@ -6768,14 +6772,16 @@
# Bits 9-16 of $? contain the child process exit status
$exit_status = $? >> 8;
+ # Ignore the returned value of $? if it is -1
+ # This likely means a Perl bug was encountered
+ # Assume command was successful
+ if ($? == -1) {
+ notify($ERRORS{'OK'}, 0, "exit status changed from
$exit_status to 0, Perl bug likely encountered");
+ $exit_status = 0;
+ }
+
#notify($ERRORS{'DEBUG'}, 0, "\$?: $?, signal: $signal_number,
core dump: $core_dump, exit status: $exit_status");
- ## For some reason the SSH exit status is sometimes
right-padded with 8 0's
- ## Shift right 8 bits to get the real value if it's > 255
- #if ($ssh_exit_status > 255) {
- # $ssh_exit_status = ($ssh_exit_status >> 8);
- #}
-
# Strip out the key warning message from the output
$ssh_output =~ s/\...@{10,}.*man-in-the-middle attacks\.//igs;
@@ -6811,6 +6817,13 @@
# Check the exit status
# ssh exits with the exit status of the remote command or with
255 if an error occurred.
if ($exit_status == 255 || $ssh_output_formatted =~ /lost
connection|reset by peer|no route to host|connection refused|connection timed
out/i) {
+ # Temporary fix for problem of nodes using different
ports
+ if ($attempts == 3) {
+ $max_attempts++;
+ notify($ERRORS{'OK'}, 0, "making 1 more attempt
using port 24");
+ $ssh_command = "$ssh_path $identity_paths -l
$user -p 24 -x $node '$command' 2>&1";
+ }
+
notify($ERRORS{'WARNING'}, 0, "attempt
$attempts/$max_attempts: failed to execute SSH command on $node: $command, exit
status: $exit_status, SSH exits with the exit status of the remote command or
with 255 if an error occurred, output:\n$ssh_output_formatted");
next;
}
@@ -6971,6 +6984,14 @@
# scp exits with 0 on success or >0 if an error occurred
if ($scp_exit_status > 0 || $scp_output =~ /lost
connection|failed|reset by peer|no route to host/i) {
notify($ERRORS{'WARNING'}, 0, "scp error occurred:
attempt $attempts/$max_attempts, command: $scp_command, exit status:
$scp_exit_status, output: $scp_output");
+
+ # Temporary fix for problem of nodes using different
ports
+ if ($attempts == 3) {
+ $max_attempts++;
+ notify($ERRORS{'OK'}, 0, "making 1 more attempt
using port 24");
+ $scp_command = "$scp_path -B $identity_paths-P
24 -p -r $path1 $path2 2>&1";
+ }
+
next;
}
else {
@@ -9936,6 +9957,11 @@
# Save the exit status
$exit_status = $? >> 8;
+ if ($? == -1) {
+ notify($ERRORS{'OK'}, 0, "\$? is set to $?, setting
exit status to 0, Perl bug likely encountered");
+ $exit_status = 0;
+ }
+
# Close the command handle
close(COMMAND);
}