Author: arkurth
Date: Thu Jun 13 16:37:14 2013
New Revision: 1492747
URL: http://svn.apache.org/r1492747
Log:
VCL-16
Fixed a few issues with reserved.pm and inuse.pm. The parent reserved may have
exited before all child reservations had exited. Added a check to make sure
computerloadlog reserved entry exists for all children before parent exits.
Also added check to make sure reservation wasn't deleted or any of the child
reserved processes failed.
Fixed inuse.pm to properly handle the connecttimeout variable. If set to a long
duration, the connect checking could have run into the end time countdown.
Modified:
vcl/trunk/managementnode/lib/VCL/inuse.pm
vcl/trunk/managementnode/lib/VCL/reserved.pm
vcl/trunk/managementnode/lib/VCL/utils.pm
Modified: vcl/trunk/managementnode/lib/VCL/inuse.pm
URL:
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/inuse.pm?rev=1492747&r1=1492746&r2=1492747&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/inuse.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/inuse.pm Thu Jun 13 16:37:14 2013
@@ -79,7 +79,7 @@ use strict;
use warnings;
use diagnostics;
-use POSIX;
+use POSIX qw(ceil floor strftime);
use VCL::utils;
##############################################################################
@@ -101,31 +101,35 @@ use VCL::utils;
sub process {
my $self = shift;
- my $request_id = $self->data->get_request_id();
- my $request_state_name = $self->data->get_request_state_name();
- my $request_laststate_name = $self->data->get_request_laststate_name();
- my $request_start = $self->data->get_request_start_time();
- my $request_end = $self->data->get_request_end_time();
- my $request_data = $self->data->get_request_data();
- my $request_forimaging = $self->data->get_request_forimaging();
- my $request_checkuser = $self->data->get_request_checkuser();
- my $reservation_id = $self->data->get_reservation_id();
- my $reservation_count = $self->data->get_reservation_count();
- my $server_request_id = $self->data->get_server_request_id();
- my $imagemeta_checkuser = $self->data->get_imagemeta_checkuser();
- my $is_parent_reservation = $self->data->is_parent_reservation();
- my $computer_id = $self->data->get_computer_id();
- my $computer_short_name = $self->data->get_computer_short_name();
- my $connect_timeout_minutes =
$self->data->get_variable('connect_timeout_minutes') || 15;
+ my $request_id = $self->data->get_request_id();
+ my $request_state_name = $self->data->get_request_state_name();
+ my $request_laststate_name = $self->data->get_request_laststate_name();
+ my $request_start = $self->data->get_request_start_time();
+ my $request_end = $self->data->get_request_end_time();
+ my $request_data = $self->data->get_request_data();
+ my $request_forimaging = $self->data->get_request_forimaging();
+ my $request_checkuser = $self->data->get_request_checkuser();
+ my $reservation_id = $self->data->get_reservation_id();
+ my $reservation_count = $self->data->get_reservation_count();
+ my $server_request_id = $self->data->get_server_request_id();
+ my $imagemeta_checkuser = $self->data->get_imagemeta_checkuser();
+ my $is_parent_reservation = $self->data->is_parent_reservation();
+ my $computer_id = $self->data->get_computer_id();
+ my $computer_short_name = $self->data->get_computer_short_name();
+ my $connect_timeout_seconds =
$self->data->get_variable('connecttimeout') || (15 * 60);
# Make sure connect timeout is long enough
# It has to be a bit longer than the ~5 minute period between inuse
checks due to cluster reservations
# If too short, a user may be connected to one computer in a cluster
and another inuse process times out before the connected computer is checked
+ my $connect_timeout_minutes = ceil($connect_timeout_seconds / 60);
if ($connect_timeout_minutes < 10) {
notify($ERRORS{'WARNING'}, 0, "connect timeout is set to
$connect_timeout_minutes minutes, it must be 10 minutes or more");
$connect_timeout_minutes = 10;
}
+ # Connect timeout must be in whole minutes
+ $connect_timeout_seconds = ($connect_timeout_minutes * 60);
+
# Check if reboot operation was requested
if ($request_state_name =~ /reboot/) {
if ($self->os->can('reboot')) {
@@ -136,8 +140,7 @@ sub process {
else {
notify($ERRORS{'CRITICAL'}, 0, "'$request_state_name'
operation requested, " . ref($self->os) . " does not implement a 'reboot'
subroutine");
}
- update_request_state($request_id, "inuse", "inuse");
- notify($ERRORS{'OK'}, 0, "exiting");
+ switch_state($request_data, 'inuse', 'inuse');
exit;
}
@@ -146,7 +149,7 @@ sub process {
if (!$self->os->manage_server_access()) {
notify($ERRORS{'CRITICAL'}, 0, "failed to update server
access");
}
- update_request_state($request_id, "inuse", "inuse");
+ switch_state($request_data, 'inuse', 'inuse');
exit;
}
@@ -154,17 +157,27 @@ sub process {
delete_computerloadlog_reservation($reservation_id, '!begin');
my $now_epoch_seconds = time;
+
my $request_start_epoch_seconds =
convert_to_epoch_seconds($request_start);
my $request_end_epoch_seconds = convert_to_epoch_seconds($request_end);
+
my $request_remaining_seconds = ($request_end_epoch_seconds -
$now_epoch_seconds);
my $request_remaining_minutes = floor($request_remaining_seconds / 60);
+
my $request_duration_seconds = ($request_end_epoch_seconds -
$request_start_epoch_seconds);
my $request_duration_hours = floor($request_duration_seconds / 60 / 60);
my $end_time_notify_minutes = 10;
my $end_time_notify_seconds = ($end_time_notify_minutes * 60);
+ my $now_string = strftime('%H:%M:%S',
localtime($now_epoch_seconds));
+ my $request_end_string = strftime('%H:%M:%S',
localtime($request_end_epoch_seconds));
+ my $request_remaining_string = strftime('%H:%M:%S',
gmtime($request_remaining_seconds));
+ my $end_time_notify_string = strftime('%H:%M:%S',
gmtime($end_time_notify_seconds));
+ my $connect_timeout_string = strftime('%H:%M:%S',
gmtime($connect_timeout_seconds));
+
# Check if near the end time
+ # Compare remaining minutes to connect timeout minutes in case this is
> 15 minutes
if ($request_remaining_minutes <= ($end_time_notify_minutes + 6)) {
# Only 1 reservation needs to handle the end time countdown
if (!$is_parent_reservation) {
@@ -172,11 +185,6 @@ sub process {
exit;
}
- my $now_string = strftime('%H:%M:%S',
localtime($now_epoch_seconds));
- my $request_end_string = strftime('%H:%M:%S',
localtime($request_end_epoch_seconds));
- my $request_remaining_string = strftime('%H:%M:%S',
gmtime($request_remaining_seconds));
- my $end_time_notify_string = strftime('%H:%M:%S',
gmtime($end_time_notify_seconds));
-
my $sleep_seconds = ($request_remaining_seconds -
$end_time_notify_seconds);
if ($sleep_seconds > 0) {
my $sleep_string = strftime('%H:%M:%S',
gmtime($sleep_seconds));
@@ -218,7 +226,7 @@ sub process {
# Check if the user extended the request
if ($current_request_end_epoch_seconds >
$request_end_epoch_seconds) {
notify($ERRORS{'OK'}, 0, "user extended
request, end time: $request_end --> $current_request_end, returning request to
inuse state");
- update_request_state($request_id, "inuse",
"inuse");
+ switch_state($request_data, 'inuse', 'inuse');
exit;
}
@@ -241,7 +249,7 @@ sub process {
notify($ERRORS{'OK'}, 0, "initiating image auto-capture
process");
if (!$self->_start_imaging_request()) {
notify($ERRORS{'CRITICAL'}, 0, "failed to
initiate image auto-capture process, changing request and computer state to
maintenance");
- update_request_state($request_id,
'maintenance', 'maintenance');
+ switch_state($request_data, 'maintenance',
'maintenance');
exit;
}
}
@@ -263,10 +271,27 @@ sub process {
notify($ERRORS{'DEBUG'}, 0, "skipping end time notice interval
check, request duration: $request_duration_hours hours, parent reservation:
$is_parent_reservation");
}
+ # Compare remaining minutes to connect timeout
+ # Connect timeout may be longer than 15 minutes
+ # Make sure connect timeout doesn't run into the end time notice
+ if ($request_remaining_minutes < ($connect_timeout_minutes +
$end_time_notify_minutes)) {
+ notify($ERRORS{'DEBUG'}, 0, "skipping user connection check,
connect timeout would run into the end time notice stage:\n" .
+ "current time : $now_string\n" .
+ "request end time : $request_end_string\n" .
+ "remaining time : $request_remaining_string\n" .
+ "notify time : $end_time_notify_string\n" .
+ "connect timeout : $connect_timeout_string"
+ );
+ switch_state($request_data, 'inuse', 'inuse');
+ exit;
+ }
+
# Check if the computer is responding to SSH
+ # Skip connection checks if the computer is not responding to SSH
+ # This prevents a reservatino from timing out if the user is actually
connected but SSH from the management node isn't working
if (!$self->os->is_ssh_responding()) {
notify($ERRORS{'OK'}, 0, "$computer_short_name is not
responding to SSH, skipping user connection check");
- update_request_state($request_id, "inuse", "inuse");
+ switch_state($request_data, 'inuse', 'inuse');
exit;
}
@@ -275,9 +300,7 @@ sub process {
if ($request_laststate_name ne 'reserved' &&
$self->os->can('firewall_compare_update')) {
$self->os->firewall_compare_update();
}
-
- # Skip connection checks if the computer is not responding to SSH
- # This prevents a reservatino from timing out if the user is actually
connected but SSH from the management node isn't working
+
# Wait for the user to acknowledge the request by clicking Connect
button or from API
if (!$self->code_loop_timeout(sub{$self->user_connected()}, [],
"waiting for user to connect to $computer_short_name",
($connect_timeout_minutes*60), 15)) {
if (!$imagemeta_checkuser || !$request_checkuser) {
@@ -321,7 +344,7 @@ sub process {
}
}
- update_request_state($request_id, "inuse", "inuse");
+ switch_state($request_data, 'inuse', 'inuse');
exit;
}
Modified: vcl/trunk/managementnode/lib/VCL/reserved.pm
URL:
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/reserved.pm?rev=1492747&r1=1492746&r2=1492747&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/reserved.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/reserved.pm Thu Jun 13 16:37:14 2013
@@ -94,25 +94,19 @@ sub process {
my $self = shift;
my $request_id = $self->data->get_request_id();
- my @reservation_ids = $self->data->get_reservation_ids();
my $request_data = $self->data->get_request_data();
my $request_logid = $self->data->get_request_log_id();
- my $request_forimaging = $self->data->get_request_forimaging;
my $reservation_id = $self->data->get_reservation_id();
my $reservation_count = $self->data->get_reservation_count();
my $computer_id = $self->data->get_computer_id();
my $computer_short_name =
$self->data->get_computer_short_name();
- my $imagemeta_checkuser =
$self->data->get_imagemeta_checkuser();
- my $server_request_id = $self->data->get_server_request_id();
- my $acknowledge_timeout_seconds =
$self->data->get_variable('acknowledgetimeout') || 900;
- my $connect_timeout_seconds =
$self->data->get_variable('connecttimeout') || 900;
my $is_parent_reservation = $self->data->is_parent_reservation();
+ my $server_request_id = $self->data->get_server_request_id();
+ my $acknowledge_timeout_seconds =
$self->data->get_variable('acknowledgetimeout') || 900;
# Update the log loaded time to now for this request
update_log_loaded_time($request_logid);
- insertloadlog($reservation_id, $computer_id, "reserved",
"$computer_short_name successfully reserved");
-
# Update the computer state to reserved
# This causes pending to change to the Connect button on the Current
Reservations page
update_computer_state($computer_id, 'reserved');
@@ -124,7 +118,6 @@ sub process {
}
# User acknowledged request
-
# Add the cluster information to the loaded computers if this is a
cluster reservation
if ($reservation_count > 1 && !update_cluster_info($request_data)) {
$self->reservation_failed("update_cluster_info failed");
@@ -144,14 +137,23 @@ sub process {
if ($self->os->can("post_reserve") && !$self->os->post_reserve()) {
$self->reservation_failed("OS module post_reserve failed");
}
+
+ # Add a 'reserved' computerloadlog entry
+ # Do this last - important for cluster reservation timing
+ # Parent's reserved process will loop until this exists for all child
reservations
+ insertloadlog($reservation_id, $computer_id, "reserved",
"$computer_short_name successfully reserved");
# For cluster reservations, the parent must wait until all child
reserved processes have exited
# Otherwise, the state will change to inuse while the child processes
are still finishing up the reserved state
# vcld will then fail to fork inuse processes for the child reservations
if ($reservation_count > 1 && $is_parent_reservation) {
- if
(!$self->code_loop_timeout(sub{$self->is_child_process_running()}, [], 'waiting
for child reserved processes to exit', 3*60, 5)) {
- $self->reservation_failed('child reservation reserved
processes did not exit');
+ if
(!$self->code_loop_timeout(sub{$self->wait_for_child_reservations()}, [],
"waiting for child reservation reserved processes to complete", 180, 5)) {
+ $self->reservation_failed('all child reservation
reserved processes did not complete');
}
+
+ # Parent can't tell if reserved processes on other management
nodes have terminated
+ # Wait a short time in case processes on other management nodes
are terminating
+ sleep 3;
}
# Change the request and computer state to inuse then exit
@@ -160,36 +162,66 @@ sub process {
#/////////////////////////////////////////////////////////////////////////////
-=head2 is_child_process_running
+=head2 wait_for_child_reservations
Parameters : none
Returns : boolean
- Description :
+ Description : Checks if all child reservation 'reserved' processes have
+ completed.
=cut
-sub is_child_process_running {
+sub wait_for_child_reservations {
my $self = shift;
- if (ref($self) !~ /VCL::reserved/) {
- notify($ERRORS{'CRITICAL'}, 0, "subroutine can only be called
as a class method of a VCL::reserved object");
+ my $request_id = $self->data->get_request_id();
+
+ exit if is_request_deleted($request_id);
+
+ # Check if 'reserved' computerloadlog entry exists for all reservations
+ my $request_loadstate_names = get_request_loadstate_names($request_id);
+ if (!$request_loadstate_names) {
+ notify($ERRORS{'WARNING'}, 0, "failed to retrieve request
loadstate names");
return;
}
- my $request_id = $self->data->get_request_id();
- my $reservation_id = $self->data->get_reservation_id();
+ my @reserved_exists;
+ my @reserved_does_not_exist;
+ my @failed;
+ for my $reservation_id (keys %$request_loadstate_names) {
+ my @loadstate_names =
@{$request_loadstate_names->{$reservation_id}};
+ if (grep { $_ eq 'reserved' } @loadstate_names) {
+ push @reserved_exists, $reservation_id;
+ }
+ else {
+ push @reserved_does_not_exist, $reservation_id;
+ }
+
+ if (grep { $_ eq 'failed' } @loadstate_names) {
+ push @failed, $reservation_id;
+ }
+ }
- my @reservation_ids = $self->data->get_reservation_ids();
- @reservation_ids = grep { $_ ne $reservation_id} @reservation_ids;
+ # Check if any child reservations failed
+ if (@failed) {
+ $self->reservation_failed("child reservation reserve process
failed: " . join(', ', @failed));
+ }
- my $pattern = "$request_id:(" . join('|', @reservation_ids) . ")";
- if (my @pids = is_management_node_process_running($pattern)) {
- notify($ERRORS{'DEBUG'}, 0, "child processes are running: " .
join(", ", @pids));
+ if (@reserved_does_not_exist) {
+ notify($ERRORS{'DEBUG'}, 0, "computerloadlog 'reserved' entry
does NOT exist for all reservations:\n" .
+ "exists for reservation IDs: " . join(', ',
@reserved_exists) . "\n" .
+ "does not exist for reservation IDs: " . join(', ',
@reserved_does_not_exist)
+ );
return 0;
}
else {
- notify($ERRORS{'DEBUG'}, 0, "no child processes running");
- return 1;
+ notify($ERRORS{'DEBUG'}, 0, "computerloadlog 'reserved' entry
exists for all reservations");
}
+
+ # Check if child reservation processes are running
+ return 0 unless $self->is_child_process_running();
+
+ notify($ERRORS{'DEBUG'}, 0, "all child reservation reserved processes
have completed");
+ return 1;
}
#/////////////////////////////////////////////////////////////////////////////
Modified: vcl/trunk/managementnode/lib/VCL/utils.pm
URL:
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/utils.pm?rev=1492747&r1=1492746&r2=1492747&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/utils.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/utils.pm Thu Jun 13 16:37:14 2013
@@ -144,10 +144,10 @@ our @EXPORT = qw(
get_production_imagerevision_info
get_random_mac_address
get_request_by_computerid
- get_request_computerloadstate_names
get_request_current_state_name
get_request_end
get_request_info
+ get_request_loadstate_names
get_reservation_accounts
get_resource_groups
get_managable_resource_groups
@@ -8597,7 +8597,7 @@ sub reservations_ready {
#/////////////////////////////////////////////////////////////////////////////
-=head2 get_request_computerloadstate_names
+=head2 get_request_loadstate_names
Parameters : $request_id
Returns : hash reference
@@ -8609,7 +8609,7 @@ sub reservations_ready {
=cut
-sub get_request_computerloadstate_names {
+sub get_request_loadstate_names {
my ($request_id) = @_;
if (!$request_id) {
notify($ERRORS{'WARNING'}, 0, "request ID argument was not
passed");