Author: arkurth
Date: Tue Mar 31 20:48:08 2015
New Revision: 1670477
URL: http://svn.apache.org/r1670477
Log:
VCL-845
Removed reference to blockRequest.admingroupid from
utils.pm::get_management_node_blockrequests and from DataStructure.pm.
VCL-847
Fixed problem in utils.pm::reservation_being_processed. The
$parent_reservation_id variable was getting initialized every time. Later on it
is set to the actual ID if not defined. As a result, it would never be set to
the correct reservation ID.
Added State.pm::wait_for_reservation_loadstate. Updated reserved.pm so that
child reservations wait for the parent to generate an "acknowledgetimeout"
computerloadlog entry. The timestamp of this entry is used to determine when to
stop checking for acknowledgment.
Updated reclaim.pm to call state_exit instead of switch_state.
Commented out section in State.pm::wait_for_child_reservations_to_exit where it
exited immediately if the request was deleted. This causes problems with
cluster requests. Added a comment explaining the problem for future reference.
Updated how State.pm::state_exit handles setting request.state if the current
state is deleted. It was skipping the request.state update instead of changing
it to completed.
Other
Added code to new.pm::wait_for_child_reservations to use a short interval
between checks at the start, and then increase it. This speeds things up when
all child reservations are ready early on.
Modified:
vcl/trunk/managementnode/lib/VCL/DataStructure.pm
vcl/trunk/managementnode/lib/VCL/Module.pm
vcl/trunk/managementnode/lib/VCL/Module/State.pm
vcl/trunk/managementnode/lib/VCL/new.pm
vcl/trunk/managementnode/lib/VCL/reclaim.pm
vcl/trunk/managementnode/lib/VCL/reserved.pm
vcl/trunk/managementnode/lib/VCL/utils.pm
Modified: vcl/trunk/managementnode/lib/VCL/DataStructure.pm
URL:
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/DataStructure.pm?rev=1670477&r1=1670476&r2=1670477&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/DataStructure.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/DataStructure.pm Tue Mar 31 20:48:08 2015
@@ -108,7 +108,6 @@ $SUBROUTINE_MAPPINGS{blockrequest_group_
$SUBROUTINE_MAPPINGS{blockrequest_group_name} =
'$self->blockrequest_data->{BLOCKREQUEST_ID}{groupname}';
$SUBROUTINE_MAPPINGS{blockrequest_repeating} =
'$self->blockrequest_data->{BLOCKREQUEST_ID}{repeating}';
$SUBROUTINE_MAPPINGS{blockrequest_owner_id} =
'$self->blockrequest_data->{BLOCKREQUEST_ID}{ownerid}';
-$SUBROUTINE_MAPPINGS{blockrequest_admin_group_id} =
'$self->blockrequest_data->{BLOCKREQUEST_ID}{admingroupid}';
$SUBROUTINE_MAPPINGS{blockrequest_management_node_id} =
'$self->blockrequest_data->{BLOCKREQUEST_ID}{managementnodeid}';
$SUBROUTINE_MAPPINGS{blockrequest_expire} =
'$self->blockrequest_data->{BLOCKREQUEST_ID}{expireTime}';
$SUBROUTINE_MAPPINGS{blockrequest_processing} =
'$self->blockrequest_data->{BLOCKREQUEST_ID}{processing}';
Modified: vcl/trunk/managementnode/lib/VCL/Module.pm
URL:
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/Module.pm?rev=1670477&r1=1670476&r2=1670477&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/Module.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/Module.pm Tue Mar 31 20:48:08 2015
@@ -1316,7 +1316,7 @@ sub code_loop_timeout {
$attempt_delay_seconds = 15;
}
elsif (defined($attempt_delay_seconds) && $attempt_delay_seconds !~
/^\d+$/) {
- notify($ERRORS{'WARNING'}, 0, "5th argument (attempt delay) was
not passed correctly");
+ notify($ERRORS{'WARNING'}, 0, "5th argument (attempt delay) was
not passed correctly: $attempt_delay_seconds");
return;
}
Modified: vcl/trunk/managementnode/lib/VCL/Module/State.pm
URL:
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/Module/State.pm?rev=1670477&r1=1670476&r2=1670477&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/Module/State.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/Module/State.pm Tue Mar 31 20:48:08 2015
@@ -116,7 +116,7 @@ sub initialize {
# If this is a cluster request, wait for all reservations to begin
before proceeding
if ($reservation_count > 1) {
- if (!$self->wait_for_all_reservations_to_begin('begin', 300,
30)) {
+ if (!$self->wait_for_all_reservations_to_begin('begin', 300,
5)) {
$self->reservation_failed("failed to detect start of
processing for all reservation processes", 'available');
}
}
@@ -712,6 +712,61 @@ sub wait_for_all_reservations_to_begin {
#/////////////////////////////////////////////////////////////////////////////
+=head2 wait_for_reservation_loadstate
+
+ Parameters : $reservation_id, $loadstate_name, $total_wait_seconds
(optional), $attempt_delay_seconds (optional)
+ Returns : boolean
+ Description : Waits for a computerloadlog entry to exist for a particular
+ reservation.
+
+=cut
+
+sub wait_for_reservation_loadstate {
+ my $self = shift;
+ if (ref($self) !~ /VCL/) {
+ notify($ERRORS{'CRITICAL'}, 0, "subroutine can only be called
as a class method of a VCL object");
+ return;
+ }
+
+ my ($reservation_id, $loadstate_name, $total_wait_seconds,
$attempt_delay_seconds) = @_;
+ if (!$reservation_id) {
+ notify($ERRORS{'WARNING'}, 0, "computerloadlog loadstate name
argument was not supplied");
+ return;
+ }
+ elsif (!$loadstate_name) {
+ notify($ERRORS{'WARNING'}, 0, "computerloadlog loadstate name
argument was not supplied");
+ return;
+ }
+
+ $total_wait_seconds = 300 unless defined($total_wait_seconds);
+ $attempt_delay_seconds = 30 unless defined($attempt_delay_seconds);
+
+ my $request_id = $self->data->get_request_id();
+ my $request_state_name = $self->data->get_request_state_name();
+
+ my $result = $self->code_loop_timeout(
+ sub {
+ if ($request_state_name ne 'deleted' &&
is_request_deleted($request_id)) {
+ notify($ERRORS{'OK'}, 0, "request has been
deleted, exiting");
+ exit;
+ }
+ return
get_reservation_computerloadlog_time($reservation_id, $loadstate_name);
+ },
+ [],
+ "waiting for reservation $reservation_id to generate a
$loadstate_name computerloadlog entry", $total_wait_seconds,
$attempt_delay_seconds
+ );
+
+ if ($result) {
+ return $result;
+ }
+ else {
+ notify($ERRORS{'WARNING'}, 0, "computerloadlog
'$loadstate_name' entry does not exist for reservation $reservation_id, waited
$total_wait_seconds seconds");
+ return;
+ }
+}
+
+#/////////////////////////////////////////////////////////////////////////////
+
=head2 wait_for_child_reservations_to_exit
Parameters : $total_wait_seconds (optional), $attempt_delay_seconds
(optional)
@@ -732,7 +787,7 @@ sub wait_for_child_reservations_to_exit
}
my $total_wait_seconds = shift || 300;
- my $attempt_delay_seconds = shift || 30;
+ my $attempt_delay_seconds = shift || 15;
my $request_id = $self->data->get_request_id();
my $request_state_name = $self->data->get_request_state_name();
@@ -741,10 +796,18 @@ sub wait_for_child_reservations_to_exit
return $self->code_loop_timeout(
sub {
- if (is_request_deleted($request_id)) {
- notify($ERRORS{'OK'}, 0, "request has been
deleted, exiting");
- exit;
- }
+ # Commented out - causes problems for cluster requests
+ # Example: request deleted while in pending/reserved,
waiting for acknowledgement
+ # Parent sees state=deleted, and doesn't wait for child
reserved processes to exit
+ # Parent's deleted/reclaim.pm process starts up
+ # -Parent sees 'begin' entries for the child
reservations
+ # -Sets request state to pending/deleted
+ # -reclaim.pm processes are never created for children
+ # Child computer state gets left in 'reserved'
+ #if (is_request_deleted($request_id)) {
+ # notify($ERRORS{'OK'}, 0, "request has been
deleted, exiting");
+ # exit;
+ #}
my ($exited, $not_exited) =
$self->does_loadstate_exist_all_reservations('exited', 1);
# If no reservations are missing an 'exited' entry
return true
@@ -794,7 +857,11 @@ sub state_exit {
my ($request_state_name_new, $computer_state_name_new,
$request_log_ending) = @_;
- notify($ERRORS{'DEBUG'}, 0, "beginning state module exit tasks");
+ my $string = "beginning state module exit tasks\n";
+ $string .= "request state argument: " . ($request_state_name_new ?
$request_state_name_new : '<not specified>') . "\n";
+ $string .= "computer state argument: " . ($computer_state_name_new ?
$computer_state_name_new : '<not specified>') . "\n";
+ $string .= "log ending argument: " . ($request_log_ending ?
$request_log_ending : '<not specified>');
+ notify($ERRORS{'DEBUG'}, 0, $string);
my $calling_sub = get_calling_subroutine();
@@ -812,7 +879,13 @@ sub state_exit {
if ($is_parent_reservation) {
# If parent of a cluster request, wait for child processes to
exit before switching the state
if ($reservation_count > 1) {
- $self->wait_for_child_reservations_to_exit();
+ # Check frequently if reservation timed out to cause
Reservations page to remove the Connect button ASAP
+ if ($request_state_name_new && $request_state_name_new
=~ /(timeout)/) {
+ $self->wait_for_child_reservations_to_exit(300,
3);
+ }
+ else {
+ $self->wait_for_child_reservations_to_exit();
+ }
# Check if any reservations failed
my @failed_reservation_ids =
$self->does_loadstate_exist_any_reservation('failed');
@@ -868,19 +941,35 @@ sub state_exit {
}
# Update the request state
- if ($request_state_name_old ne 'deleted' &&
!is_request_deleted($request_id)) {
- # Check if the request state has already been
updated
- # This can occur if another reservation in a
cluster failed
- my ($request_state_name_current,
$request_laststate_name_current) = get_request_current_state_name($request_id);
- if ($request_state_name_current eq
$request_state_name_new && $request_laststate_name_current eq
$request_state_name_old) {
- notify($ERRORS{'OK'}, 0, "request has
NOT been deleted, current state already set to:
$request_state_name_current/$request_laststate_name_current");
+ if ($request_state_name_old ne 'deleted') {
+ if (is_request_deleted($request_id)) {
+ notify($ERRORS{'OK'}, 0, "request has
been deleted, request state not updated: $request_state_name_old -->
$request_state_name_new");
}
else {
- notify($ERRORS{'OK'}, 0, "request has
NOT been deleted, updating request state:
$request_state_name_old/$request_laststate_name_old -->
$request_state_name_new/$request_state_name_old");
+ # Check if the request state has
already been updated
+ # This can occur if another reservation
in a cluster failed
+ my ($request_state_name_current,
$request_laststate_name_current) = get_request_current_state_name($request_id);
+ if ($request_state_name_current eq
$request_state_name_new && $request_laststate_name_current eq
$request_state_name_old) {
+ notify($ERRORS{'OK'}, 0,
"request has NOT been deleted, current state already set to:
$request_state_name_current/$request_laststate_name_current");
+ }
+ else {
+ notify($ERRORS{'OK'}, 0,
"request has NOT been deleted, updating request state:
$request_state_name_old/$request_laststate_name_old -->
$request_state_name_new/$request_state_name_old");
+ if
(!update_request_state($request_id, $request_state_name_new,
$request_state_name_old)) {
+
notify($ERRORS{'WARNING'}, 0, "failed to change request state:
$request_state_name_old/$request_laststate_name_old -->
$request_state_name_new/$request_state_name_old");
+ }
+ }
+ }
+ }
+ else {
+ # Current request state = 'deleted'
+ if ($request_state_name_new =~ /(complete)/) {
if (!update_request_state($request_id,
$request_state_name_new, $request_state_name_old)) {
notify($ERRORS{'WARNING'}, 0,
"failed to change request state:
$request_state_name_old/$request_laststate_name_old -->
$request_state_name_new/$request_state_name_old");
}
}
+ else {
+ notify($ERRORS{'WARNING'}, 0, "request
state not updated: $request_state_name_old --> $request_state_name_new");
+ }
}
}
Modified: vcl/trunk/managementnode/lib/VCL/new.pm
URL:
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/new.pm?rev=1670477&r1=1670476&r2=1670477&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/new.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/new.pm Tue Mar 31 20:48:08 2015
@@ -1078,7 +1078,16 @@ sub wait_for_child_reservations {
$previous_request_loadstate_names =
$current_request_loadstate_names;
$previous_lastcheck_info = $current_lastcheck_info;
- sleep $monitor_delay_seconds;
+
+ if ($total_elapsed_seconds <= 30) {
+ sleep_uninterrupted(3);
+ }
+ elsif ($total_elapsed_seconds <= 60) {
+ sleep_uninterrupted(5);
+ }
+ else {
+ sleep_uninterrupted($monitor_delay_seconds);
+ }
}
# If out of main loop, waited maximum amount of time
Modified: vcl/trunk/managementnode/lib/VCL/reclaim.pm
URL:
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/reclaim.pm?rev=1670477&r1=1670476&r2=1670477&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/reclaim.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/reclaim.pm Tue Mar 31 20:48:08 2015
@@ -212,10 +212,10 @@ sub process {
# Set the computer state to available if it isn't in the maintenance or
reloading state
if ($computer_state_name =~ /maintenance|reloading/) {
notify($ERRORS{'OK'}, 0, "$computer_shortname in
$computer_state_name state, skipping state update to available");
- switch_state($request_data, 'complete', '', '', '1');
+ $self->state_exit('complete');
}
else {
- switch_state($request_data, 'complete', 'available', '', '1');
+ $self->state_exit('complete', 'available');
}
notify($ERRORS{'DEBUG'}, 0, "exiting");
@@ -245,8 +245,10 @@ sub insert_reload_and_exit {
# Run any vcl_post_reservation scripts (if exists)
if ($self->os->can("post_reservation")) {
- if ($self->os->post_reservation()) {
- notify($ERRORS{'OK'}, 0, "post_reservation script has
been executed on $computer_shortname prior to reloading");
+ if ($self->os->is_ssh_responding()) {
+ if ($self->os->post_reservation()) {
+ notify($ERRORS{'OK'}, 0, "post_reservation
script has been executed on $computer_shortname prior to reloading");
+ }
}
}
@@ -259,11 +261,9 @@ sub insert_reload_and_exit {
if (update_computer_imagename($computer_id,
'noimage')) {
notify($ERRORS{'DEBUG'}, 0, "set
computer $computer_shortname current image to 'noimage'");
}
-
- switch_state($request_data, 'complete',
'available', '', '1');
+ $self->state_exit('complete', 'available');
}
}
-
}
else {
#elsif ( $action =~ /reload/i ) {
@@ -285,13 +285,13 @@ sub insert_reload_and_exit {
notify($ERRORS{'OK'}, 0, "inserted reload request into
database for computer id=$computer_id, image=$next_image_name");
# Switch the request state to complete, the computer
state to reload
- switch_state($request_data, 'complete', 'reload', '',
'1');
+ $self->state_exit('complete', 'reload');
}
else {
notify($ERRORS{'CRITICAL'}, 0, "failed to insert reload
request into database for computer id=$computer_id image=$next_image_name");
# Switch the request and computer states to failed
- switch_state($request_data, 'failed', 'failed', '',
'1');
+ $self->state_exit('failed', 'failed');
}
}
Modified: vcl/trunk/managementnode/lib/VCL/reserved.pm
URL:
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/reserved.pm?rev=1670477&r1=1670476&r2=1670477&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/reserved.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/reserved.pm Tue Mar 31 20:48:08 2015
@@ -117,20 +117,52 @@ sub process {
update_computer_state($computer_id, 'reserved');
insertloadlog($reservation_id, $computer_id, "reserved",
"$computer_short_name successfully reserved");
- # Send an email and/or IM to the user
- # Do this after updating the computer state to reserved because this is
when the Connect button appears
- $self->_notify_user_ready();
-
- # Insert acknowledgetimeout immediately before beginning to check user
clicked Connect
- # Web uses timestamp of this to determine when next to refresh the page
- # Important because page should refresh as soon as possible to
reservation timing out
- insertloadlog($reservation_id, $computer_id, "acknowledgetimeout",
"begin acknowledge timeout ($acknowledge_timeout_seconds seconds)");
+
+ if ($is_parent_reservation) {
+ # Send an email and/or IM to the user
+ # Do this after updating the computer state to reserved because
this is when the Connect button appears
+ $self->_notify_user_ready();
+
+ # Insert acknowledgetimeout immediately before beginning to
check user clicked Connect
+ # Web uses timestamp of this to determine when next to refresh
the page
+ # Important because page should refresh as soon as possible to
reservation timing out
+ insertloadlog($reservation_id, $computer_id,
"acknowledgetimeout", "begin acknowledge timeout ($acknowledge_timeout_seconds
seconds)");
+ }
+
+ my $acknowledge_check_start_epoch_seconds =
$self->wait_for_reservation_loadstate($parent_reservation_id,
"acknowledgetimeout", $acknowledge_timeout_seconds, 5);
+ if (!$acknowledge_check_start_epoch_seconds) {
+ notify($ERRORS{'WARNING'}, 0, "failed to retrieve timestamp of
parent reservation $parent_reservation_id 'acknowledgetimeout' computerloadlog
entry");
+ return;
+ }
+
+ # Get the current time
+ my $now_epoch_seconds = time;
+
+ # Calculate the exact time when connection checking should end
+ my $acknowledge_check_end_epoch_seconds =
($acknowledge_check_start_epoch_seconds + $acknowledge_timeout_seconds);
+ my $acknowledge_timeout_remaining_seconds =
($acknowledge_check_end_epoch_seconds - $now_epoch_seconds);
+
+ my $now_string = strftime('%H:%M:%S',
localtime($now_epoch_seconds));
+ my $acknowledge_check_start_string = strftime('%H:%M:%S',
localtime($acknowledge_check_start_epoch_seconds));
+ my $acknowledge_check_end_string = strftime('%H:%M:%S',
localtime($acknowledge_check_end_epoch_seconds));
+ my $acknowledge_timeout_string = strftime('%H:%M:%S',
gmtime($acknowledge_timeout_seconds));
+ my $acknowledge_timeout_remaining_string = strftime('%H:%M:%S',
gmtime($acknowledge_timeout_remaining_seconds));
+
+ notify($ERRORS{'DEBUG'}, 0, "beginning to check for user
acknowledgement:\n" .
+ "acknowledge check start :
$acknowledge_check_start_string\n" .
+ "acknowledge timeout total : + $acknowledge_timeout_string\n" .
+ "--------------------------------------\n" .
+ "acknowledge check end : = $acknowledge_check_end_string\n"
.
+ "current time : - $now_string\n" .
+ "--------------------------------------\n" .
+ "acknowledge timeout remaining : =
$acknowledge_timeout_remaining_string ($acknowledge_timeout_remaining_seconds
seconds)\n"
+ );
# Wait for the user to acknowledge the request by clicking Connect
button or from API
- my $user_acknowledged =
$self->code_loop_timeout(sub{$self->user_acknowledged()}, [], 'waiting for user
acknowledgement', $acknowledge_timeout_seconds, 1, 10);
+ my $user_acknowledged =
$self->code_loop_timeout(sub{$self->user_acknowledged()}, [], 'waiting for user
acknowledgement', $acknowledge_timeout_remaining_seconds, 1, 10);
if (!$user_acknowledged) {
$self->_notify_user_timeout($request_data);
- $self->state_exit('timeout', 'reserved', 'noack');
+ $self->state_exit('timeout', 'available', 'noack');
}
# Add noinitialconnection and then delete acknowledgetimeout
@@ -174,13 +206,13 @@ sub process {
insertloadlog($reservation_id, $computer_id, "postreserve",
"$computer_short_name post reserve successful");
# Get the current time
- my $now_epoch_seconds = time;
+ $now_epoch_seconds = time;
# Calculate the exact time when connection checking should end
my $connection_check_end_epoch_seconds =
($connection_check_start_epoch_seconds + $initial_connect_timeout_seconds);
my $connect_timeout_remaining_seconds =
($connection_check_end_epoch_seconds - $now_epoch_seconds);
- my $now_string = strftime('%H:%M:%S',
localtime($now_epoch_seconds));
+ $now_string = strftime('%H:%M:%S',
localtime($now_epoch_seconds));
my $connection_check_start_string = strftime('%H:%M:%S',
localtime($connection_check_start_epoch_seconds));
my $connection_check_end_string = strftime('%H:%M:%S',
localtime($connection_check_end_epoch_seconds));
my $connect_timeout_string = strftime('%H:%M:%S',
gmtime($initial_connect_timeout_seconds));
@@ -336,7 +368,7 @@ sub user_acknowledged {
# Check if user deleted the request
if (is_request_deleted($request_id)) {
notify($ERRORS{'DEBUG'}, 0, "request deleted, exiting");
- exit;
+ $self->state_exit();
}
my $remote_ip = $self->data->get_reservation_remote_ip();
Modified: vcl/trunk/managementnode/lib/VCL/utils.pm
URL:
http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/utils.pm?rev=1670477&r1=1670476&r2=1670477&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/utils.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/utils.pm Tue Mar 31 20:48:08 2015
@@ -642,7 +642,7 @@ sub notify {
if ($string !~ /[\'\"]/gs && $string !~ /\s:\s/gs) {
$string =~ s/[ \t]+/ /gs;
}
-
+
# Assemble the process identifier string
my $process_identifier;
$process_identifier .= "|$PID|";
@@ -6296,7 +6296,6 @@ sub get_management_node_blockrequests {
blockRequest.groupid AS blockRequest_groupid,
blockRequest.repeating AS blockRequest_repeating,
blockRequest.ownerid AS blockRequest_ownerid,
- blockRequest.admingroupid AS blockRequest_admingroupid,
blockRequest.managementnodeid AS blockRequest_managementnodeid,
blockRequest.expireTime AS blockRequest_expireTime,
blockRequest.processing AS blockRequest_processing,
@@ -6340,7 +6339,6 @@ sub get_management_node_blockrequests {
blockRequest.groupid AS blockRequest_groupid,
blockRequest.repeating AS blockRequest_repeating,
blockRequest.ownerid AS blockRequest_ownerid,
- blockRequest.admingroupid AS blockRequest_admingroupid,
blockRequest.managementnodeid AS
blockRequest_managementnodeid,
blockRequest.expireTime AS blockRequest_expireTime,
blockRequest.processing AS blockRequest_processing,
@@ -8801,7 +8799,7 @@ EOF
# Check if at least 1 row was returned
my $computerloadlog_exists = 0;
- my $parent_reservation_id = '<unknown>';
+ my $parent_reservation_id;
my $parent_computerloadlog_exists = 0;
@@ -8819,6 +8817,8 @@ EOF
}
}
+ $parent_reservation_id = '<unknown>' if
!defined($parent_reservation_id);
+
# Check if a vcld process is running matching for this reservation
my $reservation_process_name_regex =
get_reservation_vcld_process_name_regex($reservation_id);
my @processes_running =
is_management_node_process_running($reservation_process_name_regex);