Do not execute any manual user migration of an HA resource to a target node, which is not one of the highest priority nodes if the HA resource has failback set.
This prevents users from moving an HA resource, which would be failed back to a higher priority node of the strict or non-strict node affinity rule immediately after, which just wastes time and resources. Signed-off-by: Daniel Kral <[email protected]> --- I thought about exposing the service configurations hash ($sc) through $self->{sc} in the HA Manager instead, as we already did with $self->{groups} and $self->{rules} / $self->{compiled_rules}, but I left it to passing it to the appropriate routines for now. This would be a rather nice cleanup in the future to have these parts separated in something like the Resources module, where the config and the state of HA resources is controlled instead of having all of this logic in the Manager module, but there are bit more important things to do right now. src/PVE/HA/Config.pm | 11 +++++-- src/PVE/HA/Helpers.pm | 6 ++-- src/PVE/HA/Manager.pm | 13 +++++--- .../test-node-affinity-nonstrict1/log.expect | 16 +--------- .../test-node-affinity-nonstrict7/log.expect | 32 +++---------------- .../test-node-affinity-strict7/log.expect | 18 ++--------- 6 files changed, 27 insertions(+), 69 deletions(-) diff --git a/src/PVE/HA/Config.pm b/src/PVE/HA/Config.pm index f8c5965e..fa14816c 100644 --- a/src/PVE/HA/Config.pm +++ b/src/PVE/HA/Config.pm @@ -382,22 +382,27 @@ sub service_is_configured { sub get_resource_motion_info { my ($sid) = @_; - my $resources = read_resources_config(); + my $conf = read_resources_config(); my $dependent_resources = []; my $blocking_resources_by_node = {}; - if (&$service_check_ha_state($resources, $sid)) { + if (&$service_check_ha_state($conf, $sid)) { my $manager_status = read_manager_status(); my $ss = $manager_status->{service_status}; my $ns = $manager_status->{node_status}; # get_resource_motion_info expects a hashset of all nodes with status 'online' my $online_nodes = { map { $ns->{$_} eq 'online' ? ($_ => 1) : () } keys %$ns }; + # get_resource_motion_info expects a resource config with defaults set + my $resources = read_and_check_resources_config(); my $compiled_rules = read_and_compile_rules_config(); + my $cd = $resources->{$sid} // {}; ($dependent_resources, $blocking_resources_by_node) = - PVE::HA::Helpers::get_resource_motion_info($ss, $sid, $online_nodes, $compiled_rules); + PVE::HA::Helpers::get_resource_motion_info( + $ss, $sid, $cd, $online_nodes, $compiled_rules, + ); } return ($dependent_resources, $blocking_resources_by_node); diff --git a/src/PVE/HA/Helpers.pm b/src/PVE/HA/Helpers.pm index b160c541..a58b1e12 100644 --- a/src/PVE/HA/Helpers.pm +++ b/src/PVE/HA/Helpers.pm @@ -18,13 +18,13 @@ causes that make the node unavailable to C<$sid>. =cut -sub get_resource_motion_info($ss, $sid, $online_nodes, $compiled_rules) { +sub get_resource_motion_info($ss, $sid, $cd, $online_nodes, $compiled_rules) { my $dependent_resources = []; my $blocking_resources_by_node = {}; my ($node_affinity, $resource_affinity) = $compiled_rules->@{qw(node-affinity resource-affinity)}; - my ($allowed_nodes) = get_node_affinity($node_affinity, $sid, $online_nodes); + my ($allowed_nodes, $pri_nodes) = get_node_affinity($node_affinity, $sid, $online_nodes); my ($together, $separate) = get_affinitive_resources($resource_affinity, $sid); for my $csid (sort keys %$together) { @@ -35,7 +35,7 @@ sub get_resource_motion_info($ss, $sid, $online_nodes, $compiled_rules) { } for my $node (keys %$online_nodes) { - if (!$allowed_nodes->{$node}) { + if (!$allowed_nodes->{$node} || ($cd->{failback} && !$pri_nodes->{$node})) { push $blocking_resources_by_node->{$node}->@*, { sid => $sid, diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm index d1ff9615..9067d27b 100644 --- a/src/PVE/HA/Manager.pm +++ b/src/PVE/HA/Manager.pm @@ -387,13 +387,15 @@ sub read_lrm_status { } sub execute_migration { - my ($self, $cmd, $task, $sid, $target) = @_; + my ($self, $cmd, $task, $sid, $cd, $target) = @_; my ($haenv, $ss, $ns, $compiled_rules) = $self->@{qw(haenv ss ns compiled_rules)}; my $online_nodes = { map { $_ => 1 } $self->{ns}->list_online_nodes()->@* }; my ($dependent_resources, $blocking_resources_by_node) = - PVE::HA::Helpers::get_resource_motion_info($ss, $sid, $online_nodes, $compiled_rules); + PVE::HA::Helpers::get_resource_motion_info( + $ss, $sid, $cd, $online_nodes, $compiled_rules, + ); if (my $blocking_resources = $blocking_resources_by_node->{$target}) { for my $blocking_resource (@$blocking_resources) { @@ -432,7 +434,7 @@ sub execute_migration { # read new crm commands and save them into crm master status sub update_crm_commands { - my ($self) = @_; + my ($self, $sc) = @_; my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); @@ -453,7 +455,8 @@ sub update_crm_commands { "ignore crm command - service already on target node: $cmd", ); } else { - $self->execute_migration($cmd, $task, $sid, $node); + my $cd = $sc->{$sid} // {}; + $self->execute_migration($cmd, $task, $sid, $cd, $node); } } } else { @@ -707,7 +710,7 @@ sub manage { $self->{last_services_digest} = $services_digest; } - $self->update_crm_commands(); + $self->update_crm_commands($sc); for (;;) { my $repeat = 0; diff --git a/src/test/test-node-affinity-nonstrict1/log.expect b/src/test/test-node-affinity-nonstrict1/log.expect index d86c69de..ca2c40b3 100644 --- a/src/test/test-node-affinity-nonstrict1/log.expect +++ b/src/test/test-node-affinity-nonstrict1/log.expect @@ -22,19 +22,5 @@ info 25 node3/lrm: status change wait_for_agent_lock => active info 25 node3/lrm: starting service vm:101 info 25 node3/lrm: service status vm:101 started info 120 cmdlist: execute service vm:101 migrate node2 -info 120 node1/crm: got crm command: migrate vm:101 node2 -info 120 node1/crm: migrate service 'vm:101' to node 'node2' -info 120 node1/crm: service 'vm:101': state changed from 'started' to 'migrate' (node = node3, target = node2) -info 123 node2/lrm: got lock 'ha_agent_node2_lock' -info 123 node2/lrm: status change wait_for_agent_lock => active -info 125 node3/lrm: service vm:101 - start migrate to node 'node2' -info 125 node3/lrm: service vm:101 - end migrate to node 'node2' -info 140 node1/crm: service 'vm:101': state changed from 'migrate' to 'started' (node = node2) -info 140 node1/crm: migrate service 'vm:101' to node 'node3' (running) -info 140 node1/crm: service 'vm:101': state changed from 'started' to 'migrate' (node = node2, target = node3) -info 143 node2/lrm: service vm:101 - start migrate to node 'node3' -info 143 node2/lrm: service vm:101 - end migrate to node 'node3' -info 160 node1/crm: service 'vm:101': state changed from 'migrate' to 'started' (node = node3) -info 165 node3/lrm: starting service vm:101 -info 165 node3/lrm: service status vm:101 started +err 120 node1/crm: crm command 'migrate vm:101 node2' error - service 'vm:101' is not allowed on node 'node2' info 720 hardware: exit simulation - done diff --git a/src/test/test-node-affinity-nonstrict7/log.expect b/src/test/test-node-affinity-nonstrict7/log.expect index 31daa618..54e824ea 100644 --- a/src/test/test-node-affinity-nonstrict7/log.expect +++ b/src/test/test-node-affinity-nonstrict7/log.expect @@ -28,35 +28,9 @@ info 25 node3/lrm: status change wait_for_agent_lock => active info 25 node3/lrm: starting service vm:101 info 25 node3/lrm: service status vm:101 started info 120 cmdlist: execute service vm:101 migrate node1 -info 120 node1/crm: got crm command: migrate vm:101 node1 -info 120 node1/crm: migrate service 'vm:101' to node 'node1' -info 120 node1/crm: service 'vm:101': state changed from 'started' to 'migrate' (node = node3, target = node1) -info 121 node1/lrm: got lock 'ha_agent_node1_lock' -info 121 node1/lrm: status change wait_for_agent_lock => active -info 125 node3/lrm: service vm:101 - start migrate to node 'node1' -info 125 node3/lrm: service vm:101 - end migrate to node 'node1' -info 140 node1/crm: service 'vm:101': state changed from 'migrate' to 'started' (node = node1) -info 140 node1/crm: migrate service 'vm:101' to node 'node3' (running) -info 140 node1/crm: service 'vm:101': state changed from 'started' to 'migrate' (node = node1, target = node3) -info 141 node1/lrm: service vm:101 - start migrate to node 'node3' -info 141 node1/lrm: service vm:101 - end migrate to node 'node3' -info 160 node1/crm: service 'vm:101': state changed from 'migrate' to 'started' (node = node3) -info 165 node3/lrm: starting service vm:101 -info 165 node3/lrm: service status vm:101 started +err 120 node1/crm: crm command 'migrate vm:101 node1' error - service 'vm:101' is not allowed on node 'node1' info 220 cmdlist: execute service vm:101 migrate node2 -info 220 node1/crm: got crm command: migrate vm:101 node2 -info 220 node1/crm: migrate service 'vm:101' to node 'node2' -info 220 node1/crm: service 'vm:101': state changed from 'started' to 'migrate' (node = node3, target = node2) -info 225 node3/lrm: service vm:101 - start migrate to node 'node2' -info 225 node3/lrm: service vm:101 - end migrate to node 'node2' -info 240 node1/crm: service 'vm:101': state changed from 'migrate' to 'started' (node = node2) -info 240 node1/crm: migrate service 'vm:101' to node 'node3' (running) -info 240 node1/crm: service 'vm:101': state changed from 'started' to 'migrate' (node = node2, target = node3) -info 243 node2/lrm: service vm:101 - start migrate to node 'node3' -info 243 node2/lrm: service vm:101 - end migrate to node 'node3' -info 260 node1/crm: service 'vm:101': state changed from 'migrate' to 'started' (node = node3) -info 265 node3/lrm: starting service vm:101 -info 265 node3/lrm: service status vm:101 started +err 220 node1/crm: crm command 'migrate vm:101 node2' error - service 'vm:101' is not allowed on node 'node2' info 320 cmdlist: execute service vm:101 migrate node3 info 320 node1/crm: ignore crm command - service already on target node: migrate vm:101 node3 info 420 cmdlist: execute service vm:102 migrate node3 @@ -81,6 +55,8 @@ info 620 cmdlist: execute service vm:102 migrate node1 info 620 node1/crm: got crm command: migrate vm:102 node1 info 620 node1/crm: migrate service 'vm:102' to node 'node1' info 620 node1/crm: service 'vm:102': state changed from 'started' to 'migrate' (node = node2, target = node1) +info 621 node1/lrm: got lock 'ha_agent_node1_lock' +info 621 node1/lrm: status change wait_for_agent_lock => active info 623 node2/lrm: service vm:102 - start migrate to node 'node1' info 623 node2/lrm: service vm:102 - end migrate to node 'node1' info 640 node1/crm: service 'vm:102': state changed from 'migrate' to 'started' (node = node1) diff --git a/src/test/test-node-affinity-strict7/log.expect b/src/test/test-node-affinity-strict7/log.expect index 9c4e9f0b..ae8e43fb 100644 --- a/src/test/test-node-affinity-strict7/log.expect +++ b/src/test/test-node-affinity-strict7/log.expect @@ -28,21 +28,7 @@ info 25 node3/lrm: status change wait_for_agent_lock => active info 25 node3/lrm: starting service vm:101 info 25 node3/lrm: service status vm:101 started info 120 cmdlist: execute service vm:101 migrate node1 -info 120 node1/crm: got crm command: migrate vm:101 node1 -info 120 node1/crm: migrate service 'vm:101' to node 'node1' -info 120 node1/crm: service 'vm:101': state changed from 'started' to 'migrate' (node = node3, target = node1) -info 121 node1/lrm: got lock 'ha_agent_node1_lock' -info 121 node1/lrm: status change wait_for_agent_lock => active -info 125 node3/lrm: service vm:101 - start migrate to node 'node1' -info 125 node3/lrm: service vm:101 - end migrate to node 'node1' -info 140 node1/crm: service 'vm:101': state changed from 'migrate' to 'started' (node = node1) -info 140 node1/crm: migrate service 'vm:101' to node 'node3' (running) -info 140 node1/crm: service 'vm:101': state changed from 'started' to 'migrate' (node = node1, target = node3) -info 141 node1/lrm: service vm:101 - start migrate to node 'node3' -info 141 node1/lrm: service vm:101 - end migrate to node 'node3' -info 160 node1/crm: service 'vm:101': state changed from 'migrate' to 'started' (node = node3) -info 165 node3/lrm: starting service vm:101 -info 165 node3/lrm: service status vm:101 started +err 120 node1/crm: crm command 'migrate vm:101 node1' error - service 'vm:101' is not allowed on node 'node1' info 220 cmdlist: execute service vm:101 migrate node2 err 220 node1/crm: crm command 'migrate vm:101 node2' error - service 'vm:101' is not allowed on node 'node2' info 320 cmdlist: execute service vm:101 migrate node3 @@ -55,6 +41,8 @@ info 620 cmdlist: execute service vm:102 migrate node1 info 620 node1/crm: got crm command: migrate vm:102 node1 info 620 node1/crm: migrate service 'vm:102' to node 'node1' info 620 node1/crm: service 'vm:102': state changed from 'started' to 'migrate' (node = node2, target = node1) +info 621 node1/lrm: got lock 'ha_agent_node1_lock' +info 621 node1/lrm: status change wait_for_agent_lock => active info 623 node2/lrm: service vm:102 - start migrate to node 'node1' info 623 node2/lrm: service vm:102 - end migrate to node 'node1' info 640 node1/crm: service 'vm:102': state changed from 'migrate' to 'started' (node = node1) -- 2.47.3 _______________________________________________ pve-devel mailing list [email protected] https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
