else we will try again endlessly if the service has no other possible node where it can run, e.g. if its restricted.
This avoids various problems, especially if a service is configured to just one node we could never get the service out of the fence state again without manually hacking the manager status. Add a regression test for this. Signed-off-by: Thomas Lamprecht <t.lampre...@proxmox.com> --- new patch src/PVE/HA/Manager.pm | 3 ++- src/test/test-recovery1/README | 4 ++++ src/test/test-recovery1/cmdlist | 4 ++++ src/test/test-recovery1/groups | 4 ++++ src/test/test-recovery1/hardware_status | 5 +++++ src/test/test-recovery1/log.expect | 38 +++++++++++++++++++++++++++++++++ src/test/test-recovery1/manager_status | 1 + src/test/test-recovery1/service_config | 3 +++ 8 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 src/test/test-recovery1/README create mode 100644 src/test/test-recovery1/cmdlist create mode 100644 src/test/test-recovery1/groups create mode 100644 src/test/test-recovery1/hardware_status create mode 100644 src/test/test-recovery1/log.expect create mode 100644 src/test/test-recovery1/manager_status create mode 100644 src/test/test-recovery1/service_config diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm index e6dab7a..e58fc0b 100644 --- a/src/PVE/HA/Manager.pm +++ b/src/PVE/HA/Manager.pm @@ -292,9 +292,10 @@ my $recover_fenced_service = sub { $cd->{node} = $sd->{node} = $recovery_node; &$change_service_state($self, $sid, 'started', node => $recovery_node); } else { - # no node found, let the service in 'fence' state and try again + # no possible node found, cannot recover $haenv->log('err', "recovering service '$sid' from fenced node " . "'$fenced_node' failed, no recovery node found"); + &$change_service_state($self, $sid, 'error'); } }; diff --git a/src/test/test-recovery1/README b/src/test/test-recovery1/README new file mode 100644 index 0000000..8753ad2 --- /dev/null +++ b/src/test/test-recovery1/README @@ -0,0 +1,4 @@ +Test what happens if a service needs to get recovered but +select_service_node cannot return any possible node. + +Avoid endless loops by placing the service in the error state. diff --git a/src/test/test-recovery1/cmdlist b/src/test/test-recovery1/cmdlist new file mode 100644 index 0000000..4e4f36d --- /dev/null +++ b/src/test/test-recovery1/cmdlist @@ -0,0 +1,4 @@ +[ + [ "power node1 on", "power node2 on", "power node3 on"], + [ "network node2 off" ] +] diff --git a/src/test/test-recovery1/groups b/src/test/test-recovery1/groups new file mode 100644 index 0000000..06c7f76 --- /dev/null +++ b/src/test/test-recovery1/groups @@ -0,0 +1,4 @@ +group: prefer_node2 + nodes node2 + restricted 1 + diff --git a/src/test/test-recovery1/hardware_status b/src/test/test-recovery1/hardware_status new file mode 100644 index 0000000..451beb1 --- /dev/null +++ b/src/test/test-recovery1/hardware_status @@ -0,0 +1,5 @@ +{ + "node1": { "power": "off", "network": "off" }, + "node2": { "power": "off", "network": "off" }, + "node3": { "power": "off", "network": "off" } +} diff --git a/src/test/test-recovery1/log.expect b/src/test/test-recovery1/log.expect new file mode 100644 index 0000000..ffd732a --- /dev/null +++ b/src/test/test-recovery1/log.expect @@ -0,0 +1,38 @@ +info 0 hardware: starting simulation +info 20 cmdlist: execute power node1 on +info 20 node1/crm: status change startup => wait_for_quorum +info 20 node1/lrm: status change startup => wait_for_agent_lock +info 20 cmdlist: execute power node2 on +info 20 node2/crm: status change startup => wait_for_quorum +info 20 node2/lrm: status change startup => wait_for_agent_lock +info 20 cmdlist: execute power node3 on +info 20 node3/crm: status change startup => wait_for_quorum +info 20 node3/lrm: status change startup => wait_for_agent_lock +info 20 node1/crm: got lock 'ha_manager_lock' +info 20 node1/crm: status change wait_for_quorum => master +info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online' +info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online' +info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online' +info 20 node1/crm: adding new service 'vm:102' on node 'node2' +info 22 node2/crm: status change wait_for_quorum => slave +info 23 node2/lrm: got lock 'ha_agent_node2_lock' +info 23 node2/lrm: status change wait_for_agent_lock => active +info 23 node2/lrm: starting service vm:102 +info 23 node2/lrm: service status vm:102 started +info 24 node3/crm: status change wait_for_quorum => slave +info 120 cmdlist: execute network node2 off +info 120 node1/crm: node 'node2': state changed from 'online' => 'unknown' +info 122 node2/crm: status change slave => wait_for_quorum +info 123 node2/lrm: status change active => lost_agent_lock +info 160 node1/crm: service 'vm:102': state changed from 'started' to 'fence' +info 160 node1/crm: node 'node2': state changed from 'unknown' => 'fence' +info 164 watchdog: execute power node2 off +info 163 node2/crm: killed by poweroff +info 164 node2/lrm: killed by poweroff +info 164 hardware: server 'node2' stopped by poweroff (watchdog) +info 240 node1/crm: got lock 'ha_agent_node2_lock' +info 240 node1/crm: fencing: acknowleged - got agent lock for node 'node2' +info 240 node1/crm: node 'node2': state changed from 'fence' => 'unknown' +err 240 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +info 240 node1/crm: service 'vm:102': state changed from 'fence' to 'error' +info 720 hardware: exit simulation - done diff --git a/src/test/test-recovery1/manager_status b/src/test/test-recovery1/manager_status new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/src/test/test-recovery1/manager_status @@ -0,0 +1 @@ +{} diff --git a/src/test/test-recovery1/service_config b/src/test/test-recovery1/service_config new file mode 100644 index 0000000..39a05e5 --- /dev/null +++ b/src/test/test-recovery1/service_config @@ -0,0 +1,3 @@ +{ + "vm:102": { "node": "node2", "state": "enabled", "group": "prefer_node2" } +} -- 2.1.4 _______________________________________________ pve-devel mailing list pve-devel@pve.proxmox.com http://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel