else we will try again endlessly if the service has no other
possible node where it can run, e.g. if its restricted.
This avoids various problems, especially if a service is configured
to just one node we could never get the service out of the fence
state again without manually hacking the manager status.
Add a regression test for this.
Signed-off-by: Thomas Lamprecht
---
new patch
src/PVE/HA/Manager.pm | 3 ++-
src/test/test-recovery1/README | 4
src/test/test-recovery1/cmdlist | 4
src/test/test-recovery1/groups | 4
src/test/test-recovery1/hardware_status | 5 +
src/test/test-recovery1/log.expect | 38 +
src/test/test-recovery1/manager_status | 1 +
src/test/test-recovery1/service_config | 3 +++
8 files changed, 61 insertions(+), 1 deletion(-)
create mode 100644 src/test/test-recovery1/README
create mode 100644 src/test/test-recovery1/cmdlist
create mode 100644 src/test/test-recovery1/groups
create mode 100644 src/test/test-recovery1/hardware_status
create mode 100644 src/test/test-recovery1/log.expect
create mode 100644 src/test/test-recovery1/manager_status
create mode 100644 src/test/test-recovery1/service_config
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index e6dab7a..e58fc0b 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -292,9 +292,10 @@ my $recover_fenced_service = sub {
$cd->{node} = $sd->{node} = $recovery_node;
&$change_service_state($self, $sid, 'started', node => $recovery_node);
} else {
- # no node found, let the service in 'fence' state and try again
+ # no possible node found, cannot recover
$haenv->log('err', "recovering service '$sid' from fenced node " .
"'$fenced_node' failed, no recovery node found");
+ &$change_service_state($self, $sid, 'error');
}
};
diff --git a/src/test/test-recovery1/README b/src/test/test-recovery1/README
new file mode 100644
index 000..8753ad2
--- /dev/null
+++ b/src/test/test-recovery1/README
@@ -0,0 +1,4 @@
+Test what happens if a service needs to get recovered but
+select_service_node cannot return any possible node.
+
+Avoid endless loops by placing the service in the error state.
diff --git a/src/test/test-recovery1/cmdlist b/src/test/test-recovery1/cmdlist
new file mode 100644
index 000..4e4f36d
--- /dev/null
+++ b/src/test/test-recovery1/cmdlist
@@ -0,0 +1,4 @@
+[
+[ "power node1 on", "power node2 on", "power node3 on"],
+[ "network node2 off" ]
+]
diff --git a/src/test/test-recovery1/groups b/src/test/test-recovery1/groups
new file mode 100644
index 000..06c7f76
--- /dev/null
+++ b/src/test/test-recovery1/groups
@@ -0,0 +1,4 @@
+group: prefer_node2
+ nodes node2
+ restricted 1
+
diff --git a/src/test/test-recovery1/hardware_status
b/src/test/test-recovery1/hardware_status
new file mode 100644
index 000..451beb1
--- /dev/null
+++ b/src/test/test-recovery1/hardware_status
@@ -0,0 +1,5 @@
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-recovery1/log.expect
b/src/test/test-recovery1/log.expect
new file mode 100644
index 000..ffd732a
--- /dev/null
+++ b/src/test/test-recovery1/log.expect
@@ -0,0 +1,38 @@
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20node1/crm: status change startup => wait_for_quorum
+info 20node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20node2/crm: status change startup => wait_for_quorum
+info 20node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20node3/crm: status change startup => wait_for_quorum
+info 20node3/lrm: status change startup => wait_for_agent_lock
+info 20node1/crm: got lock 'ha_manager_lock'
+info 20node1/crm: status change wait_for_quorum => master
+info 20node1/crm: node 'node1': state changed from 'unknown' =>
'online'
+info 20node1/crm: node 'node2': state changed from 'unknown' =>
'online'
+info 20node1/crm: node 'node3': state changed from 'unknown' =>
'online'
+info 20node1/crm: adding new service 'vm:102' on node 'node2'
+info 22node2/crm: status change wait_for_quorum => slave
+info 23node2/lrm: got lock 'ha_agent_node2_lock'
+info 23node2/lrm: status change wait_for_agent_lock => active
+info 23node2/lrm: starting service vm:102
+info 23node2/lrm: service status vm:102 started
+info 24node3/crm: status change wait_for_quorum => slave
+info120 cmdlist: execute network node2 off
+info120node1/crm: node 'node2': state changed from 'online' =>