This is an automated email from the ASF dual-hosted git repository.
dahn pushed a commit to branch 4.11
in repository https://gitbox.apache.org/repos/asf/cloudstack.git
The following commit(s) were added to refs/heads/4.11 by this push:
new 023dcec CLOUDSTACK-10310 Fix KVM reboot on storage issue (#2722)
023dcec is described below
commit 023dcec5ef2e38091c0aacda1e0fae67fd6c4553
Author: Slair1 <[email protected]>
AuthorDate: Mon Aug 20 03:28:03 2018 -0500
CLOUDSTACK-10310 Fix KVM reboot on storage issue (#2722)
---
.../src/com/cloud/hypervisor/kvm/resource/KVMHABase.java | 3 ++-
.../com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java | 14 +++++++++++---
scripts/vm/hypervisor/kvm/kvmheartbeat.sh | 4 ++--
3 files changed, 15 insertions(+), 6 deletions(-)
diff --git
a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java
b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java
index be5ab39..f180848 100644
---
a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java
+++
b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java
@@ -34,7 +34,8 @@ public class KVMHABase {
protected static String s_heartBeatPath;
protected long _heartBeatUpdateTimeout = 60000;
protected long _heartBeatUpdateFreq = 60000;
- protected long _heartBeatUpdateMaxRetry = 3;
+ protected long _heartBeatUpdateMaxTries = 5;
+ protected long _heartBeatUpdateRetrySleep = 15000;
public static enum PoolType {
PrimaryStorage, SecondaryStorage
diff --git
a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java
b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java
index 0cebb4c..8a11b7f 100644
---
a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java
+++
b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java
@@ -119,7 +119,8 @@ public class KVMHAMonitor extends KVMHABase implements
Runnable {
}
String result = null;
- for (int i = 0; i < 5; i++) {
+ // Try multiple times, but sleep in between tries to
ensure it isn't a short lived transient error
+ for (int i = 1; i <= _heartBeatUpdateMaxTries; i++) {
Script cmd = new Script(s_heartBeatPath,
_heartBeatUpdateTimeout, s_logger);
cmd.add("-i", primaryStoragePool._poolIp);
cmd.add("-p", primaryStoragePool._poolMountSourcePath);
@@ -127,14 +128,21 @@ public class KVMHAMonitor extends KVMHABase implements
Runnable {
cmd.add("-h", _hostIP);
result = cmd.execute();
if (result != null) {
- s_logger.warn("write heartbeat failed: " + result
+ ", retry: " + i);
+ s_logger.warn("write heartbeat failed: " + result
+ ", try: " + i + " of " + _heartBeatUpdateMaxTries);
+ try {
+ Thread.sleep(_heartBeatUpdateRetrySleep);
+ } catch (InterruptedException e) {
+ s_logger.debug("[ignored] interupted between
heartbeat retries.");
+ }
} else {
break;
}
}
if (result != null) {
- s_logger.warn("write heartbeat failed: " + result + ";
reboot the host");
+ // Stop cloudstack-agent if can't write to heartbeat
file.
+ // This will raise an alert on the mgmt server
+ s_logger.warn("write heartbeat failed: " + result + ";
stopping cloudstack-agent");
Script cmd = new Script(s_heartBeatPath,
_heartBeatUpdateTimeout, s_logger);
cmd.add("-i", primaryStoragePool._poolIp);
cmd.add("-p", primaryStoragePool._poolMountSourcePath);
diff --git a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh
b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh
index 7c8ee67..30ca72a 100755
--- a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh
+++ b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh
@@ -155,10 +155,10 @@ then
exit 0
elif [ "$cflag" == "1" ]
then
- /usr/bin/logger -t heartbeat "kvmheartbeat.sh rebooted system because it was
unable to write the heartbeat to the storage."
+ /usr/bin/logger -t heartbeat "kvmheartbeat.sh stopped cloudstack-agent
because it was unable to write the heartbeat to the storage."
sync &
sleep 5
- echo b > /proc/sysrq-trigger
+ service cloudstack-agent stop
exit $?
else
write_hbLog