GutoVeronezi commented on a change in pull request #5783:
URL: https://github.com/apache/cloudstack/pull/5783#discussion_r773186546



##########
File path: 
server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java
##########
@@ -459,6 +463,20 @@
     public static final ConfigKey<Boolean> MIGRATE_VM_ACROSS_CLUSTERS = new 
ConfigKey<Boolean>(Boolean.class, "migrate.vm.across.clusters", "Advanced", 
"false",
             "Indicates whether the VM can be migrated to different cluster if 
no host is found in same cluster",true, ConfigKey.Scope.Zone, null);
 
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_MAX_RETRIES_CK =  
new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_MAX_RETRIES, "5",
+            "The maximum retries of kvm heartbeat to write to storage",

Review comment:
       ```suggestion
               "The maximum retries of KVM heartbeat to write to storage.",
   ```

##########
File path: 
server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java
##########
@@ -459,6 +463,20 @@
     public static final ConfigKey<Boolean> MIGRATE_VM_ACROSS_CLUSTERS = new 
ConfigKey<Boolean>(Boolean.class, "migrate.vm.across.clusters", "Advanced", 
"false",
             "Indicates whether the VM can be migrated to different cluster if 
no host is found in same cluster",true, ConfigKey.Scope.Zone, null);
 
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_MAX_RETRIES_CK =  
new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_MAX_RETRIES, "5",
+            "The maximum retries of kvm heartbeat to write to storage",
+            true, ConfigKey.Scope.Global);
+
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_RETRY_SLEEP_CK =  
new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_RETRY_SLEEP, 
"10000",
+            "The sleep time, in milliseconds, between two kvm heartbeats to 
write to storage",
+            true, ConfigKey.Scope.Global);
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_TIMEOUT_CK = new 
ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_TIMEOUT, "60000",
+            "Timeout(in milliseconds) that kvm heartbeat to write to storage",
+            true, ConfigKey.Scope.Global);
+    public static final ConfigKey<String> KVM_HEARTBEAT_FAILURE_ACTION_CK = 
new ConfigKey<>("Advanced", String.class, KVM_HEARTBEAT_FAILURE_ACTION, 
"hardreset",
+            "The action for heartbeat write failures on KVM host. The valid 
value are 'hardreset' (default), 'stopagent', 'destroyvms'",

Review comment:
       ```suggestion
               "The action for heartbeat write failures on KVM host. The valid 
values are 'hardreset' (default), 'stopagent', 'destroyvms'",
   ```

##########
File path: 
server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java
##########
@@ -459,6 +463,20 @@
     public static final ConfigKey<Boolean> MIGRATE_VM_ACROSS_CLUSTERS = new 
ConfigKey<Boolean>(Boolean.class, "migrate.vm.across.clusters", "Advanced", 
"false",
             "Indicates whether the VM can be migrated to different cluster if 
no host is found in same cluster",true, ConfigKey.Scope.Zone, null);
 
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_MAX_RETRIES_CK =  
new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_MAX_RETRIES, "5",
+            "The maximum retries of kvm heartbeat to write to storage",
+            true, ConfigKey.Scope.Global);
+
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_RETRY_SLEEP_CK =  
new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_RETRY_SLEEP, 
"10000",
+            "The sleep time, in milliseconds, between two kvm heartbeats to 
write to storage",

Review comment:
       ```suggestion
               "The sleep time, in milliseconds, between two KVM heartbeats to 
write to storage.",
   ```

##########
File path: 
server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java
##########
@@ -459,6 +463,20 @@
     public static final ConfigKey<Boolean> MIGRATE_VM_ACROSS_CLUSTERS = new 
ConfigKey<Boolean>(Boolean.class, "migrate.vm.across.clusters", "Advanced", 
"false",
             "Indicates whether the VM can be migrated to different cluster if 
no host is found in same cluster",true, ConfigKey.Scope.Zone, null);
 
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_MAX_RETRIES_CK =  
new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_MAX_RETRIES, "5",
+            "The maximum retries of kvm heartbeat to write to storage",
+            true, ConfigKey.Scope.Global);
+
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_RETRY_SLEEP_CK =  
new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_RETRY_SLEEP, 
"10000",
+            "The sleep time, in milliseconds, between two kvm heartbeats to 
write to storage",
+            true, ConfigKey.Scope.Global);
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_TIMEOUT_CK = new 
ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_TIMEOUT, "60000",
+            "Timeout(in milliseconds) that kvm heartbeat to write to storage",
+            true, ConfigKey.Scope.Global);
+    public static final ConfigKey<String> KVM_HEARTBEAT_FAILURE_ACTION_CK = 
new ConfigKey<>("Advanced", String.class, KVM_HEARTBEAT_FAILURE_ACTION, 
"hardreset",
+            "The action for heartbeat write failures on KVM host. The valid 
value are 'hardreset' (default), 'stopagent', 'destroyvms'",

Review comment:
       The value `noaction` should be explicit here too.

##########
File path: 
server/src/main/java/com/cloud/configuration/ConfigurationManagerImpl.java
##########
@@ -459,6 +463,20 @@
     public static final ConfigKey<Boolean> MIGRATE_VM_ACROSS_CLUSTERS = new 
ConfigKey<Boolean>(Boolean.class, "migrate.vm.across.clusters", "Advanced", 
"false",
             "Indicates whether the VM can be migrated to different cluster if 
no host is found in same cluster",true, ConfigKey.Scope.Zone, null);
 
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_MAX_RETRIES_CK =  
new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_MAX_RETRIES, "5",
+            "The maximum retries of kvm heartbeat to write to storage",
+            true, ConfigKey.Scope.Global);
+
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_RETRY_SLEEP_CK =  
new ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_RETRY_SLEEP, 
"10000",
+            "The sleep time, in milliseconds, between two kvm heartbeats to 
write to storage",
+            true, ConfigKey.Scope.Global);
+    public static final ConfigKey<Long> KVM_HEARTBEAT_UPDATE_TIMEOUT_CK = new 
ConfigKey<>("Advanced", Long.class, KVM_HEARTBEAT_UPDATE_TIMEOUT, "60000",
+            "Timeout(in milliseconds) that kvm heartbeat to write to storage",

Review comment:
       ```suggestion
               "Timeout, in milliseconds, to KVM heartbeat writes to storage.",
   ```

##########
File path: 
engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java
##########
@@ -1840,4 +1878,91 @@ public void propagateChangeToAgents(Map<String, String> 
params) {
             sendCommandToAgents(hostsPerZone, params);
         }
     }
+
+    protected class ScanDisconnectedHostsTask extends ManagedContextRunnable {
+
+        @Override
+        protected void runInContext() {
+            try {
+                ManagementServerHostVO msHost = 
_msHostDao.findOneInUpState(new Filter(ManagementServerHostVO.class, "id", 
true, 0L, 1L));
+                if (msHost == null || (msHost.getMsid() != _nodeId)) {
+                    s_logger.debug("Skipping disconnected hosts scan task");
+                    for (Long hostId : _investigateTasksMap.keySet()) {
+                        cancelInvestigationTask(hostId);
+                    }
+                    return;
+                }
+                for (HostVO host : _hostDao.listByType(Host.Type.Routing)) {
+                    if (host.getStatus() == Status.Disconnected) {
+                        scheduleInvestigationTask(host.getId());
+                    }
+                }
+            } catch (final Exception e) {
+                s_logger.error("Exception caught while scanning disconnected 
hosts : ", e);
+            }
+        }
+    }
+
+    protected class InvestigationTask extends ManagedContextRunnable {
+        Long _hostId;
+
+        InvestigationTask(final Long hostId) {
+            _hostId = hostId;
+        }
+
+        @Override
+        protected void runInContext() {
+            try {
+                final long hostId = _hostId;
+                s_logger.info("Investigating host " + hostId + " to determine 
its actual state");
+                HostVO host = _hostDao.findById(hostId);
+                if (host == null) {
+                    s_logger.info("Cancelling investigation on host " + hostId 
+ " which might has been removed");
+                    cancelInvestigationTask(hostId);
+                    return;
+                }
+                if (host.getStatus() != Status.Disconnected) {
+                    s_logger.info("Cancelling investigation on host " + hostId 
+ " in status " + host.getStatus());
+                    cancelInvestigationTask(hostId);
+                    return;
+                }
+                Status determinedState = _haMgr.investigate(hostId);
+                s_logger.info("Investigators determine the status of host " + 
hostId + " is " + determinedState);
+                if (determinedState == Status.Down) {
+                    agentStatusTransitTo(host, Status.Event.HostDown, _nodeId);
+                    s_logger.info("Scheduling VMs restart on host " + hostId + 
" which is Down");
+                    _haMgr.scheduleRestartForVmsOnHost(host, true);
+                    s_logger.info("Cancelling investigation on host " + hostId 
+ " which is Down");
+                    cancelInvestigationTask(hostId);
+                }
+            } catch (final Exception e) {

Review comment:
       Do we need a catch Pokémon (`catch them all`) here?

##########
File path: 
engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java
##########
@@ -1840,4 +1878,91 @@ public void propagateChangeToAgents(Map<String, String> 
params) {
             sendCommandToAgents(hostsPerZone, params);
         }
     }
+
+    protected class ScanDisconnectedHostsTask extends ManagedContextRunnable {
+
+        @Override
+        protected void runInContext() {
+            try {
+                ManagementServerHostVO msHost = 
_msHostDao.findOneInUpState(new Filter(ManagementServerHostVO.class, "id", 
true, 0L, 1L));
+                if (msHost == null || (msHost.getMsid() != _nodeId)) {
+                    s_logger.debug("Skipping disconnected hosts scan task");
+                    for (Long hostId : _investigateTasksMap.keySet()) {
+                        cancelInvestigationTask(hostId);
+                    }
+                    return;
+                }
+                for (HostVO host : _hostDao.listByType(Host.Type.Routing)) {
+                    if (host.getStatus() == Status.Disconnected) {
+                        scheduleInvestigationTask(host.getId());
+                    }
+                }
+            } catch (final Exception e) {
+                s_logger.error("Exception caught while scanning disconnected 
hosts : ", e);
+            }
+        }
+    }
+
+    protected class InvestigationTask extends ManagedContextRunnable {
+        Long _hostId;
+
+        InvestigationTask(final Long hostId) {
+            _hostId = hostId;
+        }
+
+        @Override
+        protected void runInContext() {
+            try {
+                final long hostId = _hostId;
+                s_logger.info("Investigating host " + hostId + " to determine 
its actual state");
+                HostVO host = _hostDao.findById(hostId);
+                if (host == null) {
+                    s_logger.info("Cancelling investigation on host " + hostId 
+ " which might has been removed");
+                    cancelInvestigationTask(hostId);
+                    return;
+                }
+                if (host.getStatus() != Status.Disconnected) {
+                    s_logger.info("Cancelling investigation on host " + hostId 
+ " in status " + host.getStatus());
+                    cancelInvestigationTask(hostId);
+                    return;
+                }
+                Status determinedState = _haMgr.investigate(hostId);
+                s_logger.info("Investigators determine the status of host " + 
hostId + " is " + determinedState);
+                if (determinedState == Status.Down) {
+                    agentStatusTransitTo(host, Status.Event.HostDown, _nodeId);
+                    s_logger.info("Scheduling VMs restart on host " + hostId + 
" which is Down");
+                    _haMgr.scheduleRestartForVmsOnHost(host, true);
+                    s_logger.info("Cancelling investigation on host " + hostId 
+ " which is Down");
+                    cancelInvestigationTask(hostId);
+                }
+            } catch (final Exception e) {
+                s_logger.error("Exception caught while handling investigation 
task: ", e);
+            }
+        }
+    }
+
+    private void scheduleInvestigationTask(final Long hostId) {
+        ScheduledFuture future = _investigateTasksMap.get(hostId);
+        if (future != null) {
+            s_logger.info("There is already a task to investigate host " + 
hostId);
+        } else {
+            ScheduledFuture scheduledFuture = 
_investigatorExecutor.scheduleWithFixedDelay(new InvestigationTask(hostId), 
InvestigateDisconnectedHostsInterval.value(),
+                    InvestigateDisconnectedHostsInterval.value(), 
TimeUnit.SECONDS);
+            _investigateTasksMap.put(hostId, scheduledFuture);
+            s_logger.info("Scheduled a task to investigate host " + hostId);
+        }
+    }
+
+    private void cancelInvestigationTask(final Long hostId) {
+        ScheduledFuture future = _investigateTasksMap.get(hostId);
+        if (future != null) {
+            try {
+                future.cancel(false);
+                s_logger.info("Cancelled a task to investigate host " + 
hostId);
+                _investigateTasksMap.remove(hostId);
+            } catch (Exception e) {

Review comment:
       Do we need a catch Pokémon (`catch them all`) here?

##########
File path: 
engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java
##########
@@ -1840,4 +1878,91 @@ public void propagateChangeToAgents(Map<String, String> 
params) {
             sendCommandToAgents(hostsPerZone, params);
         }
     }
+
+    protected class ScanDisconnectedHostsTask extends ManagedContextRunnable {
+
+        @Override
+        protected void runInContext() {
+            try {
+                ManagementServerHostVO msHost = 
_msHostDao.findOneInUpState(new Filter(ManagementServerHostVO.class, "id", 
true, 0L, 1L));
+                if (msHost == null || (msHost.getMsid() != _nodeId)) {

Review comment:
       ```suggestion
                   if (msHost == null || msHost.getMsid() != _nodeId) {
   ```

##########
File path: agent/src/main/java/com/cloud/agent/properties/AgentProperties.java
##########
@@ -39,14 +39,6 @@
      */
     public static final Property<Integer> VM_MIGRATE_DOMAIN_RETRIEVE_TIMEOUT = 
new Property<Integer>("vm.migrate.domain.retrieve.timeout", 10);
 
-    /**
-     * Reboot host and alert management on heartbeat timeout. <br>
-     * Data type: boolean.<br>
-     * Default value: true.
-     */
-    public static final Property<Boolean> 
REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT
-        = new 
Property<Boolean>("reboot.host.and.alert.management.on.heartbeat.timeout", 
true);

Review comment:
       If we gonna remove this property, we need to explicit the option in the 
new feature that behave the same way and advise operators on how to configure 
it.
   
   Also, we should remove from `agent.properties` too and wherever it is used.

##########
File path: 
engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java
##########
@@ -1840,4 +1878,91 @@ public void propagateChangeToAgents(Map<String, String> 
params) {
             sendCommandToAgents(hostsPerZone, params);
         }
     }
+
+    protected class ScanDisconnectedHostsTask extends ManagedContextRunnable {
+
+        @Override
+        protected void runInContext() {
+            try {
+                ManagementServerHostVO msHost = 
_msHostDao.findOneInUpState(new Filter(ManagementServerHostVO.class, "id", 
true, 0L, 1L));
+                if (msHost == null || (msHost.getMsid() != _nodeId)) {
+                    s_logger.debug("Skipping disconnected hosts scan task");
+                    for (Long hostId : _investigateTasksMap.keySet()) {
+                        cancelInvestigationTask(hostId);
+                    }
+                    return;
+                }
+                for (HostVO host : _hostDao.listByType(Host.Type.Routing)) {
+                    if (host.getStatus() == Status.Disconnected) {
+                        scheduleInvestigationTask(host.getId());
+                    }
+                }
+            } catch (final Exception e) {

Review comment:
       Do we need a catch Pokémon (`catch them all`) here?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to