[Engine-patches] Change in ovirt-hosted-engine-ha[master]: WIP agent: update logic and use state machine

gpadgett Tue, 13 Aug 2013 15:35:57 -0700

Greg Padgett has uploaded a new change for review.

Change subject: WIP agent: update logic and use state machine
......................................................................


WIP agent: update logic and use state machine

Contains some updates to make runtime smoother, as well as refactoring
the code to control the vm using a state machine rather than procedural
code.  This makes the control flow more clear and makes logic changes
easier to implement.

Change-Id: Iad9906d50a5fffb4c9f362b43f0064bf6be4dbde
Signed-off-by: Greg Padgett <[email protected]>
---
M ovirt_hosted_engine_ha/agent/hosted_engine.py
1 file changed, 208 insertions(+), 70 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-hosted-engine-ha 
refs/changes/51/18051/1

diff --git a/ovirt_hosted_engine_ha/agent/hosted_engine.py 
b/ovirt_hosted_engine_ha/agent/hosted_engine.py
index 3fdeb9e..fab0a4f 100644
--- a/ovirt_hosted_engine_ha/agent/hosted_engine.py
+++ b/ovirt_hosted_engine_ha/agent/hosted_engine.py
@@ -46,6 +46,14 @@
         'vm-up good-health-status': 3,
     }
 
+    class States(object):
+        ENTRY = 'ENTRY'
+        OFF = 'OFF'
+        START = 'START'
+        ON = 'ON'
+        STOP = 'STOP'
+        MIGRATE = 'MIGRATE'
+
     def __init__(self, shutdown_requested_callback):
         """
         Initialize hosted engine monitoring logic.  shutdown_requested_callback
@@ -62,12 +70,22 @@
         self._required_monitors = self._get_required_monitors()
         self._local_monitors = {}
         self._local_state = {}
+        self._init_local_state()
         self._all_host_stats = {}
 
         self._sd_path = None
         self._metadata_path = None
 
         self._sanlock_initialized = False
+
+        self._vm_state_actions = {
+            self.States.ENTRY: self._handle_entry,
+            self.States.OFF: self._handle_off,
+            self.States.START: self._handle_start,
+            self.States.ON: self._handle_on,
+            self.States.STOP: self._handle_stop,
+            self.States.MIGRATE: self._handle_migrate,
+        }
 
     def _get_required_monitors(self):
         """
@@ -125,6 +143,23 @@
                 'vm_uuid': self._config.get(config.VM, config.VM_UUID)}
         })
         return req
+
+    def _init_local_state(self):
+        """
+        Initialize self._local_state dict (and document the entries).
+        """
+        # Local timestamp of most recent engine vm startup attempt
+        self._local_state['last-engine-retry'] = 0
+
+        # Count of recent engine vm startup attempts
+        self._local_state['engine-retries'] = 0
+
+        # Host id of local host
+        self._local_state['host-id'] = int(self._config.get(config.ENGINE,
+                                                            config.HOST_ID))
+
+        # Initial state to track engine vm status in state machine
+        self._local_state['current-state'] = self.States.ENTRY
 
     def _get_lf_args(self, lf_class):
         return {'lf_class': lf_class,
@@ -260,7 +295,7 @@
     def _initialize_sanlock(self):
         self._cond_start_service('sanlock')
 
-        host_id = int(self._config.get(config.ENGINE, config.HOST_ID))
+        host_id = self._local_state['host-id']
         self._metadata_dir = os.path.join(self._sd_path,
                                           constants.SD_METADATA_DIR)
         lease_file = os.path.join(self._metadata_dir,
@@ -308,10 +343,11 @@
                 self._broker.get_monitor_status(v['id']))
         self._log.debug("Refresh complete")
 
-        # (re-)initialize retry status variables if either a) they've not
-        # been initialized yet, or b) the retry window has expired.
-        if ('last-engine-retry' not in self._local_state
-                or self._local_state['last-engine-retry'] + time.time()
+        # re-initialize retry status variables if the retry window
+        # has expired.
+        if ((self._local_state['last-engine-retry'] != 0
+             or self._local_state['engine-retries'] != 0)
+            and self._local_state['last-engine-retry'] + time.time()
                 < constants.ENGINE_RETRY_EXPIRATION_SECS):
             self._local_state['last-engine-retry'] = 0
             self._local_state['engine-retries'] = 0
@@ -391,8 +427,7 @@
                 .format(md_parse_vers=constants.METADATA_PARSE_VERSION,
                         md_feature_vers=constants.METADATA_FEATURE_VERSION,
                         ts_int=ts,
-                        host_id=self._config.get(config.ENGINE,
-                                                 config.HOST_ID),
+                        host_id=self._local_state['host-id'],
                         score=score,
                         engine_status=lm['engine-health']['status'],
                         name=socket.gethostname()))
@@ -409,8 +444,7 @@
                         md_feature_vers=constants.METADATA_FEATURE_VERSION,
                         ts_int=ts,
                         ts_str=time.ctime(ts),
-                        host_id=self._config.get(config.ENGINE,
-                                                 config.HOST_ID),
+                        host_id=self._local_state['host-id'],
                         score=score))
         for (k, v) in sorted(lm.iteritems()):
             info += "{0}={1}\n".format(k, str(v['status']))
@@ -443,16 +477,6 @@
                 self._log.error("Malformed metadata:"
                                 " host id '%s' not an integer", host_id)
                 continue
-
-            if host_id not in self._all_host_stats:
-                self._all_host_stats[host_id] = {
-                    'first-update': True,
-                    'last-update-local-ts': None,
-                    'last-update-host-ts': None,
-                    'alive': False,
-                    'score': 0,
-                    'engine-status': None,
-                    'hostname': '(unknown)'}
 
             if len(data) < 512:
                 self._log.error("Malformed metadata for host %d:"
@@ -487,8 +511,21 @@
             engine_status = str(tokens[5])  # convert from bytearray
             hostname = str(tokens[6])  # convert from bytearray
 
+            if host_id not in self._all_host_stats:
+                self._all_host_stats[host_id] = {
+                    'first-update': True,
+                    'last-update-local-ts': local_ts,
+                    'last-update-host-ts': None,
+                    'alive': 'unknown',
+                    'score': 0,
+                    'engine-status': None,
+                    'hostname': '(unknown)'}
+
             if host_ts != self._all_host_stats[host_id]['last-update-host-ts']:
-                # Track first update in order to accurately judge liveness
+                # Track first update in order to accurately judge liveness.
+                # If last-update-host-ts is 0, then first-update stays True
+                # which indicates that we cannot use this last-update-local-ts
+                # as an indication of host liveness.
                 if self._all_host_stats[host_id]['last-update-host-ts']:
                     self._all_host_stats[host_id]['first-update'] = False
 
@@ -501,17 +538,20 @@
 
         # All updated, now determine if hosts are alive/updating
         for host_id, attr in self._all_host_stats.iteritems():
-            if attr['first-update']:
-                self._log.info("Watching for update from host %s (id %d)",
-                               attr['hostname'], host_id,
-                               extra=self._get_lf_args(self.LF_HOST_UPDATE))
-            elif (attr['last-update-local-ts']
-                  + constants.HOST_ALIVE_TIMEOUT_SECS) <= local_ts:
+            if (attr['last-update-local-ts']
+                    + constants.HOST_ALIVE_TIMEOUT_SECS) <= local_ts:
+                # Check for timeout regardless of the first-update flag status:
+                # a timeout in this case means we read stale data, but still
+                # must mark the host as dead.
                 # TODO newer sanlocks can report this through get_hosts()
                 self._log.error("Host %s (id %d) is no longer updating its"
                                 " metadata", attr['hostname'], host_id,
                                 extra=self._get_lf_args(self.LF_HOST_UPDATE))
                 self._all_host_stats[host_id]['alive'] = False
+            elif attr['first-update']:
+                self._log.info("Waiting for first update from host %s (id %d)",
+                               attr['hostname'], host_id,
+                               extra=self._get_lf_args(self.LF_HOST_UPDATE))
             else:
                 self._log.info("Host %s (id %d) metadata updated",
                                attr['hostname'], host_id,
@@ -527,60 +567,56 @@
         """
         Start or stop engine on current host based on hosts' statistics.
         """
-        local_host_id = int(self._config.get(config.ENGINE, config.HOST_ID))
-        engine_status = self._all_host_stats[local_host_id]['engine-status']
-        engine_status_host_id = local_host_id
-        best_score = self._all_host_stats[local_host_id]['score']
-        best_score_host_id = local_host_id
+        local_host_id = self._local_state['host-id']
 
-        if engine_status == 'None':
+        if self._all_host_stats[local_host_id]['engine-status'] == 'None':
             self._log.info("Unknown local engine vm status, no actions taken")
             return
 
+        cur_stats = {
+            'best-engine-status':
+            self._all_host_stats[local_host_id]['engine-status'],
+            'best-engine-status-host-id': local_host_id,
+            'best-score': self._all_host_stats[local_host_id]['score'],
+            'best-score-host-id': local_host_id,
+        }
+
         for host_id, stats in self._all_host_stats.iteritems():
-            if self._engine_status_score(stats['engine-status']) \
-                    > self._engine_status_score(engine_status):
-                engine_status = stats['engine-status']
-                engine_status_host_id = host_id
+            if stats['alive'] == 'unknown':
+                # TODO probably unnecessary to wait, sanlock will prevent
+                # 2 from starting up accidentally
+                self._log.info("Unknown host state for id %d,"
+                               " waiting for initialization", host_id)
+                return
+            elif not stats['alive']:
+                continue
+
+            if self._get_engine_status_score(stats['engine-status']) \
+                    > self._get_engine_status_score(
+                        cur_stats['best-engine-status']):
+                cur_stats['best-engine-status'] = stats['engine-status']
+                cur_stats['best-engine-status-host-id'] = host_id
             # Prefer local score if equal to remote score
-            if stats['score'] > best_score:
-                best_score_host_id = host_id
+            if stats['score'] > cur_stats['best-score']:
+                cur_stats['best-score'] = stats['score']
+                cur_stats['best-score-host-id'] = host_id
 
-        if engine_status[:5] == 'vm-up':
-            # FIXME timeout for bad-host-status: if up and no engine, try to
-            # migrate; if can't migrate, reduce local score and shut down
-            self._log.info(
-                "Engine vm is running on host %s (id %d)",
-                self._all_host_stats[engine_status_host_id]['hostname'],
-                engine_status_host_id,
-                extra=self._get_lf_args(self.LF_ENGINE_HEALTH)
-            )
-            return
+        # FIXME set maintenance flag
+        cur_stats['maintenance'] = False
 
-        # FIXME remote db down, other statuses
+        self._cur_stats = cur_stats
 
-        # FIXME cluster-wide engine maintenance bit
+        state = self._local_state['current-state']
+        yield_ = False
+        # Process the states until it's time to sleep, indicated by the
+        # state handler returning yield_ as True.
+        while not yield_:
+            self._log.debug("Processing engine state %s", state)
+            state, yield_ = self._vm_state_actions[state]()
+        self._log.debug("Next engine state %s", state)
+        self._local_state['current-state'] = state
 
-        if best_score_host_id != local_host_id:
-            self._log.info("Engine down, local host does not have best score",
-                           extra=self._get_lf_args(self.LF_ENGINE_HEALTH))
-            return
-
-        self._log.error("Engine down and local host has best score (%d),"
-                        " attempting to start engine VM", best_score,
-                        extra=self._get_lf_args(self.LF_ENGINE_HEALTH))
-        try:
-            self._start_engine_vm()
-        except Exception as e:
-            self._log.error("Failed to start engine VM: %s", str(e))
-            self._local_state['last-engine-retry'] = time.time()
-            self._local_state['engine-retries'] += 1
-            # TODO mail for error (each time, or after n retries?)
-        else:
-            self._local_state['last-engine-retry'] = 0
-            self._local_state['engine-retries'] = 0
-
-    def _engine_status_score(self, status):
+    def _get_engine_status_score(self, status):
         """
         Convert a string engine/vm status to a sortable numeric score;
         the highest score is a live vm with a healthy engine.
@@ -590,6 +626,76 @@
         except KeyError:
             self._log.error("Invalid engine status: %s", status, exc_info=True)
             return 0
+
+    def _handle_entry(self):
+        """
+        ENTRY state.  Determine current vm state and switch appropriately.
+        """
+        local_host_id = self._local_state['host-id']
+        if self._all_host_stats[local_host_id]['engine-status'][:5] == 'vm-up':
+            return self.States.ON, False
+        else:
+            return self.States.OFF, False
+
+    def _handle_off(self):
+        """
+        OFF state.  Check if any conditions warrant starting the vm, and
+        check if it was started externally.
+        """
+        local_host_id = self._local_state['host-id']
+
+        if self._cur_stats['best-engine-status'][:5] == 'vm-up':
+            # FIXME timeout for bad-host-status: if up and no engine, try to
+            # migrate; if can't migrate, reduce local score and shut down
+            engine_host_id = self._cur_stats['best-engine-status-host-id']
+            if engine_host_id == local_host_id:
+                self._log.info("Engine vm unexpectedly running locally,"
+                               " monitoring vm")
+                # FIXME maintenance bit should prevent this transition; in
+                # fact, it should trigger STOP and then IDLE or similar
+                return self.States.ON, False
+            else:
+                self._log.info(
+                    "Engine vm is running on host %s (id %d)",
+                    self._all_host_stats[engine_host_id]['hostname'],
+                    engine_host_id,
+                    extra=self._get_lf_args(self.LF_ENGINE_HEALTH)
+                )
+                return self.States.OFF, True
+
+        # FIXME remote db down, other statuses
+
+        # FIXME cluster-wide engine maintenance bit
+
+        if self._cur_stats['best-score-host-id'] != local_host_id:
+            self._log.info("Engine down, local host does not have best score",
+                           extra=self._get_lf_args(self.LF_ENGINE_HEALTH))
+            return self.States.OFF, True
+
+        self._log.error("Engine down and local host has best score (%d),"
+                        " attempting to start engine VM",
+                        self._cur_stats['best-score'],
+                        extra=self._get_lf_args(self.LF_ENGINE_HEALTH))
+        return self.States.START, False
+
+    def _handle_start(self):
+        """
+        START state.  Power on VM.
+        """
+        try:
+            self._start_engine_vm()
+        except Exception as e:
+            self._log.error("Failed to start engine VM: %s", str(e))
+            self._local_state['last-engine-retry'] = time.time()
+            self._local_state['engine-retries'] += 1
+            # TODO mail for error (each time, or after n retries?)
+            # OFF handler will retry based on host score
+            return self.States.OFF, True
+        else:
+            self._local_state['last-engine-retry'] = 0
+            self._local_state['engine-retries'] = 0
+            return self.States.ON, True
+            # TODO should we stay in START until success/timeout?
 
     def _start_engine_vm(self):
         self._log.info("Starting vm using `%s --vm-start`",
@@ -614,3 +720,35 @@
 
         self._log.error("Engine VM started on localhost")
         # FIXME record start time in order to track bad-health-status timeout
+        return
+
+    def _handle_on(self):
+        """
+        ON state.  See if the VM was stopped or needs to be stopped.
+        """
+        local_host_id = self._local_state['host-id']
+        if self._cur_stats['best-engine-status'][:5] != 'vm-up':
+            self._log.error("Engine vm died unexpectedly")
+            return self.States.OFF, False
+        elif self._cur_stats['best-engine-status-host-id'] != local_host_id:
+            self._log.error("Engine vm unexpectedly running on other host")
+            return self.States.OFF, True
+
+        # FIXME migration if other hosts are found to be significantly better
+        # TODO check for health, just just liveness
+        self._log.info("Engine vm running on localhost")
+        return self.States.ON, True
+
+    def _handle_stop(self):
+        """
+        STOP state.  Shut down the locally-running vm.
+        """
+        # FIXME currently unused
+        return self.States.STOP, True
+
+    def _handle_migrate(self):
+        """
+        MIGRATE state.  Move the VM to the destination host.
+        """
+        # FIXME currently unused
+        return self.States.MIGRATE, True


-- 
To view, visit http://gerrit.ovirt.org/18051
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iad9906d50a5fffb4c9f362b43f0064bf6be4dbde
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-hosted-engine-ha
Gerrit-Branch: master
Gerrit-Owner: Greg Padgett <[email protected]>
_______________________________________________
Engine-patches mailing list
[email protected]
http://lists.ovirt.org/mailman/listinfo/engine-patches

[Engine-patches] Change in ovirt-hosted-engine-ha[master]: WIP agent: update logic and use state machine

Reply via email to