Jiří Moskovčák has uploaded a new change for review. Change subject: try harder when initializing ......................................................................
try harder when initializing the storage initialization is async task, so sometimes we try to access some files before the file exists, in that case we should just wait a while and try again Change-Id: Iac39bedfa78479f71674c3f5e673ba3814f52279 Signed-off-by: Jiri Moskovcak <[email protected]> --- M ovirt_hosted_engine_ha/agent/constants.py.in M ovirt_hosted_engine_ha/agent/hosted_engine.py M ovirt_hosted_engine_ha/lib/exceptions.py 3 files changed, 67 insertions(+), 35 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-hosted-engine-ha refs/changes/37/29937/1 diff --git a/ovirt_hosted_engine_ha/agent/constants.py.in b/ovirt_hosted_engine_ha/agent/constants.py.in index 869b833..df94c8e 100644 --- a/ovirt_hosted_engine_ha/agent/constants.py.in +++ b/ovirt_hosted_engine_ha/agent/constants.py.in @@ -88,3 +88,6 @@ # The length of history STATS_HISTORY_SECS = 15 * 60 + +WAIT_FOR_STORAGE_RETRY = 5 +WAIT_FOR_STORAGE_DELAY = 5 diff --git a/ovirt_hosted_engine_ha/agent/hosted_engine.py b/ovirt_hosted_engine_ha/agent/hosted_engine.py index 6078720..5b79950 100644 --- a/ovirt_hosted_engine_ha/agent/hosted_engine.py +++ b/ovirt_hosted_engine_ha/agent/hosted_engine.py @@ -295,10 +295,14 @@ error_count = 0 # make sure everything is initialized - self._initialize_broker() + # VDSM has to be initialized first, because it prepares the + # storage domain connection + # Broker then initializes the pieces needed for metadata and leases + # which are then used by sanlock self._initialize_vdsm() - self._initialize_sanlock() self._initialize_domain_monitor() + self._initialize_broker() + self._initialize_sanlock() for old_state, state, delay in self.fsm: if self._shutdown_requested_callback(): @@ -312,10 +316,10 @@ try: # make sure everything is still initialized - self._initialize_broker() self._initialize_vdsm() - self._initialize_sanlock() self._initialize_domain_monitor() + self._initialize_broker() + self._initialize_sanlock() # log state self._log.info("Current state %s (score: %d)", @@ -383,6 +387,7 @@ raise else: self._local_monitors[m['field']] = lm + self._log.info("Broker initialized, all submonitors started") def _initialize_vdsm(self): @@ -470,37 +475,53 @@ " is acquired (file: %s)", constants.LOCKSPACE_NAME, self.host_id, lease_file) - try: - sanlock.add_lockspace(constants.LOCKSPACE_NAME, - self.host_id, lease_file) - except sanlock.SanlockException as e: - acquired_lock = False - msg = None - if hasattr(e, 'errno'): - if e.errno == errno.EEXIST: - self._log.debug("Host already holds lock") - acquired_lock = True - elif e.errno == errno.EINVAL: - msg = ("cannot get lock on host id {0}:" - " host already holds lock on a different host id" - .format(self.host_id)) - elif e.errno == errno.EINTR: - msg = ("cannot get lock on host id {0}:" - " sanlock operation interrupted (will retry)" - .format(self.host_id)) - elif e.errno == errno.EINPROGRESS: - msg = ("cannot get lock on host id {0}:" - " sanlock operation in progress (will retry)" - .format(self.host_id)) - if not acquired_lock: - if not msg: - msg = ("cannot get lock on host id {0}: {1}" - .format(self.host_id, str(e))) - self._log.error(msg, exc_info=True) - raise Exception("Failed to initialize sanlock: {0}" - .format(msg)) - else: - self._log.info("Acquired lock on host id %d", self.host_id) + for attempt in range(constants.WAIT_FOR_STORAGE_RETRY): + try: + sanlock.add_lockspace(constants.LOCKSPACE_NAME, + self.host_id, lease_file) + except sanlock.SanlockException as e: + if hasattr(e, 'errno'): + if e.errno == errno.EEXIST: + self._log.debug("Host already holds lock") + break + elif e.errno == errno.EINVAL: + self._log.error( + "cannot get lock on host id {0}: " + "host already holds lock on a different" + " host id" + .format(self.host_id)) + raise # this shouldn't happen, so throw the exception + elif e.errno == errno.EINTR: + self._log.warn("cannot get lock on host id {0}:" + " sanlock operation interrupted" + " (will retry)" + .format(self.host_id)) + elif e.errno == errno.EINPROGRESS: + self._log.warn("cannot get lock on host id {0}:" + " sanlock operation in progress" + "(will retry)" + .format(self.host_id)) + elif e.errno == errno.ENOENT: + self._log.warn("cannot get lock on host id {0}:" + " the lock file '{1}' is missing" + "(will retry)" + .format(self.host_id, lease_file)) + else: # no exception, we acquired the lock + self._log.info("Acquired lock on host id %d", self.host_id) + break + + # some temporary problem has occurred (usually waiting for + # the storage), so wait a while and try again + self._log.info("Failed to acquire the lock. Waiting '{0}'s before" + " the next attempt". + format(constants.WAIT_FOR_STORAGE_DELAY)) + time.sleep(constants.WAIT_FOR_STORAGE_DELAY) + else: # happens only if all attempts are exhausted + raise ex.SanlockInitializationError( + "Failed to initialize sanlock, the number of errors has" + " exceeded the limit") + + # we get here only if the the lock is acquired self._sanlock_initialized = True def _initialize_domain_monitor(self): diff --git a/ovirt_hosted_engine_ha/lib/exceptions.py b/ovirt_hosted_engine_ha/lib/exceptions.py index 824b059..5eb6a83 100644 --- a/ovirt_hosted_engine_ha/lib/exceptions.py +++ b/ovirt_hosted_engine_ha/lib/exceptions.py @@ -45,3 +45,11 @@ class FatalMetadataError(Exception): pass + + +class SanlockInitializationError(Exception): + pass + + +class BrokerInitializationError(Exception): + pass -- To view, visit http://gerrit.ovirt.org/29937 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iac39bedfa78479f71674c3f5e673ba3814f52279 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-hosted-engine-ha Gerrit-Branch: ovirt-hosted-engine-ha-1.1 Gerrit-Owner: Jiří Moskovčák <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
