Repository: incubator-hawq Updated Branches: refs/heads/2.0.0.0-incubating 3b54677d9 -> 49ceffaff
HAWQ-901 Add retries to standby master start check Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/49ceffaf Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/49ceffaf Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/49ceffaf Branch: refs/heads/2.0.0.0-incubating Commit: 49ceffaffcd742dc9caf1716a91a9cbf6bc1d51e Parents: 3b54677 Author: rlei <r...@pivotal.io> Authored: Mon Jul 11 10:22:29 2016 +0800 Committer: rlei <r...@pivotal.io> Committed: Mon Jul 11 16:14:04 2016 +0800 ---------------------------------------------------------------------- tools/bin/hawq_ctl | 2 +- tools/sbin/hawqstandbywatch.py | 22 ++++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/49ceffaf/tools/bin/hawq_ctl ---------------------------------------------------------------------- diff --git a/tools/bin/hawq_ctl b/tools/bin/hawq_ctl index 50070f6..211f599 100755 --- a/tools/bin/hawq_ctl +++ b/tools/bin/hawq_ctl @@ -638,7 +638,7 @@ class HawqStart: cmd = self._start_standby_cmd() check_return_code(remote_ssh(cmd, self.standby_host_name, self.user)) cmd = "%s; %s/sbin/hawqstandbywatch.py %s debug" % (source_hawq_env, self.GPHOME, self.master_data_directory) - result = remote_ssh(cmd, self.standby_host_name, self.user) + result = remote_ssh_nowait(cmd, self.standby_host_name, self.user) return result def _check_standby_sync(self): http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/49ceffaf/tools/sbin/hawqstandbywatch.py ---------------------------------------------------------------------- diff --git a/tools/sbin/hawqstandbywatch.py b/tools/sbin/hawqstandbywatch.py index 82cf699..ca7ad1d 100755 --- a/tools/sbin/hawqstandbywatch.py +++ b/tools/sbin/hawqstandbywatch.py @@ -102,7 +102,7 @@ class SyncmasterWatcher: self.handles = {} self.maxlines = 1000 - self.timelimit = 5 + self.timelimit = 3 self.delay = 0.1 @@ -188,10 +188,20 @@ class SyncmasterWatcher: break logger.info("checking if syncmaster is running") - pid = gp.getSyncmasterPID('localhost', self.datadir) - if not pid > 0: - logger.warning("syncmaster not running") - return 1 + count = 0 + counter = 20 + while True: + pid = gp.getSyncmasterPID('localhost', self.datadir) + if not pid > 0: + if count >= counter: + logger.error("Standby master start timeout") + return 1 + else: + logger.warning("syncmaster not running, waiting...") + else: + break + count += 1 + time.sleep(3) # syncmaster is running and there are no obvious errors in the log logger.info("syncmaster appears ok, pid %s" % pid) @@ -219,7 +229,7 @@ if __name__ == '__main__': # watch syncmaster logs if len(sys.argv) > 2 and sys.argv[2] == 'debug': - print "Checking standby master status" + logger.info("Checking standby master status") watcher = SyncmasterWatcher( sys.argv[1] ) rc = watcher.monitor_logs() watcher.close()