Repository: incubator-hawq
Updated Branches:
  refs/heads/2.0.0.0-incubating 3b54677d9 -> 49ceffaff


HAWQ-901 Add retries to standby master start check


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/49ceffaf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/49ceffaf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/49ceffaf

Branch: refs/heads/2.0.0.0-incubating
Commit: 49ceffaffcd742dc9caf1716a91a9cbf6bc1d51e
Parents: 3b54677
Author: rlei <r...@pivotal.io>
Authored: Mon Jul 11 10:22:29 2016 +0800
Committer: rlei <r...@pivotal.io>
Committed: Mon Jul 11 16:14:04 2016 +0800

----------------------------------------------------------------------
 tools/bin/hawq_ctl             |  2 +-
 tools/sbin/hawqstandbywatch.py | 22 ++++++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/49ceffaf/tools/bin/hawq_ctl
----------------------------------------------------------------------
diff --git a/tools/bin/hawq_ctl b/tools/bin/hawq_ctl
index 50070f6..211f599 100755
--- a/tools/bin/hawq_ctl
+++ b/tools/bin/hawq_ctl
@@ -638,7 +638,7 @@ class HawqStart:
         cmd = self._start_standby_cmd()
         check_return_code(remote_ssh(cmd, self.standby_host_name, self.user))
         cmd = "%s; %s/sbin/hawqstandbywatch.py %s debug" % (source_hawq_env, 
self.GPHOME, self.master_data_directory)
-        result = remote_ssh(cmd, self.standby_host_name, self.user)
+        result = remote_ssh_nowait(cmd, self.standby_host_name, self.user)
         return result
 
     def _check_standby_sync(self):

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/49ceffaf/tools/sbin/hawqstandbywatch.py
----------------------------------------------------------------------
diff --git a/tools/sbin/hawqstandbywatch.py b/tools/sbin/hawqstandbywatch.py
index 82cf699..ca7ad1d 100755
--- a/tools/sbin/hawqstandbywatch.py
+++ b/tools/sbin/hawqstandbywatch.py
@@ -102,7 +102,7 @@ class SyncmasterWatcher:
 
         self.handles         = {}
         self.maxlines        = 1000
-        self.timelimit       = 5
+        self.timelimit       = 3
         self.delay           = 0.1
 
 
@@ -188,10 +188,20 @@ class SyncmasterWatcher:
                 break
 
         logger.info("checking if syncmaster is running")
-        pid = gp.getSyncmasterPID('localhost', self.datadir)
-        if not pid > 0:
-            logger.warning("syncmaster not running")
-            return 1
+        count = 0
+        counter = 20
+        while True:
+            pid = gp.getSyncmasterPID('localhost', self.datadir)
+            if not pid > 0:
+                if count >= counter:
+                    logger.error("Standby master start timeout")
+                    return 1
+                else:
+                    logger.warning("syncmaster not running, waiting...")
+            else:
+                break
+            count += 1
+            time.sleep(3)
 
         # syncmaster is running and there are no obvious errors in the log
         logger.info("syncmaster appears ok, pid %s" % pid)
@@ -219,7 +229,7 @@ if __name__ == '__main__':
 
     # watch syncmaster logs
     if len(sys.argv) > 2 and sys.argv[2] == 'debug':
-        print "Checking standby master status"
+        logger.info("Checking standby master status")
     watcher = SyncmasterWatcher( sys.argv[1] )
     rc = watcher.monitor_logs()
     watcher.close()

Reply via email to