[Autotest] [PATCH] Adds support to wait_down and wait_for_restart to watch for changed boot_id values

John Admanski Mon, 11 Jan 2010 15:35:29 -0800

Adds support to wait_down and wait_for_restart to watch for changed
boot_id values (using /proc/sys/kernel/random/boot_id). This avoids a
race condition where if a machine was able to successfully shutdown
and boot up again after you started a reboot and before we started
polling for the shutdown, it would just appear that the machine
failed to shutdown when told.


As a result the Host.wait_down method now treats the case where a
machine is NOT down but has a new boot_id as being "down", because
this implies that the machine did shut down (and subsequently came
back up). This does mean that you cannot assume that a successful
wait_down implies that the machine is now down, but that was never
true anyway.

Risk: High
Visibility: We can now reliably detect the restart of machines with
    very fast restart times.

Signed-off-by: John Admanski <[email protected]>

--- autotest/client/common_lib/hosts/base_classes.py    2010-01-11 
15:32:34.000000000 -0800
+++ autotest/client/common_lib/hosts/base_classes.py    2010-01-11 
15:32:34.000000000 -0800
@@ -151,20 +151,36 @@
         return processes
 
 
+    def get_boot_id(self, timeout=60):
+        """ Get a unique ID associated with the current boot.
+
+        Should return a string with the semantics such that two separate
+        calls to Host.get_boot_id() return the same string if the host did
+        not reboot between the two calls, and two different strings if it
+        has rebooted at least once between the two calls.
+
+        @param timeout The number of seconds to wait before timing out.
+
+        @return A string unique to this boot."""
+        return self.run('cat /proc/sys/kernel/random/boot_id',
+                        timeout=timeout).stdout.strip()
+
+
     def wait_up(self, timeout=None):
         raise NotImplementedError('Wait up not implemented!')
 
 
-    def wait_down(self, timeout=None, warning_timer=None):
+    def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
         raise NotImplementedError('Wait down not implemented!')
 
 
     def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT,
-                         log_failure=True, **dargs):
+                         log_failure=True, old_boot_id=None, **dargs):
         """ Wait for the host to come back from a reboot. This is a generic
         implementation based entirely on wait_up and wait_down. """
         if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT,
-                              warning_timer=self.WAIT_DOWN_REBOOT_WARNING):
+                              warning_timer=self.WAIT_DOWN_REBOOT_WARNING,
+                              old_boot_id=old_boot_id):
             if log_failure:
                 self.record("ABORT", None, "reboot.verify", "shut down failed")
             raise error.AutoservShutdownError("Host did not shut down")
--- autotest/server/autotest.py 2010-01-07 15:31:13.000000000 -0800
+++ autotest/server/autotest.py 2010-01-11 15:32:34.000000000 -0800
@@ -664,10 +664,10 @@
             return stderr_redirector.last_line
 
 
-    def _wait_for_reboot(self):
+    def _wait_for_reboot(self, old_boot_id):
         logging.info("Client is rebooting")
         logging.info("Waiting for client to halt")
-        if not self.host.wait_down(HALT_TIME):
+        if not self.host.wait_down(HALT_TIME, old_boot_id=old_boot_id):
             err = "%s failed to shutdown after %d"
             err %= (self.host.hostname, HALT_TIME)
             raise error.AutotestRunError(err)
@@ -709,6 +709,7 @@
                     section_timeout = start_time + timeout - time.time()
                 else:
                     section_timeout = None
+                boot_id = self.host.get_boot_id()
                 last = self.execute_section(section, section_timeout,
                                             logger, client_disconnect_timeout)
                 if self.background:
@@ -719,7 +720,7 @@
                     return
                 elif self.is_client_job_rebooting(last):
                     try:
-                        self._wait_for_reboot()
+                        self._wait_for_reboot(boot_id)
                     except error.AutotestRunError, e:
                         self.host.job.record("ABORT", None, "reboot", str(e))
                         self.host.job.record("END ABORT", None, None, str(e))
--- autotest/server/hosts/abstract_ssh.py       2010-01-11 14:05:15.000000000 
-0800
+++ autotest/server/hosts/abstract_ssh.py       2010-01-11 15:32:34.000000000 
-0800
@@ -345,8 +345,7 @@
         """
         Check if the remote host is up.
 
-        Returns:
-                True if the remote host is up, False otherwise
+        @returns True if the remote host is up, False otherwise
         """
         try:
             self.ssh_ping()
@@ -363,12 +362,10 @@
         In fact, it will wait until an ssh connection to the remote
         host can be established, and getty is running.
 
-        Args:
-                timeout: time limit in seconds before returning even
-                        if the host is not up.
+        @param timeout time limit in seconds before returning even
+            if the host is not up.
 
-        Returns:
-                True if the host was found to be up, False otherwise
+        @returns True if the host was found to be up, False otherwise
         """
         if timeout:
             end_time = time.time() + timeout
@@ -385,21 +382,28 @@
         return False
 
 
-    def wait_down(self, timeout=None, warning_timer=None):
+    def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
         """
         Wait until the remote host is down or the timeout expires.
 
-        In fact, it will wait until an ssh connection to the remote
-        host fails.
-
-        Args:
-            timeout: time limit in seconds before returning even
-                     if the host is still up.
-            warning_timer: time limit in seconds that will generate
-                     a warning if the host is not down yet.
+        If old_boot_id is provided, this will wait until either the machine
+        is unpingable or self.get_boot_id() returns a value different from
+        old_boot_id. If the boot_id value has changed then the function
+        returns true under the assumption that the machine has shut down
+        and has now already come back up.
+
+        If old_boot_id is None then until the machine becomes unreachable the
+        method assumes the machine has not yet shut down.
+
+        @param timeout Time limit in seconds before returning even
+            if the host is still up.
+        @param warning_timer Time limit in seconds that will generate
+            a warning if the host is not down yet.
+        @param old_boot_id A string containing the result of self.get_boot_id()
+            prior to the host being told to shut down. Can be None if this is
+            not available.
 
-        Returns:
-                True if the host was found to be down, False otherwise
+        @returns True if the host was found to be down, False otherwise
         """
         current_time = time.time()
         if timeout:
@@ -408,9 +412,25 @@
         if warning_timer:
             warn_time = current_time + warning_timer
 
+        if old_boot_id is not None:
+            logging.debug('Host %s pre-shutdown boot_id is %s',
+                          self.hostname, old_boot_id)
+
         while not timeout or current_time < end_time:
-            if not self.is_up():
+            try:
+                new_boot_id = self.get_boot_id()
+            except error.AutoservSSHTimeout:
+                logging.debug('Host %s is now unreachable over ssh, is down',
+                              self.hostname)
                 return True
+            else:
+                # if the machine is up but the boot_id value has changed from
+                # old boot id, then we can assume the machine has gone down
+                # and then already come back up
+                if old_boot_id is not None and old_boot_id != new_boot_id:
+                    logging.debug('Host %s now has boot_id %s and so must '
+                                  'have rebooted', self.hostname, new_boot_id)
+                    return True
 
             if warning_timer and current_time > warn_time:
                 self.record("WARN", None, "shutdown",
--- autotest/server/hosts/remote.py     2010-01-11 15:32:34.000000000 -0800
+++ autotest/server/hosts/remote.py     2010-01-11 15:32:34.000000000 -0800
@@ -129,6 +129,8 @@
         def reboot():
             self.record("GOOD", None, "reboot.start")
             try:
+                current_boot_id = self.get_boot_id()
+
                 # sync before starting the reboot, so that a long sync during
                 # shutdown isn't timed out by wait_down's short timeout
                 if not fastsync:
@@ -150,7 +152,8 @@
                               "reboot command failed")
                 raise
             if wait:
-                self.wait_for_restart(timeout, **dargs)
+                self.wait_for_restart(timeout, old_boot_id=current_boot_id,
+                                      **dargs)
 
         # if this is a full reboot-and-wait, run the reboot inside a group
         if wait:
--- autotest/server/hosts/serial.py     2010-01-11 15:32:34.000000000 -0800
+++ autotest/server/hosts/serial.py     2010-01-11 15:32:34.000000000 -0800
@@ -128,6 +128,15 @@
                 wait_for_restart()
         """
         conmux_command = "'~$%s'" % conmux_command
+
+        # if the machine is up, grab the old boot id, otherwise use a dummy
+        # string and NOT None to ensure that wait_down always returns True,
+        # even if the machine comes back up before it's called
+        try:
+            old_boot_id = self.get_boot_id()
+        except error.AutoservSSHTimeout:
+            old_boot_id = 'unknown boot_id prior to SerialHost.hardreset'
+
         def reboot():
             if not self.run_conmux(conmux_command):
                 self.record("ABORT", None, "reboot.start",
@@ -141,9 +150,12 @@
                 for attempt in xrange(num_attempts-1):
                     try:
                         self.wait_for_restart(timeout, log_failure=False,
+                                              old_boot_id=old_boot_id,
                                               **wait_for_restart_kwargs)
                     except error.AutoservShutdownError:
                         logging.warning(warning_msg, attempt+1, num_attempts)
+                        # re-send the hard reset command
+                        self.run_conmux(conmux_command)
                     else:
                         break
                 else:
_______________________________________________
Autotest mailing list
[email protected]
http://test.kernel.org/cgi-bin/mailman/listinfo/autotest

[Autotest] [PATCH] Adds support to wait_down and wait_for_restart to watch for changed boot_id values

Reply via email to