LGTM. Thanks for doing this.

On Mon, Jan 11, 2010 at 3:35 PM, John Admanski <[email protected]> wrote:
> Adds support to wait_down and wait_for_restart to watch for changed
> boot_id values (using /proc/sys/kernel/random/boot_id). This avoids a
> race condition where if a machine was able to successfully shutdown
> and boot up again after you started a reboot and before we started
> polling for the shutdown, it would just appear that the machine
> failed to shutdown when told.
>
> As a result the Host.wait_down method now treats the case where a
> machine is NOT down but has a new boot_id as being "down", because
> this implies that the machine did shut down (and subsequently came
> back up). This does mean that you cannot assume that a successful
> wait_down implies that the machine is now down, but that was never
> true anyway.
>
> Risk: High
> Visibility: We can now reliably detect the restart of machines with
>    very fast restart times.
>
> Signed-off-by: John Admanski <[email protected]>
>
> --- autotest/client/common_lib/hosts/base_classes.py    2010-01-11 
> 15:32:34.000000000 -0800
> +++ autotest/client/common_lib/hosts/base_classes.py    2010-01-11 
> 15:32:34.000000000 -0800
> @@ -151,20 +151,36 @@
>         return processes
>
>
> +    def get_boot_id(self, timeout=60):
> +        """ Get a unique ID associated with the current boot.
> +
> +        Should return a string with the semantics such that two separate
> +        calls to Host.get_boot_id() return the same string if the host did
> +        not reboot between the two calls, and two different strings if it
> +        has rebooted at least once between the two calls.
> +
> +       �...@param timeout The number of seconds to wait before timing out.
> +
> +       �...@return A string unique to this boot."""
> +        return self.run('cat /proc/sys/kernel/random/boot_id',
> +                        timeout=timeout).stdout.strip()
> +
> +
>     def wait_up(self, timeout=None):
>         raise NotImplementedError('Wait up not implemented!')
>
>
> -    def wait_down(self, timeout=None, warning_timer=None):
> +    def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
>         raise NotImplementedError('Wait down not implemented!')
>
>
>     def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT,
> -                         log_failure=True, **dargs):
> +                         log_failure=True, old_boot_id=None, **dargs):
>         """ Wait for the host to come back from a reboot. This is a generic
>         implementation based entirely on wait_up and wait_down. """
>         if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT,
> -                              warning_timer=self.WAIT_DOWN_REBOOT_WARNING):
> +                              warning_timer=self.WAIT_DOWN_REBOOT_WARNING,
> +                              old_boot_id=old_boot_id):
>             if log_failure:
>                 self.record("ABORT", None, "reboot.verify", "shut down 
> failed")
>             raise error.AutoservShutdownError("Host did not shut down")
> --- autotest/server/autotest.py 2010-01-07 15:31:13.000000000 -0800
> +++ autotest/server/autotest.py 2010-01-11 15:32:34.000000000 -0800
> @@ -664,10 +664,10 @@
>             return stderr_redirector.last_line
>
>
> -    def _wait_for_reboot(self):
> +    def _wait_for_reboot(self, old_boot_id):
>         logging.info("Client is rebooting")
>         logging.info("Waiting for client to halt")
> -        if not self.host.wait_down(HALT_TIME):
> +        if not self.host.wait_down(HALT_TIME, old_boot_id=old_boot_id):
>             err = "%s failed to shutdown after %d"
>             err %= (self.host.hostname, HALT_TIME)
>             raise error.AutotestRunError(err)
> @@ -709,6 +709,7 @@
>                     section_timeout = start_time + timeout - time.time()
>                 else:
>                     section_timeout = None
> +                boot_id = self.host.get_boot_id()
>                 last = self.execute_section(section, section_timeout,
>                                             logger, client_disconnect_timeout)
>                 if self.background:
> @@ -719,7 +720,7 @@
>                     return
>                 elif self.is_client_job_rebooting(last):
>                     try:
> -                        self._wait_for_reboot()
> +                        self._wait_for_reboot(boot_id)
>                     except error.AutotestRunError, e:
>                         self.host.job.record("ABORT", None, "reboot", str(e))
>                         self.host.job.record("END ABORT", None, None, str(e))
> --- autotest/server/hosts/abstract_ssh.py       2010-01-11 14:05:15.000000000 
> -0800
> +++ autotest/server/hosts/abstract_ssh.py       2010-01-11 15:32:34.000000000 
> -0800
> @@ -345,8 +345,7 @@
>         """
>         Check if the remote host is up.
>
> -        Returns:
> -                True if the remote host is up, False otherwise
> +       �...@returns True if the remote host is up, False otherwise
>         """
>         try:
>             self.ssh_ping()
> @@ -363,12 +362,10 @@
>         In fact, it will wait until an ssh connection to the remote
>         host can be established, and getty is running.
>
> -        Args:
> -                timeout: time limit in seconds before returning even
> -                        if the host is not up.
> +       �...@param timeout time limit in seconds before returning even
> +            if the host is not up.
>
> -        Returns:
> -                True if the host was found to be up, False otherwise
> +       �...@returns True if the host was found to be up, False otherwise
>         """
>         if timeout:
>             end_time = time.time() + timeout
> @@ -385,21 +382,28 @@
>         return False
>
>
> -    def wait_down(self, timeout=None, warning_timer=None):
> +    def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
>         """
>         Wait until the remote host is down or the timeout expires.
>
> -        In fact, it will wait until an ssh connection to the remote
> -        host fails.
> -
> -        Args:
> -            timeout: time limit in seconds before returning even
> -                     if the host is still up.
> -            warning_timer: time limit in seconds that will generate
> -                     a warning if the host is not down yet.
> +        If old_boot_id is provided, this will wait until either the machine
> +        is unpingable or self.get_boot_id() returns a value different from
> +        old_boot_id. If the boot_id value has changed then the function
> +        returns true under the assumption that the machine has shut down
> +        and has now already come back up.
> +
> +        If old_boot_id is None then until the machine becomes unreachable the
> +        method assumes the machine has not yet shut down.
> +
> +       �...@param timeout Time limit in seconds before returning even
> +            if the host is still up.
> +       �...@param warning_timer Time limit in seconds that will generate
> +            a warning if the host is not down yet.
> +       �...@param old_boot_id A string containing the result of 
> self.get_boot_id()
> +            prior to the host being told to shut down. Can be None if this is
> +            not available.
>
> -        Returns:
> -                True if the host was found to be down, False otherwise
> +       �...@returns True if the host was found to be down, False otherwise
>         """
>         current_time = time.time()
>         if timeout:
> @@ -408,9 +412,25 @@
>         if warning_timer:
>             warn_time = current_time + warning_timer
>
> +        if old_boot_id is not None:
> +            logging.debug('Host %s pre-shutdown boot_id is %s',
> +                          self.hostname, old_boot_id)
> +
>         while not timeout or current_time < end_time:
> -            if not self.is_up():
> +            try:
> +                new_boot_id = self.get_boot_id()
> +            except error.AutoservSSHTimeout:
> +                logging.debug('Host %s is now unreachable over ssh, is down',
> +                              self.hostname)
>                 return True
> +            else:
> +                # if the machine is up but the boot_id value has changed from
> +                # old boot id, then we can assume the machine has gone down
> +                # and then already come back up
> +                if old_boot_id is not None and old_boot_id != new_boot_id:
> +                    logging.debug('Host %s now has boot_id %s and so must '
> +                                  'have rebooted', self.hostname, 
> new_boot_id)
> +                    return True
>
>             if warning_timer and current_time > warn_time:
>                 self.record("WARN", None, "shutdown",
> --- autotest/server/hosts/remote.py     2010-01-11 15:32:34.000000000 -0800
> +++ autotest/server/hosts/remote.py     2010-01-11 15:32:34.000000000 -0800
> @@ -129,6 +129,8 @@
>         def reboot():
>             self.record("GOOD", None, "reboot.start")
>             try:
> +                current_boot_id = self.get_boot_id()
> +
>                 # sync before starting the reboot, so that a long sync during
>                 # shutdown isn't timed out by wait_down's short timeout
>                 if not fastsync:
> @@ -150,7 +152,8 @@
>                               "reboot command failed")
>                 raise
>             if wait:
> -                self.wait_for_restart(timeout, **dargs)
> +                self.wait_for_restart(timeout, old_boot_id=current_boot_id,
> +                                      **dargs)
>
>         # if this is a full reboot-and-wait, run the reboot inside a group
>         if wait:
> --- autotest/server/hosts/serial.py     2010-01-11 15:32:34.000000000 -0800
> +++ autotest/server/hosts/serial.py     2010-01-11 15:32:34.000000000 -0800
> @@ -128,6 +128,15 @@
>                 wait_for_restart()
>         """
>         conmux_command = "'~$%s'" % conmux_command
> +
> +        # if the machine is up, grab the old boot id, otherwise use a dummy
> +        # string and NOT None to ensure that wait_down always returns True,
> +        # even if the machine comes back up before it's called
> +        try:
> +            old_boot_id = self.get_boot_id()
> +        except error.AutoservSSHTimeout:
> +            old_boot_id = 'unknown boot_id prior to SerialHost.hardreset'
> +
>         def reboot():
>             if not self.run_conmux(conmux_command):
>                 self.record("ABORT", None, "reboot.start",
> @@ -141,9 +150,12 @@
>                 for attempt in xrange(num_attempts-1):
>                     try:
>                         self.wait_for_restart(timeout, log_failure=False,
> +                                              old_boot_id=old_boot_id,
>                                               **wait_for_restart_kwargs)
>                     except error.AutoservShutdownError:
>                         logging.warning(warning_msg, attempt+1, num_attempts)
> +                        # re-send the hard reset command
> +                        self.run_conmux(conmux_command)
>                     else:
>                         break
>                 else:
> _______________________________________________
> Autotest mailing list
> [email protected]
> http://test.kernel.org/cgi-bin/mailman/listinfo/autotest
>
_______________________________________________
Autotest mailing list
[email protected]
http://test.kernel.org/cgi-bin/mailman/listinfo/autotest

Reply via email to