LGTM. Thanks for doing this.
On Mon, Jan 11, 2010 at 3:35 PM, John Admanski <[email protected]> wrote: > Adds support to wait_down and wait_for_restart to watch for changed > boot_id values (using /proc/sys/kernel/random/boot_id). This avoids a > race condition where if a machine was able to successfully shutdown > and boot up again after you started a reboot and before we started > polling for the shutdown, it would just appear that the machine > failed to shutdown when told. > > As a result the Host.wait_down method now treats the case where a > machine is NOT down but has a new boot_id as being "down", because > this implies that the machine did shut down (and subsequently came > back up). This does mean that you cannot assume that a successful > wait_down implies that the machine is now down, but that was never > true anyway. > > Risk: High > Visibility: We can now reliably detect the restart of machines with > very fast restart times. > > Signed-off-by: John Admanski <[email protected]> > > --- autotest/client/common_lib/hosts/base_classes.py 2010-01-11 > 15:32:34.000000000 -0800 > +++ autotest/client/common_lib/hosts/base_classes.py 2010-01-11 > 15:32:34.000000000 -0800 > @@ -151,20 +151,36 @@ > return processes > > > + def get_boot_id(self, timeout=60): > + """ Get a unique ID associated with the current boot. > + > + Should return a string with the semantics such that two separate > + calls to Host.get_boot_id() return the same string if the host did > + not reboot between the two calls, and two different strings if it > + has rebooted at least once between the two calls. > + > + �...@param timeout The number of seconds to wait before timing out. > + > + �...@return A string unique to this boot.""" > + return self.run('cat /proc/sys/kernel/random/boot_id', > + timeout=timeout).stdout.strip() > + > + > def wait_up(self, timeout=None): > raise NotImplementedError('Wait up not implemented!') > > > - def wait_down(self, timeout=None, warning_timer=None): > + def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None): > raise NotImplementedError('Wait down not implemented!') > > > def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, > - log_failure=True, **dargs): > + log_failure=True, old_boot_id=None, **dargs): > """ Wait for the host to come back from a reboot. This is a generic > implementation based entirely on wait_up and wait_down. """ > if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT, > - warning_timer=self.WAIT_DOWN_REBOOT_WARNING): > + warning_timer=self.WAIT_DOWN_REBOOT_WARNING, > + old_boot_id=old_boot_id): > if log_failure: > self.record("ABORT", None, "reboot.verify", "shut down > failed") > raise error.AutoservShutdownError("Host did not shut down") > --- autotest/server/autotest.py 2010-01-07 15:31:13.000000000 -0800 > +++ autotest/server/autotest.py 2010-01-11 15:32:34.000000000 -0800 > @@ -664,10 +664,10 @@ > return stderr_redirector.last_line > > > - def _wait_for_reboot(self): > + def _wait_for_reboot(self, old_boot_id): > logging.info("Client is rebooting") > logging.info("Waiting for client to halt") > - if not self.host.wait_down(HALT_TIME): > + if not self.host.wait_down(HALT_TIME, old_boot_id=old_boot_id): > err = "%s failed to shutdown after %d" > err %= (self.host.hostname, HALT_TIME) > raise error.AutotestRunError(err) > @@ -709,6 +709,7 @@ > section_timeout = start_time + timeout - time.time() > else: > section_timeout = None > + boot_id = self.host.get_boot_id() > last = self.execute_section(section, section_timeout, > logger, client_disconnect_timeout) > if self.background: > @@ -719,7 +720,7 @@ > return > elif self.is_client_job_rebooting(last): > try: > - self._wait_for_reboot() > + self._wait_for_reboot(boot_id) > except error.AutotestRunError, e: > self.host.job.record("ABORT", None, "reboot", str(e)) > self.host.job.record("END ABORT", None, None, str(e)) > --- autotest/server/hosts/abstract_ssh.py 2010-01-11 14:05:15.000000000 > -0800 > +++ autotest/server/hosts/abstract_ssh.py 2010-01-11 15:32:34.000000000 > -0800 > @@ -345,8 +345,7 @@ > """ > Check if the remote host is up. > > - Returns: > - True if the remote host is up, False otherwise > + �...@returns True if the remote host is up, False otherwise > """ > try: > self.ssh_ping() > @@ -363,12 +362,10 @@ > In fact, it will wait until an ssh connection to the remote > host can be established, and getty is running. > > - Args: > - timeout: time limit in seconds before returning even > - if the host is not up. > + �...@param timeout time limit in seconds before returning even > + if the host is not up. > > - Returns: > - True if the host was found to be up, False otherwise > + �...@returns True if the host was found to be up, False otherwise > """ > if timeout: > end_time = time.time() + timeout > @@ -385,21 +382,28 @@ > return False > > > - def wait_down(self, timeout=None, warning_timer=None): > + def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None): > """ > Wait until the remote host is down or the timeout expires. > > - In fact, it will wait until an ssh connection to the remote > - host fails. > - > - Args: > - timeout: time limit in seconds before returning even > - if the host is still up. > - warning_timer: time limit in seconds that will generate > - a warning if the host is not down yet. > + If old_boot_id is provided, this will wait until either the machine > + is unpingable or self.get_boot_id() returns a value different from > + old_boot_id. If the boot_id value has changed then the function > + returns true under the assumption that the machine has shut down > + and has now already come back up. > + > + If old_boot_id is None then until the machine becomes unreachable the > + method assumes the machine has not yet shut down. > + > + �...@param timeout Time limit in seconds before returning even > + if the host is still up. > + �...@param warning_timer Time limit in seconds that will generate > + a warning if the host is not down yet. > + �...@param old_boot_id A string containing the result of > self.get_boot_id() > + prior to the host being told to shut down. Can be None if this is > + not available. > > - Returns: > - True if the host was found to be down, False otherwise > + �...@returns True if the host was found to be down, False otherwise > """ > current_time = time.time() > if timeout: > @@ -408,9 +412,25 @@ > if warning_timer: > warn_time = current_time + warning_timer > > + if old_boot_id is not None: > + logging.debug('Host %s pre-shutdown boot_id is %s', > + self.hostname, old_boot_id) > + > while not timeout or current_time < end_time: > - if not self.is_up(): > + try: > + new_boot_id = self.get_boot_id() > + except error.AutoservSSHTimeout: > + logging.debug('Host %s is now unreachable over ssh, is down', > + self.hostname) > return True > + else: > + # if the machine is up but the boot_id value has changed from > + # old boot id, then we can assume the machine has gone down > + # and then already come back up > + if old_boot_id is not None and old_boot_id != new_boot_id: > + logging.debug('Host %s now has boot_id %s and so must ' > + 'have rebooted', self.hostname, > new_boot_id) > + return True > > if warning_timer and current_time > warn_time: > self.record("WARN", None, "shutdown", > --- autotest/server/hosts/remote.py 2010-01-11 15:32:34.000000000 -0800 > +++ autotest/server/hosts/remote.py 2010-01-11 15:32:34.000000000 -0800 > @@ -129,6 +129,8 @@ > def reboot(): > self.record("GOOD", None, "reboot.start") > try: > + current_boot_id = self.get_boot_id() > + > # sync before starting the reboot, so that a long sync during > # shutdown isn't timed out by wait_down's short timeout > if not fastsync: > @@ -150,7 +152,8 @@ > "reboot command failed") > raise > if wait: > - self.wait_for_restart(timeout, **dargs) > + self.wait_for_restart(timeout, old_boot_id=current_boot_id, > + **dargs) > > # if this is a full reboot-and-wait, run the reboot inside a group > if wait: > --- autotest/server/hosts/serial.py 2010-01-11 15:32:34.000000000 -0800 > +++ autotest/server/hosts/serial.py 2010-01-11 15:32:34.000000000 -0800 > @@ -128,6 +128,15 @@ > wait_for_restart() > """ > conmux_command = "'~$%s'" % conmux_command > + > + # if the machine is up, grab the old boot id, otherwise use a dummy > + # string and NOT None to ensure that wait_down always returns True, > + # even if the machine comes back up before it's called > + try: > + old_boot_id = self.get_boot_id() > + except error.AutoservSSHTimeout: > + old_boot_id = 'unknown boot_id prior to SerialHost.hardreset' > + > def reboot(): > if not self.run_conmux(conmux_command): > self.record("ABORT", None, "reboot.start", > @@ -141,9 +150,12 @@ > for attempt in xrange(num_attempts-1): > try: > self.wait_for_restart(timeout, log_failure=False, > + old_boot_id=old_boot_id, > **wait_for_restart_kwargs) > except error.AutoservShutdownError: > logging.warning(warning_msg, attempt+1, num_attempts) > + # re-send the hard reset command > + self.run_conmux(conmux_command) > else: > break > else: > _______________________________________________ > Autotest mailing list > [email protected] > http://test.kernel.org/cgi-bin/mailman/listinfo/autotest > _______________________________________________ Autotest mailing list [email protected] http://test.kernel.org/cgi-bin/mailman/listinfo/autotest
