Adds support to wait_down and wait_for_restart to watch for changed
boot_id values (using /proc/sys/kernel/random/boot_id). This avoids a
race condition where if a machine was able to successfully shutdown
and boot up again after you started a reboot and before we started
polling for the shutdown, it would just appear that the machine
failed to shutdown when told.
As a result the Host.wait_down method now treats the case where a
machine is NOT down but has a new boot_id as being "down", because
this implies that the machine did shut down (and subsequently came
back up). This does mean that you cannot assume that a successful
wait_down implies that the machine is now down, but that was never
true anyway.
Risk: High
Visibility: We can now reliably detect the restart of machines with
very fast restart times.
Signed-off-by: John Admanski <[email protected]>
--- autotest/client/common_lib/hosts/base_classes.py 2010-01-11
15:32:34.000000000 -0800
+++ autotest/client/common_lib/hosts/base_classes.py 2010-01-11
15:32:34.000000000 -0800
@@ -151,20 +151,36 @@
return processes
+ def get_boot_id(self, timeout=60):
+ """ Get a unique ID associated with the current boot.
+
+ Should return a string with the semantics such that two separate
+ calls to Host.get_boot_id() return the same string if the host did
+ not reboot between the two calls, and two different strings if it
+ has rebooted at least once between the two calls.
+
+ @param timeout The number of seconds to wait before timing out.
+
+ @return A string unique to this boot."""
+ return self.run('cat /proc/sys/kernel/random/boot_id',
+ timeout=timeout).stdout.strip()
+
+
def wait_up(self, timeout=None):
raise NotImplementedError('Wait up not implemented!')
- def wait_down(self, timeout=None, warning_timer=None):
+ def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
raise NotImplementedError('Wait down not implemented!')
def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT,
- log_failure=True, **dargs):
+ log_failure=True, old_boot_id=None, **dargs):
""" Wait for the host to come back from a reboot. This is a generic
implementation based entirely on wait_up and wait_down. """
if not self.wait_down(timeout=self.WAIT_DOWN_REBOOT_TIMEOUT,
- warning_timer=self.WAIT_DOWN_REBOOT_WARNING):
+ warning_timer=self.WAIT_DOWN_REBOOT_WARNING,
+ old_boot_id=old_boot_id):
if log_failure:
self.record("ABORT", None, "reboot.verify", "shut down failed")
raise error.AutoservShutdownError("Host did not shut down")
--- autotest/server/autotest.py 2010-01-07 15:31:13.000000000 -0800
+++ autotest/server/autotest.py 2010-01-11 15:32:34.000000000 -0800
@@ -664,10 +664,10 @@
return stderr_redirector.last_line
- def _wait_for_reboot(self):
+ def _wait_for_reboot(self, old_boot_id):
logging.info("Client is rebooting")
logging.info("Waiting for client to halt")
- if not self.host.wait_down(HALT_TIME):
+ if not self.host.wait_down(HALT_TIME, old_boot_id=old_boot_id):
err = "%s failed to shutdown after %d"
err %= (self.host.hostname, HALT_TIME)
raise error.AutotestRunError(err)
@@ -709,6 +709,7 @@
section_timeout = start_time + timeout - time.time()
else:
section_timeout = None
+ boot_id = self.host.get_boot_id()
last = self.execute_section(section, section_timeout,
logger, client_disconnect_timeout)
if self.background:
@@ -719,7 +720,7 @@
return
elif self.is_client_job_rebooting(last):
try:
- self._wait_for_reboot()
+ self._wait_for_reboot(boot_id)
except error.AutotestRunError, e:
self.host.job.record("ABORT", None, "reboot", str(e))
self.host.job.record("END ABORT", None, None, str(e))
--- autotest/server/hosts/abstract_ssh.py 2010-01-11 14:05:15.000000000
-0800
+++ autotest/server/hosts/abstract_ssh.py 2010-01-11 15:32:34.000000000
-0800
@@ -345,8 +345,7 @@
"""
Check if the remote host is up.
- Returns:
- True if the remote host is up, False otherwise
+ @returns True if the remote host is up, False otherwise
"""
try:
self.ssh_ping()
@@ -363,12 +362,10 @@
In fact, it will wait until an ssh connection to the remote
host can be established, and getty is running.
- Args:
- timeout: time limit in seconds before returning even
- if the host is not up.
+ @param timeout time limit in seconds before returning even
+ if the host is not up.
- Returns:
- True if the host was found to be up, False otherwise
+ @returns True if the host was found to be up, False otherwise
"""
if timeout:
end_time = time.time() + timeout
@@ -385,21 +382,28 @@
return False
- def wait_down(self, timeout=None, warning_timer=None):
+ def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
"""
Wait until the remote host is down or the timeout expires.
- In fact, it will wait until an ssh connection to the remote
- host fails.
-
- Args:
- timeout: time limit in seconds before returning even
- if the host is still up.
- warning_timer: time limit in seconds that will generate
- a warning if the host is not down yet.
+ If old_boot_id is provided, this will wait until either the machine
+ is unpingable or self.get_boot_id() returns a value different from
+ old_boot_id. If the boot_id value has changed then the function
+ returns true under the assumption that the machine has shut down
+ and has now already come back up.
+
+ If old_boot_id is None then until the machine becomes unreachable the
+ method assumes the machine has not yet shut down.
+
+ @param timeout Time limit in seconds before returning even
+ if the host is still up.
+ @param warning_timer Time limit in seconds that will generate
+ a warning if the host is not down yet.
+ @param old_boot_id A string containing the result of self.get_boot_id()
+ prior to the host being told to shut down. Can be None if this is
+ not available.
- Returns:
- True if the host was found to be down, False otherwise
+ @returns True if the host was found to be down, False otherwise
"""
current_time = time.time()
if timeout:
@@ -408,9 +412,25 @@
if warning_timer:
warn_time = current_time + warning_timer
+ if old_boot_id is not None:
+ logging.debug('Host %s pre-shutdown boot_id is %s',
+ self.hostname, old_boot_id)
+
while not timeout or current_time < end_time:
- if not self.is_up():
+ try:
+ new_boot_id = self.get_boot_id()
+ except error.AutoservSSHTimeout:
+ logging.debug('Host %s is now unreachable over ssh, is down',
+ self.hostname)
return True
+ else:
+ # if the machine is up but the boot_id value has changed from
+ # old boot id, then we can assume the machine has gone down
+ # and then already come back up
+ if old_boot_id is not None and old_boot_id != new_boot_id:
+ logging.debug('Host %s now has boot_id %s and so must '
+ 'have rebooted', self.hostname, new_boot_id)
+ return True
if warning_timer and current_time > warn_time:
self.record("WARN", None, "shutdown",
--- autotest/server/hosts/remote.py 2010-01-11 15:32:34.000000000 -0800
+++ autotest/server/hosts/remote.py 2010-01-11 15:32:34.000000000 -0800
@@ -129,6 +129,8 @@
def reboot():
self.record("GOOD", None, "reboot.start")
try:
+ current_boot_id = self.get_boot_id()
+
# sync before starting the reboot, so that a long sync during
# shutdown isn't timed out by wait_down's short timeout
if not fastsync:
@@ -150,7 +152,8 @@
"reboot command failed")
raise
if wait:
- self.wait_for_restart(timeout, **dargs)
+ self.wait_for_restart(timeout, old_boot_id=current_boot_id,
+ **dargs)
# if this is a full reboot-and-wait, run the reboot inside a group
if wait:
--- autotest/server/hosts/serial.py 2010-01-11 15:32:34.000000000 -0800
+++ autotest/server/hosts/serial.py 2010-01-11 15:32:34.000000000 -0800
@@ -128,6 +128,15 @@
wait_for_restart()
"""
conmux_command = "'~$%s'" % conmux_command
+
+ # if the machine is up, grab the old boot id, otherwise use a dummy
+ # string and NOT None to ensure that wait_down always returns True,
+ # even if the machine comes back up before it's called
+ try:
+ old_boot_id = self.get_boot_id()
+ except error.AutoservSSHTimeout:
+ old_boot_id = 'unknown boot_id prior to SerialHost.hardreset'
+
def reboot():
if not self.run_conmux(conmux_command):
self.record("ABORT", None, "reboot.start",
@@ -141,9 +150,12 @@
for attempt in xrange(num_attempts-1):
try:
self.wait_for_restart(timeout, log_failure=False,
+ old_boot_id=old_boot_id,
**wait_for_restart_kwargs)
except error.AutoservShutdownError:
logging.warning(warning_msg, attempt+1, num_attempts)
+ # re-send the hard reset command
+ self.run_conmux(conmux_command)
else:
break
else:
_______________________________________________
Autotest mailing list
[email protected]
http://test.kernel.org/cgi-bin/mailman/listinfo/autotest