This patch is a proposed solution for issue #255. One frequent problem of people using the autotest server is job aborts due to client state mismatches between reboots, mainly differences among list of partitions before and after reboot. In all cases so far the mismatch is due to simple things like:
* Presence/absence of cdrom drivers among different kernels * Difference among device naming. We did actually try to fix that in commit 78a07d5e, but not all clients may have blkid installed. So far I haven't seen any case where autotest is aborting the job in a legitimate context (ie, a context where we might have a problem in the job itself). So let's make the abort part optional, with defaults to False. Signed-off-by: Lucas Meneghel Rodrigues <l...@redhat.com> --- client/bin/job.py | 22 ++++++++++++++++------ client/bin/job_unittest.py | 39 ++++++++++++++++++++++++++++++++++----- global_config.ini | 2 ++ 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/client/bin/job.py b/client/bin/job.py index ab6e7b7..9d9ec9b 100644 --- a/client/bin/job.py +++ b/client/bin/job.py @@ -745,6 +745,10 @@ class base_client_job(base_job.base_job): @raise JobError: Raised if the current configuration does not match the pre-reboot configuration. """ + abort_on_mismatch = GLOBAL_CONFIG.get_config_value('CLIENT', + 'abort_on_mismatch', + type=bool, + default=False) # check to see if any partitions have changed partition_list = partition_lib.get_partition_list(self, exclude_swap=False) @@ -756,9 +760,12 @@ class base_client_job(base_job.base_job): description = ("mounted partitions are different after reboot " "(old entries: %s, new entries: %s)" % (old_entries, new_entries)) - self._record_reboot_failure(subdir, "reboot.verify_config", - description, running_id=running_id) - raise error.JobError("Reboot failed: %s" % description) + if abort_on_mismatch: + self._record_reboot_failure(subdir, "reboot.verify_config", + description, running_id=running_id) + raise error.JobError("Reboot failed: %s" % description) + else: + logging.warning(description) # check to see if any CPUs have changed cpu_count = utils.count_cpus() @@ -767,9 +774,12 @@ class base_client_job(base_job.base_job): description = ('Number of CPUs changed after reboot ' '(old count: %d, new count: %d)' % (old_count, cpu_count)) - self._record_reboot_failure(subdir, 'reboot.verify_config', - description, running_id=running_id) - raise error.JobError('Reboot failed: %s' % description) + if abort_on_mismatch: + self._record_reboot_failure(subdir, 'reboot.verify_config', + description, running_id=running_id) + raise error.JobError('Reboot failed: %s' % description) + else: + logging.warning(description) def end_reboot(self, subdir, kernel, patches, running_id=None): diff --git a/client/bin/job_unittest.py b/client/bin/job_unittest.py index 6549db3..e42af41 100755 --- a/client/bin/job_unittest.py +++ b/client/bin/job_unittest.py @@ -8,7 +8,8 @@ except ImportError: from autotest_lib.client.bin import job, boottool, config, sysinfo, harness from autotest_lib.client.bin import test, xen, kernel, utils -from autotest_lib.client.common_lib import packages, error, log +from autotest_lib.client.common_lib import packages, error, log, global_config +from autotest_lib.client.common_lib.global_config import global_config from autotest_lib.client.common_lib import logging_manager, logging_config from autotest_lib.client.common_lib import base_job_unittest from autotest_lib.client.common_lib.test_utils import mock, unittest @@ -574,16 +575,21 @@ class test_base_job(unittest.TestCase): self.god.check_playback() - def _setup_check_post_reboot(self, mount_info, cpu_count): + def _setup_check_post_reboot(self, mount_info, cpu_count, abort_value): # setup self.god.stub_function(job.partition_lib, "get_partition_list") self.god.stub_function(utils, "count_cpus") + self.god.stub_function(global_config, "get_config_value") part_list = [self.get_partition_mock("/dev/hda1"), self.get_partition_mock("/dev/hdb1")] mount_list = ["/mnt/hda1", "/mnt/hdb1"] # record + global_config.get_config_value.expect_call('CLIENT', + 'abort_on_mismatch', + default=False, + type=bool).and_return(abort_value) job.partition_lib.get_partition_list.expect_call( self.job, exclude_swap=False).and_return(part_list) for i in xrange(len(part_list)): @@ -599,7 +605,18 @@ class test_base_job(unittest.TestCase): mount_info = set([("/dev/hda1", "/mnt/hda1"), ("/dev/hdb1", "/mnt/hdb1")]) - self._setup_check_post_reboot(mount_info, 8) + self._setup_check_post_reboot(mount_info, 8, False) + + # playback + self.job._check_post_reboot("sub") + self.god.check_playback() + + + def test_check_post_reboot_mounts_warning(self): + self.construct_job(True) + + mount_info = set([("/dev/hda1", "/mnt/hda1")]) + self._setup_check_post_reboot(mount_info, 8, False) # playback self.job._check_post_reboot("sub") @@ -610,7 +627,7 @@ class test_base_job(unittest.TestCase): self.construct_job(True) mount_info = set([("/dev/hda1", "/mnt/hda1")]) - self._setup_check_post_reboot(mount_info, None) + self._setup_check_post_reboot(mount_info, None, True) self.god.stub_function(self.job, "_record_reboot_failure") self.job._record_reboot_failure.expect_call("sub", @@ -623,12 +640,24 @@ class test_base_job(unittest.TestCase): self.god.check_playback() + def test_check_post_reboot_cpu_warning(self): + self.construct_job(True) + + mount_info = set([("/dev/hda1", "/mnt/hda1"), + ("/dev/hdb1", "/mnt/hdb1")]) + self._setup_check_post_reboot(mount_info, 4, False) + + # playback + self.job._check_post_reboot("sub") + self.god.check_playback() + + def test_check_post_reboot_cpu_failure(self): self.construct_job(True) mount_info = set([("/dev/hda1", "/mnt/hda1"), ("/dev/hdb1", "/mnt/hdb1")]) - self._setup_check_post_reboot(mount_info, 4) + self._setup_check_post_reboot(mount_info, 4, True) self.god.stub_function(self.job, "_record_reboot_failure") self.job._record_reboot_failure.expect_call( diff --git a/global_config.ini b/global_config.ini index 8e75e42..f676ab8 100644 --- a/global_config.ini +++ b/global_config.ini @@ -64,6 +64,8 @@ drop_caches_between_iterations: False output_dir: # Log installed packages (recommended setting to True on server setups) log_installed_packages = False +# Abort on client state mismatches post reboot (!= list of devices or CPUs) +abort_on_mismatch = False [SERVER] # Hostname of the autotest RPC server. You should set a different hostname -- 1.7.9.3 _______________________________________________ Autotest mailing list Autotest@test.kernel.org http://test.kernel.org/cgi-bin/mailman/listinfo/autotest