This patch is a proposed solution for issue #255.

One frequent problem of people using the autotest server
is job aborts due to client state mismatches between
reboots, mainly differences among list of partitions
before and after reboot. In all cases so far the mismatch
is due to simple things like:

 * Presence/absence of cdrom drivers among different
   kernels
 * Difference among device naming. We did actually try
   to fix that in commit 78a07d5e, but not all clients
   may have blkid installed.

So far I haven't seen any case where autotest is aborting
the job in a legitimate context (ie, a context where we
might have a problem in the job itself). So let's make
the abort part optional, with defaults to False.

Signed-off-by: Lucas Meneghel Rodrigues <l...@redhat.com>
---
 client/bin/job.py          |   22 ++++++++++++++++------
 client/bin/job_unittest.py |   39 ++++++++++++++++++++++++++++++++++-----
 global_config.ini          |    2 ++
 3 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/client/bin/job.py b/client/bin/job.py
index ab6e7b7..9d9ec9b 100644
--- a/client/bin/job.py
+++ b/client/bin/job.py
@@ -745,6 +745,10 @@ class base_client_job(base_job.base_job):
         @raise JobError: Raised if the current configuration does not match the
             pre-reboot configuration.
         """
+        abort_on_mismatch = GLOBAL_CONFIG.get_config_value('CLIENT',
+                                                           'abort_on_mismatch',
+                                                           type=bool,
+                                                           default=False)
         # check to see if any partitions have changed
         partition_list = partition_lib.get_partition_list(self,
                                                           exclude_swap=False)
@@ -756,9 +760,12 @@ class base_client_job(base_job.base_job):
             description = ("mounted partitions are different after reboot "
                            "(old entries: %s, new entries: %s)" %
                            (old_entries, new_entries))
-            self._record_reboot_failure(subdir, "reboot.verify_config",
-                                        description, running_id=running_id)
-            raise error.JobError("Reboot failed: %s" % description)
+            if abort_on_mismatch:
+                self._record_reboot_failure(subdir, "reboot.verify_config",
+                                            description, running_id=running_id)
+                raise error.JobError("Reboot failed: %s" % description)
+            else:
+                logging.warning(description)
 
         # check to see if any CPUs have changed
         cpu_count = utils.count_cpus()
@@ -767,9 +774,12 @@ class base_client_job(base_job.base_job):
             description = ('Number of CPUs changed after reboot '
                            '(old count: %d, new count: %d)' %
                            (old_count, cpu_count))
-            self._record_reboot_failure(subdir, 'reboot.verify_config',
-                                        description, running_id=running_id)
-            raise error.JobError('Reboot failed: %s' % description)
+            if abort_on_mismatch:
+                self._record_reboot_failure(subdir, 'reboot.verify_config',
+                                            description, running_id=running_id)
+                raise error.JobError('Reboot failed: %s' % description)
+            else:
+                logging.warning(description)
 
 
     def end_reboot(self, subdir, kernel, patches, running_id=None):
diff --git a/client/bin/job_unittest.py b/client/bin/job_unittest.py
index 6549db3..e42af41 100755
--- a/client/bin/job_unittest.py
+++ b/client/bin/job_unittest.py
@@ -8,7 +8,8 @@ except ImportError:
 
 from autotest_lib.client.bin import job, boottool, config, sysinfo, harness
 from autotest_lib.client.bin import test, xen, kernel, utils
-from autotest_lib.client.common_lib import packages, error, log
+from autotest_lib.client.common_lib import packages, error, log, global_config
+from autotest_lib.client.common_lib.global_config import global_config
 from autotest_lib.client.common_lib import logging_manager, logging_config
 from autotest_lib.client.common_lib import base_job_unittest
 from autotest_lib.client.common_lib.test_utils import mock, unittest
@@ -574,16 +575,21 @@ class test_base_job(unittest.TestCase):
         self.god.check_playback()
 
 
-    def _setup_check_post_reboot(self, mount_info, cpu_count):
+    def _setup_check_post_reboot(self, mount_info, cpu_count, abort_value):
         # setup
         self.god.stub_function(job.partition_lib, "get_partition_list")
         self.god.stub_function(utils, "count_cpus")
+        self.god.stub_function(global_config, "get_config_value")
 
         part_list = [self.get_partition_mock("/dev/hda1"),
                      self.get_partition_mock("/dev/hdb1")]
         mount_list = ["/mnt/hda1", "/mnt/hdb1"]
 
         # record
+        global_config.get_config_value.expect_call('CLIENT',
+                                                   'abort_on_mismatch',
+                                                   default=False,
+                                              
type=bool).and_return(abort_value)
         job.partition_lib.get_partition_list.expect_call(
                 self.job, exclude_swap=False).and_return(part_list)
         for i in xrange(len(part_list)):
@@ -599,7 +605,18 @@ class test_base_job(unittest.TestCase):
 
         mount_info = set([("/dev/hda1", "/mnt/hda1"),
                           ("/dev/hdb1", "/mnt/hdb1")])
-        self._setup_check_post_reboot(mount_info, 8)
+        self._setup_check_post_reboot(mount_info, 8, False)
+
+        # playback
+        self.job._check_post_reboot("sub")
+        self.god.check_playback()
+
+
+    def test_check_post_reboot_mounts_warning(self):
+        self.construct_job(True)
+
+        mount_info = set([("/dev/hda1", "/mnt/hda1")])
+        self._setup_check_post_reboot(mount_info, 8, False)
 
         # playback
         self.job._check_post_reboot("sub")
@@ -610,7 +627,7 @@ class test_base_job(unittest.TestCase):
         self.construct_job(True)
 
         mount_info = set([("/dev/hda1", "/mnt/hda1")])
-        self._setup_check_post_reboot(mount_info, None)
+        self._setup_check_post_reboot(mount_info, None, True)
 
         self.god.stub_function(self.job, "_record_reboot_failure")
         self.job._record_reboot_failure.expect_call("sub",
@@ -623,12 +640,24 @@ class test_base_job(unittest.TestCase):
         self.god.check_playback()
 
 
+    def test_check_post_reboot_cpu_warning(self):
+        self.construct_job(True)
+
+        mount_info = set([("/dev/hda1", "/mnt/hda1"),
+                          ("/dev/hdb1", "/mnt/hdb1")])
+        self._setup_check_post_reboot(mount_info, 4, False)
+
+        # playback
+        self.job._check_post_reboot("sub")
+        self.god.check_playback()
+
+
     def test_check_post_reboot_cpu_failure(self):
         self.construct_job(True)
 
         mount_info = set([("/dev/hda1", "/mnt/hda1"),
                           ("/dev/hdb1", "/mnt/hdb1")])
-        self._setup_check_post_reboot(mount_info, 4)
+        self._setup_check_post_reboot(mount_info, 4, True)
 
         self.god.stub_function(self.job, "_record_reboot_failure")
         self.job._record_reboot_failure.expect_call(
diff --git a/global_config.ini b/global_config.ini
index 8e75e42..f676ab8 100644
--- a/global_config.ini
+++ b/global_config.ini
@@ -64,6 +64,8 @@ drop_caches_between_iterations: False
 output_dir:
 # Log installed packages (recommended setting to True on server setups)
 log_installed_packages = False
+# Abort on client state mismatches post reboot (!= list of devices or CPUs)
+abort_on_mismatch = False
 
 [SERVER]
 # Hostname of the autotest RPC server. You should set a different hostname
-- 
1.7.9.3

_______________________________________________
Autotest mailing list
Autotest@test.kernel.org
http://test.kernel.org/cgi-bin/mailman/listinfo/autotest

Reply via email to