From: Yann Argotti <[email protected]> Date: Tue, 24 Nov 2015 12:16:34 +0100
This adds a policy which advises when user should reboot system to avoid noisy test results due to system becoming unstable, for instance, and therefore continues testing successfully. To do this, a new Dmesg class is proposed which is not filtering dmesg and monitors whether or not one of the following event occurs: - gpu reset failed (not just gpu reset happened, that happens way too often and many tests even provoke hangs intentionally) - gpu crash, - Oops: - BUG - lockdep splat that causes the locking validator to get disabled If one of these issues happen, piglit test execution is stopped -terminating test thread pool- and exit with code 3 to inform that reboot is advised. Then test execution resume, after rebooting system or not, is done like usually with command line parameter "resume". To call it, use command line parameter: --dmesg monitored (this is original patch for review and therefore didn't apply any proposed change yet) Signed-off-by: Yann Argotti <[email protected]> --- framework/dmesg.py | 118 +++++++++++++++++++++++++++++++++++++++++++++- framework/exceptions.py | 6 ++- framework/profile.py | 35 ++++++++++++-- framework/programs/run.py | 45 ++++++++++++++---- 4 files changed, 185 insertions(+), 19 deletions(-) diff --git a/framework/dmesg.py b/framework/dmesg.py index daeeeae..66c95f5 100644 --- a/framework/dmesg.py +++ b/framework/dmesg.py @@ -46,6 +46,7 @@ import gzip __all__ = [ 'BaseDmesg', 'LinuxDmesg', + 'LinuxMonitoredDmesg', 'DummyDmesg', 'get_dmesg', ] @@ -212,6 +213,116 @@ class LinuxDmesg(BaseDmesg): # Attempt to store the last element of dmesg, unless there was no dmesg self._last_message = dmesg[-1] if dmesg else None +class LinuxMonitoredDmesg(BaseDmesg): + """ Read and monitor dmesg on posix systems + + This reads the dmesg ring buffer, stores the contents of the buffer, and + then compares it whenever called, returning the difference between the + calls. It requires timestamps to be enabled. + + """ + # We are considering all levels of dmesg to avoid missing + # any important ones and with an inaccurate reporting level + DMESG_COMMAND = ['dmesg'] + # With _first_dmesg_process, we are not checking dmesg at test execution + # starting time: user must know current machine/GPU state before + # starting piglit: we can resume test execution based on last stopping + # if we consider that our system is still in good shape (ie not blocking + # run forever) + _first_dmesg_process = True + _reboot_needed = False + + def __init__(self): + """ Create a dmesg instance """ + # _last_message store the contents of the last entry in dmesg the last + # time it was read. Because dmesg is a ringbuffer old entries can fall + # off the end if it becomes too full. To track new entries search for + # this entry and take any entries that were added after it. This works + # on Linux because each entry is guaranteed unique by timestamps. + self._last_message = None + super(LinuxMonitoredDmesg, self).__init__() + self._reboot_needed = False + + if not self._last_message: + # We need to ensure that there is something in dmesg to check + # timestamps. + warnings.warn("Cannot check dmesg for timestamp support. If you " + "do not have timestamps enabled in your kernel you " + "get incomplete dmesg captures", RuntimeWarning) + elif not re.match(r'\[\s*\d+\.\d+\]', self._last_message): + # Do an initial check to ensure that dmesg has timestamps, we need + # timestamps to work correctly. A proper linux dmesg timestamp + # looks like this: [ 0.00000] + raise DmesgError( + "Your kernel does not seem to support timestamps, which " + "are required by the --dmesg option") + + def reboot_is_needed(self): + """ Simply return the reboot_needed variable state """ + return self._reboot_needed + + def update_dmesg(self): + """ Call dmesg using subprocess.check_output + + Get the contents of dmesg, then calculate new messages, finally set + self.dmesg equal to the just read contents of dmesg. + + """ + dmesg = subprocess.check_output(self.DMESG_COMMAND).strip().splitlines() + + # Find all new entries, do this by slicing the list of dmesg to only + # returns elements after the last element stored. If there are not + # matches a value error is raised, that means all of dmesg is new + l = 0 + for index, item in enumerate(reversed(dmesg)): + if item == self._last_message: + l = len(dmesg) - index # don't include the matched element + break + self._new_messages = dmesg[l:] + + if (not self._first_dmesg_process): + if (any("[drm] Failed to reset chip" in msg for msg in self._new_messages) or + any("[drm] GPU crash" in msg for msg in self._new_messages) or + any("Oops:" in msg for msg in self._new_messages) or + any("BUG:" in msg for msg in self._new_messages) or + any("turning off the locking correctness validator" in msg for msg in self._new_messages)): + self._reboot_needed = True + else: + self._first_dmesg_process = False + + # Attempt to store the last element of dmesg, unless there was no dmesg + self._last_message = dmesg[-1] if dmesg else None + + def update_result(self, result): + """ Takes a TestResult object and updates it with dmesg statuses + + There is no filtering/change of status in dmesg and return the + original result passed in. + + Arguments: + result -- A TestResult instance + + """ + + # Get a new snapshot of dmesg + self.update_dmesg() + + # if update_dmesg() found new entries replace the results of the test + # and subtests + if self._new_messages: + + if self.regex: + for line in self._new_messages: + if self.regex.search(line): + break + else: + return result + + # Add the dmesg values to the result + result.dmesg = "\n".join(self._new_messages) + + return result + class DummyDmesg(BaseDmesg): """ An dummy class for dmesg on non unix-like systems @@ -239,7 +350,7 @@ class DmesgError(EnvironmentError): pass -def get_dmesg(not_dummy=True): +def get_dmesg(not_dummy=True, monitored=False): """ Return a Dmesg type instance Normally this does a system check, and returns the type that proper for @@ -248,5 +359,8 @@ def get_dmesg(not_dummy=True): """ if sys.platform.startswith('linux') and not_dummy: - return LinuxDmesg() + if monitored: + return LinuxMonitoredDmesg() + else: + return LinuxDmesg() return DummyDmesg() diff --git a/framework/exceptions.py b/framework/exceptions.py index fdd1f1f..c6adfc3 100644 --- a/framework/exceptions.py +++ b/framework/exceptions.py @@ -46,8 +46,9 @@ def handler(func): @functools.wraps(func) def _inner(*args, **kwargs): - try: - func(*args, **kwargs) + returncode = 0 + try: + returncode = func(*args, **kwargs) except PiglitFatalError as e: print('Fatal Error: {}'.format(str(e)), file=sys.stderr) sys.exit(1) @@ -67,6 +68,7 @@ def handler(func): if _DEBUG: raise sys.exit(1) + return returncode return _inner diff --git a/framework/profile.py b/framework/profile.py index 32ed759..165d043 100644 --- a/framework/profile.py +++ b/framework/profile.py @@ -34,6 +34,8 @@ import importlib import contextlib import itertools +from functools import partial + from framework import grouptools, exceptions, options from framework.dmesg import get_dmesg from framework.log import LogManager @@ -165,15 +167,25 @@ class TestProfile(object): return self._dmesg @dmesg.setter - def dmesg(self, not_dummy): + def dmesg(self, vals): """ Set dmesg - Arguments: + Arguments: (contain in vals) + not_dummy -- if Truthy dmesg will try to get a PosixDmesg, if Falsy it will get a DummyDmesg + monitored -- if True, invoke also that dmesg are monitored to stop test + execution and advise to reboot (exit code 3) + + if vals is not iterable, it assume this is a signle value """ - self._dmesg = get_dmesg(not_dummy) + try: + not_dummy, monitored = vals + except TypeError, ValueError: + not_dummy = vals + monitored = False + self._dmesg = get_dmesg(not_dummy, monitored) def _prepare_test_list(self): """ Prepare tests for running @@ -253,16 +265,19 @@ class TestProfile(object): self._prepare_test_list() log = LogManager(logger, len(self.test_list)) - def test(pair): + def test(pair, this_pool=None): """Function to call test.execute from map""" name, test = pair with backend.write_test(name) as w: test.execute(name, log.get(), self.dmesg) w(test.result) + # in case current dmesg is supporting this property, check it + if self.dmesg.reboot_is_needed(): + this_pool.terminate() def run_threads(pool, testlist): """ Open a pool, close it, and join it """ - pool.imap(test, testlist, chunksize) + pool.imap(partial(test, this_pool=pool), testlist, chunksize) pool.close() pool.join() @@ -289,6 +304,16 @@ class TestProfile(object): self._post_run_hook() + # return code 3 is to say a reboot is required + try: + if self.dmesg.reboot_is_needed(): + return 3 + except AttributeError: + pass + + return 0 + + def filter_tests(self, function): """Filter out tests that return false from the supplied function diff --git a/framework/programs/run.py b/framework/programs/run.py index 328e0b0..f4f82dd 100644 --- a/framework/programs/run.py +++ b/framework/programs/run.py @@ -137,9 +137,17 @@ def _run_parser(input_): action="store_true", help="Run tests in valgrind's memcheck") parser.add_argument("--dmesg", - action="store_true", - help="Capture a difference in dmesg before and " - "after each test. Implies -1/--no-concurrency") + dest="dmesg", + action="store", + choices=['none', 'simple', 'monitored'], + default='none', + help="Set level of dmesg usage: " + "none={no dmesg use - default}, " + "simple={Capture a difference in dmesg before and " + "after each test. Implies -1/--no-concurrency}, " + "monitored={simple mode + monitor dmesg and stop " + "test execution when a reboot may be required " + "-exit code 3}") parser.add_argument("-s", "--sync", action="store_true", help="Sync results to disk after every test") @@ -223,7 +231,7 @@ def run(input_): # If dmesg is requested we must have serial run, this is becasue dmesg # isn't reliable with threaded run - if args.dmesg: + if args.dmesg != 'none': args.concurrency = "none" # build up the include filter based on test_list @@ -268,9 +276,16 @@ def run(input_): results.time_elapsed.start = time.time() # Set the dmesg type - if args.dmesg: - profile.dmesg = args.dmesg - profile.run(args.log_level, backend) + if args.dmesg != 'none': + profile.dmesg = (True, args.dmesg=='monitored') + + returncode = profile.run(args.log_level, backend) + # in case something went wrong (returncode is not 0) + if returncode != 0: + print('Aborting Piglit execution: code (', returncode,')') + if returncode == 3: + print('~> Advising to reboot\n') + return returncode results.time_elapsed.end = time.time() backend.finalize({'time_elapsed': results.time_elapsed}) @@ -278,6 +293,8 @@ def run(input_): print('Thank you for running Piglit!\n' 'Results have been written to ' + args.results_path) + return 0 + @exceptions.handler def resume(input_): @@ -328,13 +345,21 @@ def resume(input_): profile = framework.profile.merge_test_profiles(results.options['profile']) profile.results_dir = args.results_path - if options.OPTIONS.dmesg: - profile.dmesg = options.OPTIONS.dmesg + if options.OPTIONS.dmesg != 'none': + profile.dmesg = (True, options.OPTIONS.dmesg=='monitored') # This is resumed, don't bother with time since it wont be accurate anyway - profile.run(results.options['log_level'], backend) + returncode = profile.run(results.options['log_level'], backend) + # in case something went wrong (returncode is not 0) + if returncode != 0: + print('Aborting Piglit execution: code (', returncode,')') + if returncode == 3: + print('~> Advising to reboot\n') + return returncode backend.finalize() print("Thank you for running Piglit!\n" "Results have been written to {0}".format(args.results_path)) + + return 0 -- 1.9.1 _______________________________________________ Piglit mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/piglit
