Module: Mesa Branch: main Commit: 201b0b6d2948eedfd1821e5173ecba09f0d453c6 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=201b0b6d2948eedfd1821e5173ecba09f0d453c6
Author: Guilherme Gallo <[email protected]> Date: Wed Apr 20 20:55:47 2022 -0300 ci/lava: Retry when data fetching log RPC call is corrupted Rarely the jobs.logs RPC call can return corrupted data, such as mal-formed YAML data. As this is expected and very rare to occur, let's retry this RPC call several times to give it a chance to fix itself. Retrying would not swallow the log lines since we keep track of how many log lines each job has. Signed-off-by: Guilherme Gallo <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15938> --- .gitlab-ci/lava/exceptions.py | 6 ++++- .gitlab-ci/lava/lava_job_submitter.py | 39 ++++++++++++++++++++++------- .gitlab-ci/tests/test_lava_job_submitter.py | 31 +++++++++++++++++++++-- 3 files changed, 64 insertions(+), 12 deletions(-) diff --git a/.gitlab-ci/lava/exceptions.py b/.gitlab-ci/lava/exceptions.py index 576924f2b7d..1c1db591897 100644 --- a/.gitlab-ci/lava/exceptions.py +++ b/.gitlab-ci/lava/exceptions.py @@ -14,4 +14,8 @@ class MesaCITimeoutError(MesaCIException): class MesaCIRetryError(MesaCIException): def __init__(self, *args, retry_count: int) -> None: super().__init__(*args) - self.retry_count = retry_count \ No newline at end of file + self.retry_count = retry_count + + +class MesaCIParseException(MesaCIException): + pass diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index cc5063b2727..fb4b04420c2 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -24,7 +24,9 @@ """Send a job to LAVA, track it and collect log back""" + import argparse +import contextlib import pathlib import re import sys @@ -38,7 +40,12 @@ from typing import Any, Optional import lavacli import yaml -from lava.exceptions import MesaCIException, MesaCIRetryError, MesaCITimeoutError +from lava.exceptions import ( + MesaCIException, + MesaCIParseException, + MesaCIRetryError, + MesaCITimeoutError, +) from lavacli.utils import loader # Timeout in seconds to decide if the device from the dispatched LAVA job has @@ -264,20 +271,26 @@ class LAVAJob: ) return job_state["job_state"] not in waiting_states + def _load_log_from_data(self, data) -> list[str]: + lines = [] + # When there is no new log data, the YAML is empty + if loaded_lines := yaml.load(str(data), Loader=loader(False)): + lines = loaded_lines + # If we had non-empty log data, we can assure that the device is alive. + self.heartbeat() + self.last_log_line += len(lines) + return lines + def get_logs(self) -> list[str]: try: (finished, data) = _call_proxy( self.proxy.scheduler.jobs.logs, self.job_id, self.last_log_line ) - lines = yaml.load(str(data), Loader=loader(False)) self.is_finished = finished - if not lines: - return [] - self.heartbeat() - self.last_log_line += len(lines) - return lines + return self._load_log_from_data(data) + except Exception as mesa_ci_err: - raise MesaCIException( + raise MesaCIParseException( f"Could not get LAVA job logs. Reason: {mesa_ci_err}" ) from mesa_ci_err @@ -381,7 +394,15 @@ def fetch_logs(job, max_idle_time): time.sleep(LOG_POLLING_TIME_SEC) - new_lines = job.get_logs() + # The XMLRPC binary packet may be corrupted, causing a YAML scanner error. + # Retry the log fetching several times before exposing the error. + for _ in range(5): + with contextlib.suppress(MesaCIParseException): + new_lines = job.get_logs() + break + else: + raise MesaCIParseException + parsed_lines = parse_lava_lines(new_lines) for line in parsed_lines: diff --git a/.gitlab-ci/tests/test_lava_job_submitter.py b/.gitlab-ci/tests/test_lava_job_submitter.py index d2f900a2794..49ad219d3e4 100644 --- a/.gitlab-ci/tests/test_lava_job_submitter.py +++ b/.gitlab-ci/tests/test_lava_job_submitter.py @@ -168,7 +168,9 @@ PROXY_SCENARIOS = { {}, ), "timed out more times than retry attempts": ( - generate_n_logs(n=4, tick_fn=DEVICE_HANGING_TIMEOUT_SEC + 1), + generate_n_logs( + n=NUMBER_OF_MAX_ATTEMPTS + 1, tick_fn=DEVICE_HANGING_TIMEOUT_SEC + 1 + ), pytest.raises(MesaCIRetryError), False, {}, @@ -316,4 +318,29 @@ def test_hide_sensitive_data(input, expectation, tag): def test_get_job_results(mock_proxy): proxy = mock_proxy() - get_job_results(proxy, 1, "0_mesa") \ No newline at end of file + get_job_results(proxy, 1, "0_mesa") + + +CORRUPTED_LOG_SCENARIOS = { + "too much subsequent corrupted data": ( + [(False, "{'msg': 'Incomplete}")] * 100 + [jobs_logs_response(True)], + pytest.raises((MesaCIRetryError)), + ), + "one subsequent corrupted data": ( + [(False, "{'msg': 'Incomplete}")] * 2 + [jobs_logs_response(True)], + does_not_raise(), + ), +} + + [email protected]( + "data_sequence, expected_exception", + CORRUPTED_LOG_SCENARIOS.values(), + ids=CORRUPTED_LOG_SCENARIOS.keys(), +) +def test_log_corruption(mock_sleep, data_sequence, expected_exception, mock_proxy): + proxy_mock = mock_proxy() + proxy_logs_mock = proxy_mock.scheduler.jobs.logs + proxy_logs_mock.side_effect = data_sequence + with expected_exception: + retriable_follow_job(proxy_mock, "")
