This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-steward.git
The following commit(s) were added to refs/heads/main by this push:
new a316f73 feat(tools/skill-evals): add --fail-fast flag to stop on
first failure (#408)
a316f73 is described below
commit a316f7380362894a32576707e108e5d11b5c37c6
Author: Aayush Rajput <[email protected]>
AuthorDate: Sun May 31 17:42:15 2026 +0530
feat(tools/skill-evals): add --fail-fast flag to stop on first failure
(#408)
Why: Large skill-evals suites can take minutes to surface a single failure
because the harness runs the full suite. The --fail-fast flag (already
supported by pytest and prek) stops execution after the first FAIL or ERROR
in --cli mode, matching user expectations and reducing feedback latency.
Fixes #376
---
tools/skill-evals/src/skill_evals/runner.py | 11 +++++++
tools/skill-evals/tests/test_runner.py | 45 +++++++++++++++++++++++++++++
2 files changed, 56 insertions(+)
diff --git a/tools/skill-evals/src/skill_evals/runner.py
b/tools/skill-evals/src/skill_evals/runner.py
index 42ed0eb..ed80837 100644
--- a/tools/skill-evals/src/skill_evals/runner.py
+++ b/tools/skill-evals/src/skill_evals/runner.py
@@ -759,6 +759,14 @@ def main(argv: list[str] | None = None) -> int:
action="store_true",
help="In --cli mode, also print the prompts and the model's raw stdout
per case.",
)
+ parser.add_argument(
+ "--fail-fast",
+ action="store_true",
+ help=(
+ "Stops on the first failure instead of running all cases."
+ " Only applies in --cli mode; "
+ ),
+ )
parser.add_argument(
"--tag",
action="append",
@@ -796,6 +804,9 @@ def main(argv: list[str] | None = None) -> int:
passed = failed = manual = errored = 0
for case_dir, fixtures_dir in cases:
+ if (args.cli is not None) and args.fail_fast and (failed or errored):
+ print("Fail-fast enabled; stopping on first failure or error.")
+ break
if fixtures_dir not in _step_config_cache:
_step_config_cache[fixtures_dir] = load_step_config(fixtures_dir)
system_prompt, user_prompt_template = _step_config_cache[fixtures_dir]
diff --git a/tools/skill-evals/tests/test_runner.py
b/tools/skill-evals/tests/test_runner.py
index dd95a44..f29960e 100644
--- a/tools/skill-evals/tests/test_runner.py
+++ b/tools/skill-evals/tests/test_runner.py
@@ -751,6 +751,51 @@ def test_cli_mode_fail_with_wrong_json(tmp_path: Path,
capsys: pytest.CaptureFix
assert "FAIL" in stdout
assert "1 failed" in stdout
+def test_cli_mode_fail_with_wrong_jsons(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
+ """A CLI that returns the wrong JSONS should FAIL with multiple failures
and exit non-zero."""
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
+ _make_case(fixtures_dir, "case-2", report="another report",
expected={"verdict": "ok"})
+ rc, stdout, _ = _run_main(
+ capsys,
+ ["--cli", 'echo \'{"verdict": "wrong"}\'', str(fixtures_dir)],
+ )
+ assert rc == 1
+ assert "FAIL" in stdout
+ assert "2 failed" in stdout # asserts that behaviour doesn't changes and
outputs exactly 2 failures instead of stopping at the first one, which is
tested in the next test case
+
+def test_cli_model_with_fail_fast_and_wrong_json(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
+ """With --fail-fast, the runner should stop at the first failure and not
run further cases."""
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
+ # Add a second case that would FAIL if it ran, but should be skipped due
to fail-fast.
+ _make_case(fixtures_dir, "case-2", report="another report",
expected={"verdict": "ok"})
+ rc, stdout, _ = _run_main(
+ capsys,
+ ["--cli", 'echo \'{"verdict": "wrong"}\'', "--fail-fast",
str(fixtures_dir)],
+ )
+ assert rc == 1
+ assert "FAIL" in stdout
+ assert "1 failed" in stdout
+ assert "CASE: case-2" not in stdout # second case should not run at all
+
+def test_cli_model_with_fail_fast_and_error_json(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
+ """In timeout being negative raise internal error when run_cli is called
and --fail-fast is used."""
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "wrong"})
+ _make_case(fixtures_dir, "case-2", report="another report",
expected={"verdict": "ok"})
+ rc, stdout, _ = _run_main(
+ capsys,
+ [
+ "--cli",
+ 'echo \'{"verdict": "wrong"}\'',
+ '--timeout',"-1", # force an error (timeout) instead of a fail,
to check that fail-fast also applies to errors
+ "--exact",
+ "--fail-fast",
+ str(fixtures_dir),
+ ],
+ )
+ assert rc == 1
+ assert "ERROR" in stdout
+ assert "1 errored" in stdout
+ assert "CASE: case-2" not in stdout # second case should not run at all
def test_cli_mode_manual_skips_structural(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
"""Structural expected.json (has_* / mention_*) is reported MANUAL, not
auto-compared."""