(airflow-steward) branch main updated: feat(tools/skill-evals): add --fail-fast flag to stop on first failure (#408)

potiuk Sun, 31 May 2026 05:13:16 -0700

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-steward.git



The following commit(s) were added to refs/heads/main by this push:
     new a316f73  feat(tools/skill-evals): add --fail-fast flag to stop on 
first failure (#408)
a316f73 is described below

commit a316f7380362894a32576707e108e5d11b5c37c6
Author: Aayush Rajput <[email protected]>
AuthorDate: Sun May 31 17:42:15 2026 +0530

    feat(tools/skill-evals): add --fail-fast flag to stop on first failure 
(#408)
    
    Why: Large skill-evals suites can take minutes to surface a single failure
    because the harness runs the full suite. The --fail-fast flag (already
    supported by pytest and prek) stops execution after the first FAIL or ERROR
    in --cli mode, matching user expectations and reducing feedback latency.
    
    Fixes #376
---
 tools/skill-evals/src/skill_evals/runner.py | 11 +++++++
 tools/skill-evals/tests/test_runner.py      | 45 +++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/tools/skill-evals/src/skill_evals/runner.py 
b/tools/skill-evals/src/skill_evals/runner.py
index 42ed0eb..ed80837 100644
--- a/tools/skill-evals/src/skill_evals/runner.py
+++ b/tools/skill-evals/src/skill_evals/runner.py
@@ -759,6 +759,14 @@ def main(argv: list[str] | None = None) -> int:
         action="store_true",
         help="In --cli mode, also print the prompts and the model's raw stdout 
per case.",
     )
+    parser.add_argument(
+        "--fail-fast",
+        action="store_true",
+        help=(
+            "Stops on the first failure instead of running all cases."
+            " Only applies in --cli mode; "
+        ),
+    )
     parser.add_argument(
         "--tag",
         action="append",
@@ -796,6 +804,9 @@ def main(argv: list[str] | None = None) -> int:
     passed = failed = manual = errored = 0
 
     for case_dir, fixtures_dir in cases:
+        if (args.cli is not None) and args.fail_fast and (failed or errored):
+            print("Fail-fast enabled; stopping on first failure or error.")
+            break
         if fixtures_dir not in _step_config_cache:
             _step_config_cache[fixtures_dir] = load_step_config(fixtures_dir)
         system_prompt, user_prompt_template = _step_config_cache[fixtures_dir]
diff --git a/tools/skill-evals/tests/test_runner.py 
b/tools/skill-evals/tests/test_runner.py
index dd95a44..f29960e 100644
--- a/tools/skill-evals/tests/test_runner.py
+++ b/tools/skill-evals/tests/test_runner.py
@@ -751,6 +751,51 @@ def test_cli_mode_fail_with_wrong_json(tmp_path: Path, 
capsys: pytest.CaptureFix
     assert "FAIL" in stdout
     assert "1 failed" in stdout
 
+def test_cli_mode_fail_with_wrong_jsons(tmp_path: Path, capsys: 
pytest.CaptureFixture[str]):
+    """A CLI that returns the wrong JSONS should FAIL with multiple failures 
and exit non-zero."""
+    fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
+    _make_case(fixtures_dir, "case-2", report="another report", 
expected={"verdict": "ok"})
+    rc, stdout, _ = _run_main(
+        capsys,
+        ["--cli", 'echo \'{"verdict": "wrong"}\'', str(fixtures_dir)],
+    )
+    assert rc == 1
+    assert "FAIL" in stdout
+    assert "2 failed" in stdout # asserts that behaviour doesn't changes and 
outputs exactly 2 failures instead of stopping at the first one, which is 
tested in the next test case
+
+def test_cli_model_with_fail_fast_and_wrong_json(tmp_path: Path, capsys: 
pytest.CaptureFixture[str]):
+    """With --fail-fast, the runner should stop at the first failure and not 
run further cases."""
+    fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
+    # Add a second case that would FAIL if it ran, but should be skipped due 
to fail-fast.
+    _make_case(fixtures_dir, "case-2", report="another report", 
expected={"verdict": "ok"})
+    rc, stdout, _ = _run_main(
+        capsys,
+        ["--cli", 'echo \'{"verdict": "wrong"}\'', "--fail-fast", 
str(fixtures_dir)],
+    )
+    assert rc == 1
+    assert "FAIL" in stdout
+    assert "1 failed" in stdout
+    assert "CASE: case-2" not in stdout  # second case should not run at all
+
+def test_cli_model_with_fail_fast_and_error_json(tmp_path: Path, capsys: 
pytest.CaptureFixture[str]):
+    """In timeout being negative raise internal error when run_cli is called 
and --fail-fast is used."""
+    fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "wrong"})
+    _make_case(fixtures_dir, "case-2", report="another report", 
expected={"verdict": "ok"})
+    rc, stdout, _ = _run_main(
+        capsys,
+        [
+            "--cli",
+            'echo \'{"verdict": "wrong"}\'',
+            '--timeout',"-1",  # force an error (timeout) instead of a fail, 
to check that fail-fast also applies to errors
+            "--exact",
+            "--fail-fast",
+            str(fixtures_dir),
+        ],
+    )
+    assert rc == 1
+    assert "ERROR" in stdout
+    assert "1 errored" in stdout
+    assert "CASE: case-2" not in stdout  # second case should not run at all
 
 def test_cli_mode_manual_skips_structural(tmp_path: Path, capsys: 
pytest.CaptureFixture[str]):
     """Structural expected.json (has_* / mention_*) is reported MANUAL, not 
auto-compared."""

(airflow-steward) branch main updated: feat(tools/skill-evals): add --fail-fast flag to stop on first failure (#408)

Reply via email to