This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-steward.git
The following commit(s) were added to refs/heads/main by this push:
new d634318 add support for llama and what tests it can run (#338)
d634318 is described below
commit d634318950fdaf014cdff1902b375665ad6fa2f4
Author: Justin Mclean <[email protected]>
AuthorDate: Wed May 27 20:32:34 2026 +1000
add support for llama and what tests it can run (#338)
---
tools/skill-evals/README.md | 22 ++++++++++++
.../fixtures/case-1-clean-commit/case-meta.json | 6 ++++
.../fixtures/case-2-all-fixed/case-meta.json | 6 ++++
.../fixtures/case-2-no-baselines/case-meta.json | 6 ++++
.../fixtures/case-1-explicit-key/case-meta.json | 6 ++++
.../fixtures/case-2-component/case-meta.json | 6 ++++
.../fixtures/case-1-post-all/case-meta.json | 6 ++++
.../fixtures/case-4-cancel/case-meta.json | 6 ++++
.../fixtures/case-2-verbose-request/case-meta.json | 6 ++++
.../fixtures/case-6-empty-diff/case-meta.json | 6 ++++
.../fixtures/case-3-no-images/case-meta.json | 6 ++++
.../fixtures/case-1-untriaged/case-meta.json | 6 ++++
.../case-meta.json | 6 ++++
.../case-4-triaged-responded-commit/case-meta.json | 6 ++++
.../fixtures/case-5-stale-marker/case-meta.json | 6 ++++
.../fixtures/case-1-clean-pass/case-meta.json | 6 ++++
.../fixtures/case-1-apply-all/case-meta.json | 6 ++++
.../fixtures/case-3-cancel/case-meta.json | 6 ++++
.../fixtures/case-1-clean-pass/case-meta.json | 6 ++++
.../fixtures/case-3-cancel/case-meta.json | 6 ++++
.../fixtures/case-3-cancel/case-meta.json | 6 ++++
.../fixtures/case-1-apply-all/case-meta.json | 6 ++++
.../fixtures/case-3-cancel/case-meta.json | 6 ++++
.../fixtures/case-4-cancel/case-meta.json | 6 ++++
.../case-1-override-yaml-found/case-meta.json | 6 ++++
.../case-2-no-yaml-fallback/case-meta.json | 6 ++++
.../fixtures/case-3-quarterly-arg/case-meta.json | 6 ++++
.../fixtures/case-4-output-path-arg/case-meta.json | 6 ++++
tools/skill-evals/src/skill_evals/runner.py | 35 +++++++++++++++++-
tools/skill-evals/tests/test_runner.py | 41 ++++++++++++++++++++++
30 files changed, 259 insertions(+), 1 deletion(-)
diff --git a/tools/skill-evals/README.md b/tools/skill-evals/README.md
index 4bba6dd..d3cfd66 100644
--- a/tools/skill-evals/README.md
+++ b/tools/skill-evals/README.md
@@ -68,6 +68,11 @@ PYTHONPATH=tools/skill-evals/src python3 -m
skill_evals.runner --cli "llm -m gpt
# Add --verbose to also print prompts and the model's raw stdout per case.
PYTHONPATH=tools/skill-evals/src python3 -m skill_evals.runner --cli "claude
-p" --verbose \
tools/skill-evals/evals/issue-triage/step-3-classify/fixtures/case-1-clear-bug
+
+# Run only cases tagged as useful smoke tests for local llama3.1:8b.
+PYTHONPATH=tools/skill-evals/src python3 -m skill_evals.runner --tag llama \
+ --cli "ollama run llama3.1:8b --nowordwrap --format json" \
+ tools/skill-evals/evals/
```
**JSON extraction** tries three strategies in order: parse the whole
@@ -88,6 +93,22 @@ comparison is a self-eval pass — useful as a smoke test for
prompt /
output-shape regressions, but weaker than a cross-model run. For
substantive changes, also run against a different model class.
+### Case tags
+
+Cases can opt into runner filters with a `case-meta.json` file next to
+`report.md` and `expected.json`:
+
+```json
+{
+ "tags": ["llama", "smoke"]
+}
+```
+
+Use `--tag <name>` to run only matching cases. Tags are intentionally
+conservative: for example, `llama` means the case is known to be useful
+as a local `llama3.1:8b` smoke signal, not that the whole suite is
+expected to pass on that model.
+
## Structure
```text
@@ -103,6 +124,7 @@ evals/
case-N-<name>/
report.md # mock tool call outputs for this case
expected.json # ground-truth JSON the model should produce
+ case-meta.json # optional runner tags, e.g.
{"tags":["llama"]}
```
The runner resolves the system prompt in order: `step-config.json` →
`system-prompt.md` → error. When `step-config.json` is present the system
prompt is assembled at run time by extracting the relevant section directly
from the skill's `SKILL.md` and appending `output-spec.md`. This means a change
to `SKILL.md` is immediately reflected in the prompt — if the change would
cause the model to produce different output, the test fails.
diff --git
a/tools/skill-evals/evals/issue-fix-workflow/step-7-compose-commit/fixtures/case-1-clean-commit/case-meta.json
b/tools/skill-evals/evals/issue-fix-workflow/step-7-compose-commit/fixtures/case-1-clean-commit/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/issue-fix-workflow/step-7-compose-commit/fixtures/case-1-clean-commit/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/issue-reassess-stats/step-2-classify/fixtures/case-2-all-fixed/case-meta.json
b/tools/skill-evals/evals/issue-reassess-stats/step-2-classify/fixtures/case-2-all-fixed/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/issue-reassess-stats/step-2-classify/fixtures/case-2-all-fixed/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/issue-reproducer/step-8-baselines/fixtures/case-2-no-baselines/case-meta.json
b/tools/skill-evals/evals/issue-reproducer/step-8-baselines/fixtures/case-2-no-baselines/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/issue-reproducer/step-8-baselines/fixtures/case-2-no-baselines/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/issue-triage/step-1-resolve-selector/fixtures/case-1-explicit-key/case-meta.json
b/tools/skill-evals/evals/issue-triage/step-1-resolve-selector/fixtures/case-1-explicit-key/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/issue-triage/step-1-resolve-selector/fixtures/case-1-explicit-key/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/issue-triage/step-1-resolve-selector/fixtures/case-2-component/case-meta.json
b/tools/skill-evals/evals/issue-triage/step-1-resolve-selector/fixtures/case-2-component/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/issue-triage/step-1-resolve-selector/fixtures/case-2-component/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/issue-triage/step-5-confirm/fixtures/case-1-post-all/case-meta.json
b/tools/skill-evals/evals/issue-triage/step-5-confirm/fixtures/case-1-post-all/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/issue-triage/step-5-confirm/fixtures/case-1-post-all/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/issue-triage/step-5-confirm/fixtures/case-4-cancel/case-meta.json
b/tools/skill-evals/evals/issue-triage/step-5-confirm/fixtures/case-4-cancel/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/issue-triage/step-5-confirm/fixtures/case-4-cancel/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/list-steward-skills/step-1-command/fixtures/case-2-verbose-request/case-meta.json
b/tools/skill-evals/evals/list-steward-skills/step-1-command/fixtures/case-2-verbose-request/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/list-steward-skills/step-1-command/fixtures/case-2-verbose-request/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/pairing-self-review/step-2-classify-findings/fixtures/case-6-empty-diff/case-meta.json
b/tools/skill-evals/evals/pairing-self-review/step-2-classify-findings/fixtures/case-6-empty-diff/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/pairing-self-review/step-2-classify-findings/fixtures/case-6-empty-diff/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/pr-management-code-review/step-4-image-ip/fixtures/case-3-no-images/case-meta.json
b/tools/skill-evals/evals/pr-management-code-review/step-4-image-ip/fixtures/case-3-no-images/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/pr-management-code-review/step-4-image-ip/fixtures/case-3-no-images/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-1-untriaged/case-meta.json
b/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-1-untriaged/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-1-untriaged/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-3-triaged-responded-comment/case-meta.json
b/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-3-triaged-responded-comment/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-3-triaged-responded-comment/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-4-triaged-responded-commit/case-meta.json
b/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-4-triaged-responded-commit/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-4-triaged-responded-commit/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-5-stale-marker/case-meta.json
b/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-5-stale-marker/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/pr-management-stats/classify/fixtures/case-5-stale-marker/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-cve-allocate/step-1-blocker-checks/fixtures/case-1-clean-pass/case-meta.json
b/tools/skill-evals/evals/security-cve-allocate/step-1-blocker-checks/fixtures/case-1-clean-pass/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-cve-allocate/step-1-blocker-checks/fixtures/case-1-clean-pass/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-cve-allocate/step-5-confirm/fixtures/case-1-apply-all/case-meta.json
b/tools/skill-evals/evals/security-cve-allocate/step-5-confirm/fixtures/case-1-apply-all/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-cve-allocate/step-5-confirm/fixtures/case-1-apply-all/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-cve-allocate/step-5-confirm/fixtures/case-3-cancel/case-meta.json
b/tools/skill-evals/evals/security-cve-allocate/step-5-confirm/fixtures/case-3-cancel/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-cve-allocate/step-5-confirm/fixtures/case-3-cancel/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-issue-deduplicate/step-1-classify/fixtures/case-1-clean-pass/case-meta.json
b/tools/skill-evals/evals/security-issue-deduplicate/step-1-classify/fixtures/case-1-clean-pass/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-issue-deduplicate/step-1-classify/fixtures/case-1-clean-pass/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-issue-deduplicate/step-5-confirm/fixtures/case-3-cancel/case-meta.json
b/tools/skill-evals/evals/security-issue-deduplicate/step-5-confirm/fixtures/case-3-cancel/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-issue-deduplicate/step-5-confirm/fixtures/case-3-cancel/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-issue-import/step-6-confirm/fixtures/case-3-cancel/case-meta.json
b/tools/skill-evals/evals/security-issue-import/step-6-confirm/fixtures/case-3-cancel/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-issue-import/step-6-confirm/fixtures/case-3-cancel/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-issue-sync/step-3-confirm/fixtures/case-1-apply-all/case-meta.json
b/tools/skill-evals/evals/security-issue-sync/step-3-confirm/fixtures/case-1-apply-all/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-issue-sync/step-3-confirm/fixtures/case-1-apply-all/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-issue-sync/step-3-confirm/fixtures/case-3-cancel/case-meta.json
b/tools/skill-evals/evals/security-issue-sync/step-3-confirm/fixtures/case-3-cancel/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-issue-sync/step-3-confirm/fixtures/case-3-cancel/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-issue-triage/step-5-confirm/fixtures/case-4-cancel/case-meta.json
b/tools/skill-evals/evals/security-issue-triage/step-5-confirm/fixtures/case-4-cancel/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-issue-triage/step-5-confirm/fixtures/case-4-cancel/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-1-override-yaml-found/case-meta.json
b/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-1-override-yaml-found/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-1-override-yaml-found/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-2-no-yaml-fallback/case-meta.json
b/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-2-no-yaml-fallback/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-2-no-yaml-fallback/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-3-quarterly-arg/case-meta.json
b/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-3-quarterly-arg/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-3-quarterly-arg/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git
a/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-4-output-path-arg/case-meta.json
b/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-4-output-path-arg/case-meta.json
new file mode 100644
index 0000000..aa8c78e
--- /dev/null
+++
b/tools/skill-evals/evals/security-tracker-stats-dashboard/step-1-resolve-config/fixtures/case-4-output-path-arg/case-meta.json
@@ -0,0 +1,6 @@
+{
+ "tags": [
+ "llama",
+ "smoke"
+ ]
+}
diff --git a/tools/skill-evals/src/skill_evals/runner.py
b/tools/skill-evals/src/skill_evals/runner.py
index 286077a..5da34e9 100644
--- a/tools/skill-evals/src/skill_evals/runner.py
+++ b/tools/skill-evals/src/skill_evals/runner.py
@@ -202,6 +202,22 @@ def load_case(case_dir: Path) -> tuple[list[dict], dict,
str, dict]:
return corpus, roster, report, expected
+def load_case_tags(case_dir: Path) -> set[str]:
+ """Return optional runner-selection tags for a case.
+
+ Tags live in ``case-meta.json`` so expected.json stays focused on the
+ behavioral assertion. Unknown metadata keys are ignored.
+ """
+ meta_path = case_dir / "case-meta.json"
+ if not meta_path.exists():
+ return set()
+ meta = json.loads(meta_path.read_text())
+ tags = meta.get("tags", [])
+ if not isinstance(tags, list) or not all(isinstance(tag, str) for tag in
tags):
+ raise ValueError(f"{meta_path} must contain a string-list 'tags'
field")
+ return set(tags)
+
+
# ---------------------------------------------------------------------------
# Automated comparison (--cli mode)
# ---------------------------------------------------------------------------
@@ -399,11 +415,28 @@ def main(argv: list[str] | None = None) -> int:
action="store_true",
help="In --cli mode, also print the prompts and the model's raw stdout
per case.",
)
+ parser.add_argument(
+ "--tag",
+ action="append",
+ default=[],
+ help=(
+ "Run only cases tagged in case-meta.json. May be passed multiple "
+ "times; a case is included if it has all requested tags."
+ ),
+ )
args = parser.parse_args(argv)
cases = find_cases(args.path)
+ if args.tag:
+ requested_tags = set(args.tag)
+ cases = [
+ (case_dir, fixtures_dir)
+ for case_dir, fixtures_dir in cases
+ if requested_tags.issubset(load_case_tags(case_dir))
+ ]
if not cases:
- print(f"No eval cases found under {args.path}", file=sys.stderr)
+ tag_suffix = f" matching tag(s): {', '.join(args.tag)}" if args.tag
else ""
+ print(f"No eval cases found under {args.path}{tag_suffix}",
file=sys.stderr)
return 1
# Cache loaded step configs so we don't re-read prompts for every case in
diff --git a/tools/skill-evals/tests/test_runner.py
b/tools/skill-evals/tests/test_runner.py
index 3b9c65a..f6dbc14 100644
--- a/tools/skill-evals/tests/test_runner.py
+++ b/tools/skill-evals/tests/test_runner.py
@@ -34,6 +34,7 @@ from skill_evals.runner import (
find_repo_root,
is_structural_expected,
load_case,
+ load_case_tags,
load_step_config,
main,
)
@@ -378,6 +379,30 @@ def test_load_case_loads_optional_roster(tmp_path: Path):
assert roster == roster_data
+def test_load_case_tags_missing_meta_returns_empty_set(tmp_path: Path):
+ fixtures_dir = tmp_path / "fixtures"
+ fixtures_dir.mkdir()
+ case_dir = _make_case(fixtures_dir, "case-1")
+ assert load_case_tags(case_dir) == set()
+
+
+def test_load_case_tags_reads_case_meta(tmp_path: Path):
+ fixtures_dir = tmp_path / "fixtures"
+ fixtures_dir.mkdir()
+ case_dir = _make_case(fixtures_dir, "case-1")
+ (case_dir / "case-meta.json").write_text(json.dumps({"tags": ["llama",
"smoke"]}))
+ assert load_case_tags(case_dir) == {"llama", "smoke"}
+
+
+def test_load_case_tags_rejects_non_string_tags(tmp_path: Path):
+ fixtures_dir = tmp_path / "fixtures"
+ fixtures_dir.mkdir()
+ case_dir = _make_case(fixtures_dir, "case-1")
+ (case_dir / "case-meta.json").write_text(json.dumps({"tags": ["llama",
3]}))
+ with pytest.raises(ValueError, match="tags"):
+ load_case_tags(case_dir)
+
+
# ---------------------------------------------------------------------------
# find_cases
# ---------------------------------------------------------------------------
@@ -769,3 +794,19 @@ def test_cli_mode_summary_counts(tmp_path: Path, capsys:
pytest.CaptureFixture[s
assert rc == 1 # because one case FAILs
assert "1 passed" in stdout
assert "1 failed" in stdout
+
+
+def test_tag_filter_runs_only_matching_cases(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
+ """--tag should restrict discovered cases before invoking the CLI."""
+ expected = {"verdict": "ok"}
+ fixtures_dir, case_dir = _make_cli_case(tmp_path, expected=expected)
+ _make_case(fixtures_dir, "case-2-untagged", report="x",
expected={"verdict": "different"})
+ (case_dir / "case-meta.json").write_text(json.dumps({"tags": ["llama"]}))
+
+ rc, stdout, _ = _run_main(
+ capsys,
+ ["--tag", "llama", "--cli", f"echo '{json.dumps(expected)}'",
str(fixtures_dir)],
+ )
+ assert rc == 0
+ assert "1 passed" in stdout
+ assert "case-2-untagged" not in stdout