This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-steward.git
The following commit(s) were added to refs/heads/main by this push:
new 68660702 feat(validator): add SOFT eval-coverage check (check #8)
(#481)
68660702 is described below
commit 68660702fb75be4e38f75f9374064bb5244ecf8d
Author: Justin Mclean <[email protected]>
AuthorDate: Thu Jun 11 19:18:14 2026 +1000
feat(validator): add SOFT eval-coverage check (check #8) (#481)
Every skill under skills/ must ship a matching behavioural eval suite
under tools/skill-evals/evals/<slug>/. The new validate_eval_coverage
function surfaces missing suites as SOFT advisory violations so that
in-flight eval PRs do not fail the gate while their branches are pending
review.
Against the live repo the check correctly flags the two skills that
currently have in-flight eval branches (pr-management-quick-merge and
setup-status) and is silent on all others. 8 new test cases cover the
happy path, the missing-eval path, missing-both-dirs paths, the
soft-category membership, and the non-directory skip.
Addresses the Known Gap in specs/meta-and-quality-tooling.md:
"Eval coverage is incomplete — skills added before the per-skill-eval
convention have no suite." The check prevents future regressions.
Generated-by: Claude (Opus 4.7)
---
.../src/skill_and_tool_validator/__init__.py | 48 ++++++++++++-
.../tests/test_validator.py | 83 ++++++++++++++++++++++
2 files changed, 130 insertions(+), 1 deletion(-)
diff --git
a/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py
b/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py
index 1d12cbf0..4b617d71 100644
--- a/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py
+++ b/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py
@@ -17,7 +17,7 @@
"""Validate framework skill definitions.
-This module validates eight aspects of every skill under
+This module validates nine aspects of every skill under
skills/:
1. YAML frontmatter — every SKILL.md must have a valid frontmatter
@@ -49,6 +49,11 @@ skills/:
Apache Software Foundation license preamble. Skill ``.md`` files
declare their license via the required ``license:`` frontmatter key
(checked by aspect 1), so they need no separate header.
+9. Eval-coverage (SOFT) — every skill directory under ``skills/``
+ must have a matching behavioural eval suite under
+ ``tools/skill-evals/evals/<slug>/``. Missing suites are
+ advisories so in-flight eval PRs do not block the gate while
+ their branches are pending review.
SOFT categories surface as advisory warnings (stderr) without
failing the run unless ``--strict`` is passed.
@@ -74,6 +79,7 @@ from pathlib import Path
SKILLS_DIR = Path("skills")
TOOLS_DIR = Path("tools")
DOCS_DIR = Path("docs")
+SKILL_EVALS_DIR = Path("tools/skill-evals/evals")
PROJECTS_TEMPLATE_DIR = Path("projects/_template")
# Categories for the tool-validator block. Both HARD by default — every
@@ -89,6 +95,8 @@ TOOL_CAPABILITY_RE = re.compile(r"^\*\*Capability:\*\*[
\t]+(.+)$", re.MULTILINE
# with live skill frontmatter + tool README declarations.
DOCS_LABELS_AND_CAPABILITIES = Path("docs/labels-and-capabilities.md")
CAPABILITY_SYNC_CATEGORY = "capability-sync"
+# Eval-coverage check: every skill must have a matching eval suite.
+EVAL_COVERAGE_CATEGORY = "eval-coverage"
_SKILL_TABLE_HEADER = "## Capability to skill map"
_TOOL_TABLE_HEADER = "## Capability to tool map"
# Tokens like `capability:setup`. Optional backticks around the token.
@@ -262,6 +270,7 @@ SOFT_CATEGORIES: frozenset[str] = frozenset(
GH_LIST_CATEGORY,
PRIVACY_CATEGORY,
LOWERCASE_F_FIELD_CATEGORY,
+ EVAL_COVERAGE_CATEGORY,
}
)
HARD_CATEGORIES: frozenset[str] = frozenset(
@@ -1728,6 +1737,40 @@ def collect_doc_files(root: Path | None = None) ->
set[Path]:
return files
+# ---------------------------------------------------------------------------
+# Eval-coverage check (check #9, SOFT)
+# ---------------------------------------------------------------------------
+
+
+def validate_eval_coverage(root: Path | None = None) -> Iterable[Violation]:
+ """Warn when a skill directory has no matching eval suite.
+
+ Every skill under skills/ must have a behavioural eval suite under
+ tools/skill-evals/evals/<slug>/. Missing suites surface as SOFT
+ advisories so in-flight eval PRs do not fail the gate while their
+ branches are pending review.
+ """
+ repo_root = root or find_repo_root()
+ skills_base = repo_root / SKILLS_DIR
+ evals_base = repo_root / SKILL_EVALS_DIR
+ if not skills_base.exists():
+ return
+ eval_slugs: set[str] = set()
+ if evals_base.exists():
+ eval_slugs = {p.name for p in evals_base.iterdir() if p.is_dir()}
+ for skill_dir in sorted(skills_base.iterdir()):
+ if not skill_dir.is_dir():
+ continue
+ slug = skill_dir.name
+ if slug not in eval_slugs:
+ yield Violation(
+ skill_dir / "SKILL.md",
+ None,
+ f"eval-coverage: no eval suite at
tools/skill-evals/evals/{slug}/ — add one before shipping",
+ category=EVAL_COVERAGE_CATEGORY,
+ )
+
+
def run_validation(root: Path | None = None) -> list[Violation]:
"""Run the full validation suite and return all violations."""
repo_root = root or find_repo_root()
@@ -1774,6 +1817,9 @@ def run_validation(root: Path | None = None) ->
list[Violation]:
# Capability-sync check: the doc tables and the source must agree.
violations.extend(validate_capability_sync(repo_root))
+ # Eval-coverage check: every skill must have a matching eval suite.
+ violations.extend(validate_eval_coverage(repo_root))
+
return violations
diff --git a/tools/skill-and-tool-validator/tests/test_validator.py
b/tools/skill-and-tool-validator/tests/test_validator.py
index 2e5430b3..3fda038f 100644
--- a/tools/skill-and-tool-validator/tests/test_validator.py
+++ b/tools/skill-and-tool-validator/tests/test_validator.py
@@ -30,6 +30,7 @@ from skill_and_tool_validator import (
_PRIVACY_EXTERNAL_CONTENT_MODES,
ALL_CATEGORIES,
ALLOWED_MODES,
+ EVAL_COVERAGE_CATEGORY,
FORBIDDEN_PATTERNS,
GH_LIST_CATEGORY,
HARD_CATEGORIES,
@@ -61,6 +62,7 @@ from skill_and_tool_validator import (
run_validation,
slugify,
validate_capability_sync,
+ validate_eval_coverage,
validate_frontmatter,
validate_gh_list_limit,
validate_injection_guard,
@@ -2399,3 +2401,84 @@ class TestValidateCapabilitySync:
# The parenthetical capability:reconciliation must NOT be flagged as a
doc-side declared capability;
# the row's authoritative capability is just intake, which matches the
live skill.
assert violations == [], [v.message for v in violations]
+
+
+# ---------------------------------------------------------------------------
+# Eval-coverage check
+# ---------------------------------------------------------------------------
+
+
+class TestValidateEvalCoverage:
+ """Tests for validate_eval_coverage (check #9 — SOFT)."""
+
+ def _make_skill(self, root: Path, slug: str) -> None:
+ skill_dir = root / "skills" / slug
+ skill_dir.mkdir(parents=True, exist_ok=True)
+ (skill_dir / "SKILL.md").write_text(
+ f"---\nname: magpie-{slug}\ndescription: test\ncapability:
capability:triage\nlicense: Apache-2.0\n---\n"
+ )
+
+ def _make_eval(self, root: Path, slug: str) -> None:
+ eval_dir = root / "tools" / "skill-evals" / "evals" / slug
+ eval_dir.mkdir(parents=True, exist_ok=True)
+ (eval_dir / "README.md").write_text(f"# {slug} evals\n")
+
+ def test_skill_with_matching_eval_passes(self, tmp_path: Path) -> None:
+ self._make_skill(tmp_path, "issue-triage")
+ self._make_eval(tmp_path, "issue-triage")
+ violations = list(validate_eval_coverage(tmp_path))
+ assert violations == []
+
+ def test_skill_without_eval_yields_soft_violation(self, tmp_path: Path) ->
None:
+ self._make_skill(tmp_path, "new-skill")
+ # No matching eval directory.
+ violations = list(validate_eval_coverage(tmp_path))
+ assert len(violations) == 1
+ v = violations[0]
+ assert v.category == EVAL_COVERAGE_CATEGORY
+ assert "new-skill" in v.message
+ assert "tools/skill-evals/evals/new-skill/" in v.message
+
+ def test_multiple_skills_some_missing_evals(self, tmp_path: Path) -> None:
+ self._make_skill(tmp_path, "alpha")
+ self._make_skill(tmp_path, "beta")
+ self._make_skill(tmp_path, "gamma")
+ self._make_eval(tmp_path, "alpha")
+ # beta and gamma have no evals.
+ violations = list(validate_eval_coverage(tmp_path))
+ assert len(violations) == 2
+ slugs = {v.path.parent.name for v in violations}
+ assert slugs == {"beta", "gamma"}
+ assert all(v.category == EVAL_COVERAGE_CATEGORY for v in violations)
+
+ def test_no_skills_dir_returns_no_violations(self, tmp_path: Path) -> None:
+ # skills/ does not exist at all.
+ violations = list(validate_eval_coverage(tmp_path))
+ assert violations == []
+
+ def test_no_evals_dir_all_skills_flagged(self, tmp_path: Path) -> None:
+ self._make_skill(tmp_path, "alpha")
+ self._make_skill(tmp_path, "beta")
+ # tools/skill-evals/evals/ does not exist.
+ violations = list(validate_eval_coverage(tmp_path))
+ assert len(violations) == 2
+ assert all(v.category == EVAL_COVERAGE_CATEGORY for v in violations)
+
+ def test_eval_coverage_is_soft_category(self) -> None:
+ assert EVAL_COVERAGE_CATEGORY in SOFT_CATEGORIES
+ assert EVAL_COVERAGE_CATEGORY not in ALL_CATEGORIES - SOFT_CATEGORIES
+
+ def test_violation_path_points_to_skill_md(self, tmp_path: Path) -> None:
+ self._make_skill(tmp_path, "orphan")
+ violations = list(validate_eval_coverage(tmp_path))
+ assert len(violations) == 1
+ assert violations[0].path.name == "SKILL.md"
+ assert violations[0].path.parent.name == "orphan"
+
+ def test_non_directory_entries_in_skills_are_skipped(self, tmp_path: Path)
-> None:
+ skills_dir = tmp_path / "skills"
+ skills_dir.mkdir(parents=True)
+ # A plain file (not a directory) must not be treated as a skill.
+ (skills_dir / "README.md").write_text("# skills\n")
+ violations = list(validate_eval_coverage(tmp_path))
+ assert violations == []