Copilot commented on code in PR #64008: URL: https://github.com/apache/airflow/pull/64008#discussion_r3066489669
########## scripts/ci/pre_commit/extract_agent_skills.py: ########## @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use it except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Extract and validate .. agent-skill:: blocks from AGENTS.md and contributing-docs/*.rst into skills.json.""" +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path + + +def collect_source_texts(root: Path, docs_file: str, docs_dir: str) -> list[tuple[Path, str]]: + """Return list of (path, text) for AGENTS.md and all .rst under docs_dir.""" + result: list[tuple[Path, str]] = [] + agents_path = Path(docs_file) if Path(docs_file).is_absolute() else root / docs_file + if agents_path.exists(): + result.append((agents_path, agents_path.read_text(encoding="utf-8"))) + contrib_dir = Path(docs_dir) if Path(docs_dir).is_absolute() else root / docs_dir + if contrib_dir.is_dir(): + for rst_path in sorted(contrib_dir.rglob("*.rst")): + result.append((rst_path, rst_path.read_text(encoding="utf-8"))) + return result + +VALID_CONTEXTS = {"host", "breeze", "either"} +VALID_CATEGORIES = {"environment", "testing", "linting", "providers", "dags", "documentation"} +ID_RE = re.compile(r"^[a-z][a-z0-9-]*$") + + +def parse_skills(text: str) -> list[dict]: + """Extract agent-skill blocks from one file's text; return list of skill dicts.""" + blocks = re.split(r"\n\.\. agent-skill::\s*\n", text) + skills = [] + for block in blocks[1:]: # skip content before first block + skill = {} + # Options: :key: value (value can continue on next line if indented) + for m in re.finditer(r"^ :(\w+):\s*(.*?)(?=^ :\w+:|^ \.\.|\Z)", block, re.M | re.S): + key, val = m.group(1), m.group(2) + skill[key] = re.sub(r"\n\s+", " ", val).strip() + # code-block content + cb = re.search(r"\.\. code-block::\s*\w+\s*\n(.*?)(?=\n \.\.|\Z)", block, re.S) + skill["command"] = re.sub(r"^ ", "", cb.group(1).strip()).replace("\n ", "\n") if cb else "" + # expected output + eo = re.search(r"\.\. agent-skill-expected-output::\s*\n(.*?)(?=\n\.\. agent-skill|\Z)", block, re.S) + skill["expected_output"] = re.sub(r"^ ", "", eo.group(1).strip()).replace("\n ", "\n") if eo else "" + skills.append(skill) + return skills + + +def validate_skill(skill: dict, index: int) -> None: + """Raise ValueError if skill is invalid.""" + sid = skill.get("id", "") + if not sid: + raise ValueError(f"Skill at index {index}: missing required field 'id'") + if not ID_RE.match(sid): + raise ValueError(f"Skill '{sid}': id must match ^[a-z][a-z0-9-]*$") + ctx = skill.get("context", "") + if not ctx: + raise ValueError(f"Skill '{sid}': missing required field 'context'") + if ctx not in VALID_CONTEXTS: + raise ValueError(f"Skill '{sid}': context must be one of {sorted(VALID_CONTEXTS)}, got '{ctx}'") + cat = skill.get("category", "") + if not cat: + raise ValueError(f"Skill '{sid}': missing required field 'category'") + if cat not in VALID_CATEGORIES: + raise ValueError(f"Skill '{sid}': category must be one of {sorted(VALID_CATEGORIES)}, got '{cat}'") + desc = (skill.get("description") or "").strip() + if not desc: + raise ValueError(f"Skill '{sid}': missing or empty required field 'description'") + cmd = (skill.get("command") or "").strip() + if not cmd: + raise ValueError(f"Skill '{sid}': missing or empty required field 'command'") + + +def _skill_key_set(skill: dict) -> set[str]: + """Return set of keys that define skill identity and content for comparison.""" + return {"id", "context", "category", "prereqs", "validates", "description", "command", "expected_output"} Review Comment: The drift comparison key set omits fields that are present in skills (e.g. `fallback` / `fallback_condition`). As a result, `--check` will not detect drift when those fields change, even though they affect behavior/UX for agents. Please include all supported fields in `_skill_key_set()` (or explicitly document which fields are intentionally ignored) so drift detection is complete. ```suggestion return { "id", "context", "category", "prereqs", "validates", "description", "command", "expected_output", "fallback", "fallback_condition", } ``` ########## contributing-docs/agent_skills/skill_graph.json: ########## @@ -0,0 +1,71 @@ +{ + "nodes": [ + { + "id": "setup-breeze-environment", + "category": "environment", + "context": "host" + }, + { + "id": "run-single-test", + "category": "testing", + "context": "host" + }, + { + "id": "run-static-checks", + "category": "linting", + "context": "host" + }, + { + "id": "run-manual-checks", + "category": "linting", + "context": "host" + }, + { + "id": "build-docs", + "category": "documentation", + "context": "host" + }, + { + "id": "run-provider-tests", + "category": "testing", + "context": "host" + }, + { + "id": "run-type-check", + "category": "linting", + "context": "host" + } + ], Review Comment: `skill_graph.json` appears out of sync with `skills.json`: it lists only 7 nodes and is missing the newly added skills (e.g. `run-tests-uv-first`, `run-static-checks-prek`). Please regenerate `skill_graph.json` from the current `skills.json` (and ensure the pre-commit hook updates it deterministically). ########## scripts/ci/pre_commit/extract_agent_skills.py: ########## @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use it except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Extract and validate .. agent-skill:: blocks from AGENTS.md and contributing-docs/*.rst into skills.json.""" +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path + + +def collect_source_texts(root: Path, docs_file: str, docs_dir: str) -> list[tuple[Path, str]]: + """Return list of (path, text) for AGENTS.md and all .rst under docs_dir.""" + result: list[tuple[Path, str]] = [] + agents_path = Path(docs_file) if Path(docs_file).is_absolute() else root / docs_file + if agents_path.exists(): + result.append((agents_path, agents_path.read_text(encoding="utf-8"))) + contrib_dir = Path(docs_dir) if Path(docs_dir).is_absolute() else root / docs_dir + if contrib_dir.is_dir(): + for rst_path in sorted(contrib_dir.rglob("*.rst")): + result.append((rst_path, rst_path.read_text(encoding="utf-8"))) + return result + +VALID_CONTEXTS = {"host", "breeze", "either"} +VALID_CATEGORIES = {"environment", "testing", "linting", "providers", "dags", "documentation"} +ID_RE = re.compile(r"^[a-z][a-z0-9-]*$") + + +def parse_skills(text: str) -> list[dict]: + """Extract agent-skill blocks from one file's text; return list of skill dicts.""" + blocks = re.split(r"\n\.\. agent-skill::\s*\n", text) + skills = [] + for block in blocks[1:]: # skip content before first block + skill = {} + # Options: :key: value (value can continue on next line if indented) + for m in re.finditer(r"^ :(\w+):\s*(.*?)(?=^ :\w+:|^ \.\.|\Z)", block, re.M | re.S): + key, val = m.group(1), m.group(2) + skill[key] = re.sub(r"\n\s+", " ", val).strip() + # code-block content + cb = re.search(r"\.\. code-block::\s*\w+\s*\n(.*?)(?=\n \.\.|\Z)", block, re.S) + skill["command"] = re.sub(r"^ ", "", cb.group(1).strip()).replace("\n ", "\n") if cb else "" + # expected output + eo = re.search(r"\.\. agent-skill-expected-output::\s*\n(.*?)(?=\n\.\. agent-skill|\Z)", block, re.S) + skill["expected_output"] = re.sub(r"^ ", "", eo.group(1).strip()).replace("\n ", "\n") if eo else "" + skills.append(skill) + return skills + + +def validate_skill(skill: dict, index: int) -> None: + """Raise ValueError if skill is invalid.""" + sid = skill.get("id", "") + if not sid: + raise ValueError(f"Skill at index {index}: missing required field 'id'") + if not ID_RE.match(sid): + raise ValueError(f"Skill '{sid}': id must match ^[a-z][a-z0-9-]*$") + ctx = skill.get("context", "") + if not ctx: + raise ValueError(f"Skill '{sid}': missing required field 'context'") + if ctx not in VALID_CONTEXTS: + raise ValueError(f"Skill '{sid}': context must be one of {sorted(VALID_CONTEXTS)}, got '{ctx}'") + cat = skill.get("category", "") + if not cat: + raise ValueError(f"Skill '{sid}': missing required field 'category'") + if cat not in VALID_CATEGORIES: + raise ValueError(f"Skill '{sid}': category must be one of {sorted(VALID_CATEGORIES)}, got '{cat}'") + desc = (skill.get("description") or "").strip() + if not desc: + raise ValueError(f"Skill '{sid}': missing or empty required field 'description'") + cmd = (skill.get("command") or "").strip() + if not cmd: + raise ValueError(f"Skill '{sid}': missing or empty required field 'command'") + + +def _skill_key_set(skill: dict) -> set[str]: + """Return set of keys that define skill identity and content for comparison.""" + return {"id", "context", "category", "prereqs", "validates", "description", "command", "expected_output"} + + +def _normalize_skill_for_compare(skill: dict) -> dict: + """Return a comparable dict with only the fields we care about, sorted keys.""" + keys = _skill_key_set(skill) + return {k: skill.get(k, "") for k in sorted(keys) if k in skill or k in keys} + + +def extract_all_skills(root: Path, docs_file: str, docs_dir: str) -> list[dict]: + """Collect skills from AGENTS.md and all .rst under docs_dir; later occurrence wins on duplicate id.""" + sources = collect_source_texts(root, docs_file, docs_dir) + by_id: dict[str, dict] = {} + for path, text in sources: + try: + for skill in parse_skills(text): + sid = skill.get("id") + if sid: + by_id[sid] = skill Review Comment: `extract_all_skills()` overwrites duplicate IDs (`by_id[sid] = skill`), which means duplicates across AGENTS.md / RST files are silently discarded and will never be reported by validation or drift checks. Since skill IDs are the primary key, duplicates should be treated as an error (fail extraction/validation) to avoid non-deterministic behavior and accidental shadowing. ```suggestion """Collect skills from AGENTS.md and all .rst under docs_dir; reject duplicate skill ids.""" sources = collect_source_texts(root, docs_file, docs_dir) by_id: dict[str, dict] = {} seen_paths: dict[str, Path] = {} for path, text in sources: try: for skill in parse_skills(text): sid = skill.get("id") if sid: if sid in by_id: raise ValueError( f"Duplicate skill id '{sid}' found in {path}; " f"already defined in {seen_paths[sid]}" ) by_id[sid] = skill seen_paths[sid] = path ``` ########## scripts/ci/pre_commit/skill_graph.py: ########## @@ -0,0 +1,186 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use it except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Build dependency graph from skills.json (prereqs) and output tree + JSON graph.""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + + +def parse_prereq_skill_ids(prereqs_str: str, skill_ids: set[str]) -> list[str]: + """Return list of skill IDs that appear in prereqs string (comma-separated).""" + if not prereqs_str or not prereqs_str.strip(): + return [] + return [x.strip() for x in prereqs_str.split(",") if x.strip() in skill_ids] + + +def find_cycle(skill_ids: set[str], edges: list[tuple[str, str]]) -> list[str] | None: + """edges are (from_id, to_id) meaning from_id requires to_id. Return cycle as list or None.""" + # Build adjacency: node -> list of nodes it points to (dependencies) + adj: dict[str, list[str]] = {sid: [] for sid in skill_ids} + for from_id, to_id in edges: + adj[from_id].append(to_id) + path: list[str] = [] + path_set: set[str] = set() + in_stack: set[str] = set() + + def visit(n: str) -> list[str] | None: + path.append(n) + path_set.add(n) + in_stack.add(n) + for m in adj.get(n, []): + if m not in path_set: + cycle = visit(m) + if cycle is not None: + return cycle + elif m in in_stack: + # cycle: from m back to m + start = path.index(m) + return path[start:] + [m] + path.pop() + path_set.remove(n) + in_stack.remove(n) + return None + + for sid in skill_ids: + if sid not in path_set: + cycle = visit(sid) + if cycle is not None: + return cycle Review Comment: `find_cycle()` uses `path_set` as the visited check in the outer loop, but `path_set` is cleared on backtracking. This causes the DFS to restart from every node (O(V*(V+E))) and makes the intent unclear. Consider introducing a separate `visited` set (permanent) and reserving the stack set for cycle detection only. ########## contributing-docs/agent_skills/README.md: ########## @@ -0,0 +1,31 @@ +# Agent Skills + +Machine-readable contributor workflow skills extracted +automatically from contributing-docs/*.rst files. + +## What is a Skill? + +A `.. agent-skill::` block embedded in an RST contributing +doc. Skills extend AGENTS.md with executable, +schema-validated, verifiable workflow steps. + +## The :context: field + +The most critical field. Tells an AI agent whether a +command runs on the HOST or inside BREEZE: + +- `host` — run on your local machine +- `breeze` — run inside `breeze shell` +- `either` — works in both + +This directly addresses the warning in AGENTS.md: +"Never run pytest, python, or airflow commands directly +on the host — always use breeze." Review Comment: The README quotes a strict warning from AGENTS.md (“Never run pytest… on the host — always use breeze”), but the documented workflows (and the new skills) explicitly run `uv run … pytest` on the host. This makes the guidance read as contradictory for contributors/agents. Please rephrase this section to clarify the actual rule (e.g. “don’t run bare pytest/python on the host; use `uv run` or Breeze”) so the context concept matches current recommended commands. ```suggestion This helps clarify the warning in AGENTS.md: do not run bare `pytest`, `python`, or `airflow` commands directly on the host. On the host, use the documented wrapper commands such as `uv run ...` when a skill marks the context as `host`; otherwise use Breeze. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
