This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-steward.git
The following commit(s) were added to refs/heads/main by this push:
new 5a2cd96 feat(generate-cve-json): emit related references for sibling
CVEs (#384)
5a2cd96 is described below
commit 5a2cd9688f40a328b45d2d330368893e3f48a11b
Author: Jarek Potiuk <[email protected]>
AuthorDate: Sat May 30 18:46:00 2026 +0200
feat(generate-cve-json): emit related references for sibling CVEs (#384)
Per Arnout Engelen's 2026-05-29 review comment on CVE-2026-49298 — when
a CVE is an incomplete-fix follow-up to a prior CVE (or otherwise
relates to one), the JSON should carry a structured references[] entry
of type "related" pointing at the prior CVE record.
Implementation:
- Extend classify_reference to tag cve.org/CVERecord?id=... and
nvd.nist.gov/vuln/detail/... URLs as ["related"].
- Add extract_related_cve_ids(text, current_cve_id) — finds distinct
CVE-YYYY-NNNNN tokens in arbitrary text with word-boundary matching,
excludes the current record's own ID, preserves first-appearance
order for deterministic emission.
- Add related_cve_url(cve_id) — emits the canonical
https://www.cve.org/CVERecord?id=<id> URL.
- build_cna_container now accepts current_cve_id, extracts related
IDs from the description, and appends cve.org URLs to the
references list.
Tests: 20 new cases. Full suite 264/264.
Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
.../src/generate_cve_json/cve_json.py | 68 ++++++++++++++++-
.../tests/test_generate_cve_json.py | 87 ++++++++++++++++++++++
2 files changed, 154 insertions(+), 1 deletion(-)
diff --git
a/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
b/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
index 5387288..7bd658c 100644
--- a/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
+++ b/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
@@ -809,6 +809,10 @@ def classify_reference(url: str) -> list[str]:
``["patch"]``.
* ``lists.apache.org/...`` and ``security.apache.org/...`` →
``["vendor-advisory"]``.
+ * ``cve.org/CVERecord?id=...`` and ``nvd.nist.gov/vuln/detail/...`` →
+ ``["related"]`` (links to other CVE records on the public CVE
+ databases — used for incomplete-fix / sibling-CVE cross-references
+ per ASF Security's request).
* Anything else → no tags (empty list).
"""
if re.search(r"github\.com/[^/]+/[^/]+/(pull|commit)/", url):
@@ -822,9 +826,59 @@ def classify_reference(url: str) -> list[str]:
return []
if host in ("lists.apache.org", "security.apache.org"):
return ["vendor-advisory"]
+ if host in ("cve.org", "www.cve.org", "nvd.nist.gov"):
+ return ["related"]
return []
+# Match a complete CVE-YYYY-NNNNN identifier with word boundaries so
+# substrings inside larger tokens (e.g. ``CVE-2026-12345-extra``) do
+# not match.
+_CVE_ID_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b")
+
+
+def extract_related_cve_ids(text: str, current_cve_id: str | None = None) ->
list[str]:
+ """Extract distinct CVE identifiers cited in ``text``, in order of
+ first appearance.
+
+ ``current_cve_id`` is excluded from the result so the generator
+ never emits a self-reference. The check is case-insensitive.
+
+ Typical inputs:
+
+ * The tracker's *Short public summary for publish* body field —
+ where Gate #3 (incomplete-fix cross-CVE clause) places a prior
+ CVE identifier the current CVE is a follow-up to.
+ * The tracker's *Security mailing list thread* field — when the
+ report references a prior CVE in its body for context.
+
+ Output is a list (not a set) to preserve first-appearance order
+ so the emitted references list is deterministic across runs.
+ """
+ seen: set[str] = set()
+ ordered: list[str] = []
+ current_upper = (current_cve_id or "").upper()
+ for match in _CVE_ID_RE.finditer(text):
+ cve_id = match.group(0).upper()
+ if cve_id == current_upper:
+ continue
+ if cve_id in seen:
+ continue
+ seen.add(cve_id)
+ ordered.append(cve_id)
+ return ordered
+
+
+def related_cve_url(cve_id: str) -> str:
+ """Return the public ``cve.org`` record URL for a CVE identifier.
+
+ Format matches ASF Security's preference per Arnout Engelen's
+ 2026-05-29 review comment on CVE-2026-49298: ``https://cve.org/
+ CVERecord?id=<CVE-ID>``.
+ """
+ return f"https://www.cve.org/CVERecord?id={cve_id}"
+
+
def build_references(
mailing_list_field: str,
pr_field: str,
@@ -1111,7 +1165,18 @@ def build_cna_container(
remediation_developers: list[str],
advisory_urls: list[str] | None = None,
product_overrides: dict[str, str] | None = None,
+ current_cve_id: str | None = None,
) -> dict:
+ # Sibling-CVE cross-references — extract every distinct CVE-YYYY-NNNNN
+ # mentioned in the description (the short public summary) and emit a
+ # ``cve.org/CVERecord?id=<id>`` reference for each, tagged ``related``
+ # by :func:`classify_reference`. This satisfies ASF Security's request
+ # (Arnout Engelen, 2026-05-29 review on CVE-2026-49298) that incomplete-
+ # fix follow-ups carry a structured ``references[]`` link back to the
+ # prior CVE. The current record's own CVE ID is excluded so the
+ # generator never emits a self-reference.
+ related_cve_urls = [related_cve_url(cid) for cid in
extract_related_cve_ids(description, current_cve_id)]
+ extra_urls = list(advisory_urls or []) + related_cve_urls
cna: dict = {
"affected": build_affected(
affected_versions_value,
@@ -1127,7 +1192,7 @@ def build_cna_container(
"metrics": build_metrics(severity_value),
"problemTypes": build_problem_types(cwe_value),
"providerMetadata": {"orgId": org_id},
- "references": build_references(mailing_list_value, pr_value,
extra_urls=advisory_urls),
+ "references": build_references(mailing_list_value, pr_value,
extra_urls=extra_urls),
"source": {"discovery": discovery},
"title": title,
"x_generator": {"engine": GENERATOR_TAG},
@@ -2094,6 +2159,7 @@ def main(argv: list[str] | None = None) -> int:
remediation_developers=combined_remediation_developers,
advisory_urls=combined_advisory_urls,
product_overrides=product_overrides,
+ current_cve_id=cve_id,
)
except ValueError as exc:
# parse_affected_versions (and any future fail-loud parser
diff --git a/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
b/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
index 6a07d36..b9d4b6d 100644
--- a/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
+++ b/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
@@ -801,6 +801,93 @@ class TestClassifyReference:
def test_malformed_url_returns_no_tags(self):
assert classify_reference("not a url") == []
+ def test_cve_org_record_tagged_as_related(self):
+ # ASF Security's preferred URL form for sibling/incomplete-fix CVE
+ # cross-references (Arnout Engelen, 2026-05-29 review on
+ # CVE-2026-49298).
+ assert
classify_reference("https://www.cve.org/CVERecord?id=CVE-2026-27173") ==
["related"]
+ assert
classify_reference("https://cve.org/CVERecord?id=CVE-2025-68438") == ["related"]
+
+ def test_nvd_record_tagged_as_related(self):
+ # NVD is the same CVE database under a different URL form; treat
+ # as related too.
+ assert
classify_reference("https://nvd.nist.gov/vuln/detail/CVE-2026-27173") ==
["related"]
+
+
+class TestExtractRelatedCveIds:
+ def test_extracts_single_prior_cve(self):
+ from generate_cve_json.cve_json import extract_related_cve_ids
+
+ summary = (
+ "This is a variant of CWE-200 previously addressed in
CVE-2025-68438; "
+ "that fix did not cover the nested sensitive-keyword allowlist."
+ )
+ assert extract_related_cve_ids(summary) == ["CVE-2025-68438"]
+
+ def test_extracts_multiple_distinct_in_order(self):
+ from generate_cve_json.cve_json import extract_related_cve_ids
+
+ summary = "Fix-bypass of CVE-2026-33858. Also related to
CVE-2025-50213 and CVE-2025-27018."
+ assert extract_related_cve_ids(summary) == [
+ "CVE-2026-33858",
+ "CVE-2025-50213",
+ "CVE-2025-27018",
+ ]
+
+ def test_excludes_current_cve_id(self):
+ from generate_cve_json.cve_json import extract_related_cve_ids
+
+ summary = "CVE-2026-42359 fixes a PATCH-path bypass of CVE-2026-33858."
+ assert extract_related_cve_ids(summary,
current_cve_id="CVE-2026-42359") == [
+ "CVE-2026-33858",
+ ]
+
+ def test_current_cve_id_match_is_case_insensitive(self):
+ from generate_cve_json.cve_json import extract_related_cve_ids
+
+ summary = "cve-2026-42359 fixes a PATCH-path bypass of CVE-2026-33858."
+ assert extract_related_cve_ids(summary,
current_cve_id="CVE-2026-42359") == [
+ "CVE-2026-33858",
+ ]
+
+ def test_deduplicates_repeated_mentions(self):
+ from generate_cve_json.cve_json import extract_related_cve_ids
+
+ summary = "CVE-2025-68438 was incomplete; this CVE follows
CVE-2025-68438."
+ assert extract_related_cve_ids(summary) == ["CVE-2025-68438"]
+
+ def test_substring_in_larger_token_does_not_match(self):
+ from generate_cve_json.cve_json import extract_related_cve_ids
+
+ # Word-boundary regex must not match identifiers embedded in
+ # larger tokens (defensive against accidental hits).
+ assert extract_related_cve_ids("seeCVE-2026-33858trailing") == []
+ assert extract_related_cve_ids("CVE-2026-33858x") == []
+
+ def test_short_form_required_at_least_four_digits(self):
+ from generate_cve_json.cve_json import extract_related_cve_ids
+
+ # CVE-YYYY-NNNN minimum (matches MITRE's 4-7 digit constraint).
+ assert extract_related_cve_ids("CVE-2026-123") == []
+ assert extract_related_cve_ids("CVE-2026-1234") == ["CVE-2026-1234"]
+
+ def test_empty_string_returns_empty_list(self):
+ from generate_cve_json.cve_json import extract_related_cve_ids
+
+ assert extract_related_cve_ids("") == []
+
+ def test_no_cve_id_in_text_returns_empty_list(self):
+ from generate_cve_json.cve_json import extract_related_cve_ids
+
+ assert extract_related_cve_ids("no CVE here, just narrative.") == []
+
+
+class TestRelatedCveUrl:
+ def test_url_format_matches_cve_org(self):
+ from generate_cve_json.cve_json import related_cve_url
+
+ assert related_cve_url("CVE-2026-27173") ==
"https://www.cve.org/CVERecord?id=CVE-2026-27173"
+
class TestBuildReferences:
def test_mailing_list_field_urls_are_not_auto_included(self):