This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-steward.git


The following commit(s) were added to refs/heads/main by this push:
     new 5a2cd96  feat(generate-cve-json): emit related references for sibling 
CVEs (#384)
5a2cd96 is described below

commit 5a2cd9688f40a328b45d2d330368893e3f48a11b
Author: Jarek Potiuk <[email protected]>
AuthorDate: Sat May 30 18:46:00 2026 +0200

    feat(generate-cve-json): emit related references for sibling CVEs (#384)
    
    Per Arnout Engelen's 2026-05-29 review comment on CVE-2026-49298 — when
    a CVE is an incomplete-fix follow-up to a prior CVE (or otherwise
    relates to one), the JSON should carry a structured references[] entry
    of type "related" pointing at the prior CVE record.
    
    Implementation:
    
    - Extend classify_reference to tag cve.org/CVERecord?id=... and
      nvd.nist.gov/vuln/detail/... URLs as ["related"].
    - Add extract_related_cve_ids(text, current_cve_id) — finds distinct
      CVE-YYYY-NNNNN tokens in arbitrary text with word-boundary matching,
      excludes the current record's own ID, preserves first-appearance
      order for deterministic emission.
    - Add related_cve_url(cve_id) — emits the canonical
      https://www.cve.org/CVERecord?id=<id> URL.
    - build_cna_container now accepts current_cve_id, extracts related
      IDs from the description, and appends cve.org URLs to the
      references list.
    
    Tests: 20 new cases. Full suite 264/264.
    
    Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
 .../src/generate_cve_json/cve_json.py              | 68 ++++++++++++++++-
 .../tests/test_generate_cve_json.py                | 87 ++++++++++++++++++++++
 2 files changed, 154 insertions(+), 1 deletion(-)

diff --git 
a/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py 
b/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
index 5387288..7bd658c 100644
--- a/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
+++ b/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
@@ -809,6 +809,10 @@ def classify_reference(url: str) -> list[str]:
       ``["patch"]``.
     * ``lists.apache.org/...`` and ``security.apache.org/...`` →
       ``["vendor-advisory"]``.
+    * ``cve.org/CVERecord?id=...`` and ``nvd.nist.gov/vuln/detail/...`` →
+      ``["related"]`` (links to other CVE records on the public CVE
+      databases — used for incomplete-fix / sibling-CVE cross-references
+      per ASF Security's request).
     * Anything else → no tags (empty list).
     """
     if re.search(r"github\.com/[^/]+/[^/]+/(pull|commit)/", url):
@@ -822,9 +826,59 @@ def classify_reference(url: str) -> list[str]:
         return []
     if host in ("lists.apache.org", "security.apache.org"):
         return ["vendor-advisory"]
+    if host in ("cve.org", "www.cve.org", "nvd.nist.gov"):
+        return ["related"]
     return []
 
 
+# Match a complete CVE-YYYY-NNNNN identifier with word boundaries so
+# substrings inside larger tokens (e.g. ``CVE-2026-12345-extra``) do
+# not match.
+_CVE_ID_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b")
+
+
+def extract_related_cve_ids(text: str, current_cve_id: str | None = None) -> 
list[str]:
+    """Extract distinct CVE identifiers cited in ``text``, in order of
+    first appearance.
+
+    ``current_cve_id`` is excluded from the result so the generator
+    never emits a self-reference. The check is case-insensitive.
+
+    Typical inputs:
+
+    * The tracker's *Short public summary for publish* body field —
+      where Gate #3 (incomplete-fix cross-CVE clause) places a prior
+      CVE identifier the current CVE is a follow-up to.
+    * The tracker's *Security mailing list thread* field — when the
+      report references a prior CVE in its body for context.
+
+    Output is a list (not a set) to preserve first-appearance order
+    so the emitted references list is deterministic across runs.
+    """
+    seen: set[str] = set()
+    ordered: list[str] = []
+    current_upper = (current_cve_id or "").upper()
+    for match in _CVE_ID_RE.finditer(text):
+        cve_id = match.group(0).upper()
+        if cve_id == current_upper:
+            continue
+        if cve_id in seen:
+            continue
+        seen.add(cve_id)
+        ordered.append(cve_id)
+    return ordered
+
+
+def related_cve_url(cve_id: str) -> str:
+    """Return the public ``cve.org`` record URL for a CVE identifier.
+
+    Format matches ASF Security's preference per Arnout Engelen's
+    2026-05-29 review comment on CVE-2026-49298: ``https://cve.org/
+    CVERecord?id=<CVE-ID>``.
+    """
+    return f"https://www.cve.org/CVERecord?id={cve_id}";
+
+
 def build_references(
     mailing_list_field: str,
     pr_field: str,
@@ -1111,7 +1165,18 @@ def build_cna_container(
     remediation_developers: list[str],
     advisory_urls: list[str] | None = None,
     product_overrides: dict[str, str] | None = None,
+    current_cve_id: str | None = None,
 ) -> dict:
+    # Sibling-CVE cross-references — extract every distinct CVE-YYYY-NNNNN
+    # mentioned in the description (the short public summary) and emit a
+    # ``cve.org/CVERecord?id=<id>`` reference for each, tagged ``related``
+    # by :func:`classify_reference`. This satisfies ASF Security's request
+    # (Arnout Engelen, 2026-05-29 review on CVE-2026-49298) that incomplete-
+    # fix follow-ups carry a structured ``references[]`` link back to the
+    # prior CVE. The current record's own CVE ID is excluded so the
+    # generator never emits a self-reference.
+    related_cve_urls = [related_cve_url(cid) for cid in 
extract_related_cve_ids(description, current_cve_id)]
+    extra_urls = list(advisory_urls or []) + related_cve_urls
     cna: dict = {
         "affected": build_affected(
             affected_versions_value,
@@ -1127,7 +1192,7 @@ def build_cna_container(
         "metrics": build_metrics(severity_value),
         "problemTypes": build_problem_types(cwe_value),
         "providerMetadata": {"orgId": org_id},
-        "references": build_references(mailing_list_value, pr_value, 
extra_urls=advisory_urls),
+        "references": build_references(mailing_list_value, pr_value, 
extra_urls=extra_urls),
         "source": {"discovery": discovery},
         "title": title,
         "x_generator": {"engine": GENERATOR_TAG},
@@ -2094,6 +2159,7 @@ def main(argv: list[str] | None = None) -> int:
             remediation_developers=combined_remediation_developers,
             advisory_urls=combined_advisory_urls,
             product_overrides=product_overrides,
+            current_cve_id=cve_id,
         )
     except ValueError as exc:
         # parse_affected_versions (and any future fail-loud parser
diff --git a/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py 
b/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
index 6a07d36..b9d4b6d 100644
--- a/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
+++ b/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
@@ -801,6 +801,93 @@ class TestClassifyReference:
     def test_malformed_url_returns_no_tags(self):
         assert classify_reference("not a url") == []
 
+    def test_cve_org_record_tagged_as_related(self):
+        # ASF Security's preferred URL form for sibling/incomplete-fix CVE
+        # cross-references (Arnout Engelen, 2026-05-29 review on
+        # CVE-2026-49298).
+        assert 
classify_reference("https://www.cve.org/CVERecord?id=CVE-2026-27173";) == 
["related"]
+        assert 
classify_reference("https://cve.org/CVERecord?id=CVE-2025-68438";) == ["related"]
+
+    def test_nvd_record_tagged_as_related(self):
+        # NVD is the same CVE database under a different URL form; treat
+        # as related too.
+        assert 
classify_reference("https://nvd.nist.gov/vuln/detail/CVE-2026-27173";) == 
["related"]
+
+
+class TestExtractRelatedCveIds:
+    def test_extracts_single_prior_cve(self):
+        from generate_cve_json.cve_json import extract_related_cve_ids
+
+        summary = (
+            "This is a variant of CWE-200 previously addressed in 
CVE-2025-68438; "
+            "that fix did not cover the nested sensitive-keyword allowlist."
+        )
+        assert extract_related_cve_ids(summary) == ["CVE-2025-68438"]
+
+    def test_extracts_multiple_distinct_in_order(self):
+        from generate_cve_json.cve_json import extract_related_cve_ids
+
+        summary = "Fix-bypass of CVE-2026-33858. Also related to 
CVE-2025-50213 and CVE-2025-27018."
+        assert extract_related_cve_ids(summary) == [
+            "CVE-2026-33858",
+            "CVE-2025-50213",
+            "CVE-2025-27018",
+        ]
+
+    def test_excludes_current_cve_id(self):
+        from generate_cve_json.cve_json import extract_related_cve_ids
+
+        summary = "CVE-2026-42359 fixes a PATCH-path bypass of CVE-2026-33858."
+        assert extract_related_cve_ids(summary, 
current_cve_id="CVE-2026-42359") == [
+            "CVE-2026-33858",
+        ]
+
+    def test_current_cve_id_match_is_case_insensitive(self):
+        from generate_cve_json.cve_json import extract_related_cve_ids
+
+        summary = "cve-2026-42359 fixes a PATCH-path bypass of CVE-2026-33858."
+        assert extract_related_cve_ids(summary, 
current_cve_id="CVE-2026-42359") == [
+            "CVE-2026-33858",
+        ]
+
+    def test_deduplicates_repeated_mentions(self):
+        from generate_cve_json.cve_json import extract_related_cve_ids
+
+        summary = "CVE-2025-68438 was incomplete; this CVE follows 
CVE-2025-68438."
+        assert extract_related_cve_ids(summary) == ["CVE-2025-68438"]
+
+    def test_substring_in_larger_token_does_not_match(self):
+        from generate_cve_json.cve_json import extract_related_cve_ids
+
+        # Word-boundary regex must not match identifiers embedded in
+        # larger tokens (defensive against accidental hits).
+        assert extract_related_cve_ids("seeCVE-2026-33858trailing") == []
+        assert extract_related_cve_ids("CVE-2026-33858x") == []
+
+    def test_short_form_required_at_least_four_digits(self):
+        from generate_cve_json.cve_json import extract_related_cve_ids
+
+        # CVE-YYYY-NNNN minimum (matches MITRE's 4-7 digit constraint).
+        assert extract_related_cve_ids("CVE-2026-123") == []
+        assert extract_related_cve_ids("CVE-2026-1234") == ["CVE-2026-1234"]
+
+    def test_empty_string_returns_empty_list(self):
+        from generate_cve_json.cve_json import extract_related_cve_ids
+
+        assert extract_related_cve_ids("") == []
+
+    def test_no_cve_id_in_text_returns_empty_list(self):
+        from generate_cve_json.cve_json import extract_related_cve_ids
+
+        assert extract_related_cve_ids("no CVE here, just narrative.") == []
+
+
+class TestRelatedCveUrl:
+    def test_url_format_matches_cve_org(self):
+        from generate_cve_json.cve_json import related_cve_url
+
+        assert related_cve_url("CVE-2026-27173") == 
"https://www.cve.org/CVERecord?id=CVE-2026-27173";
+
 
 class TestBuildReferences:
     def test_mailing_list_field_urls_are_not_auto_included(self):

Reply via email to