This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-steward.git
The following commit(s) were added to refs/heads/main by this push:
new 845b252 fix(generate-cve-json): sanitize CWE wrapper + fail loud on
unparsable versions (#362)
845b252 is described below
commit 845b25203bfb0d37edc23c5ca098fa8386ca402a
Author: Jarek Potiuk <[email protected]>
AuthorDate: Thu May 28 21:04:21 2026 +0200
fix(generate-cve-json): sanitize CWE wrapper + fail loud on unparsable
versions (#362)
Two surgical input-sanitization fixes for the CVE-JSON generator,
both arising from Arnout Engelen's 2026-05-28 review pass on the
Airflow CVE records.
parse_cwe: strip outer parens / brackets from the title.
Reviewers' CWE pickers commonly serialise to
`CWE-285 (Improper Authorization)`. The previous concat path then
emitted `"CWE-285: (Improper Authorization)"` — both a colon *and*
parens, which the ASF CVE-CNA reviewer flagged as cluttered on
CVE-2026-46763. The fix strips a single outer wrapper layer when
the entire title is wrapped (`(Foo)` / `[Bar]`); inner parens and
unbalanced wrappers are left intact since they are content.
parse_affected_versions: fail loud on unparsable input.
The previous fall-through path emitted `{"version": <raw string>}`,
which produces invalid CVE 5.x records — the schema's `version`
field is a literal version, never a range expression. Real
reviewer inputs like `>= 3.0.0 (reporter verified against 3.2.1)`
silently shipped as malformed JSON on CVE-2026-46763. The fix
raises ValueError with a message naming the supported shapes; main
catches the raise and exits 2 with a clean error rather than
trace-back.
parse_affected_versions: warn on bare lower bound without sentinel.
`>= 2.0.0` alone serialises as `{version: "2.0.0"}` with no
`lessThan`, which CVE 5.x readers interpret as "this version alone
is affected" — almost always misleading. Warns to stderr unless the
`< NEXT VERSION` sentinel was used (the documented "fix not yet
released" shape). This catches the CVE-2026-33264 class.
242 tests pass; updated the previously-naive
test_unknown_string_falls_back_to_single_entry to assert the new
ValueError instead, and updated one build_affected test that
incidentally used a free-form version string.
Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
.../src/generate_cve_json/cve_json.py | 132 ++++++++++++++++-----
.../vulnogram/generate-cve-json/tests/test_cli.py | 13 ++
.../tests/test_generate_cve_json.py | 83 +++++++++++--
3 files changed, 192 insertions(+), 36 deletions(-)
diff --git
a/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
b/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
index c9252d5..4e109c2 100644
--- a/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
+++ b/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py
@@ -562,9 +562,24 @@ def parse_cve_id(value: str) -> str:
def parse_cwe(value: str) -> tuple[str, str]:
"""Return ``(cweId, human-readable description)`` from a CWE field.
- Accepts ``CWE-285``, ``CWE-285: Improper Authorization`` and
- free-form text. If no ``CWE-\\d+`` token is found, ``cweId`` is an
- empty string and the full value is returned as the description.
+ Accepts ``CWE-285``, ``CWE-285: Improper Authorization``,
+ ``CWE-285 (Improper Authorization)``, and free-form text. If no
+ ``CWE-\\d+`` token is found, ``cweId`` is an empty string and the
+ full value is returned as the description.
+
+ When the title following the id is wrapped in a single layer of
+ parentheses or brackets (e.g. ``CWE-285 (Improper Authorization)``
+ — the form most projects' CWE pickers serialise to), the outer
+ wrapper is stripped before the description is assembled. Without
+ the strip the result reads ``"CWE-285: (Improper Authorization)"``
+ — both a colon *and* parens, which the ASF CVE-CNA reviewers flag
+ as cluttered. The accepted shapes are ``"CWE-285: Title"`` (colon
+ separator, no wrapper) or ``"CWE-285 Title"`` (no separator); the
+ parser emits the colon-separated form.
+
+ Only the **outer** wrapper is stripped: ``CWE-285 (Foo) Bar``
+ stays ``"CWE-285: (Foo) Bar"`` because the parens are not the
+ outermost shape.
"""
value = value.strip()
id_match = re.search(r"CWE-\d+", value)
@@ -574,6 +589,12 @@ def parse_cwe(value: str) -> tuple[str, str]:
# a title follows the id; otherwise just the id.
title_match = re.match(r"^CWE-\d+\s*[:\-]?\s*(?P<rest>.*)", value)
rest = (title_match.group("rest") if title_match else "").strip()
+ # Strip a single layer of outer wrapping parens / brackets.
+ # The inner content must contain no unbalanced wrappers; this
+ # avoids stripping ``(Foo) Bar (Baz)`` to ``Foo) Bar (Baz``.
+ wrapper_match =
re.match(r"^\((?P<paren>[^()]+)\)$|^\[(?P<bracket>[^\[\]]+)\]$", rest)
+ if wrapper_match:
+ rest = (wrapper_match.group("paren") or
wrapper_match.group("bracket")).strip()
description = f"{cwe_id}: {rest}" if rest else cwe_id
else:
description = value
@@ -609,13 +630,29 @@ def parse_affected_versions(value: str,
version_start_override: str | None) -> l
because the issue body rarely specifies it and a reviewer usually
knows the first affected release by heart.
- Unknown shapes fall back to a single ``version`` entry with the raw
- string, leaving normalisation to the reviewer in Vulnogram.
+ **Strict mode** — unparsable inputs raise ``ValueError``. The
+ prior fallback that emitted ``{"version": <raw string>}`` produced
+ invalid CVE 5.x records: the schema's ``version`` field is a
+ literal version, never a range expression. Forcing the parser to
+ fail loud catches reviewer-typo / parenthetical-comment inputs
+ before they ship as malformed JSON to Vulnogram (for example,
+ ``>= 3.0.0 (reporter verified...)`` falling through and
+ serialising the parenthetical clause into the ``version`` field).
+
+ **Bare lower bounds without an upper bound** emit a warning to
+ stderr unless the ``< NEXT VERSION`` sentinel was used. The
+ resulting record claims the bare version is the *sole* affected
+ release, which is almost always wrong; the warning catches the
+ case without blocking emission, since a small number of
+ workflows legitimately want bare-version entries (single-point-
+ fix records where only one release train is involved).
"""
+ raw_value_for_diagnostics = (value or "").strip()
cleaned = value.strip().strip("`").strip() if value else ""
low_bound = (version_start_override or "").strip() or "0"
- if NEXT_VERSION_TOKEN_RE.search(cleaned):
+ has_next_version_sentinel = bool(NEXT_VERSION_TOKEN_RE.search(cleaned))
+ if has_next_version_sentinel:
cleaned = NEXT_VERSION_TOKEN_RE.sub("",
cleaned).strip().rstrip(",").strip()
if not cleaned:
@@ -675,6 +712,25 @@ def parse_affected_versions(value: str,
version_start_override: str | None) -> l
ge_only_match = re.match(r">=?\s*(?P<low>[0-9][0-9A-Za-z.\-+]*)\s*$",
cleaned)
if ge_only_match:
+ # Warn when the input was a bare lower bound AND no `< NEXT
+ # VERSION` sentinel was given. The emitted entry has only a
+ # `version` field (no `lessThan`), which CVE 5.x readers
+ # interpret as "this version alone is affected" — almost
+ # always misleading for a CVE that affects a continuous
+ # range. The sentinel signals "fix not yet released, upper
+ # bound unknown" and is the right shape for in-flight
+ # trackers; without it, the reviewer should add an explicit
+ # upper bound.
+ if not has_next_version_sentinel:
+ print(
+ f"warning: `Affected versions` value
{raw_value_for_diagnostics!r} "
+ f"has a bare lower bound (no upper bound, no `NEXT VERSION` "
+ f"sentinel). The emitted record will claim "
+ f"{ge_only_match.group('low')!r} alone is affected. Add a "
+ f"`< X.Y.Z` upper bound or the `< NEXT VERSION` sentinel "
+ f"to silence this warning.",
+ file=sys.stderr,
+ )
return [
{
"status": "affected",
@@ -683,7 +739,21 @@ def parse_affected_versions(value: str,
version_start_override: str | None) -> l
}
]
- return [{"status": "affected", "version": cleaned}]
+ # Fall-through: nothing matched. Refuse to emit a record whose
+ # `version` field is a range expression — that JSON would be
+ # rejected by Vulnogram's schema validation anyway, and silently
+ # emitting it has shipped malformed records to reviewers (CVE-
+ # 2026-46763 — `>= 3.0.0` ended up as a literal `version` string
+ # because the parser couldn't strip a trailing parenthetical).
+ raise ValueError(
+ f"Could not parse `Affected versions` value
{raw_value_for_diagnostics!r}. "
+ f"Supported shapes: `< X.Y.Z`, `<= X.Y.Z`, `>= X.Y.Z`, "
+ f"`>= X.Y.Z, < A.B.C`, and bare `X.Y.Z`. Strip parenthetical "
+ f"comments, stray backticks, and markdown wrappers before "
+ f"re-running. The `< NEXT VERSION` sentinel may be combined "
+ f"with `>= X.Y.Z` when the fix-shipped version is not yet "
+ f"known."
+ )
def normalise_severity(value: str) -> str:
@@ -1973,26 +2043,34 @@ def main(argv: list[str] | None = None) -> int:
return 2
product_overrides[pkg] = display
- cna = build_cna_container(
- title=title,
- description=description,
- affected_versions_value=affected_field,
- cwe_value=cwe,
- severity_value=severity,
- credits_value=credited_as,
- mailing_list_value=mailing_list,
- pr_value=pr_field,
- vendor=args.vendor,
- product=args.product,
- package_name=args.package_name,
- collection_url=args.collection_url,
- org_id=args.org_id,
- version_start=args.version_start,
- discovery=args.discovery,
- remediation_developers=combined_remediation_developers,
- advisory_urls=combined_advisory_urls,
- product_overrides=product_overrides,
- )
+ try:
+ cna = build_cna_container(
+ title=title,
+ description=description,
+ affected_versions_value=affected_field,
+ cwe_value=cwe,
+ severity_value=severity,
+ credits_value=credited_as,
+ mailing_list_value=mailing_list,
+ pr_value=pr_field,
+ vendor=args.vendor,
+ product=args.product,
+ package_name=args.package_name,
+ collection_url=args.collection_url,
+ org_id=args.org_id,
+ version_start=args.version_start,
+ discovery=args.discovery,
+ remediation_developers=combined_remediation_developers,
+ advisory_urls=combined_advisory_urls,
+ product_overrides=product_overrides,
+ )
+ except ValueError as exc:
+ # parse_affected_versions (and any future fail-loud parser
+ # called from build_cna_container) raises ValueError on
+ # unparsable input. Surface it as a user-facing error and
+ # exit non-zero rather than crashing with a traceback.
+ print(f"error: {exc}", file=sys.stderr)
+ return 2
if args.no_envelope:
payload: dict = cna
diff --git a/tools/vulnogram/generate-cve-json/tests/test_cli.py
b/tools/vulnogram/generate-cve-json/tests/test_cli.py
index 29dd634..3d656f9 100644
--- a/tools/vulnogram/generate-cve-json/tests/test_cli.py
+++ b/tools/vulnogram/generate-cve-json/tests/test_cli.py
@@ -439,6 +439,19 @@ class TestMainErrors:
assert rc == 2
assert "error:" in capsys.readouterr().err
+ def test_unparsable_affected_versions_returns_2(self, capsys, monkeypatch):
+ # parse_affected_versions now raises ValueError on un-parseable
+ # input rather than emitting a malformed `version` string. main
+ # catches that and exits 2 with a clean error message — no
+ # traceback shown to the user.
+ body = _issue_body(affected=">= 3.0.0 (reporter verified against
3.2.1)")
+ monkeypatch.setattr("sys.stdin", io.StringIO(body))
+ rc = cve_json.main(["--stdin"])
+ assert rc == 2
+ captured = capsys.readouterr()
+ assert "Could not parse" in captured.err
+ assert "Affected versions" in captured.err
+
# --- main: happy paths -----------------------------------------------------
diff --git a/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
b/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
index dfb860e..6a07d36 100644
--- a/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
+++ b/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
@@ -326,6 +326,34 @@ class TestParseCwe:
assert cwe_id == ""
assert description == "_No response_"
+ def test_cwe_with_parens_around_title_strips_outer_wrapper(self):
+ # `CWE-285 (Improper Authorization)` is what most projects'
+ # CWE pickers serialise to. Without the strip, the result is
+ # `CWE-285: (Improper Authorization)` — colon AND parens —
+ # which reviewers flag as cluttered. This regression closes
+ # the Arnout-reported issue on CVE-2026-46763 (2026-05-28).
+ cwe_id, description = parse_cwe("CWE-285 (Improper Authorization)")
+ assert cwe_id == "CWE-285"
+ assert description == "CWE-285: Improper Authorization"
+
+ def test_cwe_with_brackets_around_title_also_stripped(self):
+ cwe_id, description = parse_cwe("CWE-352 [Cross-Site Request Forgery]")
+ assert cwe_id == "CWE-352"
+ assert description == "CWE-352: Cross-Site Request Forgery"
+
+ def test_cwe_with_inner_parens_not_stripped(self):
+ # The wrapper-strip only fires when the outer characters are
+ # the wrapper. Inner parens stay; they are content.
+ cwe_id, description = parse_cwe("CWE-285 Title (annotation)")
+ assert cwe_id == "CWE-285"
+ assert description == "CWE-285: Title (annotation)"
+
+ def test_cwe_with_mismatched_wrappers_not_stripped(self):
+ # A leading `(` without a trailing `)` is not a wrapper.
+ cwe_id, description = parse_cwe("CWE-285 (Foo) Bar (Baz)")
+ assert cwe_id == "CWE-285"
+ assert description == "CWE-285: (Foo) Bar (Baz)"
+
# ---------------------------------------------------------------------------
# Affected-versions parsing
@@ -360,14 +388,27 @@ class TestParseAffectedVersions:
versions = parse_affected_versions(">=3.0.0, <3.2.0", "2.10.5")
assert versions[0]["version"] == "2.10.5"
- def test_unknown_string_falls_back_to_single_entry(self):
- versions = parse_affected_versions("all versions", None)
- assert versions
- assert versions[0]["status"] == "affected"
+ def test_unknown_string_raises_value_error(self):
+ # The parser now refuses to emit garbage: an unparsable
+ # `Affected versions` field is a hard error rather than a
+ # bare-string fallback. This regression closes the issue on
+ # CVE-2026-46763 (2026-05-28) where the field `>= 3.0.0 (...)`
+ # serialised the parenthetical clause into the JSON `version`
+ # field.
+ with pytest.raises(ValueError, match="Could not parse"):
+ parse_affected_versions("all versions", None)
+
+ def test_unparsable_input_error_mentions_the_value(self):
+ # The error message includes the offending value so the
+ # reviewer can find it in the issue body.
+ with pytest.raises(ValueError, match="all versions"):
+ parse_affected_versions("all versions", None)
def test_lower_bound_only_emits_open_ended_entry(self):
# `>=2.0.0` (no upper) — useful for trackers whose fix-shipped
- # version isn't known yet (typically providers).
+ # version isn't known yet (typically providers). Still emits
+ # (does not raise) but issues a warning to stderr — see the
+ # warning test below.
versions = parse_affected_versions(">=2.0.0", None)
assert versions == [
{"status": "affected", "version": "2.0.0", "versionType":
"semver"},
@@ -379,6 +420,27 @@ class TestParseAffectedVersions:
{"status": "affected", "version": "2.0.0", "versionType":
"semver"},
]
+ def test_lower_bound_only_warns_about_missing_upper_bound(self, capsys):
+ # A bare lower bound has the misleading shape where CVE 5.x
+ # readers interpret it as "this version alone is affected".
+ # Warn so a reviewer notices and fills in `< X.Y.Z` or the
+ # `< NEXT VERSION` sentinel.
+ parse_affected_versions(">= 2.0.0", None)
+ captured = capsys.readouterr()
+ assert "bare lower bound" in captured.err
+ assert "'2.0.0'" in captured.err or '"2.0.0"' in captured.err
+ # No stdout chatter; warnings go to stderr only.
+ assert captured.out == ""
+
+ def test_lower_bound_with_next_version_sentinel_does_not_warn(self,
capsys):
+ # `< NEXT VERSION` is the documented "fix not yet released,
+ # upper bound unknown" form — emitting the bare-version entry
+ # is intentional, no warning.
+ parse_affected_versions(">= 2.0.0, < NEXT VERSION", None)
+ captured = capsys.readouterr()
+ assert captured.err == ""
+ assert captured.out == ""
+
class TestParseAffectedVersionsNextVersionPlaceholder:
"""The `< NEXT VERSION` sentinel: fix not yet released, upper bound
unknown.
@@ -631,11 +693,14 @@ class TestBuildAffectedMultiProduct:
assert entries[0]["product"] == "Apache Example Project BRAND"
def test_line_without_prefix_falls_back_to_defaults(self):
- # A single line that is not a version range and doesn't carry a
- # recognisable package prefix stays in the legacy single-entry
- # path and takes the explicit product / packageName args.
+ # A single line that does not carry a recognisable package
+ # prefix stays in the legacy single-entry path and takes the
+ # explicit product / packageName args. (The version-range
+ # shape itself is incidental to what this test exercises;
+ # use a parseable shape since the parser no longer accepts
+ # arbitrary free-form text.)
entries = build_affected(
- "all versions",
+ "<= 1.0.0",
vendor="Apache Software Foundation",
product="Apache Example Helm Chart",
package_name="apache-example-helm-chart",