This is an automated email from the ASF dual-hosted git repository.
sbp pushed a commit to branch sbp
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git
The following commit(s) were added to refs/heads/sbp by this push:
new 03067776 Use extracted archives in license file checks
03067776 is described below
commit 030677769c6de91ad162284f31d8dd37ee7ed022
Author: Sean B. Palmer <[email protected]>
AuthorDate: Wed Mar 11 14:31:07 2026 +0000
Use extracted archives in license file checks
---
atr/tasks/checks/license.py | 81 +++++++++++++++++----------------
tests/unit/test_archive_member_limit.py | 18 --------
2 files changed, 42 insertions(+), 57 deletions(-)
diff --git a/atr/tasks/checks/license.py b/atr/tasks/checks/license.py
index 449445ac..86fe9f27 100644
--- a/atr/tasks/checks/license.py
+++ b/atr/tasks/checks/license.py
@@ -82,7 +82,7 @@ INCLUDED_PATTERNS: Final[list[str]] = [
# Release policy fields which this check relies on - used for result caching
INPUT_POLICY_KEYS: Final[list[str]] = ["license_check_mode",
"source_excludes_lightweight"]
INPUT_EXTRA_ARGS: Final[list[str]] = ["is_podling"]
-CHECK_VERSION: Final[str] = "1"
+CHECK_VERSION: Final[str] = "2"
# Types
@@ -141,10 +141,18 @@ async def files(args: checks.FunctionArguments) ->
results.Results | None:
is_podling = args.extra_args.get("is_podling", False)
+ cache_dir = await checks.resolve_cache_dir(args)
+ if cache_dir is None:
+ await recorder.failure(
+ "Extracted archive tree is not available",
+ {"rel_path": args.primary_rel_path},
+ )
+ return None
+
log.info(f"Checking license files for {artifact_abs_path} (rel:
{args.primary_rel_path})")
try:
- for result in await asyncio.to_thread(_files_check_core_logic,
str(artifact_abs_path), is_podling):
+ for result in await asyncio.to_thread(_files_check_core_logic,
cache_dir, is_podling):
match result:
case ArtifactResult():
await _record_artifact(recorder, result)
@@ -220,42 +228,44 @@ def headers_validate(content: bytes, _filename: str) ->
tuple[bool, str | None]:
return False, "Could not find Apache License header"
-def _files_check_core_logic(artifact_path: str, is_podling: bool) ->
Iterator[Result]:
+def _files_check_core_logic(cache_dir: pathlib.Path, is_podling: bool) ->
Iterator[Result]:
"""Verify that LICENSE and NOTICE files exist and are placed and formatted
correctly."""
license_results: dict[str, str | None] = {}
notice_results: dict[str, tuple[bool, list[str], str]] = {}
disclaimer_found = False
# Check for license files in the root directory
- try:
- with tarzip.open_archive(artifact_path) as archive:
- for member in archive:
- if member.name and member.name.split("/")[-1].startswith("._"):
- # Metadata convention
- continue
-
- if member.name.count("/") > 1:
- # Skip files in subdirectories
- continue
-
- filename = os.path.basename(member.name)
- if filename == "LICENSE":
- # TODO: Check length, should be 11,358 bytes
- license_diff = _files_check_core_logic_license(archive,
member)
- license_results[filename] = license_diff
- elif filename == "NOTICE":
- # TODO: Check length doesn't exceed some preset
- notice_ok, notice_issues, notice_preamble =
_files_check_core_logic_notice(archive, member)
- notice_results[filename] = (notice_ok, notice_issues,
notice_preamble)
- elif filename in {"DISCLAIMER", "DISCLAIMER-WIP"}:
- disclaimer_found = True
- except tarzip.ArchiveMemberLimitExceededError as e:
+ top_entries = sorted(e for e in os.listdir(cache_dir) if not
e.startswith("._"))
+ root_dirs = [e for e in top_entries if (cache_dir / e).is_dir()]
+ if len(root_dirs) != 1:
yield ArtifactResult(
status=sql.CheckResultStatus.FAILURE,
- message=f"Archive has too many members: {e}",
- data={"error": str(e)},
+ message=f"Expected single root directory, found {len(root_dirs)}",
+ data=None,
)
return
+ root_path = cache_dir / root_dirs[0]
+
+ for entry in sorted(os.listdir(root_path)):
+ if entry.startswith("._"):
+ # Metadata convention
+ continue
+
+ entry_path = root_path / entry
+ if not entry_path.is_file():
+ # Skip subdirectories
+ continue
+
+ if entry == "LICENSE":
+ # TODO: Check length, should be 11,358 bytes
+ license_diff = _files_check_core_logic_license(entry_path)
+ license_results[entry] = license_diff
+ elif entry == "NOTICE":
+ # TODO: Check length doesn't exceed some preset
+ notice_ok, notice_issues, notice_preamble =
_files_check_core_logic_notice(entry_path)
+ notice_results[entry] = (notice_ok, notice_issues, notice_preamble)
+ elif entry in {"DISCLAIMER", "DISCLAIMER-WIP"}:
+ disclaimer_found = True
yield from _license_results(license_results)
yield from _notice_results(notice_results)
@@ -267,11 +277,9 @@ def _files_check_core_logic(artifact_path: str,
is_podling: bool) -> Iterator[Re
)
-def _files_check_core_logic_license(archive: tarzip.Archive, member:
tarzip.Member) -> str | None:
+def _files_check_core_logic_license(file_path: pathlib.Path) -> str | None:
"""Verify that the start of the LICENSE file matches the Apache 2.0
license."""
- f = archive.extractfile(member)
- if not f:
- return None
+ package_license_bytes = file_path.read_bytes()
sha3e = hashlib.sha3_256()
sha3e.update(constants.APACHE_LICENSE_2_0.encode("utf-8"))
@@ -280,7 +288,6 @@ def _files_check_core_logic_license(archive:
tarzip.Archive, member: tarzip.Memb
if sha3_expected !=
"5efa4839f385df309ffc022ca5ce9763c4bc709dab862ca77d9a894db6598456":
log.error("SHA3 expected value is incorrect, please update the
static.LICENSE constant")
- package_license_bytes = f.read()
package_license = package_license_bytes.decode("utf-8", errors="replace")
# Some whitespace variations are permitted:
@@ -302,14 +309,10 @@ def _files_check_core_logic_license(archive:
tarzip.Archive, member: tarzip.Memb
return None
-def _files_check_core_logic_notice(archive: tarzip.Archive, member:
tarzip.Member) -> tuple[bool, list[str], str]:
+def _files_check_core_logic_notice(file_path: pathlib.Path) -> tuple[bool,
list[str], str]:
"""Verify that the NOTICE file follows the required format."""
- f = archive.extractfile(member)
- if not f:
- return False, ["the NOTICE file is missing or could not be read"], ""
-
try:
- content = f.read().decode("utf-8")
+ content = file_path.read_bytes().decode("utf-8")
except UnicodeDecodeError:
return False, ["the NOTICE file is not valid UTF-8"], ""
preamble = "".join(content.splitlines(keepends=True)[:3])
diff --git a/tests/unit/test_archive_member_limit.py
b/tests/unit/test_archive_member_limit.py
index e22e50cb..8b2c6bd4 100644
--- a/tests/unit/test_archive_member_limit.py
+++ b/tests/unit/test_archive_member_limit.py
@@ -49,24 +49,6 @@ def test_extract_wraps_member_limit(tmp_path, monkeypatch):
assert "too many members" in str(excinfo.value).lower()
-def test_license_files_reports_member_limit(tmp_path, monkeypatch):
- archive_path = tmp_path / "sample.tar"
- _make_tar(archive_path, ["LICENSE", "NOTICE", "README.txt"])
-
- original_open = tarzip.open_archive
-
- def limited_open(path: str, *args, **kwargs):
- return original_open(path, max_members=2)
-
- monkeypatch.setattr(tarzip, "open_archive", limited_open)
-
- results = list(license_checks._files_check_core_logic(str(archive_path),
is_podling=False))
- assert any(
- isinstance(result, license_checks.ArtifactResult) and ("too many
members" in result.message.lower())
- for result in results
- )
-
-
def test_license_headers_reports_member_limit(tmp_path, monkeypatch):
archive_path = tmp_path / "sample.tar"
_make_tar(archive_path, ["main.py", "README.txt", "extra.txt"])
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]