(tooling-trusted-releases) 05/06: Use extracted archive trees in RAT checks

sbp Wed, 11 Mar 2026 09:57:59 -0700

This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch sbp
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git


commit b0643597fee391aec9179230ad9e87fc2ee5889b
Author: Sean B. Palmer <[email protected]>
AuthorDate: Wed Mar 11 15:41:36 2026 +0000

    Use extracted archive trees in RAT checks
---
 atr/tasks/checks/rat.py       | 128 ++++++++++++++++++++----------------------
 tests/unit/test_checks_rat.py |  50 ++++++++++++-----
 2 files changed, 98 insertions(+), 80 deletions(-)

diff --git a/atr/tasks/checks/rat.py b/atr/tasks/checks/rat.py
index 461e59f9..16df15b3 100644
--- a/atr/tasks/checks/rat.py
+++ b/atr/tasks/checks/rat.py
@@ -25,7 +25,6 @@ from typing import Final
 
 import defusedxml.ElementTree as ElementTree
 
-import atr.archives as archives
 import atr.config as config
 import atr.constants as constants
 import atr.log as log
@@ -69,7 +68,7 @@ _STD_EXCLUSIONS_EXTENDED: Final[list[str]] = [
 # Release policy fields which this check relies on - used for result caching
 INPUT_POLICY_KEYS: Final[list[str]] = ["license_check_mode", 
"source_excludes_rat"]
 INPUT_EXTRA_ARGS: Final[list[str]] = []
-CHECK_VERSION: Final[str] = "1"
+CHECK_VERSION: Final[str] = "2"
 
 
 class RatError(RuntimeError):
@@ -90,13 +89,21 @@ async def check(args: checks.FunctionArguments) -> 
results.Results | None:
         log.info(f"Skipping RAT check for {artifact_abs_path} (mode is 
LIGHTWEIGHT)")
         return None
 
+    cache_dir = await checks.resolve_cache_dir(args)
+    if cache_dir is None:
+        await recorder.failure(
+            "Extracted archive tree is not available",
+            {"rel_path": args.primary_rel_path},
+        )
+        return None
+
     log.info(f"Checking RAT licenses for {artifact_abs_path} (rel: 
{args.primary_rel_path})")
 
     is_source = await recorder.primary_path_is_source()
     policy_excludes = project.policy_source_excludes_rat if is_source else []
 
     try:
-        await _check_core(args, recorder, artifact_abs_path, policy_excludes)
+        await _check_core(args, recorder, cache_dir, policy_excludes)
     except Exception as e:
         # TODO: Or bubble for task failure?
         await recorder.failure("Error running Apache RAT check", {"error": 
str(e)})
@@ -154,16 +161,14 @@ def _build_rat_command(
 async def _check_core(
     args: checks.FunctionArguments,
     recorder: checks.Recorder,
-    artifact_abs_path: pathlib.Path,
+    cache_dir: pathlib.Path,
     policy_excludes: list[str],
 ) -> None:
     result = await asyncio.to_thread(
         _synchronous,
-        artifact_path=str(artifact_abs_path),
+        cache_dir=str(cache_dir),
         policy_excludes=policy_excludes,
         rat_jar_path=args.extra_args.get("rat_jar_path", 
_CONFIG.APACHE_RAT_JAR_PATH),
-        max_extract_size=args.extra_args.get("max_extract_size", 
_CONFIG.MAX_EXTRACT_SIZE),
-        chunk_size=args.extra_args.get("chunk_size", 
_CONFIG.EXTRACT_CHUNK_SIZE),
     )
 
     # Record individual file failures before the overall result
@@ -284,19 +289,23 @@ def _count_files_outside_directory(temp_dir: str, 
scan_root: str) -> int:
 
 
 def _get_command_and_xml_output_path(
-    temp_dir: str, excludes_file_path: str | None, apply_extended_std: bool, 
scan_root: str, rat_jar_path: str
+    scratch_dir: str, excludes_file_path: str | None, apply_extended_std: 
bool, scan_root: str, rat_jar_path: str
 ) -> tuple[list[str], str]:
-    xml_output_path = os.path.join(temp_dir, _RAT_REPORT_FILENAME)
+    xml_output_path = os.path.join(scratch_dir, _RAT_REPORT_FILENAME)
     log.info(f"XML output will be written to: {xml_output_path}")
 
-    # Convert exclusion file path from temp_dir relative to scan_root relative
+    # Convert exclusion file path to scan_root relative if inside the tree, 
else absolute
     excludes_file: str | None = None
     if excludes_file_path is not None:
-        abs_path = os.path.join(temp_dir, excludes_file_path)
-        if not (os.path.exists(abs_path) and os.path.isfile(abs_path)):
-            log.error(f"Exclusion file not found or not a regular file: 
{abs_path}")
-            raise RatError(f"Exclusion file is not a regular file: 
{excludes_file_path}({abs_path})")
-        excludes_file = os.path.relpath(abs_path, scan_root)
+        if not (os.path.exists(excludes_file_path) and 
os.path.isfile(excludes_file_path)):
+            log.error(f"Exclusion file not found or not a regular file: 
{excludes_file_path}")
+            raise RatError(f"Exclusion file is not a regular file: 
{excludes_file_path}")
+        abs_excludes = os.path.abspath(excludes_file_path)
+        abs_scan = os.path.abspath(scan_root)
+        if abs_excludes.startswith(abs_scan + os.sep):
+            excludes_file = os.path.relpath(abs_excludes, abs_scan)
+        else:
+            excludes_file = abs_excludes
         log.info(f"Using exclusion file: {excludes_file}")
     command = _build_rat_command(rat_jar_path, xml_output_path, excludes_file, 
apply_extended_std)
     log.info(f"Running Apache RAT: {' '.join(command)}")
@@ -314,7 +323,7 @@ def _is_inside_directory(path: str, directory: str) -> bool:
 
 def _sanitise_command_for_storage(command: list[str]) -> list[str]:
     """Replace absolute paths with filenames for known arguments."""
-    path_args = {"-jar", "--output-file"}
+    path_args = {"-jar", "--input-exclude", "--input-exclude-file", 
"--output-file"}
     result: list[str] = []
     for i, arg in enumerate(command):
         if (i > 0) and (command[i - 1] in path_args) and os.path.isabs(arg):
@@ -338,14 +347,12 @@ def _summary_message(valid: bool, unapproved_licenses: 
int, unknown_licenses: in
 
 
 def _synchronous(
-    artifact_path: str,
+    cache_dir: str,
     policy_excludes: list[str],
     rat_jar_path: str = _CONFIG.APACHE_RAT_JAR_PATH,
-    max_extract_size: int = _CONFIG.MAX_EXTRACT_SIZE,
-    chunk_size: int = _CONFIG.EXTRACT_CHUNK_SIZE,
 ) -> checkdata.Rat:
     """Verify license headers using Apache RAT."""
-    log.info(f"Verifying licenses with Apache RAT for {artifact_path}")
+    log.info(f"Verifying licenses with Apache RAT for {cache_dir}")
     log.info(f"PATH environment variable: {os.environ.get('PATH', 'PATH not 
found')}")
 
     java_check = _synchronous_check_java_installed()
@@ -358,13 +365,9 @@ def _synchronous(
         return jar_error
 
     try:
-        # Create a temporary directory for extraction
-        # TODO: We could extract to somewhere in "state/" instead
-        with tempfile.TemporaryDirectory(prefix="rat_verify_") as temp_dir:
-            log.info(f"Created temporary directory: {temp_dir}")
-            return _synchronous_extract(
-                artifact_path, temp_dir, max_extract_size, chunk_size, 
policy_excludes, rat_jar_path
-            )
+        with tempfile.TemporaryDirectory(prefix="rat_scratch_") as scratch_dir:
+            log.info(f"Created scratch directory: {scratch_dir}")
+            return _synchronous_core(cache_dir, scratch_dir, policy_excludes, 
rat_jar_path)
     except Exception as e:
         import traceback
 
@@ -455,24 +458,17 @@ def _synchronous_check_java_installed() -> checkdata.Rat 
| None:
         )
 
 
-def _synchronous_extract(
-    artifact_path: str,
-    temp_dir: str,
-    max_extract_size: int,
-    chunk_size: int,
+def _synchronous_core(  # noqa: C901
+    cache_dir: str,
+    scratch_dir: str,
     policy_excludes: list[str],
     rat_jar_path: str,
 ) -> checkdata.Rat:
-    # Extract the archive to the temporary directory
-    log.info(f"Extracting {artifact_path} to {temp_dir}")
-    extracted_size, exclude_file_paths = archives.extract(
-        artifact_path,
-        temp_dir,
-        max_size=max_extract_size,
-        chunk_size=chunk_size,
-        track_files={_RAT_EXCLUDES_FILENAME},
-    )
-    log.info(f"Extracted {extracted_size} bytes")
+    exclude_file_paths: list[str] = []
+    for dirpath, _dirnames, filenames in os.walk(cache_dir):
+        for filename in filenames:
+            if filename == _RAT_EXCLUDES_FILENAME:
+                
exclude_file_paths.append(os.path.relpath(os.path.join(dirpath, filename), 
cache_dir))
     log.info(f"Found {len(exclude_file_paths)} {_RAT_EXCLUDES_FILENAME} 
file(s): {exclude_file_paths}")
 
     # Validate that we found at most one exclusion file
@@ -486,12 +482,12 @@ def _synchronous_extract(
     # Narrow to single path after validation
     archive_excludes_path: str | None = exclude_file_paths[0] if 
exclude_file_paths else None
 
-    excludes_source, effective_excludes_path = 
_synchronous_extract_excludes_source(
-        archive_excludes_path, policy_excludes, temp_dir
+    excludes_source, effective_excludes_path = 
_synchronous_core_excludes_source(
+        archive_excludes_path, policy_excludes, cache_dir, scratch_dir
     )
 
     try:
-        scan_root = _synchronous_extract_scan_root(archive_excludes_path, 
temp_dir)
+        scan_root = _synchronous_core_scan_root(archive_excludes_path, 
cache_dir)
     except RatError as e:
         return checkdata.Rat(
             message=f"Failed to determine scan root: {e}",
@@ -504,14 +500,14 @@ def _synchronous_extract(
     apply_extended_std = excludes_source != "archive"
     try:
         command, xml_output_path = _get_command_and_xml_output_path(
-            temp_dir, effective_excludes_path, apply_extended_std, scan_root, 
rat_jar_path
+            scratch_dir, effective_excludes_path, apply_extended_std, 
scan_root, rat_jar_path
         )
     except RatError as e:
         return checkdata.Rat(
             message=f"Failed to build RAT command: {e}",
             errors=[str(e)],
         )
-    error_result, xml_output_path = _check_core_logic_execute_rat(command, 
scan_root, temp_dir, xml_output_path)
+    error_result, xml_output_path = _check_core_logic_execute_rat(command, 
scan_root, scratch_dir, xml_output_path)
     if error_result is not None:
         return error_result
 
@@ -521,12 +517,12 @@ def _synchronous_extract(
     if xml_output_path is None:
         raise ValueError("XML output path is None")
 
-    result = _synchronous_extract_parse_output(xml_output_path, scan_root)
+    result = _synchronous_core_parse_output(xml_output_path, scan_root)
     log.info(f"Successfully parsed RAT output with 
{util.plural(result.total_files, 'file')}")
 
     # The unknown_license_files and unapproved_files contain FileEntry objects
     # The path is relative to scan_root, so we prepend the scan_root relative 
path
-    scan_root_rel = os.path.relpath(scan_root, temp_dir)
+    scan_root_rel = os.path.relpath(scan_root, cache_dir)
     result.directory = scan_root_rel
     if scan_root_rel != ".":
         for file in result.unknown_license_files:
@@ -540,8 +536,8 @@ def _synchronous_extract(
     return result
 
 
-def _synchronous_extract_excludes_source(
-    archive_excludes_path: str | None, policy_excludes: list[str], temp_dir: 
str
+def _synchronous_core_excludes_source(
+    archive_excludes_path: str | None, policy_excludes: list[str], cache_dir: 
str, scratch_dir: str
 ) -> tuple[str, str | None]:
     # Determine excludes_source and effective excludes file
     excludes_source: str
@@ -549,14 +545,14 @@ def _synchronous_extract_excludes_source(
 
     if archive_excludes_path is not None:
         excludes_source = "archive"
-        effective_excludes_path = archive_excludes_path
+        effective_excludes_path = os.path.join(cache_dir, 
archive_excludes_path)
         log.info(f"Using archive {_RAT_EXCLUDES_FILENAME}: 
{archive_excludes_path}")
     elif policy_excludes:
         excludes_source = "policy"
-        policy_excludes_file = os.path.join(temp_dir, 
_POLICY_EXCLUDES_FILENAME)
+        policy_excludes_file = os.path.join(scratch_dir, 
_POLICY_EXCLUDES_FILENAME)
         with open(policy_excludes_file, "w") as f:
             f.write("\n".join(policy_excludes))
-        effective_excludes_path = os.path.relpath(policy_excludes_file, 
temp_dir)
+        effective_excludes_path = policy_excludes_file
         log.info(f"Using policy excludes written to: {policy_excludes_file}")
     else:
         excludes_source = "none"
@@ -565,10 +561,10 @@ def _synchronous_extract_excludes_source(
     return excludes_source, effective_excludes_path
 
 
-def _synchronous_extract_parse_output(xml_file: str, base_dir: str) -> 
checkdata.Rat:
+def _synchronous_core_parse_output(xml_file: str, base_dir: str) -> 
checkdata.Rat:
     """Parse the XML output from Apache RAT safely."""
     try:
-        return _synchronous_extract_parse_output_core(xml_file, base_dir)
+        return _synchronous_core_parse_output_core(xml_file, base_dir)
     except Exception as e:
         log.error(f"Error parsing RAT output: {e}")
         return checkdata.Rat(
@@ -577,7 +573,7 @@ def _synchronous_extract_parse_output(xml_file: str, 
base_dir: str) -> checkdata
         )
 
 
-def _synchronous_extract_parse_output_core(xml_file: str, base_dir: str) -> 
checkdata.Rat:
+def _synchronous_core_parse_output_core(xml_file: str, base_dir: str) -> 
checkdata.Rat:
     """Parse the XML output from Apache RAT."""
     tree = ElementTree.parse(xml_file)
 
@@ -646,27 +642,27 @@ def _synchronous_extract_parse_output_core(xml_file: str, 
base_dir: str) -> chec
     )
 
 
-def _synchronous_extract_scan_root(archive_excludes_path: str | None, 
temp_dir: str) -> str:
+def _synchronous_core_scan_root(archive_excludes_path: str | None, cache_dir: 
str) -> str:
     # Determine scan root based on archive .rat-excludes location
     if archive_excludes_path is not None:
-        scan_root = os.path.dirname(os.path.join(temp_dir, 
archive_excludes_path))
+        scan_root = os.path.dirname(os.path.join(cache_dir, 
archive_excludes_path))
 
-        # Verify that scan_root is inside temp_dir
+        # Verify that scan_root is inside cache_dir
         abs_scan_root = os.path.abspath(scan_root)
-        abs_temp_dir = os.path.abspath(temp_dir)
-        scan_root_is_inside = (abs_scan_root == abs_temp_dir) or 
abs_scan_root.startswith(abs_temp_dir + os.sep)
+        abs_cache_dir = os.path.abspath(cache_dir)
+        scan_root_is_inside = (abs_scan_root == abs_cache_dir) or 
abs_scan_root.startswith(abs_cache_dir + os.sep)
         if not scan_root_is_inside:
-            log.error(f"Scan root {scan_root} is outside temp_dir {temp_dir}")
+            log.error(f"Scan root {scan_root} is outside cache_dir 
{cache_dir}")
             raise RatError("Invalid archive structure: exclusion file path 
escapes extraction directory")
 
         log.info(f"Using {_RAT_EXCLUDES_FILENAME} directory as scan root: 
{scan_root}")
 
-        untracked_count = _count_files_outside_directory(temp_dir, scan_root)
+        untracked_count = _count_files_outside_directory(cache_dir, scan_root)
         if untracked_count > 0:
             log.error(f"Found {untracked_count} file(s) outside 
{_RAT_EXCLUDES_FILENAME} directory")
             raise RatError(f"Files exist outside {_RAT_EXCLUDES_FILENAME} 
directory ({untracked_count} found)")
     else:
-        scan_root = temp_dir
-        log.info(f"No archive {_RAT_EXCLUDES_FILENAME} found, using temp_dir 
as scan root: {scan_root}")
+        scan_root = cache_dir
+        log.info(f"No archive {_RAT_EXCLUDES_FILENAME} found, using cache_dir 
as scan root: {scan_root}")
 
     return scan_root
diff --git a/tests/unit/test_checks_rat.py b/tests/unit/test_checks_rat.py
index 60390840..ebe67458 100644
--- a/tests/unit/test_checks_rat.py
+++ b/tests/unit/test_checks_rat.py
@@ -17,6 +17,7 @@
 
 import pathlib
 import shlex
+import tarfile
 
 import pytest
 
@@ -41,9 +42,10 @@ def rat_available() -> tuple[bool, bool]:
     return (java_ok, jar_ok)
 
 
-def test_check_includes_command(rat_available: tuple[bool, bool]):
+def test_check_includes_command(rat_available: tuple[bool, bool], tmp_path: 
pathlib.Path):
     _skip_if_unavailable(rat_available)
-    result = rat._synchronous(str(TEST_ARCHIVE), [])
+    cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE)
+    result = rat._synchronous(str(cache_dir), [])
     command = _command_args(result.command)
     assert len(command) > 0
     assert "java" in command
@@ -53,32 +55,36 @@ def test_check_includes_command(rat_available: tuple[bool, 
bool]):
     assert result.directory == "."
 
 
-def test_check_includes_excludes_source_none(rat_available: tuple[bool, bool]):
+def test_check_includes_excludes_source_none(rat_available: tuple[bool, bool], 
tmp_path: pathlib.Path):
     _skip_if_unavailable(rat_available)
-    result = rat._synchronous(str(TEST_ARCHIVE), [])
+    cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE)
+    result = rat._synchronous(str(cache_dir), [])
     assert result.excludes_source == "none"
 
 
-def test_check_includes_excludes_source_policy(rat_available: tuple[bool, 
bool]):
+def test_check_includes_excludes_source_policy(rat_available: tuple[bool, 
bool], tmp_path: pathlib.Path):
     _skip_if_unavailable(rat_available)
-    result = rat._synchronous(str(TEST_ARCHIVE), ["*.py"])
+    cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE)
+    result = rat._synchronous(str(cache_dir), ["*.py"])
     assert result.excludes_source == "policy"
 
 
-def test_excludes_archive_ignores_policy_when_file_exists(rat_available: 
tuple[bool, bool]):
+def test_excludes_archive_ignores_policy_when_file_exists(rat_available: 
tuple[bool, bool], tmp_path: pathlib.Path):
     """When archive has .rat-excludes, ignore policy patterns even if 
provided."""
     _skip_if_unavailable(rat_available)
-    result = rat._synchronous(str(TEST_ARCHIVE_WITH_RAT_EXCLUDES), ["*.py", 
"*.txt"])
+    cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE_WITH_RAT_EXCLUDES)
+    result = rat._synchronous(str(cache_dir), ["*.py", "*.txt"])
     assert result.excludes_source == "archive"
     # Should NOT use the RAT policy file
     command = _command_args(result.command)
     assert rat._POLICY_EXCLUDES_FILENAME not in command
 
 
-def test_excludes_archive_uses_rat_excludes_file(rat_available: tuple[bool, 
bool]):
+def test_excludes_archive_uses_rat_excludes_file(rat_available: tuple[bool, 
bool], tmp_path: pathlib.Path):
     """When archive has .rat-excludes, use it and set source to archive."""
     _skip_if_unavailable(rat_available)
-    result = rat._synchronous(str(TEST_ARCHIVE_WITH_RAT_EXCLUDES), [])
+    cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE_WITH_RAT_EXCLUDES)
+    result = rat._synchronous(str(cache_dir), [])
     assert result.excludes_source == "archive"
     command = _command_args(result.command)
     assert "--input-exclude-file" in command
@@ -88,10 +94,11 @@ def 
test_excludes_archive_uses_rat_excludes_file(rat_available: tuple[bool, bool
     assert result.directory == "apache-test-0.2"
 
 
-def test_excludes_none_has_no_exclude_file(rat_available: tuple[bool, bool]):
+def test_excludes_none_has_no_exclude_file(rat_available: tuple[bool, bool], 
tmp_path: pathlib.Path):
     """When neither archive nor policy, no exclude file should be used."""
     _skip_if_unavailable(rat_available)
-    result = rat._synchronous(str(TEST_ARCHIVE), [])
+    cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE)
+    result = rat._synchronous(str(cache_dir), [])
     assert result.excludes_source == "none"
     command = _command_args(result.command)
     assert "--input-exclude-file" not in command
@@ -100,11 +107,12 @@ def test_excludes_none_has_no_exclude_file(rat_available: 
tuple[bool, bool]):
     assert rat._POLICY_EXCLUDES_FILENAME not in command
 
 
-def test_excludes_policy_uses_atr_rat_excludes(rat_available: tuple[bool, 
bool]):
+def test_excludes_policy_uses_atr_rat_excludes(rat_available: tuple[bool, 
bool], tmp_path: pathlib.Path):
     """When no archive .rat-excludes but policy exists, use policy file."""
     _skip_if_unavailable(rat_available)
+    cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE)
     # The second argument to rat._synchronous is a list of exclusions from 
policy
-    result = rat._synchronous(str(TEST_ARCHIVE), ["*.py"])
+    result = rat._synchronous(str(cache_dir), ["*.py"])
     assert result.excludes_source == "policy"
     command = _command_args(result.command)
     assert "--input-exclude-file" in command
@@ -124,6 +132,10 @@ def test_sanitise_command_replaces_absolute_paths():
         "/fake/path/rat_verify_abc123/rat-report.xml",
         "--input-exclude",
         ".rat-excludes",
+        "--input-exclude",
+        "/fake/rat_scratch_abc/.atr-policy-rat-excludes",
+        "--input-exclude-file",
+        "/fake/rat_scratch_abc/.atr-policy-rat-excludes",
         "--",
         ".",
     ]
@@ -131,12 +143,22 @@ def test_sanitise_command_replaces_absolute_paths():
     assert result[2] == "apache-rat-0.17.jar"
     assert result[4] == "rat-report.xml"
     assert result[6] == ".rat-excludes"
+    assert result[8] == ".atr-policy-rat-excludes"
+    assert result[10] == ".atr-policy-rat-excludes"
 
 
 def _command_args(command: str) -> list[str]:
     return shlex.split(command)
 
 
+def _extract_test_archive(tmp_path: pathlib.Path, archive: pathlib.Path) -> 
pathlib.Path:
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
+    with tarfile.open(archive) as tf:
+        tf.extractall(cache_dir, filter="data")
+    return cache_dir
+
+
 def _skip_if_unavailable(rat_available: tuple[bool, bool]) -> None:
     java_ok, jar_ok = rat_available
     if not java_ok:


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(tooling-trusted-releases) 05/06: Use extracted archive trees in RAT checks

Reply via email to