This is an automated email from the ASF dual-hosted git repository. sbp pushed a commit to branch sbp in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git
commit b0643597fee391aec9179230ad9e87fc2ee5889b Author: Sean B. Palmer <[email protected]> AuthorDate: Wed Mar 11 15:41:36 2026 +0000 Use extracted archive trees in RAT checks --- atr/tasks/checks/rat.py | 128 ++++++++++++++++++++---------------------- tests/unit/test_checks_rat.py | 50 ++++++++++++----- 2 files changed, 98 insertions(+), 80 deletions(-) diff --git a/atr/tasks/checks/rat.py b/atr/tasks/checks/rat.py index 461e59f9..16df15b3 100644 --- a/atr/tasks/checks/rat.py +++ b/atr/tasks/checks/rat.py @@ -25,7 +25,6 @@ from typing import Final import defusedxml.ElementTree as ElementTree -import atr.archives as archives import atr.config as config import atr.constants as constants import atr.log as log @@ -69,7 +68,7 @@ _STD_EXCLUSIONS_EXTENDED: Final[list[str]] = [ # Release policy fields which this check relies on - used for result caching INPUT_POLICY_KEYS: Final[list[str]] = ["license_check_mode", "source_excludes_rat"] INPUT_EXTRA_ARGS: Final[list[str]] = [] -CHECK_VERSION: Final[str] = "1" +CHECK_VERSION: Final[str] = "2" class RatError(RuntimeError): @@ -90,13 +89,21 @@ async def check(args: checks.FunctionArguments) -> results.Results | None: log.info(f"Skipping RAT check for {artifact_abs_path} (mode is LIGHTWEIGHT)") return None + cache_dir = await checks.resolve_cache_dir(args) + if cache_dir is None: + await recorder.failure( + "Extracted archive tree is not available", + {"rel_path": args.primary_rel_path}, + ) + return None + log.info(f"Checking RAT licenses for {artifact_abs_path} (rel: {args.primary_rel_path})") is_source = await recorder.primary_path_is_source() policy_excludes = project.policy_source_excludes_rat if is_source else [] try: - await _check_core(args, recorder, artifact_abs_path, policy_excludes) + await _check_core(args, recorder, cache_dir, policy_excludes) except Exception as e: # TODO: Or bubble for task failure? await recorder.failure("Error running Apache RAT check", {"error": str(e)}) @@ -154,16 +161,14 @@ def _build_rat_command( async def _check_core( args: checks.FunctionArguments, recorder: checks.Recorder, - artifact_abs_path: pathlib.Path, + cache_dir: pathlib.Path, policy_excludes: list[str], ) -> None: result = await asyncio.to_thread( _synchronous, - artifact_path=str(artifact_abs_path), + cache_dir=str(cache_dir), policy_excludes=policy_excludes, rat_jar_path=args.extra_args.get("rat_jar_path", _CONFIG.APACHE_RAT_JAR_PATH), - max_extract_size=args.extra_args.get("max_extract_size", _CONFIG.MAX_EXTRACT_SIZE), - chunk_size=args.extra_args.get("chunk_size", _CONFIG.EXTRACT_CHUNK_SIZE), ) # Record individual file failures before the overall result @@ -284,19 +289,23 @@ def _count_files_outside_directory(temp_dir: str, scan_root: str) -> int: def _get_command_and_xml_output_path( - temp_dir: str, excludes_file_path: str | None, apply_extended_std: bool, scan_root: str, rat_jar_path: str + scratch_dir: str, excludes_file_path: str | None, apply_extended_std: bool, scan_root: str, rat_jar_path: str ) -> tuple[list[str], str]: - xml_output_path = os.path.join(temp_dir, _RAT_REPORT_FILENAME) + xml_output_path = os.path.join(scratch_dir, _RAT_REPORT_FILENAME) log.info(f"XML output will be written to: {xml_output_path}") - # Convert exclusion file path from temp_dir relative to scan_root relative + # Convert exclusion file path to scan_root relative if inside the tree, else absolute excludes_file: str | None = None if excludes_file_path is not None: - abs_path = os.path.join(temp_dir, excludes_file_path) - if not (os.path.exists(abs_path) and os.path.isfile(abs_path)): - log.error(f"Exclusion file not found or not a regular file: {abs_path}") - raise RatError(f"Exclusion file is not a regular file: {excludes_file_path}({abs_path})") - excludes_file = os.path.relpath(abs_path, scan_root) + if not (os.path.exists(excludes_file_path) and os.path.isfile(excludes_file_path)): + log.error(f"Exclusion file not found or not a regular file: {excludes_file_path}") + raise RatError(f"Exclusion file is not a regular file: {excludes_file_path}") + abs_excludes = os.path.abspath(excludes_file_path) + abs_scan = os.path.abspath(scan_root) + if abs_excludes.startswith(abs_scan + os.sep): + excludes_file = os.path.relpath(abs_excludes, abs_scan) + else: + excludes_file = abs_excludes log.info(f"Using exclusion file: {excludes_file}") command = _build_rat_command(rat_jar_path, xml_output_path, excludes_file, apply_extended_std) log.info(f"Running Apache RAT: {' '.join(command)}") @@ -314,7 +323,7 @@ def _is_inside_directory(path: str, directory: str) -> bool: def _sanitise_command_for_storage(command: list[str]) -> list[str]: """Replace absolute paths with filenames for known arguments.""" - path_args = {"-jar", "--output-file"} + path_args = {"-jar", "--input-exclude", "--input-exclude-file", "--output-file"} result: list[str] = [] for i, arg in enumerate(command): if (i > 0) and (command[i - 1] in path_args) and os.path.isabs(arg): @@ -338,14 +347,12 @@ def _summary_message(valid: bool, unapproved_licenses: int, unknown_licenses: in def _synchronous( - artifact_path: str, + cache_dir: str, policy_excludes: list[str], rat_jar_path: str = _CONFIG.APACHE_RAT_JAR_PATH, - max_extract_size: int = _CONFIG.MAX_EXTRACT_SIZE, - chunk_size: int = _CONFIG.EXTRACT_CHUNK_SIZE, ) -> checkdata.Rat: """Verify license headers using Apache RAT.""" - log.info(f"Verifying licenses with Apache RAT for {artifact_path}") + log.info(f"Verifying licenses with Apache RAT for {cache_dir}") log.info(f"PATH environment variable: {os.environ.get('PATH', 'PATH not found')}") java_check = _synchronous_check_java_installed() @@ -358,13 +365,9 @@ def _synchronous( return jar_error try: - # Create a temporary directory for extraction - # TODO: We could extract to somewhere in "state/" instead - with tempfile.TemporaryDirectory(prefix="rat_verify_") as temp_dir: - log.info(f"Created temporary directory: {temp_dir}") - return _synchronous_extract( - artifact_path, temp_dir, max_extract_size, chunk_size, policy_excludes, rat_jar_path - ) + with tempfile.TemporaryDirectory(prefix="rat_scratch_") as scratch_dir: + log.info(f"Created scratch directory: {scratch_dir}") + return _synchronous_core(cache_dir, scratch_dir, policy_excludes, rat_jar_path) except Exception as e: import traceback @@ -455,24 +458,17 @@ def _synchronous_check_java_installed() -> checkdata.Rat | None: ) -def _synchronous_extract( - artifact_path: str, - temp_dir: str, - max_extract_size: int, - chunk_size: int, +def _synchronous_core( # noqa: C901 + cache_dir: str, + scratch_dir: str, policy_excludes: list[str], rat_jar_path: str, ) -> checkdata.Rat: - # Extract the archive to the temporary directory - log.info(f"Extracting {artifact_path} to {temp_dir}") - extracted_size, exclude_file_paths = archives.extract( - artifact_path, - temp_dir, - max_size=max_extract_size, - chunk_size=chunk_size, - track_files={_RAT_EXCLUDES_FILENAME}, - ) - log.info(f"Extracted {extracted_size} bytes") + exclude_file_paths: list[str] = [] + for dirpath, _dirnames, filenames in os.walk(cache_dir): + for filename in filenames: + if filename == _RAT_EXCLUDES_FILENAME: + exclude_file_paths.append(os.path.relpath(os.path.join(dirpath, filename), cache_dir)) log.info(f"Found {len(exclude_file_paths)} {_RAT_EXCLUDES_FILENAME} file(s): {exclude_file_paths}") # Validate that we found at most one exclusion file @@ -486,12 +482,12 @@ def _synchronous_extract( # Narrow to single path after validation archive_excludes_path: str | None = exclude_file_paths[0] if exclude_file_paths else None - excludes_source, effective_excludes_path = _synchronous_extract_excludes_source( - archive_excludes_path, policy_excludes, temp_dir + excludes_source, effective_excludes_path = _synchronous_core_excludes_source( + archive_excludes_path, policy_excludes, cache_dir, scratch_dir ) try: - scan_root = _synchronous_extract_scan_root(archive_excludes_path, temp_dir) + scan_root = _synchronous_core_scan_root(archive_excludes_path, cache_dir) except RatError as e: return checkdata.Rat( message=f"Failed to determine scan root: {e}", @@ -504,14 +500,14 @@ def _synchronous_extract( apply_extended_std = excludes_source != "archive" try: command, xml_output_path = _get_command_and_xml_output_path( - temp_dir, effective_excludes_path, apply_extended_std, scan_root, rat_jar_path + scratch_dir, effective_excludes_path, apply_extended_std, scan_root, rat_jar_path ) except RatError as e: return checkdata.Rat( message=f"Failed to build RAT command: {e}", errors=[str(e)], ) - error_result, xml_output_path = _check_core_logic_execute_rat(command, scan_root, temp_dir, xml_output_path) + error_result, xml_output_path = _check_core_logic_execute_rat(command, scan_root, scratch_dir, xml_output_path) if error_result is not None: return error_result @@ -521,12 +517,12 @@ def _synchronous_extract( if xml_output_path is None: raise ValueError("XML output path is None") - result = _synchronous_extract_parse_output(xml_output_path, scan_root) + result = _synchronous_core_parse_output(xml_output_path, scan_root) log.info(f"Successfully parsed RAT output with {util.plural(result.total_files, 'file')}") # The unknown_license_files and unapproved_files contain FileEntry objects # The path is relative to scan_root, so we prepend the scan_root relative path - scan_root_rel = os.path.relpath(scan_root, temp_dir) + scan_root_rel = os.path.relpath(scan_root, cache_dir) result.directory = scan_root_rel if scan_root_rel != ".": for file in result.unknown_license_files: @@ -540,8 +536,8 @@ def _synchronous_extract( return result -def _synchronous_extract_excludes_source( - archive_excludes_path: str | None, policy_excludes: list[str], temp_dir: str +def _synchronous_core_excludes_source( + archive_excludes_path: str | None, policy_excludes: list[str], cache_dir: str, scratch_dir: str ) -> tuple[str, str | None]: # Determine excludes_source and effective excludes file excludes_source: str @@ -549,14 +545,14 @@ def _synchronous_extract_excludes_source( if archive_excludes_path is not None: excludes_source = "archive" - effective_excludes_path = archive_excludes_path + effective_excludes_path = os.path.join(cache_dir, archive_excludes_path) log.info(f"Using archive {_RAT_EXCLUDES_FILENAME}: {archive_excludes_path}") elif policy_excludes: excludes_source = "policy" - policy_excludes_file = os.path.join(temp_dir, _POLICY_EXCLUDES_FILENAME) + policy_excludes_file = os.path.join(scratch_dir, _POLICY_EXCLUDES_FILENAME) with open(policy_excludes_file, "w") as f: f.write("\n".join(policy_excludes)) - effective_excludes_path = os.path.relpath(policy_excludes_file, temp_dir) + effective_excludes_path = policy_excludes_file log.info(f"Using policy excludes written to: {policy_excludes_file}") else: excludes_source = "none" @@ -565,10 +561,10 @@ def _synchronous_extract_excludes_source( return excludes_source, effective_excludes_path -def _synchronous_extract_parse_output(xml_file: str, base_dir: str) -> checkdata.Rat: +def _synchronous_core_parse_output(xml_file: str, base_dir: str) -> checkdata.Rat: """Parse the XML output from Apache RAT safely.""" try: - return _synchronous_extract_parse_output_core(xml_file, base_dir) + return _synchronous_core_parse_output_core(xml_file, base_dir) except Exception as e: log.error(f"Error parsing RAT output: {e}") return checkdata.Rat( @@ -577,7 +573,7 @@ def _synchronous_extract_parse_output(xml_file: str, base_dir: str) -> checkdata ) -def _synchronous_extract_parse_output_core(xml_file: str, base_dir: str) -> checkdata.Rat: +def _synchronous_core_parse_output_core(xml_file: str, base_dir: str) -> checkdata.Rat: """Parse the XML output from Apache RAT.""" tree = ElementTree.parse(xml_file) @@ -646,27 +642,27 @@ def _synchronous_extract_parse_output_core(xml_file: str, base_dir: str) -> chec ) -def _synchronous_extract_scan_root(archive_excludes_path: str | None, temp_dir: str) -> str: +def _synchronous_core_scan_root(archive_excludes_path: str | None, cache_dir: str) -> str: # Determine scan root based on archive .rat-excludes location if archive_excludes_path is not None: - scan_root = os.path.dirname(os.path.join(temp_dir, archive_excludes_path)) + scan_root = os.path.dirname(os.path.join(cache_dir, archive_excludes_path)) - # Verify that scan_root is inside temp_dir + # Verify that scan_root is inside cache_dir abs_scan_root = os.path.abspath(scan_root) - abs_temp_dir = os.path.abspath(temp_dir) - scan_root_is_inside = (abs_scan_root == abs_temp_dir) or abs_scan_root.startswith(abs_temp_dir + os.sep) + abs_cache_dir = os.path.abspath(cache_dir) + scan_root_is_inside = (abs_scan_root == abs_cache_dir) or abs_scan_root.startswith(abs_cache_dir + os.sep) if not scan_root_is_inside: - log.error(f"Scan root {scan_root} is outside temp_dir {temp_dir}") + log.error(f"Scan root {scan_root} is outside cache_dir {cache_dir}") raise RatError("Invalid archive structure: exclusion file path escapes extraction directory") log.info(f"Using {_RAT_EXCLUDES_FILENAME} directory as scan root: {scan_root}") - untracked_count = _count_files_outside_directory(temp_dir, scan_root) + untracked_count = _count_files_outside_directory(cache_dir, scan_root) if untracked_count > 0: log.error(f"Found {untracked_count} file(s) outside {_RAT_EXCLUDES_FILENAME} directory") raise RatError(f"Files exist outside {_RAT_EXCLUDES_FILENAME} directory ({untracked_count} found)") else: - scan_root = temp_dir - log.info(f"No archive {_RAT_EXCLUDES_FILENAME} found, using temp_dir as scan root: {scan_root}") + scan_root = cache_dir + log.info(f"No archive {_RAT_EXCLUDES_FILENAME} found, using cache_dir as scan root: {scan_root}") return scan_root diff --git a/tests/unit/test_checks_rat.py b/tests/unit/test_checks_rat.py index 60390840..ebe67458 100644 --- a/tests/unit/test_checks_rat.py +++ b/tests/unit/test_checks_rat.py @@ -17,6 +17,7 @@ import pathlib import shlex +import tarfile import pytest @@ -41,9 +42,10 @@ def rat_available() -> tuple[bool, bool]: return (java_ok, jar_ok) -def test_check_includes_command(rat_available: tuple[bool, bool]): +def test_check_includes_command(rat_available: tuple[bool, bool], tmp_path: pathlib.Path): _skip_if_unavailable(rat_available) - result = rat._synchronous(str(TEST_ARCHIVE), []) + cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE) + result = rat._synchronous(str(cache_dir), []) command = _command_args(result.command) assert len(command) > 0 assert "java" in command @@ -53,32 +55,36 @@ def test_check_includes_command(rat_available: tuple[bool, bool]): assert result.directory == "." -def test_check_includes_excludes_source_none(rat_available: tuple[bool, bool]): +def test_check_includes_excludes_source_none(rat_available: tuple[bool, bool], tmp_path: pathlib.Path): _skip_if_unavailable(rat_available) - result = rat._synchronous(str(TEST_ARCHIVE), []) + cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE) + result = rat._synchronous(str(cache_dir), []) assert result.excludes_source == "none" -def test_check_includes_excludes_source_policy(rat_available: tuple[bool, bool]): +def test_check_includes_excludes_source_policy(rat_available: tuple[bool, bool], tmp_path: pathlib.Path): _skip_if_unavailable(rat_available) - result = rat._synchronous(str(TEST_ARCHIVE), ["*.py"]) + cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE) + result = rat._synchronous(str(cache_dir), ["*.py"]) assert result.excludes_source == "policy" -def test_excludes_archive_ignores_policy_when_file_exists(rat_available: tuple[bool, bool]): +def test_excludes_archive_ignores_policy_when_file_exists(rat_available: tuple[bool, bool], tmp_path: pathlib.Path): """When archive has .rat-excludes, ignore policy patterns even if provided.""" _skip_if_unavailable(rat_available) - result = rat._synchronous(str(TEST_ARCHIVE_WITH_RAT_EXCLUDES), ["*.py", "*.txt"]) + cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE_WITH_RAT_EXCLUDES) + result = rat._synchronous(str(cache_dir), ["*.py", "*.txt"]) assert result.excludes_source == "archive" # Should NOT use the RAT policy file command = _command_args(result.command) assert rat._POLICY_EXCLUDES_FILENAME not in command -def test_excludes_archive_uses_rat_excludes_file(rat_available: tuple[bool, bool]): +def test_excludes_archive_uses_rat_excludes_file(rat_available: tuple[bool, bool], tmp_path: pathlib.Path): """When archive has .rat-excludes, use it and set source to archive.""" _skip_if_unavailable(rat_available) - result = rat._synchronous(str(TEST_ARCHIVE_WITH_RAT_EXCLUDES), []) + cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE_WITH_RAT_EXCLUDES) + result = rat._synchronous(str(cache_dir), []) assert result.excludes_source == "archive" command = _command_args(result.command) assert "--input-exclude-file" in command @@ -88,10 +94,11 @@ def test_excludes_archive_uses_rat_excludes_file(rat_available: tuple[bool, bool assert result.directory == "apache-test-0.2" -def test_excludes_none_has_no_exclude_file(rat_available: tuple[bool, bool]): +def test_excludes_none_has_no_exclude_file(rat_available: tuple[bool, bool], tmp_path: pathlib.Path): """When neither archive nor policy, no exclude file should be used.""" _skip_if_unavailable(rat_available) - result = rat._synchronous(str(TEST_ARCHIVE), []) + cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE) + result = rat._synchronous(str(cache_dir), []) assert result.excludes_source == "none" command = _command_args(result.command) assert "--input-exclude-file" not in command @@ -100,11 +107,12 @@ def test_excludes_none_has_no_exclude_file(rat_available: tuple[bool, bool]): assert rat._POLICY_EXCLUDES_FILENAME not in command -def test_excludes_policy_uses_atr_rat_excludes(rat_available: tuple[bool, bool]): +def test_excludes_policy_uses_atr_rat_excludes(rat_available: tuple[bool, bool], tmp_path: pathlib.Path): """When no archive .rat-excludes but policy exists, use policy file.""" _skip_if_unavailable(rat_available) + cache_dir = _extract_test_archive(tmp_path, TEST_ARCHIVE) # The second argument to rat._synchronous is a list of exclusions from policy - result = rat._synchronous(str(TEST_ARCHIVE), ["*.py"]) + result = rat._synchronous(str(cache_dir), ["*.py"]) assert result.excludes_source == "policy" command = _command_args(result.command) assert "--input-exclude-file" in command @@ -124,6 +132,10 @@ def test_sanitise_command_replaces_absolute_paths(): "/fake/path/rat_verify_abc123/rat-report.xml", "--input-exclude", ".rat-excludes", + "--input-exclude", + "/fake/rat_scratch_abc/.atr-policy-rat-excludes", + "--input-exclude-file", + "/fake/rat_scratch_abc/.atr-policy-rat-excludes", "--", ".", ] @@ -131,12 +143,22 @@ def test_sanitise_command_replaces_absolute_paths(): assert result[2] == "apache-rat-0.17.jar" assert result[4] == "rat-report.xml" assert result[6] == ".rat-excludes" + assert result[8] == ".atr-policy-rat-excludes" + assert result[10] == ".atr-policy-rat-excludes" def _command_args(command: str) -> list[str]: return shlex.split(command) +def _extract_test_archive(tmp_path: pathlib.Path, archive: pathlib.Path) -> pathlib.Path: + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + with tarfile.open(archive) as tf: + tf.extractall(cache_dir, filter="data") + return cache_dir + + def _skip_if_unavailable(rat_available: tuple[bool, bool]) -> None: java_ok, jar_ok = rat_available if not java_ok: --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
