This is an automated email from the ASF dual-hosted git repository. sbp pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git
commit fa2ec3002f6febd0944c7b5c02147db67750547d Author: Sean B. Palmer <[email protected]> AuthorDate: Mon Mar 16 16:48:39 2026 +0000 Add file classifications to attestable data --- atr/attestable.py | 86 ++++++++++++++++++++++++++++++++--------- atr/detection.py | 2 +- atr/models/attestable.py | 15 +++++++ atr/storage/writers/revision.py | 11 +++--- atr/tasks/checks/__init__.py | 2 +- atr/tasks/quarantine.py | 2 +- tests/unit/test_attestable.py | 84 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 176 insertions(+), 26 deletions(-) diff --git a/atr/attestable.py b/atr/attestable.py index a020776d..1cf40c08 100644 --- a/atr/attestable.py +++ b/atr/attestable.py @@ -18,12 +18,14 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING, Any +import pathlib +from typing import Any import aiofiles import aiofiles.os import pydantic +import atr.classify as classify import atr.hashes as hashes import atr.log as log import atr.models.attestable as models @@ -31,9 +33,6 @@ import atr.models.safe as safe import atr.paths as paths import atr.util as util -if TYPE_CHECKING: - import pathlib - def attestable_checks_path( project_name: safe.ProjectName, version_name: safe.VersionName, revision_number: safe.RevisionNumber @@ -73,14 +72,14 @@ async def load( project_name: safe.ProjectName, version_name: safe.VersionName, revision_number: safe.RevisionNumber, -) -> models.AttestableV1 | None: +) -> models.Attestable | None: file_path = attestable_path(project_name, version_name, revision_number) if not await aiofiles.os.path.isfile(file_path): return None try: async with aiofiles.open(file_path, encoding="utf-8") as f: data = json.loads(await f.read()) - return models.AttestableV1.model_validate(data) + return _parse_attestable(data) except (json.JSONDecodeError, pydantic.ValidationError) as e: log.warning(f"Could not parse {file_path}, starting fresh: {e}") return None @@ -124,7 +123,7 @@ async def load_paths( log.warning(f"Could not parse {file_path}: {e}") # combined = await load(project_name, version_name, revision_number) # if combined is not None: - # return combined.paths + # return path_hashes(combined) return None @@ -148,8 +147,8 @@ def migrate_to_paths_files() -> int: try: with open(json_file, encoding="utf-8") as f: data = json.loads(f.read()) - validated = models.AttestableV1.model_validate(data) - paths_result = models.AttestablePathsV1(paths=validated.paths) + validated = _parse_attestable(data) + paths_result = models.AttestablePathsV1(paths=path_hashes(validated)) tmp = target.with_suffix(".tmp") with open(tmp, "w", encoding="utf-8") as f: f.write(paths_result.model_dump_json(indent=2)) @@ -160,6 +159,26 @@ def migrate_to_paths_files() -> int: return count +def path_classification(attestable: models.Attestable, path_key: str) -> str | None: + if isinstance(attestable, models.AttestableV2): + entry = attestable.paths.get(path_key) + return entry.classification if (entry is not None) else None + return None + + +def path_hash(attestable: models.Attestable, path_key: str) -> str | None: + if isinstance(attestable, models.AttestableV2): + entry = attestable.paths.get(path_key) + return entry.content_hash if (entry is not None) else None + return attestable.paths.get(path_key) + + +def path_hashes(attestable: models.Attestable) -> dict[str, str]: + if isinstance(attestable, models.AttestableV2): + return {path_key: entry.content_hash for path_key, entry in attestable.paths.items()} + return dict(attestable.paths) + + async def paths_to_hashes_and_sizes(directory: pathlib.Path) -> tuple[dict[str, str], dict[str, int]]: path_to_hash: dict[str, str] = {} path_to_size: dict[str, int] = {} @@ -204,14 +223,17 @@ async def write_files_data( revision_number: safe.RevisionNumber, release_policy: dict[str, Any] | None, uploader_uid: str, - previous: models.AttestableV1 | None, + previous: models.Attestable | None, path_to_hash: dict[str, str], path_to_size: dict[str, int], + base_path: pathlib.Path, ) -> None: - result = _generate_files_data(path_to_hash, path_to_size, revision_number, release_policy, uploader_uid, previous) + result = _generate_files_data( + path_to_hash, path_to_size, revision_number, release_policy, uploader_uid, previous, base_path + ) file_path = attestable_path(project_name, version_name, revision_number) await util.atomic_write_file(file_path, result.model_dump_json(indent=2)) - paths_result = models.AttestablePathsV1(paths=result.paths) + paths_result = models.AttestablePathsV1(paths=path_hashes(result)) paths_file_path = attestable_paths_path(project_name, version_name, revision_number) await util.atomic_write_file(paths_file_path, paths_result.model_dump_json(indent=2)) checks_file_path = attestable_checks_path(project_name, version_name, revision_number) @@ -220,16 +242,33 @@ async def write_files_data( await f.write(models.AttestableChecksV2().model_dump_json(indent=2)) +def _compute_classifications( + path_to_hash: dict[str, str], + release_policy: dict[str, Any] | None, + base_path: pathlib.Path, +) -> dict[str, str]: + policy = release_policy or {} + source_matcher, binary_matcher = classify.matchers_from_policy( + policy.get("source_artifact_paths", []), + policy.get("binary_artifact_paths", []), + base_path, + ) + return { + path_key: classify.classify(pathlib.Path(path_key), base_path, source_matcher, binary_matcher).value + for path_key in path_to_hash + } + + def _compute_hashes_with_attribution( # noqa: C901 current_hash_to_paths: dict[str, set[str]], path_to_size: dict[str, int], - previous: models.AttestableV1 | None, + previous: models.Attestable | None, uploader_uid: str, revision_number: safe.RevisionNumber, ) -> dict[str, models.HashEntry]: previous_hash_to_paths: dict[str, set[str]] = {} if previous is not None: - for path_key, hash_ref in previous.paths.items(): + for path_key, hash_ref in path_hashes(previous).items(): previous_hash_to_paths.setdefault(hash_ref, set()).add(path_key) new_hashes: dict[str, models.HashEntry] = {} @@ -271,8 +310,9 @@ def _generate_files_data( revision_number: safe.RevisionNumber, release_policy: dict[str, Any] | None, uploader_uid: str, - previous: models.AttestableV1 | None, -) -> models.AttestableV1: + previous: models.Attestable | None, + base_path: pathlib.Path, +) -> models.AttestableV2: current_hash_to_paths: dict[str, set[str]] = {} for path_key, hash_ref in path_to_hash.items(): current_hash_to_paths.setdefault(hash_ref, set()).add(path_key) @@ -281,12 +321,22 @@ def _generate_files_data( current_hash_to_paths, path_to_size, previous, uploader_uid, revision_number ) - return models.AttestableV1( - paths=dict(path_to_hash), + classifications = _compute_classifications(path_to_hash, release_policy, base_path) + return models.AttestableV2( hashes=dict(new_hashes), + paths={ + path_key: models.PathEntryV2(content_hash=hash_ref, classification=classifications[path_key]) + for path_key, hash_ref in path_to_hash.items() + }, policy=release_policy or {}, ) +def _parse_attestable(data: dict[str, object]) -> models.Attestable: + if data.get("version") == 2: + return models.AttestableV2.model_validate(data) + return models.AttestableV1.model_validate(data) + + def _path_basename(path_key: str) -> str: return path_key.rsplit("/", maxsplit=1)[-1] diff --git a/atr/detection.py b/atr/detection.py index 7b13efe5..22e73c5c 100644 --- a/atr/detection.py +++ b/atr/detection.py @@ -100,7 +100,7 @@ def deduplicate_quarantine_archives(archive_paths: list[str], path_to_hash: dict def detect_archives_requiring_quarantine( - path_to_hash: dict[str, str], previous_attestable: models.AttestableV1 | None + path_to_hash: dict[str, str], previous_attestable: models.Attestable | None ) -> list[str]: quarantine_paths: list[str] = [] for path_key, hash_ref in path_to_hash.items(): diff --git a/atr/models/attestable.py b/atr/models/attestable.py index 1e1bd7bb..bf1b5866 100644 --- a/atr/models/attestable.py +++ b/atr/models/attestable.py @@ -48,3 +48,18 @@ class AttestableV1(schema.Strict): paths: dict[str, str] = schema.factory(dict) hashes: dict[str, HashEntry] = schema.factory(dict) policy: dict[str, Any] = schema.factory(dict) + + +class PathEntryV2(schema.Strict): + content_hash: str + classification: str + + +class AttestableV2(schema.Strict): + version: Literal[2] = 2 + hashes: dict[str, HashEntry] = schema.factory(dict) + paths: dict[str, PathEntryV2] = schema.factory(dict) + policy: dict[str, Any] = schema.factory(dict) + + +type Attestable = AttestableV1 | AttestableV2 diff --git a/atr/storage/writers/revision.py b/atr/storage/writers/revision.py index d5c02dfa..0acf071a 100644 --- a/atr/storage/writers/revision.py +++ b/atr/storage/writers/revision.py @@ -82,7 +82,7 @@ async def finalise_revision( old_revision: sql.Revision | None, path_to_hash: dict[str, str], path_to_size: dict[str, int], - previous_attestable: atr.models.attestable.AttestableV1 | None, + previous_attestable: atr.models.attestable.Attestable | None, project_name: safe.ProjectName, release: sql.Release, release_name: safe.ReleaseName, @@ -137,7 +137,7 @@ async def _commit_new_revision( merge_base_revision_name: str | None, path_to_hash: dict[str, str], path_to_size: dict[str, int], - previous_attestable: atr.models.attestable.AttestableV1 | None, + previous_attestable: atr.models.attestable.Attestable | None, project_name: safe.ProjectName, release: sql.Release, release_name: str, @@ -202,6 +202,7 @@ async def _commit_new_revision( previous_attestable, path_to_hash, path_to_size, + new_revision_dir, ) # Commit to end the transaction started by data.begin_immediate @@ -241,13 +242,13 @@ async def _lock_and_merge( old_revision: sql.Revision | None, path_to_hash: dict[str, str], path_to_size: dict[str, int], - previous_attestable: atr.models.attestable.AttestableV1 | None, + previous_attestable: atr.models.attestable.Attestable | None, project_name: safe.ProjectName, release: sql.Release, _release_name: safe.ReleaseName, temp_dir_path: pathlib.Path, version_name: safe.VersionName, -) -> tuple[atr.models.attestable.AttestableV1 | None, str | None, str | None, sql.Release]: +) -> tuple[atr.models.attestable.Attestable | None, str | None, str | None, sql.Release]: # Acquire the write lock # We need this write lock for moving the directory afterwards atomically # But it also helps to make models.populate_revision_sequence_and_name safe against races @@ -434,7 +435,7 @@ class CommitteeParticipant(FoundationCommitter): if merge_enabled and (old_revision is not None): base_dir = old_release_dir base_inodes = await asyncio.to_thread(util.paths_to_inodes, base_dir) - base_hashes = dict(previous_attestable.paths) if (previous_attestable is not None) else {} + base_hashes = attestable.path_hashes(previous_attestable) if (previous_attestable is not None) else {} n_inodes = await asyncio.to_thread(util.paths_to_inodes, temp_dir_path) except Exception: await aioshutil.rmtree(temp_dir) diff --git a/atr/tasks/checks/__init__.py b/atr/tasks/checks/__init__.py index 16f1bb8c..d10dd4a8 100644 --- a/atr/tasks/checks/__init__.py +++ b/atr/tasks/checks/__init__.py @@ -357,7 +357,7 @@ async def resolve_cache_key( policy_dict = _coerce_policy_nulls(attestable_data.policy) policy = sql.ReleasePolicy.model_validate(policy_dict) if not ignore_path: - file_hash = attestable_data.paths.get(file) if file else None + file_hash = attestable.path_hash(attestable_data, file) if file else None else: # TODO: Is this fallback valid / necessary? Or should we bail out if there's no attestable data? policy = release.release_policy or release.project.release_policy diff --git a/atr/tasks/quarantine.py b/atr/tasks/quarantine.py index 643e1af8..d042df0c 100644 --- a/atr/tasks/quarantine.py +++ b/atr/tasks/quarantine.py @@ -326,7 +326,7 @@ async def _promote( if old_revision is not None: old_release_dir = paths.release_directory_base(release) / old_revision.number base_inodes = await asyncio.to_thread(util.paths_to_inodes, old_release_dir) - base_hashes = dict(previous_attestable.paths) if (previous_attestable is not None) else {} + base_hashes = attestable.path_hashes(previous_attestable) if (previous_attestable is not None) else {} n_inodes = await asyncio.to_thread(util.paths_to_inodes, quarantine_dir_path) async with revision.SafeSession(quarantine_dir) as data: diff --git a/tests/unit/test_attestable.py b/tests/unit/test_attestable.py index 24bd4a6a..0d8619c8 100644 --- a/tests/unit/test_attestable.py +++ b/tests/unit/test_attestable.py @@ -15,10 +15,51 @@ # specific language governing permissions and limitations # under the License. +import pathlib + import atr.attestable as attestable import atr.models.attestable as models +def test_attestable_v2_round_trip(): + original = models.AttestableV2( + hashes={"h1": models.HashEntry(size=100, uploaders=[("alice", "00001")])}, + paths={ + "a.tar.gz": models.PathEntryV2(content_hash="h1", classification="source"), + "a.tar.gz.sha512": models.PathEntryV2(content_hash="h2", classification="metadata"), + }, + policy={"min_hours": 72}, + ) + + loaded = models.AttestableV2.model_validate_json(original.model_dump_json()) + + assert loaded == original + assert loaded.version == 2 + assert loaded.paths["a.tar.gz"].content_hash == "h1" + assert loaded.paths["a.tar.gz"].classification == "source" + assert loaded.paths["a.tar.gz.sha512"].content_hash == "h2" + assert loaded.paths["a.tar.gz.sha512"].classification == "metadata" + + +def test_generate_files_data_returns_attestable_v2(): + data = attestable._generate_files_data( + path_to_hash={"apache-widget-1.0-src.tar.gz": "h1", "apache-widget-1.0-src.tar.gz.sha512": "h2"}, + path_to_size={"apache-widget-1.0-src.tar.gz": 100, "apache-widget-1.0-src.tar.gz.sha512": 64}, + revision_number="00001", + release_policy=None, + uploader_uid="alice", + previous=None, + base_path=pathlib.Path("/test"), + ) + + assert isinstance(data, models.AttestableV2) + assert data.version == 2 + assert data.paths["apache-widget-1.0-src.tar.gz"].content_hash == "h1" + assert data.paths["apache-widget-1.0-src.tar.gz"].classification == "source" + assert data.paths["apache-widget-1.0-src.tar.gz.sha512"].content_hash == "h2" + assert data.paths["apache-widget-1.0-src.tar.gz.sha512"].classification == "metadata" + + def test_hash_entry_basenames_round_trip(): entry = models.HashEntry( size=123, @@ -64,8 +105,51 @@ def test_hash_metadata_basenames_are_cumulative_and_unique(): release_policy=None, uploader_uid="bob", previous=previous, + base_path=pathlib.Path("/test"), ) assert data.hashes["h1"].basenames == ["apache-widget-1.0-src.tar.gz", "apache-widget-1.0.zip"] assert data.hashes["h1"].uploaders == [("alice", "00001"), ("bob", "00002")] assert data.hashes["h2"].basenames == ["readme.txt"] + + +def test_parse_attestable_v1(): + data = {"version": 1, "paths": {"a.tar.gz": "h1"}, "hashes": {}, "policy": {}} + + result = attestable._parse_attestable(data) + + assert isinstance(result, models.AttestableV1) + assert result.version == 1 + assert result.paths == {"a.tar.gz": "h1"} + + +def test_parse_attestable_v2(): + data = { + "version": 2, + "paths": { + "a.tar.gz": {"content_hash": "h1", "classification": "source"}, + }, + "hashes": {}, + "policy": {}, + } + + result = attestable._parse_attestable(data) + + assert isinstance(result, models.AttestableV2) + assert result.version == 2 + assert result.paths["a.tar.gz"].content_hash == "h1" + assert result.paths["a.tar.gz"].classification == "source" + + +def test_path_hashes_support_v1_and_v2(): + v1 = models.AttestableV1(paths={"a.tar.gz": "h1"}, hashes={}, policy={}) + v2 = models.AttestableV2( + paths={"a.tar.gz": models.PathEntryV2(content_hash="h1", classification="source")}, + hashes={}, + policy={}, + ) + + assert attestable.path_hashes(v1) == {"a.tar.gz": "h1"} + assert attestable.path_hashes(v2) == {"a.tar.gz": "h1"} + assert attestable.path_hash(v2, "a.tar.gz") == "h1" + assert attestable.path_classification(v2, "a.tar.gz") == "source" --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
