From: Daniel Robbins <drobb...@funtoo.org>

The content-hash layout is identical to the filename-hash layout,
except for these two differences:

1) A content digest is used instead of a filename digest.
2) The final element of the path returned from the get_path method
   corresponds to the complete content digest. The path is a
   function of the content digest alone.

Motivations to use the content-hash layout instead of the
filename-hash layout may include:

1) Since the file path is independent of the file name, file
name collisions cannot occur. This makes the content-hash
layout suitable for storage of multiple types of files (not
only gentoo distfiles). For example, it can be used to store
distfiles for multiple linux distros within the same tree,
with automatic deduplication based on content digest. This
layout can be used to store and distribute practically anything
(including binary packages for example).

2) Allows multiple revisions for the same distfiles name. An
existing distfile can be updated, and if a user still has an
older copy of an ebuild repository (or an overlay), then a user
can successfully fetch a desired revision of the distfile as
long as it has not been purged from the mirror.

3) File integrity data is integrated into the layout itself,
making it very simple to verify the integrity of any file that
it contains. The only tool required is an implementation of
the chosen hash algorithm.

Bug: https://bugs.gentoo.org/756778
Signed-off-by: Zac Medico <zmed...@gentoo.org>
---
 lib/portage/package/ebuild/fetch.py    | 160 +++++++++++++++++++++++--
 lib/portage/tests/ebuild/test_fetch.py |  40 ++++++-
 2 files changed, 184 insertions(+), 16 deletions(-)

diff --git a/lib/portage/package/ebuild/fetch.py 
b/lib/portage/package/ebuild/fetch.py
index e0fecaf23..7d2ef93bf 100644
--- a/lib/portage/package/ebuild/fetch.py
+++ b/lib/portage/package/ebuild/fetch.py
@@ -344,6 +344,31 @@ _size_suffix_map = {
 }
 
 
+class DistfileName(str):
+       def __new__(cls, s, digests=None):
+               return str.__new__(cls, s)
+
+       def __init__(self, s, digests=None):
+               super().__init__()
+               self.digests = {} if digests is None else digests
+
+       def digests_equal(self, other):
+               """
+               Test if digests compare equal to those of another instance.
+               """
+               if not isinstance(other, DistfileName):
+                       return False
+               matches = []
+               for algo, digest in self.digests.items():
+                       other_digest = other.digests.get(algo)
+                       if other_digest is not None:
+                               if other_digest == digest:
+                                       matches.append(algo)
+                               else:
+                                       return False
+               return bool(matches)
+
+
 class FlatLayout:
        def get_path(self, filename):
                return filename
@@ -413,6 +438,90 @@ class FilenameHashLayout:
                return False
 
 
+class ContentHashLayout(FilenameHashLayout):
+       """
+       The content-hash layout is identical to the filename-hash layout,
+       except for these two differences:
+
+       1) A content digest is used instead of a filename digest.
+       2) The final element of the path returned from the get_path method
+          corresponds to the complete content digest. The path is a
+          function of the content digest alone.
+
+       Motivations to use the content-hash layout instead of the
+       filename-hash layout may include:
+
+       1) Since the file path is independent of the file name, file
+       name collisions cannot occur. This makes the content-hash
+       layout suitable for storage of multiple types of files (not
+       only gentoo distfiles). For example, it can be used to store
+       distfiles for multiple linux distros within the same tree,
+       with automatic deduplication based on content digest. This
+       layout can be used to store and distribute practically anything
+       (including binary packages for example).
+
+       2) Allows multiple revisions for the same distfiles name. An
+       existing distfile can be updated, and if a user still has an
+       older copy of an ebuild repository (or an overlay), then a user
+       can successfully fetch a desired revision of the distfile as
+       long as it has not been purged from the mirror.
+
+       3) File integrity data is integrated into the layout itself,
+       making it very simple to verify the integrity of any file that
+       it contains. The only tool required is an implementation of
+       the chosen hash algorithm.
+       """
+
+       def get_path(self, filename):
+               """
+               For content-hash, the path is a function of the content digest 
alone.
+               The final element of the path returned from the get_path method
+               corresponds to the complete content digest.
+               """
+               fnhash = remaining = filename.digests[self.algo]
+               ret = ""
+               for c in self.cutoffs:
+                       assert c % 4 == 0
+                       c = c // 4
+                       ret += remaining[:c] + "/"
+                       remaining = remaining[c:]
+               return ret + fnhash
+
+       def get_filenames(self, distdir):
+               """
+               Yields DistfileName instances each with filename corresponding
+               to a digest value for self.algo. These can be compared to other
+               DistfileName instances with their digests_equal method.
+               """
+               for filename in super(ContentHashLayout, 
self).get_filenames(distdir):
+                       yield DistfileName(
+                               filename, digests=dict([(self.algo, 
os.path.basename(filename))])
+                       )
+
+       @staticmethod
+       def verify_args(args, filename=None):
+               """
+               If the filename argument is given, then supported hash
+               algorithms are constrained by digests available in the filename
+               digests attribute.
+
+               @param args: layout.conf entry args
+               @param filename: filename with digests attribute
+               @return: True if args are valid for available digest algorithms,
+                               and False otherwise
+               """
+               if len(args) != 3:
+                       return False
+               if filename is None:
+                       supported_algos = get_valid_checksum_keys()
+               else:
+                       supported_algos = filename.digests
+               algo = args[1].upper()
+               if algo not in supported_algos:
+                       return False
+               return FilenameHashLayout.verify_args(args)
+
+
 class MirrorLayoutConfig:
        """
        Class to read layout.conf from a mirror.
@@ -439,20 +548,41 @@ class MirrorLayoutConfig:
                self.structure = data
 
        @staticmethod
-       def validate_structure(val):
+       def validate_structure(val, filename=None):
+               """
+               If the filename argument is given, then supported hash
+               algorithms are constrained by digests available in the filename
+               digests attribute.
+
+               @param val: layout.conf entry args
+               @param filename: filename with digests attribute
+               @return: True if args are valid for available digest algorithms,
+                       and False otherwise
+               """
                if val[0] == 'flat':
                        return FlatLayout.verify_args(val)
-               if val[0] == 'filename-hash':
+               elif val[0] == 'filename-hash':
                        return FilenameHashLayout.verify_args(val)
+               elif val[0] == 'content-hash':
+                       return ContentHashLayout.verify_args(val, 
filename=filename)
                return False
 
-       def get_best_supported_layout(self):
+       def get_best_supported_layout(self, filename=None):
+               """
+               If the filename argument is given, then acceptable hash
+               algorithms are constrained by digests available in the filename
+               digests attribute.
+
+               @param filename: filename with digests attribute
+               """
                for val in self.structure:
-                       if self.validate_structure(val):
+                       if self.validate_structure(val, filename=filename):
                                if val[0] == 'flat':
                                        return FlatLayout(*val[1:])
-                               if val[0] == 'filename-hash':
+                               elif val[0] == 'filename-hash':
                                        return FilenameHashLayout(*val[1:])
+                               elif val[0] == 'content-hash':
+                                       return ContentHashLayout(*val[1:])
                # fallback
                return FlatLayout()
 
@@ -465,6 +595,8 @@ class MirrorLayoutConfig:
                                ret.append(FlatLayout(*val[1:]))
                        elif val[0] == 'filename-hash':
                                ret.append(FilenameHashLayout(*val[1:]))
+                       elif val[0] == 'content-hash':
+                               ret.append(ContentHashLayout(*val[1:]))
                if not ret:
                        ret.append(FlatLayout())
                return ret
@@ -515,7 +647,7 @@ def get_mirror_url(mirror_url, filename, mysettings, 
cache_path=None):
 
        # For some protocols, urlquote is required for correct behavior,
        # and it must not be used for other protocols like rsync and sftp.
-       path = mirror_conf.get_best_supported_layout().get_path(filename)
+       path = 
mirror_conf.get_best_supported_layout(filename=filename).get_path(filename)
        if urlparse(mirror_url).scheme in ('ftp', 'http', 'https'):
                path = urlquote(path)
        return mirror_url + "/distfiles/" + path
@@ -722,15 +854,23 @@ def fetch(myuris, mysettings, listonly=0, fetchonly=0,
        if hasattr(myuris, 'items'):
                for myfile, uri_set in myuris.items():
                        for myuri in uri_set:
-                               file_uri_tuples.append((myfile, myuri))
+                               file_uri_tuples.append(
+                                       (DistfileName(myfile, 
digests=mydigests.get(myfile)), myuri)
+                               )
                        if not uri_set:
-                               file_uri_tuples.append((myfile, None))
+                               file_uri_tuples.append(
+                                       (DistfileName(myfile, 
digests=mydigests.get(myfile)), None)
+                               )
        else:
                for myuri in myuris:
                        if urlparse(myuri).scheme:
-                               
file_uri_tuples.append((os.path.basename(myuri), myuri))
+                               file_uri_tuples.append(
+                                       (DistfileName(myfile, 
digests=mydigests.get(myfile)), myuri)
+                               )
                        else:
-                               
file_uri_tuples.append((os.path.basename(myuri), None))
+                               file_uri_tuples.append(
+                                       (DistfileName(myfile, 
digests=mydigests.get(myfile)), None)
+                               )
 
        filedict = OrderedDict()
        primaryuri_dict = {}
diff --git a/lib/portage/tests/ebuild/test_fetch.py 
b/lib/portage/tests/ebuild/test_fetch.py
index c5ea8253b..73ae45ebf 100644
--- a/lib/portage/tests/ebuild/test_fetch.py
+++ b/lib/portage/tests/ebuild/test_fetch.py
@@ -7,7 +7,8 @@ import tempfile
 
 import portage
 from portage import shutil, os
-from portage.const import BASH_BINARY, PORTAGE_PYM_PATH
+from portage.checksum import checksum_str
+from portage.const import BASH_BINARY, MANIFEST2_HASH_DEFAULTS, 
PORTAGE_PYM_PATH
 from portage.tests import TestCase
 from portage.tests.resolver.ResolverPlayground import ResolverPlayground
 from portage.tests.util.test_socks5 import AsyncHTTPServer
@@ -18,8 +19,15 @@ from portage.util._async.SchedulerInterface import 
SchedulerInterface
 from portage.util._eventloop.global_event_loop import global_event_loop
 from portage.package.ebuild.config import config
 from portage.package.ebuild.digestgen import digestgen
-from portage.package.ebuild.fetch import (_download_suffix, fetch, FlatLayout,
-               FilenameHashLayout, MirrorLayoutConfig)
+from portage.package.ebuild.fetch import (
+       ContentHashLayout,
+       DistfileName,
+       _download_suffix,
+       fetch,
+       FilenameHashLayout,
+       FlatLayout,
+       MirrorLayoutConfig,
+)
 from _emerge.EbuildFetcher import EbuildFetcher
 from _emerge.Package import Package
 
@@ -102,6 +110,11 @@ class EbuildFetchTestCase(TestCase):
                                "1=filename-hash BLAKE2B 8",
                                "0=flat",
                        ),
+                       (
+                               "[structure]",
+                               "0=content-hash SHA512 8:8:8",
+                               "1=flat",
+                       ),
                )
 
                fetchcommand = 
portage.util.shlex_split(playground.settings["FETCHCOMMAND"])
@@ -142,9 +155,14 @@ class EbuildFetchTestCase(TestCase):
                                content["/distfiles/layout.conf"] = 
layout_data.encode("utf8")
 
                                for k, v in distfiles.items():
+                                       filename = DistfileName(
+                                               k,
+                                               digests=dict((algo, 
checksum_str(v, hashname=algo)) for algo in MANIFEST2_HASH_DEFAULTS),
+                                       )
+
                                        # mirror path
                                        for layout in layouts:
-                                               content["/distfiles/" + 
layout.get_path(k)] = v
+                                               content["/distfiles/" + 
layout.get_path(filename)] = v
                                        # upstream path
                                        content["/distfiles/{}.txt".format(k)] 
= v
 
@@ -499,14 +517,18 @@ class EbuildFetchTestCase(TestCase):
                                io.StringIO(conf))
 
        def test_filename_hash_layout_get_filenames(self):
+               filename = DistfileName(
+                       'foo-1.tar.gz',
+                       digests=dict((algo, checksum_str(b'', hashname=algo)) 
for algo in MANIFEST2_HASH_DEFAULTS),
+               )
                layouts = (
                        FlatLayout(),
                        FilenameHashLayout('SHA1', '4'),
                        FilenameHashLayout('SHA1', '8'),
                        FilenameHashLayout('SHA1', '8:16'),
                        FilenameHashLayout('SHA1', '8:16:24'),
+                       ContentHashLayout('SHA512', '8:8:8'),
                )
-               filename = 'foo-1.tar.gz'
 
                for layout in layouts:
                        distdir = tempfile.mkdtemp()
@@ -520,6 +542,12 @@ class EbuildFetchTestCase(TestCase):
                                with open(path, 'wb') as f:
                                        pass
 
-                               self.assertEqual([filename], 
list(layout.get_filenames(distdir)))
+                               file_list = list(layout.get_filenames(distdir))
+                               self.assertTrue(len(file_list) > 0)
+                               for filename_result in file_list:
+                                       if isinstance(filename_result, 
DistfileName):
+                                               
self.assertTrue(filename_result.digests_equal(filename))
+                                       else:
+                                               
self.assertEqual(filename_result, str(filename))
                        finally:
                                shutil.rmtree(distdir)
-- 
2.26.2


Reply via email to