Add a --content-db option which is required for the content-hash
layout because its file listings return content digests instead of
distfile names.

The content db serves to translate content digests to distfiles
names, and distfiles names to content digests. All keys have a
prefix separated by a colon. For digest keys, the prefix is the
hash algorithm name. For filename keys, the prefix is "filename".

The value associated with a digest key is a set of file names. The
value associated with a distfile key is a set of content revisions.
Each content revision is expressed as a dictionary of digests which
is suitable for construction of a DistfileName instance.

Bug: https://bugs.gentoo.org/756778
Signed-off-by: Zac Medico <zmed...@gentoo.org>
---
[PATCH v3] changed the value associated with a digest key is a set
of file name, and fixed ContentDB.remove to preserved independent
references to identical content (like removing one of multiple
hardlinks).

 lib/portage/_emirrordist/Config.py           |   8 +-
 lib/portage/_emirrordist/ContentDB.py        | 178 +++++++++++++++++++
 lib/portage/_emirrordist/DeletionIterator.py |  25 ++-
 lib/portage/_emirrordist/DeletionTask.py     |   8 +
 lib/portage/_emirrordist/FetchTask.py        |   5 +-
 lib/portage/_emirrordist/main.py             |  15 +-
 lib/portage/package/ebuild/fetch.py          |   8 +-
 lib/portage/tests/ebuild/test_fetch.py       |  14 ++
 man/emirrordist.1                            |   6 +-
 9 files changed, 256 insertions(+), 11 deletions(-)
 create mode 100644 lib/portage/_emirrordist/ContentDB.py

diff --git a/lib/portage/_emirrordist/Config.py 
b/lib/portage/_emirrordist/Config.py
index 4bee4f45e..cfe944040 100644
--- a/lib/portage/_emirrordist/Config.py
+++ b/lib/portage/_emirrordist/Config.py
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Gentoo Authors
+# Copyright 2013-2021 Gentoo Authors
 # Distributed under the terms of the GNU General Public License v2
 
 import copy
@@ -10,6 +10,7 @@ import time
 from portage import os
 from portage.package.ebuild.fetch import MirrorLayoutConfig
 from portage.util import grabdict, grablines
+from .ContentDB import ContentDB
 
 class Config:
        def __init__(self, options, portdb, event_loop):
@@ -65,6 +66,11 @@ class Config:
                        self.distfiles_db = self._open_shelve(
                                options.distfiles_db, 'distfiles')
 
+               self.content_db = None
+               if options.content_db is not None:
+                       self.content_db = ContentDB(self._open_shelve(
+                               options.content_db, 'content'))
+
                self.deletion_db = None
                if options.deletion_db is not None:
                        self.deletion_db = self._open_shelve(
diff --git a/lib/portage/_emirrordist/ContentDB.py 
b/lib/portage/_emirrordist/ContentDB.py
new file mode 100644
index 000000000..7084cecff
--- /dev/null
+++ b/lib/portage/_emirrordist/ContentDB.py
@@ -0,0 +1,178 @@
+# Copyright 2021 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+import logging
+import operator
+import shelve
+import typing
+
+from portage.package.ebuild.fetch import DistfileName
+
+
+class ContentDB:
+       """
+       The content db serves to translate content digests to distfiles
+       names, and distfiles names to content digests. All keys have a
+       prefix separated by a colon. For digest keys, the prefix is the
+       hash algorithm name. For filename keys, the prefix is "filename".
+
+       The value associated with a digest key is a set of file names. The
+       value associated with a distfile key is a set of content revisions.
+       Each content revision is expressed as a dictionary of digests which
+       is suitable for construction of a DistfileName instance.
+       """
+
+       def __init__(self, shelve_instance: shelve.Shelf):
+               self._shelve = shelve_instance
+
+       def add(self, filename: DistfileName):
+               """
+               Add file name and digests.
+
+               @param filename: file name with digests attribute
+               """
+               distfile_str = str(filename)
+               distfile_key = "filename:{}".format(distfile_str)
+               for k, v in filename.digests.items():
+                       if k != "size":
+                               digest_key = "{}:{}".format(k, v).lower()
+                               try:
+                                       digest_files = self._shelve[digest_key]
+                               except KeyError:
+                                       digest_files = set()
+                               digest_files.add(distfile_str)
+                               self._shelve[digest_key] = digest_files
+               try:
+                       content_revisions = self._shelve[distfile_key]
+               except KeyError:
+                       content_revisions = set()
+
+               revision_key = tuple(
+                       sorted(
+                               (
+                                       (algo.lower(), 
filename.digests[algo].lower())
+                                       for algo in filename.digests
+                                       if algo != "size"
+                               ),
+                               key=operator.itemgetter(0),
+                       )
+               )
+               content_revisions.add(revision_key)
+               self._shelve[distfile_key] = content_revisions
+
+       def remove(self, filename: DistfileName):
+               """
+               Remove a file name from the database. If identical content is 
still
+               referenced by one or more other file names, then those 
references
+               are preserved (like removing one of many hardlinks).
+
+               @param filename: file name with digests attribute
+               """
+               distfile_key = "filename:{}".format(filename)
+               try:
+                       content_revisions = self._shelve[distfile_key]
+               except KeyError:
+                       pass
+               else:
+                       for revision_key in content_revisions:
+                               for k, v in revision_key:
+                                       digest_key = "{}:{}".format(k, v)
+                                       try:
+                                               digest_files = 
self._shelve[digest_key]
+                                       except KeyError:
+                                               digest_files = set()
+
+                                       try:
+                                               digest_files.remove(filename)
+                                       except KeyError:
+                                               pass
+                                       else:
+                                               if digest_files:
+                                                       
self._shelve[digest_key] = digest_files
+                                               else:
+                                                       try:
+                                                               del 
self._shelve[digest_key]
+                                                       except KeyError:
+                                                               pass
+
+                       logging.debug(("drop '%s' from content db") % filename)
+                       try:
+                               del self._shelve[distfile_key]
+                       except KeyError:
+                               pass
+
+       def get_filenames_translate(
+               self, filename: typing.Union[str, DistfileName]
+       ) -> typing.Generator[DistfileName, None, None]:
+               """
+               Translate distfiles content digests to distfile names.
+               If filename is already a distfile name, then it will pass
+               through unchanged.
+
+               @param filename: A filename listed by layout get_filenames
+               @return: The distfile name, translated from the corresponding
+                               content digest when necessary
+               """
+               if not isinstance(filename, DistfileName):
+                       filename = DistfileName(filename)
+
+               # Match content digests with zero or more content revisions.
+               matched_revisions = {}
+
+               for k, v in filename.digests.items():
+                       digest_item = (k.lower(), v.lower())
+                       digest_key = "{}:{}".format(*digest_item)
+                       try:
+                               digest_files = self._shelve[digest_key]
+                       except KeyError:
+                               continue
+
+                       for distfile_str in digest_files:
+                               matched_revisions.setdefault(distfile_str, 
set())
+                               try:
+                                       content_revisions = 
self._shelve["filename:{}".format(distfile_str)]
+                               except KeyError:
+                                       pass
+                               else:
+                                       for revision_key in content_revisions:
+                                               if (
+                                                       digest_item in 
revision_key
+                                                       and revision_key not in 
matched_revisions[distfile_str]
+                                               ):
+                                                       
matched_revisions[distfile_str].add(revision_key)
+                                                       yield 
DistfileName(distfile_str, digests=dict(revision_key))
+
+               if not any(matched_revisions.values()):
+                       # Since filename matched zero content revisions, allow
+                       # it to pass through unchanged (on the path toward 
deletion).
+                       yield filename
+
+       def __len__(self):
+               return len(self._shelve)
+
+       def __contains__(self, k):
+               return k in self._shelve
+
+       def __iter__(self):
+               return self._shelve.__iter__()
+
+       def items(self):
+               return self._shelve.items()
+
+       def __setitem__(self, k, v):
+               self._shelve[k] = v
+
+       def __getitem__(self, k):
+               return self._shelve[k]
+
+       def __delitem__(self, k):
+               del self._shelve[k]
+
+       def get(self, k, *args):
+               return self._shelve.get(k, *args)
+
+       def close(self):
+               self._shelve.close()
+
+       def clear(self):
+               self._shelve.clear()
diff --git a/lib/portage/_emirrordist/DeletionIterator.py 
b/lib/portage/_emirrordist/DeletionIterator.py
index 08985ed6c..ab4309f9a 100644
--- a/lib/portage/_emirrordist/DeletionIterator.py
+++ b/lib/portage/_emirrordist/DeletionIterator.py
@@ -1,10 +1,12 @@
-# Copyright 2013-2019 Gentoo Authors
+# Copyright 2013-2021 Gentoo Authors
 # Distributed under the terms of the GNU General Public License v2
 
+import itertools
 import logging
 import stat
 
 from portage import os
+from portage.package.ebuild.fetch import DistfileName
 from .DeletionTask import DeletionTask
 
 class DeletionIterator:
@@ -21,8 +23,25 @@ class DeletionIterator:
                deletion_delay = self._config.options.deletion_delay
                start_time = self._config.start_time
                distfiles_set = set()
-               for layout in self._config.layouts:
-                       distfiles_set.update(layout.get_filenames(distdir))
+               distfiles_set.update(
+                       (
+                               filename
+                               if isinstance(filename, DistfileName)
+                               else DistfileName(filename)
+                               for filename in itertools.chain.from_iterable(
+                                       layout.get_filenames(distdir) for 
layout in self._config.layouts
+                               )
+                       )
+                       if self._config.content_db is None
+                       else itertools.chain.from_iterable(
+                               (
+                                       
self._config.content_db.get_filenames_translate(filename)
+                                       for filename in 
itertools.chain.from_iterable(
+                                               layout.get_filenames(distdir) 
for layout in self._config.layouts
+                                       )
+                               )
+                       )
+               )
                for filename in distfiles_set:
                        # require at least one successful stat()
                        exceptions = []
diff --git a/lib/portage/_emirrordist/DeletionTask.py 
b/lib/portage/_emirrordist/DeletionTask.py
index 5eb01d840..73493c5a1 100644
--- a/lib/portage/_emirrordist/DeletionTask.py
+++ b/lib/portage/_emirrordist/DeletionTask.py
@@ -5,6 +5,7 @@ import errno
 import logging
 
 from portage import os
+from portage.package.ebuild.fetch import ContentHashLayout
 from portage.util._async.FileCopier import FileCopier
 from _emerge.CompositeTask import CompositeTask
 
@@ -99,6 +100,10 @@ class DeletionTask(CompositeTask):
        def _delete_links(self):
                success = True
                for layout in self.config.layouts:
+                       if isinstance(layout, ContentHashLayout) and not 
self.distfile.digests:
+                               logging.debug(("_delete_links: '%s' has "
+                                       "no digests") % self.distfile)
+                               continue
                        distfile_path = os.path.join(
                                self.config.options.distfiles,
                                layout.get_path(self.distfile))
@@ -134,6 +139,9 @@ class DeletionTask(CompositeTask):
                                logging.debug(("drop '%s' from "
                                        "distfiles db") % self.distfile)
 
+               if self.config.content_db is not None:
+                       self.config.content_db.remove(self.distfile)
+
                if self.config.deletion_db is not None:
                        try:
                                del self.config.deletion_db[self.distfile]
diff --git a/lib/portage/_emirrordist/FetchTask.py 
b/lib/portage/_emirrordist/FetchTask.py
index 997762082..5a48f91cd 100644
--- a/lib/portage/_emirrordist/FetchTask.py
+++ b/lib/portage/_emirrordist/FetchTask.py
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Gentoo Authors
+# Copyright 2013-2021 Gentoo Authors
 # Distributed under the terms of the GNU General Public License v2
 
 import collections
@@ -47,6 +47,9 @@ class FetchTask(CompositeTask):
                        # Convert _pkg_str to str in order to prevent pickle 
problems.
                        self.config.distfiles_db[self.distfile] = str(self.cpv)
 
+               if self.config.content_db is not None:
+                       self.config.content_db.add(self.distfile)
+
                if not self._have_needed_digests():
                        msg = "incomplete digests: %s" % " ".join(self.digests)
                        self.scheduler.output(msg, background=self.background,
diff --git a/lib/portage/_emirrordist/main.py b/lib/portage/_emirrordist/main.py
index 8d00a05f5..2200ec715 100644
--- a/lib/portage/_emirrordist/main.py
+++ b/lib/portage/_emirrordist/main.py
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Gentoo Authors
+# Copyright 2013-2021 Gentoo Authors
 # Distributed under the terms of the GNU General Public License v2
 
 import argparse
@@ -7,6 +7,7 @@ import sys
 
 import portage
 from portage import os
+from portage.package.ebuild.fetch import ContentHashLayout
 from portage.util import normalize_path, _recursive_file_list
 from portage.util._async.run_main_scheduler import run_main_scheduler
 from portage.util._async.SchedulerInterface import SchedulerInterface
@@ -151,6 +152,12 @@ common_options = (
                        "distfile belongs to",
                "metavar"  : "FILE"
        },
+       {
+               "longopt"  : "--content-db",
+               "help"     : "database file used to map content digests to"
+                       "distfiles names (required for content-hash layout)",
+               "metavar"  : "FILE"
+       },
        {
                "longopt"  : "--recycle-dir",
                "help"     : "directory for extended retention of files that "
@@ -441,6 +448,12 @@ def emirrordist_main(args):
                if not options.mirror:
                        parser.error('No action specified')
 
+               if options.delete and config.content_db is None:
+                       for layout in config.layouts:
+                               if isinstance(layout, ContentHashLayout):
+                                       parser.error("content-hash layout 
requires "
+                                               "--content-db to be specified")
+
                returncode = os.EX_OK
 
                if options.mirror:
diff --git a/lib/portage/package/ebuild/fetch.py 
b/lib/portage/package/ebuild/fetch.py
index a683793f0..73abec595 100644
--- a/lib/portage/package/ebuild/fetch.py
+++ b/lib/portage/package/ebuild/fetch.py
@@ -365,10 +365,10 @@ class DistfileName(str):
        In order to prepare for a migration from filename-hash to
        content-hash layout, all consumers of the layout get_filenames
        method need to be updated to work with content digests as a
-       substitute for distfile names. For example, in order to prepare
-       emirrordist for content-hash, a key-value store needs to be
-       added as a means to associate distfile names with content
-       digest values yielded by the content-hash get_filenames
+       substitute for distfile names. For example, emirrordist requires
+       the --content-db option when working with a content-hash layout,
+       which serves as a means to associate distfile names
+       with content digest values yielded by the content-hash get_filenames
        implementation.
        """
        def __new__(cls, s, digests=None):
diff --git a/lib/portage/tests/ebuild/test_fetch.py 
b/lib/portage/tests/ebuild/test_fetch.py
index d50a4cbfc..881288cdc 100644
--- a/lib/portage/tests/ebuild/test_fetch.py
+++ b/lib/portage/tests/ebuild/test_fetch.py
@@ -172,6 +172,16 @@ class EbuildFetchTestCase(TestCase):
                                with open(os.path.join(settings['DISTDIR'], 
'layout.conf'), 'wt') as f:
                                        f.write(layout_data)
 
+                               if any(isinstance(layout, ContentHashLayout) 
for layout in layouts):
+                                       content_db = 
os.path.join(playground.eprefix, 'var/db/emirrordist/content.db')
+                                       
os.makedirs(os.path.dirname(content_db), exist_ok=True)
+                                       try:
+                                               os.unlink(content_db)
+                                       except OSError:
+                                               pass
+                               else:
+                                       content_db = None
+
                                # Demonstrate that fetch preserves a stale file 
in DISTDIR when no digests are given.
                                foo_uri = {'foo': 
('{scheme}://{host}:{port}/distfiles/foo'.format(scheme=scheme, host=host, 
port=server.server_port),)}
                                foo_path = os.path.join(settings['DISTDIR'], 
'foo')
@@ -233,9 +243,13 @@ class EbuildFetchTestCase(TestCase):
                                        os.path.join(self.bindir, 
'emirrordist'),
                                        '--distfiles', settings['DISTDIR'],
                                        '--config-root', settings['EPREFIX'],
+                                       '--delete',
                                        '--repositories-configuration', 
settings.repositories.config_string(),
                                        '--repo', 'test_repo', '--mirror')
 
+                               if content_db is not None:
+                                       emirrordist_cmd = emirrordist_cmd + 
('--content-db', content_db,)
+
                                env = settings.environ()
                                env['PYTHONPATH'] = ':'.join(
                                        filter(None, [PORTAGE_PYM_PATH] + 
os.environ.get('PYTHONPATH', '').split(':')))
diff --git a/man/emirrordist.1 b/man/emirrordist.1
index 45108ef8c..7ad10dfd0 100644
--- a/man/emirrordist.1
+++ b/man/emirrordist.1
@@ -1,4 +1,4 @@
-.TH "EMIRRORDIST" "1" "Dec 2015" "Portage VERSION" "Portage"
+.TH "EMIRRORDIST" "1" "Feb 2021" "Portage VERSION" "Portage"
 .SH "NAME"
 emirrordist \- a fetch tool for mirroring of package distfiles
 .SH SYNOPSIS
@@ -66,6 +66,10 @@ reporting purposes. Opened in append mode.
 Log file for scheduled deletions, with tab\-delimited output, for
 reporting purposes. Overwritten with each run.
 .TP
+\fB\-\-content\-db\fR=\fIFILE\fR
+Database file used to pair content digests with distfiles names
+(required fo content\-hash layout).
+.TP
 \fB\-\-delete\fR
 Enable deletion of unused distfiles.
 .TP
-- 
2.26.2


Reply via email to