commit: 8b1f9dbd925ccf9c23909116c56eaa4d4f996474 Author: Daniel Robbins <drobbins <AT> funtoo <DOT> org> AuthorDate: Fri Oct 13 21:33:19 2017 +0000 Commit: Zac Medico <zmedico <AT> gentoo <DOT> org> CommitDate: Sat Oct 14 04:44:29 2017 +0000 URL: https://gitweb.gentoo.org/proj/portage.git/commit/?id=8b1f9dbd
portdbapi: cache catpkg to repository mappings In order to avoid performance problems as the number of repositories increases, use a cache of catpkg to repository mappings to optimize findname2, cp_list, and getRepositories methods. Bug: https://bugs.gentoo.org/634210 Closes: https://github.com/gentoo/portage/pull/218 NEWS | 7 +++ RELEASE-NOTES | 6 +++ pym/_emerge/depgraph.py | 4 +- pym/portage/dbapi/porttree.py | 103 +++++++++++++++++++++++++++++++++++++----- 4 files changed, 107 insertions(+), 13 deletions(-) diff --git a/NEWS b/NEWS index 60a436522..c773530e3 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,12 @@ News (mainly features/major bug fixes) +portage-2.3.12 +---------------- +* better_cache implemented to use less expensive os.listdir() instead of + os.stat() operations to scan for ebuilds. Avoids exhaustively scanning + overlays for all ebuilds which allows Portage to not slow down significantly + with lots of overlays enabled. (Daniel Robbins) + portage-2.3.7 ----------------- * eapply_user combines patch basenames from all matched directories into a diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 81c54e550..749322aaa 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -1,6 +1,12 @@ Release Notes; upgrade information mainly. Features/major bugfixes are listed in NEWS +portage-2.3.12 +================================== +* Bug Fixes: + - Bug 634210 optimize portdbapi performance to handle large numbers + of repositories (Daniel Robbins) + portage-2.3.11 ================================== * Bug Fixes: diff --git a/pym/_emerge/depgraph.py b/pym/_emerge/depgraph.py index 751111fb3..f54acdc26 100644 --- a/pym/_emerge/depgraph.py +++ b/pym/_emerge/depgraph.py @@ -5039,7 +5039,7 @@ class depgraph(object): if atom.soname: repo_list = [None] elif atom.repo is None and hasattr(db, "getRepositories"): - repo_list = db.getRepositories() + repo_list = db.getRepositories(catpkg=atom.cp) else: repo_list = [atom.repo] @@ -5490,7 +5490,7 @@ class depgraph(object): atom_set = InternalPackageSet(initial_atoms=(atom,), allow_repo=True) if atom.repo is None and hasattr(db, "getRepositories"): - repo_list = db.getRepositories() + repo_list = db.getRepositories(catpkg=atom_exp.cp) else: repo_list = [atom.repo] diff --git a/pym/portage/dbapi/porttree.py b/pym/portage/dbapi/porttree.py index a3254d017..53edcd18f 100644 --- a/pym/portage/dbapi/porttree.py +++ b/pym/portage/dbapi/porttree.py @@ -43,6 +43,8 @@ import os as _os import sys import traceback import warnings +import errno +import collections try: from urllib.parse import urlparse @@ -253,6 +255,7 @@ class portdbapi(dbapi): "RESTRICT", "SLOT", "DEFINED_PHASES", "REQUIRED_USE"]) self._aux_cache = {} + self._better_cache = None self._broken_ebuilds = set() @property @@ -342,12 +345,21 @@ class portdbapi(dbapi): except KeyError: return None - def getRepositories(self): + def getRepositories(self, catpkg=None): """ - This function is required for GLEP 42 compliance; it will return a list of - repository IDs - TreeMap = {id: path} + With catpkg=None, this will return a complete list of repositories in this dbapi. With catpkg set to a value, + this method will return a short-list of repositories that contain this catpkg. Use this second approach if + possible, to avoid exhaustively searching all repos for a particular catpkg. It's faster for this method to + find the catpkg than for you do it yourself. + + This function is required for GLEP 42 compliance. + + @param catpkg: catpkg for which we want a list of repositories; we'll get a list of all repos containing this + catpkg; if None, return a list of all Repositories that contain a particular catpkg. + @return: a list of repositories. """ + if catpkg is not None and self._better_cache is not None and catpkg in self._better_cache: + return [repo.name for repo in self._better_cache[catpkg]] return self._ordered_repo_name_list def getMissingRepoNames(self): @@ -363,7 +375,7 @@ class portdbapi(dbapi): """ return self.settings.repositories.ignored_repos - def findname2(self, mycpv, mytree=None, myrepo = None): + def findname2(self, mycpv, mytree=None, myrepo=None): """ Returns the location of the CPV, and what overlay it was in. Searches overlays first, then PORTDIR; this allows us to return the first @@ -385,16 +397,33 @@ class portdbapi(dbapi): if psplit is None or len(mysplit) != 2: raise InvalidPackageName(mycpv) + try: + cp = mycpv.cp + except AttributeError: + cp = mysplit[0] + "/" + psplit[0] + + if self._better_cache is None: + if mytree: + mytrees = [mytree] + else: + mytrees = reversed(self.porttrees) + else: + try: + repos = self._better_cache[cp] + except KeyError: + return (None, 0) + + mytrees = [] + for repo in repos: + if mytree is not None and mytree != repo.location: + continue + mytrees.append(repo.location) + # For optimal performace in this hot spot, we do manual unicode # handling here instead of using the wrapped os module. encoding = _encodings['fs'] errors = 'strict' - if mytree: - mytrees = [mytree] - else: - mytrees = reversed(self.porttrees) - relative_path = mysplit[0] + _os.sep + psplit[0] + _os.sep + \ mysplit[1] + ".ebuild" @@ -764,8 +793,15 @@ class portdbapi(dbapi): else: # assume it's iterable mytrees = mytree - else: + elif self._better_cache is None: mytrees = self.porttrees + else: + try: + repos = self._better_cache[mycp] + except KeyError: + mytrees = [] + else: + mytrees = [repo.location for repo in repos] for oroot in mytrees: try: file_list = os.listdir(os.path.join(oroot, mycp)) @@ -814,10 +850,55 @@ class portdbapi(dbapi): "minimum-all-ignore-profile", "minimum-visible"): self.xcache[x]={} self.frozen=1 + self._better_cache = better_cache = collections.defaultdict(list) + + # The purpose of self._better_cache is to perform an initial quick scan of all repositories + # using os.listdir(), which is less expensive IO-wise than exhaustively doing a stat on each + # repo. self._better_cache stores a list of repos in which particular catpkgs appear. + # + # For example, better_cache data may look like this: + # + # { "sys-apps/portage" : [ repo1, repo2 ] } + # + # Without this tweak, Portage will get slower and slower as more overlays are added. + # + # Also note that it is OK if this cache has some 'false positive' catpkgs in it. We use it + # to search for specific catpkgs listed in ebuilds. The likelihood of a false positive catpkg + # in our cache causing a problem is extremely low. Thus, the code below is optimized for + # speed rather than painstaking correctness. + + valid_categories = self.settings.categories + for repo_loc in reversed(self.porttrees): + repo = self.repositories.get_repo_for_location(repo_loc) + try: + categories = os.listdir(repo_loc) + except OSError as e: + if e.errno not in (errno.ENOTDIR, errno.ENOENT, errno.ESTALE): + raise + continue + + for cat in categories: + if cat not in valid_categories: + continue + cat_dir = repo_loc + "/" + cat + try: + pkg_list = os.listdir(cat_dir) + except OSError as e: + if e.errno != errno.ENOTDIR: + raise + continue + + for p in pkg_list: + catpkg_dir = cat_dir + "/" + p + if not os.path.isdir(catpkg_dir): + continue + catpkg = cat + "/" + p + better_cache[catpkg].append(repo) def melt(self): self.xcache = {} self._aux_cache = {} + self._better_cache = None self.frozen = 0 def xmatch(self,level,origdep,mydep=None,mykey=None,mylist=None):
