For scalability, use a tree structure to index patterns that are anchored with a leading slash.
Patterns anchored with leading slash are indexed by leading non-glob components, making it possible to minimize the number of fnmatch calls. For example: /foo*/bar -> {'.': ['/foo*/bar']} /foo/bar* -> {'foo': {'.': ['/foo/bar*']}} /foo/bar/ -> {'foo': {'bar': {'.': ['/foo/bar/']}}} Bug: https://bugs.gentoo.org/675826 Signed-off-by: Zac Medico <zmed...@gentoo.org> --- lib/portage/util/install_mask.py | 78 +++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 6 deletions(-) diff --git a/lib/portage/util/install_mask.py b/lib/portage/util/install_mask.py index 32627eb05..84305644c 100644 --- a/lib/portage/util/install_mask.py +++ b/lib/portage/util/install_mask.py @@ -3,8 +3,11 @@ __all__ = ['install_mask_dir', 'InstallMask'] +import collections import errno import fnmatch +import functools +import operator import sys from portage import os, _unicode_decode @@ -18,13 +21,76 @@ else: _unicode = unicode +def _defaultdict_tree(): + return collections.defaultdict(_defaultdict_tree) + + +_pattern = collections.namedtuple('_pattern', ('orig_index', 'is_inclusive', 'pattern')) + + class InstallMask(object): def __init__(self, install_mask): """ @param install_mask: INSTALL_MASK value @type install_mask: str """ - self._install_mask = install_mask.split() + # Patterns not anchored with leading slash + self._unanchored = [] + + # Patterns anchored with leading slash are indexed by leading + # non-glob components, making it possible to minimize the + # number of fnmatch calls. For example: + # /foo*/bar -> {'.': ['/foo*/bar']} + # /foo/bar* -> {'foo': {'.': ['/foo/bar*']}} + # /foo/bar/ -> {'foo': {'bar': {'.': ['/foo/bar/']}}} + self._anchored = _defaultdict_tree() + for orig_index, pattern in enumerate(install_mask.split()): + # if pattern starts with -, possibly exclude this path + is_inclusive = not pattern.startswith('-') + if not is_inclusive: + pattern = pattern[1:] + pattern_obj = _pattern(orig_index, is_inclusive, pattern) + # absolute path pattern + if pattern.startswith('/'): + current_dir = self._anchored + for component in list(filter(None, pattern.split('/'))): + if '*' in component: + break + else: + current_dir = current_dir[component] + current_dir.setdefault('.', []).append(pattern_obj) + + # filename + else: + self._unanchored.append(pattern_obj) + + def _iter_relevant_patterns(self, path): + """ + Iterate over patterns that may be relevant for the given path. + + Patterns anchored with leading / are indexed by leading + non-glob components, making it possible to minimize the + number of fnmatch calls. + """ + current_dir = self._anchored + components = list(filter(None, path.split('/'))) + patterns = [] + patterns.extend(current_dir.get('.', [])) + for component in components: + next_dir = current_dir.get(component, None) + if next_dir is None: + break + current_dir = next_dir + patterns.extend(current_dir.get('.', [])) + + if patterns: + # Sort by original pattern index, since order matters for + # non-inclusive patterns. + patterns.extend(self._unanchored) + patterns.sort(key=operator.attrgetter('orig_index')) + return iter(patterns) + + return iter(self._unanchored) def match(self, path): """ @@ -34,11 +100,10 @@ class InstallMask(object): @return: True if path matches INSTALL_MASK, False otherwise """ ret = False - for pattern in self._install_mask: - # if pattern starts with -, possibly exclude this path - is_inclusive = not pattern.startswith('-') - if not is_inclusive: - pattern = pattern[1:] + + for pattern_obj in self._iter_relevant_patterns(path): + is_inclusive = pattern_obj.is_inclusive + pattern = pattern_obj.pattern # absolute path pattern if pattern.startswith('/'): # handle trailing slash for explicit directory match @@ -49,6 +114,7 @@ class InstallMask(object): if (fnmatch.fnmatch(path, pattern[1:]) or fnmatch.fnmatch(path, pattern[1:].rstrip('/') + '/*')): ret = is_inclusive + # filename else: if fnmatch.fnmatch(os.path.basename(path), pattern): -- 2.18.1