Dnia June 30, 2020 2:13:43 AM UTC, Sid Spry <s...@aeam.us> napisał(a): >Hello, > >I have some runnable pseudocode outlining a faster tree verification >algorithm. >Before I create patches I'd like to see if there is any guidance on >making the >changes as unobtrusive as possible. If the radical change in algorithm >is >acceptable I can work on adding the changes. > >Instead of composing any kind of structured data out of the portage >tree my >algorithm just lists all files and then optionally batches them out to >threads. >There is a noticeable speedup by eliding the tree traversal operations >which >can be seen when running the algorithm with a single thread and >comparing it to >the current algorithm in gemato (which should still be discussed >here?).
Without reading the code: does your algorithm correctly detect extraneous files? > >Some simple tests like counting all objects traversed and verified >returns the >same(ish). Once it is put into portage it could be tested in detail. > >There is also my partial attempt at removing the brittle interface to >GnuPG >(it's not as if the current code is badly designed, just that parsing >the >output of GnuPG directly is likely not the best idea). The 'brittle interface' is well-defined machine-readable output. > >Needs gemato, dnspython, and requests. Slightly better than random code >because >I took inspiration from the existing gemato classes. The code makes a lot of brittle assumptions about the structure. The GLEP was specifically designed to avoid that and let us adjust the structure in the future to meet our needs. > >```python (veriftree.py) >#!/usr/bin/env python3 >import os, sys, zlib, hashlib, tempfile, shutil, timeit >import subprocess >from typing import List >from pprint import pprint > >from gemato.manifest import ( > ManifestFile, > ManifestFileEntry, >) >from wkd import ( > check_domain_signature, > hash_localpart, > build_web_key_uri, > stream_to_file >) >from fetchmedia import ( > OpenPGPEnvironment, > setup_verification_environment >) > ># 0. Top level directory (repository) contains Manifest, a PGP >signature of ># blake2b and sha512 hashes of Manifest.files.gz. ># 1. Manifest.files contains hashes of each category Manifest.gz. ># 2. The category Manifest contains hashes of each package Manifest. ># 3. The package Manifest contains hashes of each package file. ># Must be aware of PMS, e.g. aux tag specifies a file in files/. > ># 0. Check signature of repo Manifest. ># 1. Merge items in Manifest.files, each category Manifest, and each >package ># Manifest into one big list. The path must be made absolute. ># 2. Distribute items to threads. > ># To check operation compare directory tree to files appearing in all ># ManifestRecords. > >class ManifestTree(object): > __slots__ = ['_directory', '_manifest_list', '_manifest_records', > '_manifest_results'] > > def __init__(self, directory: str): > self._directory = directory > # Tuples of (base_path, full_path). > self._manifest_list = [] > self._manifest_records = [] > self._manifest_results = [] > > def build_manifest_list(self): > for path, dirs, files in os.walk(self._directory): > #if 'glsa' in path or 'news' in path: > #if 'metadata' in path: > # continue # Skip the metadata directory for now. > # It contains a repository. Current algo barfs on Manifest > # containing only sig. > > if 'Manifest.files.gz' in files: > self._manifest_list += [(path, path + '/Manifest.files.gz')] > if 'Manifest.gz' in files: > self._manifest_list += [(path, path + '/Manifest.gz')] > > if path == self._directory: > continue # Skip the repo manifest. Order matters, fix eventually. > if 'Manifest' in files: > self._manifest_list += [(path, path + '/Manifest')] > > def parse_manifests(self): > td = tempfile.TemporaryDirectory(dir='./') > for manifest in self._manifest_list: > def inner(): > if manifest[1].endswith('.gz'): > name = 'Manifest.files' # Need to also handle Manifest.gz. > path = '{0}/{1}'.format(td.name, name) > subprocess.run(['sh', '-c', 'gunzip -c {0} > {1}' > .format(manifest[1], path)]) > for line in open(path): > mr = ManifestRecord(line) > mr.make_absolute(manifest[0]) > self._manifest_records += [mr] > else: > for line in open(manifest[1]): > if line.startswith('-'): > return # Skip the signed manifest. > mr = ManifestRecord(line) > mr.make_absolute(manifest[0]) > self._manifest_records += [mr] > inner() > > def verify_manifests(self): > for record in self._manifest_records: > self._manifest_results += [record.verify()] > > >class ManifestRecord(object): > __slots__ = ['_tag', '_abs_path', '_path', '_size', '_hashes'] > > def __init__(self, line: str=None): > self._tag = None > self._abs_path = None > self._path = None > self._size = None > self._hashes = [] > if line: > self.from_string(line) > > def from_string(self, line: str) -> None: > parts = line.split() > if len(parts) == 2: > self._tag = 'ignore' > return > self._tag = parts[0] > self._path = parts[1] > self._size = parts[2] > self._hashes = parts[3:] > > def make_absolute(self, abs_path: str) -> None: > self._abs_path = abs_path > try: > pass > #if 'md5-cache' in abs_path: > # print(abs_path + '/' + self._path) > except TypeError as exc: > return > > def verify(self) -> bool: > if self._tag == 'ignore': > return None > > # Where is best place to do this? Before? > if self._tag.lower() == 'aux': > self._path = self._abs_path + '/files/' + self._path > elif self._abs_path: > self._path = self._abs_path + '/' + self._path > > # Distfiles will not be present. > if self._tag.lower() == 'dist': > return None > > if not os.path.exists(self._path): > return False > > fd = open(self._path, 'rb') > sha512 = hashlib.sha512() > blake2b = hashlib.blake2b() > while True: > d = fd.read(8192) > if not d: > break > sha512.update(d) > blake2b.update(d) > rsha512 = sha512.hexdigest() > rblake2b = blake2b.hexdigest() > > if rblake2b != self._hashes[1]: > return False > > if rsha512 != self._hashes[3]: > return False > > return True > > def __repr__(self) -> str: > #return repr(self._hashes) > return '\t'.join([self._tag, self._size, self._path]) > >def main() -> int: > # Step 0: verify the repo manifest. > #publishers = ['infrastruct...@gentoo.org'] > #ev = setup_verification_environment(publishers) > #mf = ManifestFile() > #mf.load(open('/var/db/repos/gentoo/Manifest'), > # verify_openpgp=True, openpgp_env=ev) > #pprint(mf) > #pprint(mf.openpgp_signed) > #pprint(mf.openpgp_signature) > > # Step 1: merge manifests. > #mt = ManifestTree('/var/db/repos/gentoo') > #mt.build_manifest_list() > #mt.parse_manifests() > #mt.verify_manifests() > > glsa = ManifestTree('/var/db/repos/gentoo') > glsa.build_manifest_list() > glsa.parse_manifests() > > start = timeit.default_timer() > glsa.verify_manifests() > end = timeit.default_timer() > pprint(end - start) > > # Handled by checking for None. >#no_ignore = filter(lambda x: x._tag != 'ignore', >glsa_manifest_results) > > > #pprint(glsa._manifest_results) >real_files = [x for x in filter(lambda x: x is not None, >glsa._manifest_results)] > #pprint(real_files) > pprint(len(glsa._manifest_results)) > pprint(len(real_files)) > > all_files = [] > for path, dirs, files in os.walk('/var/db/repos/gentoo'): > pass > > return 0 > >if __name__ == '__main__': > sys.exit(main()) >``` > >```python (wkd.py, likely unneeded but I didn't want to redo these >files yet) >#!/usr/bin/env python3 >import sys, hashlib >import dns >from dns import ( > name, query, dnssec, > message, resolver, rdatatype >) >import shutil, requests > >def check_domain_signature(domain: str) -> bool: > response = dns.resolver.query(domain, dns.rdatatype.NS) > nsname = response.rrset[0] > response = dns.resolver.query(str(nsname), dns.rdatatype.A) > nsaddr = response.rrset[0].to_text() > > # DNSKEY > request = dns.message.make_query(domain, > dns.rdatatype.DNSKEY, want_dnssec=True) > response = dns.query.udp(request, nsaddr) > if response.rcode() != 0: > raise Exception('Unable to request dnskey.') > > answer = response.answer > if len(answer) != 2: > raise Exception('Malformed answer to dnskey query.') > > name = dns.name.from_text(domain) > try: > dns.dnssec.validate(answer[0], answer[1], {name: answer[0]}) > except dns.dnssec.ValidationFailure as exc: > # Validation failed. The raise causes python to abort with status 1. > #raise exc > return False > except AttributeError as exc: ># Validation may have failed; DNSKEY missing signer attribute. dig may >report > # domain as valid. > # ># TODO: Additional state where subdomain of valid domain may fail with >3 nested ># KeyErrors. Avoid temptation to wildcard catch. Safer to put in >process? > #raise exc > return False > else: > return True > >def hash_localpart(incoming: bytes) -> str: > '''Z-base32 the localpart of an e-mail address > >https://tools.ietf.org/html/draft-koch-openpgp-webkey-service-08#section-3.1 > describes why this is needed. > > See https://tools.ietf.org/html/rfc6189#section-5.1.6 for a > description of the z-base32 scheme. > ''' > zb32 = "ybndrfg8ejkmcpqxot1uwisza345h769" > > b = hashlib.sha1(incoming).digest() > ret = "" > assert(len(b) * 8 == 160) > for i in range(0, 160, 5): > byte = i // 8 > offset = i - byte * 8 > # offset | bits remaining in k+1 | right-shift k+1 > # 3 | 0 | x > # 4 | 1 | 7 > # 5 | 2 | 6 > # 6 | 3 | 5 > # 7 | 4 | 4 > if offset < 4: > n = (b[byte] >> (3 - offset)) > else: > n = (b[byte] << (offset - 3)) + (b[byte + 1] >> (11 - offset)) > > ret += zb32[n & 0b11111] > return ret > >def build_web_key_uri(address: str) -> str: > local, remote = address.split('@') > local = hash_localpart(local.encode('utf-8')) > return 'https://' + remote + '/.well-known/openpgpkey/hu/' + \ > local > >def stream_to_file(uri: str, fname: str) -> None: > with requests.get(uri, verify=True, stream=True) as r: > from pprint import pprint > pprint(r.headers) > with open(fname, 'wb') as f: > shutil.copyfileobj(r.raw, f) >``` -- Best regards, Michał Górny