Dnia June 30, 2020 6:20:37 AM UTC, Fabian Groffen <grob...@gentoo.org> napisał(a): >Hi, > >On 29-06-2020 21:13:43 -0500, Sid Spry wrote: >> Hello, >> >> I have some runnable pseudocode outlining a faster tree verification >algorithm. >> Before I create patches I'd like to see if there is any guidance on >making the >> changes as unobtrusive as possible. If the radical change in >algorithm is >> acceptable I can work on adding the changes. >> >> Instead of composing any kind of structured data out of the portage >tree my >> algorithm just lists all files and then optionally batches them out >to threads. >> There is a noticeable speedup by eliding the tree traversal >operations which >> can be seen when running the algorithm with a single thread and >comparing it to >> the current algorithm in gemato (which should still be discussed >here?). > >I remember something that gemato used to use multiple threads, but >because it totally saturated disk-IO, it was brought back to a single >thread. People were complaining about unusable systems.
No, it gave significant speedup even on spinning HDDs. However, it hang on some people due to some bug that I couldn't reproduce. > >In any case, can you share your performance results? What speedup did >you see, on warm and hot FS caches? Which type of disk do you use? > >You could compare against qmanifest, which uses OpenMP-based >paralllelism while verifying the tree. On SSDs this does help. > >Thanks, >Fabian > >> >> Some simple tests like counting all objects traversed and verified >returns the >> same(ish). Once it is put into portage it could be tested in detail. >> >> There is also my partial attempt at removing the brittle interface to >GnuPG >> (it's not as if the current code is badly designed, just that parsing >the >> output of GnuPG directly is likely not the best idea). >> >> Needs gemato, dnspython, and requests. Slightly better than random >code because >> I took inspiration from the existing gemato classes. >> >> ```python (veriftree.py) >> #!/usr/bin/env python3 >> import os, sys, zlib, hashlib, tempfile, shutil, timeit >> import subprocess >> from typing import List >> from pprint import pprint >> >> from gemato.manifest import ( >> ManifestFile, >> ManifestFileEntry, >> ) >> from wkd import ( >> check_domain_signature, >> hash_localpart, >> build_web_key_uri, >> stream_to_file >> ) >> from fetchmedia import ( >> OpenPGPEnvironment, >> setup_verification_environment >> ) >> >> # 0. Top level directory (repository) contains Manifest, a PGP >signature of >> # blake2b and sha512 hashes of Manifest.files.gz. >> # 1. Manifest.files contains hashes of each category Manifest.gz. >> # 2. The category Manifest contains hashes of each package Manifest. >> # 3. The package Manifest contains hashes of each package file. >> # Must be aware of PMS, e.g. aux tag specifies a file in files/. >> >> # 0. Check signature of repo Manifest. >> # 1. Merge items in Manifest.files, each category Manifest, and each >package >> # Manifest into one big list. The path must be made absolute. >> # 2. Distribute items to threads. >> >> # To check operation compare directory tree to files appearing in all >> # ManifestRecords. >> >> class ManifestTree(object): >> __slots__ = ['_directory', '_manifest_list', '_manifest_records', >> '_manifest_results'] >> >> def __init__(self, directory: str): >> self._directory = directory >> # Tuples of (base_path, full_path). >> self._manifest_list = [] >> self._manifest_records = [] >> self._manifest_results = [] >> >> def build_manifest_list(self): >> for path, dirs, files in os.walk(self._directory): >> #if 'glsa' in path or 'news' in path: >> #if 'metadata' in path: >> # continue # Skip the metadata directory for now. >> # It contains a repository. Current algo barfs on >Manifest >> # containing only sig. >> >> if 'Manifest.files.gz' in files: >> self._manifest_list += [(path, path + >'/Manifest.files.gz')] >> if 'Manifest.gz' in files: >> self._manifest_list += [(path, path + >'/Manifest.gz')] >> >> if path == self._directory: >> continue # Skip the repo manifest. Order matters, fix >eventually. >> if 'Manifest' in files: >> self._manifest_list += [(path, path + '/Manifest')] >> >> def parse_manifests(self): >> td = tempfile.TemporaryDirectory(dir='./') >> for manifest in self._manifest_list: >> def inner(): >> if manifest[1].endswith('.gz'): >> name = 'Manifest.files' # Need to also handle >Manifest.gz. >> path = '{0}/{1}'.format(td.name, name) >> subprocess.run(['sh', '-c', 'gunzip -c {0} > {1}' >> .format(manifest[1], path)]) >> for line in open(path): >> mr = ManifestRecord(line) >> mr.make_absolute(manifest[0]) >> self._manifest_records += [mr] >> else: >> for line in open(manifest[1]): >> if line.startswith('-'): >> return # Skip the signed manifest. >> mr = ManifestRecord(line) >> mr.make_absolute(manifest[0]) >> self._manifest_records += [mr] >> inner() >> >> def verify_manifests(self): >> for record in self._manifest_records: >> self._manifest_results += [record.verify()] >> >> >> class ManifestRecord(object): >> __slots__ = ['_tag', '_abs_path', '_path', '_size', '_hashes'] >> >> def __init__(self, line: str=None): >> self._tag = None >> self._abs_path = None >> self._path = None >> self._size = None >> self._hashes = [] >> if line: >> self.from_string(line) >> >> def from_string(self, line: str) -> None: >> parts = line.split() >> if len(parts) == 2: >> self._tag = 'ignore' >> return >> self._tag = parts[0] >> self._path = parts[1] >> self._size = parts[2] >> self._hashes = parts[3:] >> >> def make_absolute(self, abs_path: str) -> None: >> self._abs_path = abs_path >> try: >> pass >> #if 'md5-cache' in abs_path: >> # print(abs_path + '/' + self._path) >> except TypeError as exc: >> return >> >> def verify(self) -> bool: >> if self._tag == 'ignore': >> return None >> >> # Where is best place to do this? Before? >> if self._tag.lower() == 'aux': >> self._path = self._abs_path + '/files/' + self._path >> elif self._abs_path: >> self._path = self._abs_path + '/' + self._path >> >> # Distfiles will not be present. >> if self._tag.lower() == 'dist': >> return None >> >> if not os.path.exists(self._path): >> return False >> >> fd = open(self._path, 'rb') >> sha512 = hashlib.sha512() >> blake2b = hashlib.blake2b() >> while True: >> d = fd.read(8192) >> if not d: >> break >> sha512.update(d) >> blake2b.update(d) >> rsha512 = sha512.hexdigest() >> rblake2b = blake2b.hexdigest() >> >> if rblake2b != self._hashes[1]: >> return False >> >> if rsha512 != self._hashes[3]: >> return False >> >> return True >> >> def __repr__(self) -> str: >> #return repr(self._hashes) >> return '\t'.join([self._tag, self._size, self._path]) >> >> def main() -> int: >> # Step 0: verify the repo manifest. >> #publishers = ['infrastruct...@gentoo.org'] >> #ev = setup_verification_environment(publishers) >> #mf = ManifestFile() >> #mf.load(open('/var/db/repos/gentoo/Manifest'), >> # verify_openpgp=True, openpgp_env=ev) >> #pprint(mf) >> #pprint(mf.openpgp_signed) >> #pprint(mf.openpgp_signature) >> >> # Step 1: merge manifests. >> #mt = ManifestTree('/var/db/repos/gentoo') >> #mt.build_manifest_list() >> #mt.parse_manifests() >> #mt.verify_manifests() >> >> glsa = ManifestTree('/var/db/repos/gentoo') >> glsa.build_manifest_list() >> glsa.parse_manifests() >> >> start = timeit.default_timer() >> glsa.verify_manifests() >> end = timeit.default_timer() >> pprint(end - start) >> >> # Handled by checking for None. >> #no_ignore = filter(lambda x: x._tag != 'ignore', >glsa_manifest_results) >> >> >> #pprint(glsa._manifest_results) >> real_files = [x for x in filter(lambda x: x is not None, >glsa._manifest_results)] >> #pprint(real_files) >> pprint(len(glsa._manifest_results)) >> pprint(len(real_files)) >> >> all_files = [] >> for path, dirs, files in os.walk('/var/db/repos/gentoo'): >> pass >> >> return 0 >> >> if __name__ == '__main__': >> sys.exit(main()) >> ``` >> >> ```python (wkd.py, likely unneeded but I didn't want to redo these >files yet) >> #!/usr/bin/env python3 >> import sys, hashlib >> import dns >> from dns import ( >> name, query, dnssec, >> message, resolver, rdatatype >> ) >> import shutil, requests >> >> def check_domain_signature(domain: str) -> bool: >> response = dns.resolver.query(domain, dns.rdatatype.NS) >> nsname = response.rrset[0] >> response = dns.resolver.query(str(nsname), dns.rdatatype.A) >> nsaddr = response.rrset[0].to_text() >> >> # DNSKEY >> request = dns.message.make_query(domain, >> dns.rdatatype.DNSKEY, want_dnssec=True) >> response = dns.query.udp(request, nsaddr) >> if response.rcode() != 0: >> raise Exception('Unable to request dnskey.') >> >> answer = response.answer >> if len(answer) != 2: >> raise Exception('Malformed answer to dnskey query.') >> >> name = dns.name.from_text(domain) >> try: >> dns.dnssec.validate(answer[0], answer[1], {name: answer[0]}) >> except dns.dnssec.ValidationFailure as exc: >> # Validation failed. The raise causes python to abort with >status 1. >> #raise exc >> return False >> except AttributeError as exc: >> # Validation may have failed; DNSKEY missing signer >attribute. dig may report >> # domain as valid. >> # >> # TODO: Additional state where subdomain of valid domain may >fail with 3 nested >> # KeyErrors. Avoid temptation to wildcard catch. Safer to put >in process? >> #raise exc >> return False >> else: >> return True >> >> def hash_localpart(incoming: bytes) -> str: >> '''Z-base32 the localpart of an e-mail address >> >> >https://tools.ietf.org/html/draft-koch-openpgp-webkey-service-08#section-3.1 >> describes why this is needed. >> >> See https://tools.ietf.org/html/rfc6189#section-5.1.6 for a >> description of the z-base32 scheme. >> ''' >> zb32 = "ybndrfg8ejkmcpqxot1uwisza345h769" >> >> b = hashlib.sha1(incoming).digest() >> ret = "" >> assert(len(b) * 8 == 160) >> for i in range(0, 160, 5): >> byte = i // 8 >> offset = i - byte * 8 >> # offset | bits remaining in k+1 | right-shift k+1 >> # 3 | 0 | x >> # 4 | 1 | 7 >> # 5 | 2 | 6 >> # 6 | 3 | 5 >> # 7 | 4 | 4 >> if offset < 4: >> n = (b[byte] >> (3 - offset)) >> else: >> n = (b[byte] << (offset - 3)) + (b[byte + 1] >> (11 - >offset)) >> >> ret += zb32[n & 0b11111] >> return ret >> >> def build_web_key_uri(address: str) -> str: >> local, remote = address.split('@') >> local = hash_localpart(local.encode('utf-8')) >> return 'https://' + remote + '/.well-known/openpgpkey/hu/' + \ >> local >> >> def stream_to_file(uri: str, fname: str) -> None: >> with requests.get(uri, verify=True, stream=True) as r: >> from pprint import pprint >> pprint(r.headers) >> with open(fname, 'wb') as f: >> shutil.copyfileobj(r.raw, f) >> ``` >> -- Best regards, Michał Górny