Hi,

On 29-06-2020 21:13:43 -0500, Sid Spry wrote:
> Hello,
> 
> I have some runnable pseudocode outlining a faster tree verification 
> algorithm.
> Before I create patches I'd like to see if there is any guidance on making the
> changes as unobtrusive as possible. If the radical change in algorithm is
> acceptable I can work on adding the changes.
> 
> Instead of composing any kind of structured data out of the portage tree my
> algorithm just lists all files and then optionally batches them out to 
> threads.
> There is a noticeable speedup by eliding the tree traversal operations which
> can be seen when running the algorithm with a single thread and comparing it 
> to
> the current algorithm in gemato (which should still be discussed here?).

I remember something that gemato used to use multiple threads, but
because it totally saturated disk-IO, it was brought back to a single
thread.  People were complaining about unusable systems.

In any case, can you share your performance results?  What speedup did
you see, on warm and hot FS caches?  Which type of disk do you use?

You could compare against qmanifest, which uses OpenMP-based
paralllelism while verifying the tree.  On SSDs this does help.

Thanks,
Fabian

> 
> Some simple tests like counting all objects traversed and verified returns the
> same(ish). Once it is put into portage it could be tested in detail.
> 
> There is also my partial attempt at removing the brittle interface to GnuPG
> (it's not as if the current code is badly designed, just that parsing the
> output of GnuPG directly is likely not the best idea).
> 
> Needs gemato, dnspython, and requests. Slightly better than random code 
> because
> I took inspiration from the existing gemato classes.
> 
> ```python (veriftree.py)
> #!/usr/bin/env python3
> import os, sys, zlib, hashlib, tempfile, shutil, timeit
> import subprocess
> from typing import List
> from pprint import pprint
> 
> from gemato.manifest import (
>     ManifestFile,
>     ManifestFileEntry,
> )
> from wkd import (
>     check_domain_signature,
>     hash_localpart,
>     build_web_key_uri,
>     stream_to_file
> )
> from fetchmedia import (
>     OpenPGPEnvironment,
>     setup_verification_environment
> )
> 
> # 0. Top level directory (repository) contains Manifest, a PGP signature of
> #    blake2b and sha512 hashes of Manifest.files.gz.
> # 1. Manifest.files contains hashes of each category Manifest.gz.
> # 2. The category Manifest contains hashes of each package Manifest.
> # 3. The package Manifest contains hashes of each package file.
> #    Must be aware of PMS, e.g. aux tag specifies a file in files/.
> 
> # 0. Check signature of repo Manifest.
> # 1. Merge items in Manifest.files, each category Manifest, and each package
> #    Manifest into one big list. The path must be made absolute.
> # 2. Distribute items to threads.
> 
> # To check operation compare directory tree to files appearing in all
> # ManifestRecords.
> 
> class ManifestTree(object):
>     __slots__ = ['_directory', '_manifest_list', '_manifest_records',
>         '_manifest_results']
> 
>     def __init__(self, directory: str):
>         self._directory = directory
>         # Tuples of (base_path, full_path).
>         self._manifest_list = []
>         self._manifest_records = []
>         self._manifest_results = []
> 
>     def build_manifest_list(self):
>         for path, dirs, files in os.walk(self._directory):
>             #if 'glsa' in path or 'news' in path:
>             #if 'metadata' in path:
>             #    continue # Skip the metadata directory for now.
>                 # It contains a repository. Current algo barfs on Manifest
>                 # containing only sig.
> 
>             if 'Manifest.files.gz' in files:
>                 self._manifest_list += [(path, path + '/Manifest.files.gz')]
>             if 'Manifest.gz' in files:
>                 self._manifest_list += [(path, path + '/Manifest.gz')]
>             
>             if path == self._directory:
>                 continue # Skip the repo manifest. Order matters, fix 
> eventually.
>             if 'Manifest' in files:
>                 self._manifest_list += [(path, path + '/Manifest')]
> 
>     def parse_manifests(self):
>         td = tempfile.TemporaryDirectory(dir='./')
>         for manifest in self._manifest_list:
>             def inner():
>                 if manifest[1].endswith('.gz'):
>                     name = 'Manifest.files' # Need to also handle Manifest.gz.
>                     path = '{0}/{1}'.format(td.name, name)
>                     subprocess.run(['sh', '-c', 'gunzip -c {0} > {1}'
>                         .format(manifest[1], path)])
>                     for line in open(path):
>                         mr = ManifestRecord(line)
>                         mr.make_absolute(manifest[0])
>                         self._manifest_records += [mr]
>                 else:
>                     for line in open(manifest[1]):
>                         if line.startswith('-'):
>                             return # Skip the signed manifest.
>                         mr = ManifestRecord(line)
>                         mr.make_absolute(manifest[0])
>                         self._manifest_records += [mr]
>             inner()
> 
>     def verify_manifests(self):
>         for record in self._manifest_records:
>             self._manifest_results += [record.verify()]
> 
> 
> class ManifestRecord(object):
>     __slots__ = ['_tag', '_abs_path', '_path', '_size', '_hashes']
> 
>     def __init__(self, line: str=None):
>         self._tag = None
>         self._abs_path = None
>         self._path = None
>         self._size = None
>         self._hashes = []
>         if line:
>             self.from_string(line)
> 
>     def from_string(self, line: str) -> None:
>         parts = line.split()
>         if len(parts) == 2:
>             self._tag = 'ignore'
>             return
>         self._tag = parts[0]
>         self._path = parts[1]
>         self._size = parts[2]
>         self._hashes = parts[3:]
> 
>     def make_absolute(self, abs_path: str) -> None:
>         self._abs_path = abs_path
>         try:
>             pass
>             #if 'md5-cache' in abs_path:
>             #    print(abs_path + '/' + self._path)
>         except TypeError as exc:
>             return
> 
>     def verify(self) -> bool:
>         if self._tag == 'ignore':
>             return None
> 
>         # Where is best place to do this? Before?
>         if self._tag.lower() == 'aux':
>             self._path = self._abs_path + '/files/' + self._path
>         elif self._abs_path:
>             self._path = self._abs_path + '/' + self._path
> 
>         # Distfiles will not be present.
>         if self._tag.lower() == 'dist':
>             return None
> 
>         if not os.path.exists(self._path):
>             return False
>         
>         fd = open(self._path, 'rb')
>         sha512 = hashlib.sha512()
>         blake2b = hashlib.blake2b()
>         while True:
>             d = fd.read(8192)
>             if not d:
>                 break
>             sha512.update(d)
>             blake2b.update(d)
>         rsha512 = sha512.hexdigest()
>         rblake2b = blake2b.hexdigest()
> 
>         if rblake2b != self._hashes[1]:
>             return False
> 
>         if rsha512 != self._hashes[3]:
>             return False
> 
>         return True
> 
>     def __repr__(self) -> str:
>         #return repr(self._hashes)
>         return '\t'.join([self._tag, self._size, self._path])
> 
> def main() -> int:
>     # Step 0: verify the repo manifest.
>     #publishers = ['infrastruct...@gentoo.org']
>     #ev = setup_verification_environment(publishers)
>     #mf = ManifestFile()
>     #mf.load(open('/var/db/repos/gentoo/Manifest'),
>     #    verify_openpgp=True, openpgp_env=ev)
>     #pprint(mf)
>     #pprint(mf.openpgp_signed)
>     #pprint(mf.openpgp_signature)
> 
>     # Step 1: merge manifests.
>     #mt = ManifestTree('/var/db/repos/gentoo')
>     #mt.build_manifest_list()
>     #mt.parse_manifests()
>     #mt.verify_manifests()
> 
>     glsa = ManifestTree('/var/db/repos/gentoo')
>     glsa.build_manifest_list()
>     glsa.parse_manifests()
>     
>     start = timeit.default_timer()
>     glsa.verify_manifests()
>     end = timeit.default_timer()
>     pprint(end - start)
>     
>     # Handled by checking for None.
>     #no_ignore = filter(lambda x: x._tag != 'ignore', glsa_manifest_results)
> 
> 
>     #pprint(glsa._manifest_results)
>     real_files = [x for x in filter(lambda x: x is not None, 
> glsa._manifest_results)]
>     #pprint(real_files)
>     pprint(len(glsa._manifest_results))
>     pprint(len(real_files))
> 
>     all_files = []
>     for path, dirs, files in os.walk('/var/db/repos/gentoo'):
>         pass
> 
>     return 0
> 
> if __name__ == '__main__':
>     sys.exit(main())
> ```
> 
> ```python (wkd.py, likely unneeded but I didn't want to redo these files yet)
> #!/usr/bin/env python3
> import sys, hashlib
> import dns
> from dns import (
>     name, query, dnssec,
>     message, resolver, rdatatype
> )
> import shutil, requests
> 
> def check_domain_signature(domain: str) -> bool:
>     response = dns.resolver.query(domain, dns.rdatatype.NS)
>     nsname = response.rrset[0]
>     response = dns.resolver.query(str(nsname), dns.rdatatype.A)
>     nsaddr = response.rrset[0].to_text()
>     
>     # DNSKEY
>     request = dns.message.make_query(domain,
>         dns.rdatatype.DNSKEY, want_dnssec=True)
>     response = dns.query.udp(request, nsaddr)
>     if response.rcode() != 0:
>         raise Exception('Unable to request dnskey.')
> 
>     answer = response.answer
>     if len(answer) != 2:
>         raise Exception('Malformed answer to dnskey query.')
> 
>     name = dns.name.from_text(domain)
>     try:
>         dns.dnssec.validate(answer[0], answer[1], {name: answer[0]})
>     except dns.dnssec.ValidationFailure as exc:
>         # Validation failed. The raise causes python to abort with status 1.
>         #raise exc
>         return False
>     except AttributeError as exc:
>         # Validation may have failed; DNSKEY missing signer attribute. dig 
> may report
>         # domain as valid.
>         #
>         # TODO: Additional state where subdomain of valid domain may fail 
> with 3 nested
>         # KeyErrors. Avoid temptation to wildcard catch. Safer to put in 
> process?
>         #raise exc
>         return False
>     else:
>         return True
> 
> def hash_localpart(incoming: bytes) -> str:
>     '''Z-base32 the localpart of an e-mail address
> 
>     
> https://tools.ietf.org/html/draft-koch-openpgp-webkey-service-08#section-3.1
>     describes why this is needed.
> 
>     See https://tools.ietf.org/html/rfc6189#section-5.1.6 for a
>     description of the z-base32 scheme.
>     '''
>     zb32 = "ybndrfg8ejkmcpqxot1uwisza345h769"
> 
>     b = hashlib.sha1(incoming).digest()
>     ret = ""
>     assert(len(b) * 8 == 160)
>     for i in range(0, 160, 5):
>         byte = i // 8
>         offset = i - byte * 8
>         # offset | bits remaining in k+1 | right-shift k+1
>         # 3 | 0 | x
>         # 4 | 1 | 7
>         # 5 | 2 | 6
>         # 6 | 3 | 5
>         # 7 | 4 | 4
>         if offset < 4:
>             n = (b[byte] >> (3 - offset))
>         else:
>             n = (b[byte] << (offset - 3)) + (b[byte + 1] >> (11 - offset))
> 
>         ret += zb32[n & 0b11111]
>     return ret
> 
> def build_web_key_uri(address: str) -> str:
>     local, remote = address.split('@')
>     local = hash_localpart(local.encode('utf-8'))
>     return 'https://' + remote + '/.well-known/openpgpkey/hu/' + \
>         local
> 
> def stream_to_file(uri: str, fname: str) -> None:
>     with requests.get(uri, verify=True, stream=True) as r:
>         from pprint import pprint
>         pprint(r.headers)
>         with open(fname, 'wb') as f:
>             shutil.copyfileobj(r.raw, f)
> ```
> 

-- 
Fabian Groffen
Gentoo on a different level

Attachment: signature.asc
Description: PGP signature

Reply via email to