Dnia June 30, 2020 6:20:37 AM UTC, Fabian Groffen <grob...@gentoo.org> 
napisał(a):
>Hi,
>
>On 29-06-2020 21:13:43 -0500, Sid Spry wrote:
>> Hello,
>> 
>> I have some runnable pseudocode outlining a faster tree verification
>algorithm.
>> Before I create patches I'd like to see if there is any guidance on
>making the
>> changes as unobtrusive as possible. If the radical change in
>algorithm is
>> acceptable I can work on adding the changes.
>> 
>> Instead of composing any kind of structured data out of the portage
>tree my
>> algorithm just lists all files and then optionally batches them out
>to threads.
>> There is a noticeable speedup by eliding the tree traversal
>operations which
>> can be seen when running the algorithm with a single thread and
>comparing it to
>> the current algorithm in gemato (which should still be discussed
>here?).
>
>I remember something that gemato used to use multiple threads, but
>because it totally saturated disk-IO, it was brought back to a single
>thread.  People were complaining about unusable systems.

No, it gave significant speedup even on spinning HDDs. However, it hang on some 
people due to some bug that I couldn't reproduce.

>
>In any case, can you share your performance results?  What speedup did
>you see, on warm and hot FS caches?  Which type of disk do you use?
>
>You could compare against qmanifest, which uses OpenMP-based
>paralllelism while verifying the tree.  On SSDs this does help.
>
>Thanks,
>Fabian
>
>> 
>> Some simple tests like counting all objects traversed and verified
>returns the
>> same(ish). Once it is put into portage it could be tested in detail.
>> 
>> There is also my partial attempt at removing the brittle interface to
>GnuPG
>> (it's not as if the current code is badly designed, just that parsing
>the
>> output of GnuPG directly is likely not the best idea).
>> 
>> Needs gemato, dnspython, and requests. Slightly better than random
>code because
>> I took inspiration from the existing gemato classes.
>> 
>> ```python (veriftree.py)
>> #!/usr/bin/env python3
>> import os, sys, zlib, hashlib, tempfile, shutil, timeit
>> import subprocess
>> from typing import List
>> from pprint import pprint
>> 
>> from gemato.manifest import (
>>     ManifestFile,
>>     ManifestFileEntry,
>> )
>> from wkd import (
>>     check_domain_signature,
>>     hash_localpart,
>>     build_web_key_uri,
>>     stream_to_file
>> )
>> from fetchmedia import (
>>     OpenPGPEnvironment,
>>     setup_verification_environment
>> )
>> 
>> # 0. Top level directory (repository) contains Manifest, a PGP
>signature of
>> #    blake2b and sha512 hashes of Manifest.files.gz.
>> # 1. Manifest.files contains hashes of each category Manifest.gz.
>> # 2. The category Manifest contains hashes of each package Manifest.
>> # 3. The package Manifest contains hashes of each package file.
>> #    Must be aware of PMS, e.g. aux tag specifies a file in files/.
>> 
>> # 0. Check signature of repo Manifest.
>> # 1. Merge items in Manifest.files, each category Manifest, and each
>package
>> #    Manifest into one big list. The path must be made absolute.
>> # 2. Distribute items to threads.
>> 
>> # To check operation compare directory tree to files appearing in all
>> # ManifestRecords.
>> 
>> class ManifestTree(object):
>>     __slots__ = ['_directory', '_manifest_list', '_manifest_records',
>>         '_manifest_results']
>> 
>>     def __init__(self, directory: str):
>>         self._directory = directory
>>         # Tuples of (base_path, full_path).
>>         self._manifest_list = []
>>         self._manifest_records = []
>>         self._manifest_results = []
>> 
>>     def build_manifest_list(self):
>>         for path, dirs, files in os.walk(self._directory):
>>             #if 'glsa' in path or 'news' in path:
>>             #if 'metadata' in path:
>>             #    continue # Skip the metadata directory for now.
>>                 # It contains a repository. Current algo barfs on
>Manifest
>>                 # containing only sig.
>> 
>>             if 'Manifest.files.gz' in files:
>>                 self._manifest_list += [(path, path +
>'/Manifest.files.gz')]
>>             if 'Manifest.gz' in files:
>>                 self._manifest_list += [(path, path +
>'/Manifest.gz')]
>>             
>>             if path == self._directory:
>>                 continue # Skip the repo manifest. Order matters, fix
>eventually.
>>             if 'Manifest' in files:
>>                 self._manifest_list += [(path, path + '/Manifest')]
>> 
>>     def parse_manifests(self):
>>         td = tempfile.TemporaryDirectory(dir='./')
>>         for manifest in self._manifest_list:
>>             def inner():
>>                 if manifest[1].endswith('.gz'):
>>                     name = 'Manifest.files' # Need to also handle
>Manifest.gz.
>>                     path = '{0}/{1}'.format(td.name, name)
>>                     subprocess.run(['sh', '-c', 'gunzip -c {0} > {1}'
>>                         .format(manifest[1], path)])
>>                     for line in open(path):
>>                         mr = ManifestRecord(line)
>>                         mr.make_absolute(manifest[0])
>>                         self._manifest_records += [mr]
>>                 else:
>>                     for line in open(manifest[1]):
>>                         if line.startswith('-'):
>>                             return # Skip the signed manifest.
>>                         mr = ManifestRecord(line)
>>                         mr.make_absolute(manifest[0])
>>                         self._manifest_records += [mr]
>>             inner()
>> 
>>     def verify_manifests(self):
>>         for record in self._manifest_records:
>>             self._manifest_results += [record.verify()]
>> 
>> 
>> class ManifestRecord(object):
>>     __slots__ = ['_tag', '_abs_path', '_path', '_size', '_hashes']
>> 
>>     def __init__(self, line: str=None):
>>         self._tag = None
>>         self._abs_path = None
>>         self._path = None
>>         self._size = None
>>         self._hashes = []
>>         if line:
>>             self.from_string(line)
>> 
>>     def from_string(self, line: str) -> None:
>>         parts = line.split()
>>         if len(parts) == 2:
>>             self._tag = 'ignore'
>>             return
>>         self._tag = parts[0]
>>         self._path = parts[1]
>>         self._size = parts[2]
>>         self._hashes = parts[3:]
>> 
>>     def make_absolute(self, abs_path: str) -> None:
>>         self._abs_path = abs_path
>>         try:
>>             pass
>>             #if 'md5-cache' in abs_path:
>>             #    print(abs_path + '/' + self._path)
>>         except TypeError as exc:
>>             return
>> 
>>     def verify(self) -> bool:
>>         if self._tag == 'ignore':
>>             return None
>> 
>>         # Where is best place to do this? Before?
>>         if self._tag.lower() == 'aux':
>>             self._path = self._abs_path + '/files/' + self._path
>>         elif self._abs_path:
>>             self._path = self._abs_path + '/' + self._path
>> 
>>         # Distfiles will not be present.
>>         if self._tag.lower() == 'dist':
>>             return None
>> 
>>         if not os.path.exists(self._path):
>>             return False
>>         
>>         fd = open(self._path, 'rb')
>>         sha512 = hashlib.sha512()
>>         blake2b = hashlib.blake2b()
>>         while True:
>>             d = fd.read(8192)
>>             if not d:
>>                 break
>>             sha512.update(d)
>>             blake2b.update(d)
>>         rsha512 = sha512.hexdigest()
>>         rblake2b = blake2b.hexdigest()
>> 
>>         if rblake2b != self._hashes[1]:
>>             return False
>> 
>>         if rsha512 != self._hashes[3]:
>>             return False
>> 
>>         return True
>> 
>>     def __repr__(self) -> str:
>>         #return repr(self._hashes)
>>         return '\t'.join([self._tag, self._size, self._path])
>> 
>> def main() -> int:
>>     # Step 0: verify the repo manifest.
>>     #publishers = ['infrastruct...@gentoo.org']
>>     #ev = setup_verification_environment(publishers)
>>     #mf = ManifestFile()
>>     #mf.load(open('/var/db/repos/gentoo/Manifest'),
>>     #    verify_openpgp=True, openpgp_env=ev)
>>     #pprint(mf)
>>     #pprint(mf.openpgp_signed)
>>     #pprint(mf.openpgp_signature)
>> 
>>     # Step 1: merge manifests.
>>     #mt = ManifestTree('/var/db/repos/gentoo')
>>     #mt.build_manifest_list()
>>     #mt.parse_manifests()
>>     #mt.verify_manifests()
>> 
>>     glsa = ManifestTree('/var/db/repos/gentoo')
>>     glsa.build_manifest_list()
>>     glsa.parse_manifests()
>>     
>>     start = timeit.default_timer()
>>     glsa.verify_manifests()
>>     end = timeit.default_timer()
>>     pprint(end - start)
>>     
>>     # Handled by checking for None.
>>     #no_ignore = filter(lambda x: x._tag != 'ignore',
>glsa_manifest_results)
>> 
>> 
>>     #pprint(glsa._manifest_results)
>>     real_files = [x for x in filter(lambda x: x is not None,
>glsa._manifest_results)]
>>     #pprint(real_files)
>>     pprint(len(glsa._manifest_results))
>>     pprint(len(real_files))
>> 
>>     all_files = []
>>     for path, dirs, files in os.walk('/var/db/repos/gentoo'):
>>         pass
>> 
>>     return 0
>> 
>> if __name__ == '__main__':
>>     sys.exit(main())
>> ```
>> 
>> ```python (wkd.py, likely unneeded but I didn't want to redo these
>files yet)
>> #!/usr/bin/env python3
>> import sys, hashlib
>> import dns
>> from dns import (
>>     name, query, dnssec,
>>     message, resolver, rdatatype
>> )
>> import shutil, requests
>> 
>> def check_domain_signature(domain: str) -> bool:
>>     response = dns.resolver.query(domain, dns.rdatatype.NS)
>>     nsname = response.rrset[0]
>>     response = dns.resolver.query(str(nsname), dns.rdatatype.A)
>>     nsaddr = response.rrset[0].to_text()
>>     
>>     # DNSKEY
>>     request = dns.message.make_query(domain,
>>         dns.rdatatype.DNSKEY, want_dnssec=True)
>>     response = dns.query.udp(request, nsaddr)
>>     if response.rcode() != 0:
>>         raise Exception('Unable to request dnskey.')
>> 
>>     answer = response.answer
>>     if len(answer) != 2:
>>         raise Exception('Malformed answer to dnskey query.')
>> 
>>     name = dns.name.from_text(domain)
>>     try:
>>         dns.dnssec.validate(answer[0], answer[1], {name: answer[0]})
>>     except dns.dnssec.ValidationFailure as exc:
>>         # Validation failed. The raise causes python to abort with
>status 1.
>>         #raise exc
>>         return False
>>     except AttributeError as exc:
>>         # Validation may have failed; DNSKEY missing signer
>attribute. dig may report
>>         # domain as valid.
>>         #
>>         # TODO: Additional state where subdomain of valid domain may
>fail with 3 nested
>>         # KeyErrors. Avoid temptation to wildcard catch. Safer to put
>in process?
>>         #raise exc
>>         return False
>>     else:
>>         return True
>> 
>> def hash_localpart(incoming: bytes) -> str:
>>     '''Z-base32 the localpart of an e-mail address
>> 
>>    
>https://tools.ietf.org/html/draft-koch-openpgp-webkey-service-08#section-3.1
>>     describes why this is needed.
>> 
>>     See https://tools.ietf.org/html/rfc6189#section-5.1.6 for a
>>     description of the z-base32 scheme.
>>     '''
>>     zb32 = "ybndrfg8ejkmcpqxot1uwisza345h769"
>> 
>>     b = hashlib.sha1(incoming).digest()
>>     ret = ""
>>     assert(len(b) * 8 == 160)
>>     for i in range(0, 160, 5):
>>         byte = i // 8
>>         offset = i - byte * 8
>>         # offset | bits remaining in k+1 | right-shift k+1
>>         # 3 | 0 | x
>>         # 4 | 1 | 7
>>         # 5 | 2 | 6
>>         # 6 | 3 | 5
>>         # 7 | 4 | 4
>>         if offset < 4:
>>             n = (b[byte] >> (3 - offset))
>>         else:
>>             n = (b[byte] << (offset - 3)) + (b[byte + 1] >> (11 -
>offset))
>> 
>>         ret += zb32[n & 0b11111]
>>     return ret
>> 
>> def build_web_key_uri(address: str) -> str:
>>     local, remote = address.split('@')
>>     local = hash_localpart(local.encode('utf-8'))
>>     return 'https://' + remote + '/.well-known/openpgpkey/hu/' + \
>>         local
>> 
>> def stream_to_file(uri: str, fname: str) -> None:
>>     with requests.get(uri, verify=True, stream=True) as r:
>>         from pprint import pprint
>>         pprint(r.headers)
>>         with open(fname, 'wb') as f:
>>             shutil.copyfileobj(r.raw, f)
>> ```
>> 


--
Best regards, 
Michał Górny

Reply via email to