Dnia June 30, 2020 2:13:43 AM UTC, Sid Spry <s...@aeam.us> napisał(a):
>Hello,
>
>I have some runnable pseudocode outlining a faster tree verification
>algorithm.
>Before I create patches I'd like to see if there is any guidance on
>making the
>changes as unobtrusive as possible. If the radical change in algorithm
>is
>acceptable I can work on adding the changes.
>
>Instead of composing any kind of structured data out of the portage
>tree my
>algorithm just lists all files and then optionally batches them out to
>threads.
>There is a noticeable speedup by eliding the tree traversal operations
>which
>can be seen when running the algorithm with a single thread and
>comparing it to
>the current algorithm in gemato (which should still be discussed
>here?).

Without reading the code: does your algorithm correctly detect extraneous files?

>
>Some simple tests like counting all objects traversed and verified
>returns the
>same(ish). Once it is put into portage it could be tested in detail.
>
>There is also my partial attempt at removing the brittle interface to
>GnuPG
>(it's not as if the current code is badly designed, just that parsing
>the
>output of GnuPG directly is likely not the best idea).

The 'brittle interface' is well-defined machine-readable output.

>
>Needs gemato, dnspython, and requests. Slightly better than random code
>because
>I took inspiration from the existing gemato classes.

The code makes a lot of brittle assumptions about the structure. The GLEP was 
specifically designed to avoid that and let us adjust the structure in the 
future to meet our needs.

>
>```python (veriftree.py)
>#!/usr/bin/env python3
>import os, sys, zlib, hashlib, tempfile, shutil, timeit
>import subprocess
>from typing import List
>from pprint import pprint
>
>from gemato.manifest import (
>    ManifestFile,
>    ManifestFileEntry,
>)
>from wkd import (
>    check_domain_signature,
>    hash_localpart,
>    build_web_key_uri,
>    stream_to_file
>)
>from fetchmedia import (
>    OpenPGPEnvironment,
>    setup_verification_environment
>)
>
># 0. Top level directory (repository) contains Manifest, a PGP
>signature of
>#    blake2b and sha512 hashes of Manifest.files.gz.
># 1. Manifest.files contains hashes of each category Manifest.gz.
># 2. The category Manifest contains hashes of each package Manifest.
># 3. The package Manifest contains hashes of each package file.
>#    Must be aware of PMS, e.g. aux tag specifies a file in files/.
>
># 0. Check signature of repo Manifest.
># 1. Merge items in Manifest.files, each category Manifest, and each
>package
>#    Manifest into one big list. The path must be made absolute.
># 2. Distribute items to threads.
>
># To check operation compare directory tree to files appearing in all
># ManifestRecords.
>
>class ManifestTree(object):
>    __slots__ = ['_directory', '_manifest_list', '_manifest_records',
>        '_manifest_results']
>
>    def __init__(self, directory: str):
>        self._directory = directory
>        # Tuples of (base_path, full_path).
>        self._manifest_list = []
>        self._manifest_records = []
>        self._manifest_results = []
>
>    def build_manifest_list(self):
>        for path, dirs, files in os.walk(self._directory):
>            #if 'glsa' in path or 'news' in path:
>            #if 'metadata' in path:
>            #    continue # Skip the metadata directory for now.
>             # It contains a repository. Current algo barfs on Manifest
>                # containing only sig.
>
>            if 'Manifest.files.gz' in files:
>           self._manifest_list += [(path, path + '/Manifest.files.gz')]
>            if 'Manifest.gz' in files:
>                self._manifest_list += [(path, path + '/Manifest.gz')]
>            
>            if path == self._directory:
>      continue # Skip the repo manifest. Order matters, fix eventually.
>            if 'Manifest' in files:
>                self._manifest_list += [(path, path + '/Manifest')]
>
>    def parse_manifests(self):
>        td = tempfile.TemporaryDirectory(dir='./')
>        for manifest in self._manifest_list:
>            def inner():
>                if manifest[1].endswith('.gz'):
>             name = 'Manifest.files' # Need to also handle Manifest.gz.
>                    path = '{0}/{1}'.format(td.name, name)
>                    subprocess.run(['sh', '-c', 'gunzip -c {0} > {1}'
>                        .format(manifest[1], path)])
>                    for line in open(path):
>                        mr = ManifestRecord(line)
>                        mr.make_absolute(manifest[0])
>                        self._manifest_records += [mr]
>                else:
>                    for line in open(manifest[1]):
>                        if line.startswith('-'):
>                            return # Skip the signed manifest.
>                        mr = ManifestRecord(line)
>                        mr.make_absolute(manifest[0])
>                        self._manifest_records += [mr]
>            inner()
>
>    def verify_manifests(self):
>        for record in self._manifest_records:
>            self._manifest_results += [record.verify()]
>
>
>class ManifestRecord(object):
>    __slots__ = ['_tag', '_abs_path', '_path', '_size', '_hashes']
>
>    def __init__(self, line: str=None):
>        self._tag = None
>        self._abs_path = None
>        self._path = None
>        self._size = None
>        self._hashes = []
>        if line:
>            self.from_string(line)
>
>    def from_string(self, line: str) -> None:
>        parts = line.split()
>        if len(parts) == 2:
>            self._tag = 'ignore'
>            return
>        self._tag = parts[0]
>        self._path = parts[1]
>        self._size = parts[2]
>        self._hashes = parts[3:]
>
>    def make_absolute(self, abs_path: str) -> None:
>        self._abs_path = abs_path
>        try:
>            pass
>            #if 'md5-cache' in abs_path:
>            #    print(abs_path + '/' + self._path)
>        except TypeError as exc:
>            return
>
>    def verify(self) -> bool:
>        if self._tag == 'ignore':
>            return None
>
>        # Where is best place to do this? Before?
>        if self._tag.lower() == 'aux':
>            self._path = self._abs_path + '/files/' + self._path
>        elif self._abs_path:
>            self._path = self._abs_path + '/' + self._path
>
>        # Distfiles will not be present.
>        if self._tag.lower() == 'dist':
>            return None
>
>        if not os.path.exists(self._path):
>            return False
>        
>        fd = open(self._path, 'rb')
>        sha512 = hashlib.sha512()
>        blake2b = hashlib.blake2b()
>        while True:
>            d = fd.read(8192)
>            if not d:
>                break
>            sha512.update(d)
>            blake2b.update(d)
>        rsha512 = sha512.hexdigest()
>        rblake2b = blake2b.hexdigest()
>
>        if rblake2b != self._hashes[1]:
>            return False
>
>        if rsha512 != self._hashes[3]:
>            return False
>
>        return True
>
>    def __repr__(self) -> str:
>        #return repr(self._hashes)
>        return '\t'.join([self._tag, self._size, self._path])
>
>def main() -> int:
>    # Step 0: verify the repo manifest.
>    #publishers = ['infrastruct...@gentoo.org']
>    #ev = setup_verification_environment(publishers)
>    #mf = ManifestFile()
>    #mf.load(open('/var/db/repos/gentoo/Manifest'),
>    #    verify_openpgp=True, openpgp_env=ev)
>    #pprint(mf)
>    #pprint(mf.openpgp_signed)
>    #pprint(mf.openpgp_signature)
>
>    # Step 1: merge manifests.
>    #mt = ManifestTree('/var/db/repos/gentoo')
>    #mt.build_manifest_list()
>    #mt.parse_manifests()
>    #mt.verify_manifests()
>
>    glsa = ManifestTree('/var/db/repos/gentoo')
>    glsa.build_manifest_list()
>    glsa.parse_manifests()
>    
>    start = timeit.default_timer()
>    glsa.verify_manifests()
>    end = timeit.default_timer()
>    pprint(end - start)
>    
>    # Handled by checking for None.
>#no_ignore = filter(lambda x: x._tag != 'ignore',
>glsa_manifest_results)
>
>
>    #pprint(glsa._manifest_results)
>real_files = [x for x in filter(lambda x: x is not None,
>glsa._manifest_results)]
>    #pprint(real_files)
>    pprint(len(glsa._manifest_results))
>    pprint(len(real_files))
>
>    all_files = []
>    for path, dirs, files in os.walk('/var/db/repos/gentoo'):
>        pass
>
>    return 0
>
>if __name__ == '__main__':
>    sys.exit(main())
>```
>
>```python (wkd.py, likely unneeded but I didn't want to redo these
>files yet)
>#!/usr/bin/env python3
>import sys, hashlib
>import dns
>from dns import (
>    name, query, dnssec,
>    message, resolver, rdatatype
>)
>import shutil, requests
>
>def check_domain_signature(domain: str) -> bool:
>    response = dns.resolver.query(domain, dns.rdatatype.NS)
>    nsname = response.rrset[0]
>    response = dns.resolver.query(str(nsname), dns.rdatatype.A)
>    nsaddr = response.rrset[0].to_text()
>    
>    # DNSKEY
>    request = dns.message.make_query(domain,
>        dns.rdatatype.DNSKEY, want_dnssec=True)
>    response = dns.query.udp(request, nsaddr)
>    if response.rcode() != 0:
>        raise Exception('Unable to request dnskey.')
>
>    answer = response.answer
>    if len(answer) != 2:
>        raise Exception('Malformed answer to dnskey query.')
>
>    name = dns.name.from_text(domain)
>    try:
>        dns.dnssec.validate(answer[0], answer[1], {name: answer[0]})
>    except dns.dnssec.ValidationFailure as exc:
>   # Validation failed. The raise causes python to abort with status 1.
>        #raise exc
>        return False
>    except AttributeError as exc:
># Validation may have failed; DNSKEY missing signer attribute. dig may
>report
>        # domain as valid.
>        #
># TODO: Additional state where subdomain of valid domain may fail with
>3 nested
># KeyErrors. Avoid temptation to wildcard catch. Safer to put in
>process?
>        #raise exc
>        return False
>    else:
>        return True
>
>def hash_localpart(incoming: bytes) -> str:
>    '''Z-base32 the localpart of an e-mail address
>
>https://tools.ietf.org/html/draft-koch-openpgp-webkey-service-08#section-3.1
>    describes why this is needed.
>
>    See https://tools.ietf.org/html/rfc6189#section-5.1.6 for a
>    description of the z-base32 scheme.
>    '''
>    zb32 = "ybndrfg8ejkmcpqxot1uwisza345h769"
>
>    b = hashlib.sha1(incoming).digest()
>    ret = ""
>    assert(len(b) * 8 == 160)
>    for i in range(0, 160, 5):
>        byte = i // 8
>        offset = i - byte * 8
>        # offset | bits remaining in k+1 | right-shift k+1
>        # 3 | 0 | x
>        # 4 | 1 | 7
>        # 5 | 2 | 6
>        # 6 | 3 | 5
>        # 7 | 4 | 4
>        if offset < 4:
>            n = (b[byte] >> (3 - offset))
>        else:
>         n = (b[byte] << (offset - 3)) + (b[byte + 1] >> (11 - offset))
>
>        ret += zb32[n & 0b11111]
>    return ret
>
>def build_web_key_uri(address: str) -> str:
>    local, remote = address.split('@')
>    local = hash_localpart(local.encode('utf-8'))
>    return 'https://' + remote + '/.well-known/openpgpkey/hu/' + \
>        local
>
>def stream_to_file(uri: str, fname: str) -> None:
>    with requests.get(uri, verify=True, stream=True) as r:
>        from pprint import pprint
>        pprint(r.headers)
>        with open(fname, 'wb') as f:
>            shutil.copyfileobj(r.raw, f)
>```


--
Best regards, 
Michał Górny

Reply via email to