Author: adc Date: Mon Jun 23 05:17:16 2014 New Revision: 1604694 URL: http://svn.apache.org/r1604694 Log: Added some release checking utilities
Added: labs/panopticon/pan-utils/src/asf/data/releases.py labs/panopticon/pan-utils/tests/test_releases.py Modified: labs/panopticon/pan-utils/requirements.txt labs/panopticon/pan-utils/src/asf/utils/test.py Modified: labs/panopticon/pan-utils/requirements.txt URL: http://svn.apache.org/viewvc/labs/panopticon/pan-utils/requirements.txt?rev=1604694&r1=1604693&r2=1604694&view=diff ============================================================================== --- labs/panopticon/pan-utils/requirements.txt (original) +++ labs/panopticon/pan-utils/requirements.txt Mon Jun 23 05:17:16 2014 @@ -1,5 +1,7 @@ -brownie +beautifulsoup4==4.3.2 +brownie==0.5.1 keyring==1.6.1 -PyCrypto -python-ldap -restkit +python-gnupg==0.3.6 +pycrypto==2.6.1 +python-ldap==2.4.15 +restkit==4.2.2 \ No newline at end of file Added: labs/panopticon/pan-utils/src/asf/data/releases.py URL: http://svn.apache.org/viewvc/labs/panopticon/pan-utils/src/asf/data/releases.py?rev=1604694&view=auto ============================================================================== --- labs/panopticon/pan-utils/src/asf/data/releases.py (added) +++ labs/panopticon/pan-utils/src/asf/data/releases.py Mon Jun 23 05:17:16 2014 @@ -0,0 +1,176 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +import collections +import hashlib +import logging +import os +import urlparse + +import bs4 +import gnupg +import restkit + +from asf.utils.file import temp_directory + + +log = logging.getLogger(__name__) + + +def scrape_release_url(release_url, ignore=None): + """ Recursively scrape file listing HTML pages for resources that are part of a release + :param str release_url: the root URL from which to start recursively scraping + :param set ignore: set of URLs to ignore when scraping + :return: a dictionary of paths and files within those paths + + .. code-block:: json + + { + 'path': { + 'filename' : 'url' + ... + } + } + """ + original_len = len(release_url) - 1 + + ignore = (ignore or set()) | set(['../']) + + def _scrape(scanning_url): + log.debug('scraping %s', scanning_url) + request = restkit.request(scanning_url, follow_redirect=True) + soup = bs4.BeautifulSoup(request.body_string()) + resources = collections.defaultdict(dict) + + for link in soup.find_all('a'): + href = link.get('href') + if href in ignore: + log.debug('ignored href %s in %s', href, scanning_url) + continue + + if href.endswith('/'): + resources.update(_scrape(link.get('href'))) + else: + text = link.get_text() + resources[href[original_len:-len(text) - 1]][text] = urlparse.urljoin(scanning_url, href) + + return resources + + return _scrape(release_url) + + +BLOCK_SIZE = 65536 + + +def verify_hash(resource_path, hash_path, algorithm): + """ Verify that the hash of a resource matches the hash stored in the hash file + :param str resource_path: the location of the resource file to hash + :param str hash_path: the location of the hash file + :param str algorithm: the algorithm to use to hash the resource file + :return: True if the hash of the resource matches the hash stored in the hash file + """ + resource_file = resource_path.split('/')[-1:][0] + with open(hash_path, 'r') as f: + reported_hash = f.readline() + if reported_hash.startswith(resource_file + ':'): + # sometimes hash files contain the name of the file suffixed with a colon + # we need to remove that bit and coalesce the hex digits in the file + reported_hash = reported_hash[len(resource_file) + 1:].strip() + line = f.readline() + while line: + reported_hash = reported_hash + line.strip() + line = f.readline() + reported_hash = reported_hash.replace(' ', '') + + hasher = hashlib.new(algorithm) + with open(resource_path, 'rb') as f: + buf = f.read(BLOCK_SIZE) + while len(buf) > 0: + hasher.update(buf) + buf = f.read(BLOCK_SIZE) + + return reported_hash.strip().lower() == hasher.hexdigest().strip().lower() + + +def verify_signature(resource_file, signature_file, gpg): + """ Verify signed file + :param unicode resource_file: the location of the signed resource file + :param unicode signature_file: the location of the signature file + :param gpg: GPG instance used to verify signature + :return: True if signature is valid and False otherwise + """ + with open(signature_file, 'r') as f: + result = gpg.verify_file(f, resource_file) + return result.valid + + +def verify_hashes(resources_url, keys_url, allowed_algorithms=None): + """ Verify the declared hashes of resources that are stored at a particular URL + + The declared hashes of the resources will be in the same "directory" but have the + algorithm suffixed at the end. + + :param str resources_url: the URL used to locate the resources + :param str keys_url: the URL used to locate the KEYS file that are used to verify signatures + :param set allowed_algorithms: the set of allowed algorithms to use to hash, default: sha1 and md5 + :return bool: True if the files have the same hashes an False otherwise + """ + resources = scrape_release_url(resources_url, ignore=set(['http://subversion.apache.org/'])) + + if not resources: + return False + + keys_data = '' + for chunk in restkit.request(keys_url, follow_redirect=True).tee(): + keys_data += chunk + + with temp_directory() as temp_gpg_dir: + gpg = gnupg.GPG(gnupghome=temp_gpg_dir) + gpg.import_keys(keys_data) + + allowed_algorithms = allowed_algorithms or set(['sha1', 'md5']) + for path, files in resources.iteritems(): + with temp_directory() as temp_dir: + + # download resources + for resource, resources_url in files.iteritems(): + with open(os.path.join(temp_dir, resource), 'wb') as f: + for chunk in restkit.request(resources_url, follow_redirect=True).tee(): + f.write(chunk) + + # verify hashes of resources + for resource in files.keys(): + if resource.split('.')[-1:][0] not in allowed_algorithms: + resource_file = os.path.join(temp_dir, resource) + for algorithm in allowed_algorithms: + hash_file = resource_file + '.' + algorithm + if os.path.exists(hash_file): + if not verify_hash(resource_file, hash_file, algorithm): + log.warning('Resource %s does not match reported %s hash', path + '/' + resource, algorithm) + return False + + # verify signatures + for resource in files.keys(): + resource_file = os.path.join(temp_dir, resource) + signature_file = resource_file + '.asc' + if os.path.exists(signature_file): + if not verify_signature(resource_file, signature_file, gpg): + log.warning('Resource %s signature does not verify correctly', path + '/' + resource) + return False + + return True Modified: labs/panopticon/pan-utils/src/asf/utils/test.py URL: http://svn.apache.org/viewvc/labs/panopticon/pan-utils/src/asf/utils/test.py?rev=1604694&r1=1604693&r2=1604694&view=diff ============================================================================== --- labs/panopticon/pan-utils/src/asf/utils/test.py (original) +++ labs/panopticon/pan-utils/src/asf/utils/test.py Mon Jun 23 05:17:16 2014 @@ -16,11 +16,12 @@ # specific language governing permissions and limitations # under the License. # -import pytest +import gnupg +import pytest from asf.data import ldap - from asf.utils.auth import get_stored_credentials +from asf.utils.file import temp_directory def test_credentials_stored(): @@ -42,3 +43,16 @@ def test_ldap(): ensure_ldap = pytest.mark.skipif(test_ldap(), reason="Need to ensure that LDAP is available") + + +def test_gpg(): + try: + with temp_directory() as temp_gpg_dir: + gnupg.GPG(gnupghome=temp_gpg_dir) + return False + except Exception: + return True + + +ensure_gpg = pytest.mark.skipif(test_gpg(), + reason="Need to ensure that gpg is available") Added: labs/panopticon/pan-utils/tests/test_releases.py URL: http://svn.apache.org/viewvc/labs/panopticon/pan-utils/tests/test_releases.py?rev=1604694&view=auto ============================================================================== --- labs/panopticon/pan-utils/tests/test_releases.py (added) +++ labs/panopticon/pan-utils/tests/test_releases.py Mon Jun 23 05:17:16 2014 @@ -0,0 +1,30 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +from asf.data import releases +from asf.utils.test import ensure_gpg + + +@ensure_gpg +def test_scrape_release_url(): + assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachemrql-1001/', 'http://www.apache.org/dist/incubator/mrql/KEYS') + assert releases.verify_hashes('https://dist.apache.org/repos/dist/dev/incubator/mrql/0.9.2-incubating-RC2/', 'http://www.apache.org/dist/incubator/mrql/KEYS') + + assert releases.verify_hashes('https://people.apache.org/~ptgoetz/storm-0.9.2-incubating/', 'https://git-wip-us.apache.org/repos/asf?p=incubator-storm.git;a=blob_plain;f=KEYS;hb=22b832708295fa2c15c4f3c70ac0d2bc6fded4bd') + assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachestorm-1008/', 'https://git-wip-us.apache.org/repos/asf?p=incubator-storm.git;a=blob_plain;f=KEYS;hb=22b832708295fa2c15c4f3c70ac0d2bc6fded4bd') + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@labs.apache.org For additional commands, e-mail: commits-h...@labs.apache.org