Author: adc Date: Tue Jun 24 06:40:21 2014 New Revision: 1605014 URL: http://svn.apache.org/r1605014 Log: Fixed various bugs
- some resource pages have links that wander off the original site - somtimes things recursively arrive back to the original URL Modified: labs/panopticon/pan-utils/src/asf/data/releases.py labs/panopticon/pan-utils/tests/test_releases.py Modified: labs/panopticon/pan-utils/src/asf/data/releases.py URL: http://svn.apache.org/viewvc/labs/panopticon/pan-utils/src/asf/data/releases.py?rev=1605014&r1=1605013&r2=1605014&view=diff ============================================================================== --- labs/panopticon/pan-utils/src/asf/data/releases.py (original) +++ labs/panopticon/pan-utils/src/asf/data/releases.py Tue Jun 24 06:40:21 2014 @@ -51,24 +51,42 @@ def scrape_release_url(release_url, igno ignore = (ignore or set()) | set(['../']) + visited = set() + def _scrape(scanning_url): + if scanning_url in visited: + return {} + else: + visited.add(scanning_url) + log.debug('scraping %s', scanning_url) request = restkit.request(scanning_url, follow_redirect=True) soup = bs4.BeautifulSoup(request.body_string()) - resources = collections.defaultdict(dict) + # let's put the protocol in canonical form so that it's easily compared + canonical_url = scanning_url.replace('https://', 'http://', count=1) + + resources = collections.defaultdict(dict) for link in soup.find_all('a'): href = link.get('href') - if href in ignore: + if not href or href in ignore: log.debug('ignored href %s in %s', href, scanning_url) continue full_href = urlparse.urljoin(scanning_url, href) + # put the protocol in canonical form so that it's easily compared + if not full_href.replace('https://', 'http://', count=1).startswith(canonical_url): + # if we're wandering off the original release URL then we've + # accidentally hit a link that goes off-site + log.debug('ignored off-site href %s in %s', href, scanning_url) + continue + if href.endswith('/'): resources.update(_scrape(full_href)) else: - text = link.get_text() - resources[full_href[original_len:-len(text) - 1].strip('/')][text] = full_href + resource_name = link.get_text() + resource_path = full_href[original_len:-len(resource_name) - 1].strip('/') + resources[resource_path][resource_name] = full_href return resources @@ -105,7 +123,10 @@ def verify_hash(resource_path, hash_path hasher.update(buf) buf = f.read(BLOCK_SIZE) - return reported_hash.strip().lower() == hasher.hexdigest().strip().lower() + result = reported_hash.strip().lower() == hasher.hexdigest().strip().lower() + if not result: + log.warning('Bad hash "%s" != "%s"', reported_hash.strip().lower(), hasher.hexdigest().strip().lower()) + return result def verify_signature(resource_file, signature_file, gpg): Modified: labs/panopticon/pan-utils/tests/test_releases.py URL: http://svn.apache.org/viewvc/labs/panopticon/pan-utils/tests/test_releases.py?rev=1605014&r1=1605013&r2=1605014&view=diff ============================================================================== --- labs/panopticon/pan-utils/tests/test_releases.py (original) +++ labs/panopticon/pan-utils/tests/test_releases.py Tue Jun 24 06:40:21 2014 @@ -22,7 +22,7 @@ from asf.utils.test import ensure_gpg @ensure_gpg def test_scrape_release_url(): - assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachesirona-1000//', 'https://dist.apache.org/repos/dist/release/incubator/sirona/KEYS') + assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachesirona-1000/', 'https://dist.apache.org/repos/dist/release/incubator/sirona/KEYS') assert releases.verify_hashes('https://dist.apache.org/repos/dist/dev/incubator/sirona/0.2-incubating/', 'https://dist.apache.org/repos/dist/release/incubator/sirona/KEYS') assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachemrql-1001/', 'http://www.apache.org/dist/incubator/mrql/KEYS') --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@labs.apache.org For additional commands, e-mail: commits-h...@labs.apache.org