Source: python-popcon Version: 1.3 Severity: serious Tags: patch X-Debbugs-Cc: reproducible-bui...@lists.alioth.debian.org
Hi, Since (at least) 07-Aug-2016 14:04, the all-popcon-results.txt.gz file contains an invalid line: Package: libfyba0-dbg 0 2 0 0 --> Package: libf erdp-ommon 0 0 0 1 Package: libg++2.8.1.3-dbg 0 0 0 1 That's (currently) line 54869. The "space" on the second line is actually a 0xa0 character. This is making the package unusable. $ python3 Python 3.5.2+ (default, Aug 5 2016, 08:07:14) [GCC 6.1.1 20160724] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import popcon >>> popcon.package("foo") Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/lib/python3/dist-packages/popcon.py", line 145, in package raw = package_raw(*packages) File "/usr/lib/python3/dist-packages/popcon.py", line 189, in package_raw data = _fetch() File "/usr/lib/python3/dist-packages/popcon.py", line 108, in _fetch txt = _decompress(txt) File "/usr/lib/python3/dist-packages/popcon.py", line 134, in _decompress data = data.decode() UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 3574857: invalid start byte You can see this happening here: https://jenkins.debian.net/view/reproducible/view/problems/job/reproducible_html_notes/ Patch attached. NB. the comment in the lookup section. Regards, -- ,''`. : :' : Chris Lamb `. `'` la...@debian.org / chris-lamb.co.uk `-
diff --git a/src/popcon.py b/src/popcon.py index 4f089f0..670c514 100644 --- a/src/popcon.py +++ b/src/popcon.py @@ -115,7 +115,7 @@ def _parse(results): results = results.splitlines() for line in results: elems = line.split() - if elems[0] != "Package:": + if elems[0] != b"Package:": continue ans[elems[1]] = Package(*(int(i) for i in elems[2:])) return ans @@ -131,7 +131,6 @@ def _decompress(compressed): gzippedstream = io.BytesIO(compressed) gzipper = gzip.GzipFile(fileobj=gzippedstream) data = gzipper.read() - data = data.decode() return data @@ -206,8 +205,11 @@ def package_raw(*packages): cached_timestamp = time.time() ans = dict() for pkg in packages: - if pkg in data: - ans[pkg] = data[pkg] + # Lookup using bytestrings, but always index results by the original so + # that callsites can look it up. + lookup = pkg if isinstance(pkg, bytes) else pkg.encode('utf-8') + if lookup in data: + ans[pkg] = data[lookup] if KEEP_DATA: cached_data = data return ans