Bug#861198: Shutting down public FTP services
On Mon, May 15, 2017 at 01:29:54PM -0700, Vagrant Cascadian wrote: > Seems to work! Any obvious problem with going that route? No obvious problem, go for it. It's purely a matter of preference. string.format is more python3, but %-formatting is in line with what the logging module does. I'm fine %either way. Enrico -- GPG key: 4096R/634F4BD1E7AD5568 2009-05-08 Enrico Zinisignature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On 2017-05-15, Vagrant Cascadian wrote: > On 2017-05-15, Enrico Zini wrote: >> On Mon, May 15, 2017 at 10:18:22AM -0700, Vagrant Cascadian wrote: >> >>> File "/home/vagrant/simple-cdd/simple_cdd/utils.py", line 207, in >>> verify_file >>> raise Fail("Invalid checksum for {}: expected {}, got {}", absname, >>> hashsum, hasher.hexdigest()) >>> simple_cdd.exceptions.Fail: >>> >>> During handling of the above exception, another exception occurred: >>> >>> Traceback (most recent call last): >>> File "/home/vagrant/simple-cdd/simple_cdd/log.py", line 85, in emit >>> record.message = record.getMessage() >>> File "/usr/lib/python3.5/logging/__init__.py", line 331, in getMessage >>> msg = msg % self.args >>> TypeError: not all arguments converted during string formatting >> >> Fail used %-formatting internally, but it was sometimes called with >> printf-style format strings and sometimes with string.format style >> format strings. I'm attaching a patch that unifies everything to >> string.format-style strings. Much shorter patch that switches the two cases of string.format style format strings to printf-style %s/%d formatting: diff --git a/simple_cdd/utils.py b/simple_cdd/utils.py index e3a7370..df0ec50 100644 --- a/simple_cdd/utils.py +++ b/simple_cdd/utils.py @@ -183,7 +183,7 @@ class Checksums: if expected_size is not None: real_size = os.stat(absname).st_size if real_size != expected_size: -raise Fail("Invalid size for {}: expected {}, got {}", absname, expected_size, real_size) +raise Fail("Invalid size for %s: expected %d, got %d", absname, expected_size, real_size) # Check the file against the checksums that we have for hashtype in self.FIELDS: @@ -204,7 +204,7 @@ class Checksums: # Verify if hasher.hexdigest() != hashsum: -raise Fail("Invalid checksum for {}: expected {}, got {}", absname, hashsum, hasher.hexdigest()) +raise Fail("Invalid checksum for %s: expected %s, got %s", absname, hashsum, hasher.hexdigest()) def parse_release_file(self, pathname): """ Seems to work! Any obvious problem with going that route? live well, vagrant signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On 2017-05-15, Enrico Zini wrote: > On Mon, May 15, 2017 at 10:18:22AM -0700, Vagrant Cascadian wrote: > >> File "/home/vagrant/simple-cdd/simple_cdd/utils.py", line 207, in >> verify_file >> raise Fail("Invalid checksum for {}: expected {}, got {}", absname, >> hashsum, hasher.hexdigest()) >> simple_cdd.exceptions.Fail: >> >> During handling of the above exception, another exception occurred: >> >> Traceback (most recent call last): >> File "/home/vagrant/simple-cdd/simple_cdd/log.py", line 85, in emit >> record.message = record.getMessage() >> File "/usr/lib/python3.5/logging/__init__.py", line 331, in getMessage >> msg = msg % self.args >> TypeError: not all arguments converted during string formatting > > Fail used %-formatting internally, but it was sometimes called with > printf-style format strings and sometimes with string.format style > format strings. I'm attaching a patch that unifies everything to > string.format-style strings. With that patch applied, still getting similar tracebacks when the checksum doesn't match: --- Logging error --- Traceback (most recent call last): File "./build-simple-cdd", line 658, in scdd.build_mirror() File "./build-simple-cdd", line 270, in build_mirror self.run_tool("mirror", tool) File "./build-simple-cdd", line 367, in run_tool tool.run() File "/home/vagrant/simple-cdd/simple_cdd/tools/mirror_wget.py", line 95, in run _download(x["url"], x["absname"], checksums=extrafile_sums, relname=x["relname"]) File "/home/vagrant/simple-cdd/simple_cdd/tools/mirror_wget.py", line 59, in _download checksums.verify_file(output, relname) File "/home/vagrant/simple-cdd/simple_cdd/utils.py", line 207, in verify_file raise Fail("Invalid checksum for {}: expected {}, got {}", absname, hashsum, hasher.hexdigest()) simple_cdd.exceptions.Fail: During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/lib/python3.5/logging/__init__.py", line 981, in emit msg = self.format(record) File "/usr/lib/python3.5/logging/__init__.py", line 831, in format return fmt.format(record) File "/usr/lib/python3.5/logging/__init__.py", line 568, in format record.message = record.getMessage() File "/usr/lib/python3.5/logging/__init__.py", line 331, in getMessage msg = msg % self.args TypeError: not all arguments converted during string formatting Call stack: File "./build-simple-cdd", line 674, in log.error(*e.args) Message: 'Invalid checksum for {}: expected {}, got {}' Arguments: ('/home/vagrant/simple-cdd/tmp/mirror/doc/constitution.txt', '34b19d4f013dcc45630156a0bf794ed385a0ab995e5c99553cc94a6956be7804', '8b08096e9e4a33d13eb83261649bc5ec153ba24618ef4b578b5ad7f6aea0c7bf') It seems the upstream python portion of the code works better with %-formatting, as this seems to fix/workaround the issue: diff --git a/simple_cdd/utils.py b/simple_cdd/utils.py index 9cceba2..05104ac 100644 --- a/simple_cdd/utils.py +++ b/simple_cdd/utils.py @@ -204,7 +204,7 @@ class Checksums: # Verify if hasher.hexdigest() != hashsum: -raise Fail("Invalid checksum for {}: expected {}, got {}", absname, hashsum, hasher.hexdigest()) +raise Fail("Invalid checksum for %s: expected %s, got %s", absname, hashsum, hasher.hexdigest()) def parse_release_file(self, pathname): """ Does this suggest that the inverse of your patch, converting everything to %-formatting consistantly should work? live well, vagrant signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On Mon, May 15, 2017 at 11:25:06AM -0700, Vagrant Cascadian wrote: > >> > -prefix_len = 7 + len(env.get("DI_CODENAME")) # > >> > dists/{DI_CODENAME}/ > >> > -relname = file[prefix_len:] > >> > +separator = os.path.join('dists/', > >> > env.get("DI_CODENAME"), '') > >> > +separator, relname = file.split(separator) > >> If I remember correctly, it is trying to split a filename based on > >> dists/CODENAME, so maybe os.path.split would be appropriate here; will > >> look into it. > > os.path.split only takes a single argument, and splits in a > predetermined way. e.g. /foo/bar/baz is split into /foo/bar and baz. Ah right. I keep instinctively thinking that os.path.split splits a path into a list of path components. > > os.path.relpath might also help. > > This also takes a single argument, turning a full path into a relative > path. e.g. /foo/bar/baz gives ../../../foo/bar/baz It should take an optional start= argument. I was thinking something like this: os.path.relpath(path, start=os.path.join("dists", env.get("DI_CODENAME"))) > So, I'm not sure either of those will be helpful here. To keep the diff > small, I could just revert to your original code, as it works and my > change there isn't strictly necessary; it's just easier for me to see > what's going on. If it's easier to see what's going on, by all means keep your code. Enrico -- GPG key: 4096R/634F4BD1E7AD5568 2009-05-08 Enrico Zinisignature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On Mon, May 15, 2017 at 10:18:22AM -0700, Vagrant Cascadian wrote: > File "/home/vagrant/simple-cdd/simple_cdd/utils.py", line 207, in > verify_file > raise Fail("Invalid checksum for {}: expected {}, got {}", absname, > hashsum, hasher.hexdigest()) > simple_cdd.exceptions.Fail: > > During handling of the above exception, another exception occurred: > > Traceback (most recent call last): > File "/home/vagrant/simple-cdd/simple_cdd/log.py", line 85, in emit > record.message = record.getMessage() > File "/usr/lib/python3.5/logging/__init__.py", line 331, in getMessage > msg = msg % self.args > TypeError: not all arguments converted during string formatting Fail used %-formatting internally, but it was sometimes called with printf-style format strings and sometimes with string.format style format strings. I'm attaching a patch that unifies everything to string.format-style strings. Enrico -- GPG key: 4096R/634F4BD1E7AD5568 2009-05-08 Enrico ZiniFrom 1b2196f7c9c3607c347a4be76a2b4c4cee1c7c7b Mon Sep 17 00:00:00 2001 From: Enrico Zini Date: Mon, 15 May 2017 20:29:40 +0200 Subject: [PATCH] Standardised Fail behaviour on string.format --- build-simple-cdd| 10 +- simple_cdd/exceptions.py| 2 +- simple_cdd/gnupg.py | 6 +++--- simple_cdd/tools/base.py| 4 ++-- simple_cdd/tools/mirror_reprepro.py | 8 simple_cdd/utils.py | 2 +- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/build-simple-cdd b/build-simple-cdd index 5da6284..f629e58 100755 --- a/build-simple-cdd +++ b/build-simple-cdd @@ -175,7 +175,7 @@ class SimpleCDD: if self.args.force_preseed: log.warn("preseed file invalid: %s", p) else: -raise Fail("preseed file invalid: %s", p) +raise Fail("preseed file invalid: {}", p) def paranoid_checks(self): @@ -280,7 +280,7 @@ class SimpleCDD: log.info("re-setting mirror for new debpartial-mirror dir: %s", self.env.get("MIRROR")) if not os.path.isdir(self.env.get("MIRROR")): -raise Fail("mirror dir is not a directory: %s", self.env.get("MIRROR")) +raise Fail("mirror dir is not a directory: {}", self.env.get("MIRROR")) log.debug("Checking if the mirror is complete...") self.checkpackages() @@ -380,7 +380,7 @@ class SimpleCDD: elif isinstance(val, int): val = str(val) else: -raise Fail("Unknown variable type for value %s=%r", name, val) +raise Fail("Unknown variable type for value {}={!r}", name, val) log.info("export %s=%s", name, shell_quote(val)) os.environ[name] = val @@ -408,7 +408,7 @@ class SimpleCDD: if len(candidates) == 1: return candidates[0] -raise Fail("Cannot find built ISO in %s", outdir) +raise Fail("Cannot find built ISO in {}", outdir) def checkpackages(self): @@ -501,7 +501,7 @@ class SimpleCDD: qemu = "qemu-system-" + arch qemu_bin = shell_which(qemu) if qemu_bin is None: -raise Fail("Cannot find qemu executable %s", qemu) +raise Fail("Cannot find qemu executable {}", qemu) qemu_opts = [] diff --git a/simple_cdd/exceptions.py b/simple_cdd/exceptions.py index 9eab13f..b83da59 100644 --- a/simple_cdd/exceptions.py +++ b/simple_cdd/exceptions.py @@ -4,4 +4,4 @@ class Fail(BaseException): cannot proceed. """ def __str__(self): -return self.args[0] % self.args[1:] +return self.args[0].format(self.args[1:]) diff --git a/simple_cdd/gnupg.py b/simple_cdd/gnupg.py index 78ffe7e..2f3c4f5 100644 --- a/simple_cdd/gnupg.py +++ b/simple_cdd/gnupg.py @@ -45,14 +45,14 @@ class Gnupg: for line in lines: x.write(line.decode('utf-8')) else: -raise Fail("Unable to extract data from %s to %s, returned %d", sigpathname, pathname, proc.returncode) +raise Fail("Unable to extract data from {} to {}, returned {}", sigpathname, pathname, proc.returncode) def verify_gpg_sig(self, *extra_args): args = self.common_gpg_args() args.extend(extra_args) retval = run_command("verify gpg signature", args) if retval != 0: -raise Fail("Signature verification failed on %s", pathname) +raise Fail("Signature verification failed on {}", pathname) def verify_detached_sig(self, pathname, sigpathname): return self.verify_gpg_sig("--verify", sigpathname, pathname) @@ -72,7 +72,7 @@ class Gnupg: if retval != 0: for line in stderr.decode("utf-8").split("\n"): log.error("GPG standard error: %s", line) -raise Fail("Importing %s into %s failed, gpg error code %s",
Bug#861198: Shutting down public FTP services
On 2017-05-15, Enrico Zini wrote: > On Mon, May 15, 2017 at 09:46:08AM -0700, Vagrant Cascadian wrote: > >> > -prefix_len = 7 + len(env.get("DI_CODENAME")) # >> > dists/{DI_CODENAME}/ >> > -relname = file[prefix_len:] >> > +separator = os.path.join('dists/', >> > env.get("DI_CODENAME"), '') >> > +separator, relname = file.split(separator) >> If I remember correctly, it is trying to split a filename based on >> dists/CODENAME, so maybe os.path.split would be appropriate here; will >> look into it. os.path.split only takes a single argument, and splits in a predetermined way. e.g. /foo/bar/baz is split into /foo/bar and baz. > os.path.relpath might also help. This also takes a single argument, turning a full path into a relative path. e.g. /foo/bar/baz gives ../../../foo/bar/baz So, I'm not sure either of those will be helpful here. To keep the diff small, I could just revert to your original code, as it works and my change there isn't strictly necessary; it's just easier for me to see what's going on. live well, vagrant signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On Mon, May 15, 2017 at 10:18:22AM -0700, Vagrant Cascadian wrote: > I apparently get *two* tracebacks for the price of one: Eek! Fail fails in failing. Let me fix it so it fails successfully. Enrico -- GPG key: 4096R/634F4BD1E7AD5568 2009-05-08 Enrico Zinisignature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On 2017-05-15, Enrico Zini wrote: > On Mon, May 15, 2017 at 09:46:08AM -0700, Vagrant Cascadian wrote: >> > I noticed that verify_file in simple_cdd/utils.py already throws Fail if >> > something is wrong. I would get rid of the try/except block altogether, >> > and just call verify_file. >> I think with these I was getting an ugly traceback when the file failed >> to verify, so wanted to actually give the user something more freindly >> than a traceback... will try your patch again, tweak the file between >> download and verify, and see if it still spits out the tracebacks. > > If what is thrown is Fail, that shouldn't happen: build-simple-cdd has > this that catches it and only logs the error message instead of the > traceback: > > except Fail as e: > #import traceback > #traceback.print_stack() > log.error(*e.args) > scdd.paranoid_checks() > result = 1 > isoname = None > > If it spits out a traceback instead, send it to me and I'll see if > there's somewhere where some other exception should be turned into Fail. With your patch applied to git master, and the following patch to make sure it fails: diff --git a/simple_cdd/tools/mirror_wget.py b/simple_cdd/tools/mirror_wget.py index dc914cf..73bece9 100644 --- a/simple_cdd/tools/mirror_wget.py +++ b/simple_cdd/tools/mirror_wget.py @@ -54,6 +54,8 @@ class ToolMirrorWget(Tool): log.debug("downloading: %s", output) request.urlretrieve(url, filename=output) if checksums: +with open(output, 'w') as x: +x.write('dslkfgjsdlkfjsdf') checksums.verify_file(output, relname) if env.get("mirror_files"): I apparently get *two* tracebacks for the price of one: $ ./build-simple-cdd --verbose --mirror-tools wget 2017-05-15 10:08:58 INFO Reading configuration file /home/vagrant/simple-cdd/profiles/default.conf 2017-05-15 10:08:58 INFO /home/vagrant/simple-cdd/profiles/default.conf: new var debian_mirror=http://cdn-fastly.deb.debian.org/debian/ 2017-05-15 10:08:58 INFO /home/vagrant/simple-cdd/profiles/default.conf: new var http_proxy=http://192.168.122.1:8000 --- Logging error ---g: /home/vagrant/simple-cdd/tmp/mirror/dists/stretch/main/installer-amd64/current/images/cdrom/gtk Traceback (most recent call last): File "./build-simple-cdd", line 658, in scdd.build_mirror() File "./build-simple-cdd", line 270, in build_mirror self.run_tool("mirror", tool) File "./build-simple-cdd", line 367, in run_tool tool.run() File "/home/vagrant/simple-cdd/simple_cdd/tools/mirror_wget.py", line 155, in run _download(f["url"], f["absname"], checksums=file_sums, relname=f["relname"]) File "/home/vagrant/simple-cdd/simple_cdd/tools/mirror_wget.py", line 59, in _download checksums.verify_file(output, relname) File "/home/vagrant/simple-cdd/simple_cdd/utils.py", line 207, in verify_file raise Fail("Invalid checksum for {}: expected {}, got {}", absname, hashsum, hasher.hexdigest()) simple_cdd.exceptions.Fail: During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/home/vagrant/simple-cdd/simple_cdd/log.py", line 85, in emit record.message = record.getMessage() File "/usr/lib/python3.5/logging/__init__.py", line 331, in getMessage msg = msg % self.args TypeError: not all arguments converted during string formatting Call stack: File "./build-simple-cdd", line 674, in log.error(*e.args) File "/usr/lib/python3.5/logging/__init__.py", line 1309, in error self._log(ERROR, msg, args, **kwargs) File "/usr/lib/python3.5/logging/__init__.py", line 1416, in _log self.handle(record) File "/usr/lib/python3.5/logging/__init__.py", line 1426, in handle self.callHandlers(record) File "/usr/lib/python3.5/logging/__init__.py", line 1488, in callHandlers hdlr.handle(record) File "/usr/lib/python3.5/logging/__init__.py", line 856, in handle self.emit(record) File "/home/vagrant/simple-cdd/simple_cdd/log.py", line 127, in emit self.handleError(record) Message: 'Invalid checksum for {}: expected {}, got {}' Arguments: ('/home/vagrant/simple-cdd/tmp/mirror/dists/stretch/main/installer-amd64/current/images/cdrom/gtk/initrd.gz', 'becc9447c6a59e4d1f870911999efae179612cd9eefc383cdc2c5762f6f35c39', '8b08096e9e4a33d13eb83261649bc5ec153ba24618ef4b578b5ad7f6aea0c7bf') Thanks! live well, vagrant signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On Mon, May 15, 2017 at 09:46:08AM -0700, Vagrant Cascadian wrote: > > -prefix_len = 7 + len(env.get("DI_CODENAME")) # > > dists/{DI_CODENAME}/ > > -relname = file[prefix_len:] > > +separator = os.path.join('dists/', > > env.get("DI_CODENAME"), '') > > +separator, relname = file.split(separator) > If I remember correctly, it is trying to split a filename based on > dists/CODENAME, so maybe os.path.split would be appropriate here; will > look into it. os.path.relpath might also help. > > I noticed that verify_file in simple_cdd/utils.py already throws Fail if > > something is wrong. I would get rid of the try/except block altogether, > > and just call verify_file. > I think with these I was getting an ugly traceback when the file failed > to verify, so wanted to actually give the user something more freindly > than a traceback... will try your patch again, tweak the file between > download and verify, and see if it still spits out the tracebacks. If what is thrown is Fail, that shouldn't happen: build-simple-cdd has this that catches it and only logs the error message instead of the traceback: except Fail as e: #import traceback #traceback.print_stack() log.error(*e.args) scdd.paranoid_checks() result = 1 isoname = None If it spits out a traceback instead, send it to me and I'll see if there's somewhere where some other exception should be turned into Fail. Enrico -- GPG key: 4096R/634F4BD1E7AD5568 2009-05-08 Enrico Zinisignature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On 2017-05-15, Enrico Zini wrote: > On Sun, May 07, 2017 at 09:58:12PM -0700, Vagrant Cascadian wrote: > >> Any chance you could review the urllib implementation that is in git >> soonish? :) > > I've been adventurously busy, I hope it's not too late. Thanks for the review! :) > In bfc0329060e262bfd889b5e40c32ad9099197dcb I have mixed feelings about > this, as it uses string split on a path while python has os.path.split. > However, I couldn't at a glance follow the code to see what kind of > input it works on, and if it's something like > http://ftp.fr.debian.org/debian/dists/jessie/Release > then the lines are not really paths: > > -prefix_len = 7 + len(env.get("DI_CODENAME")) # > dists/{DI_CODENAME}/ > -relname = file[prefix_len:] > +separator = os.path.join('dists/', > env.get("DI_CODENAME"), '') > +separator, relname = file.split(separator) If I remember correctly, it is trying to split a filename based on dists/CODENAME, so maybe os.path.split would be appropriate here; will look into it. > In b164d54b912a8d702576c191a821ec2f374642cf I agree that except without > an exception is too broad, like in here: > > try: > checksums.verify_file(output, relname) > except: > raise Fail("Checksum invalid: %s", output) > > I noticed that verify_file in simple_cdd/utils.py already throws Fail if > something is wrong. I would get rid of the try/except block altogether, > and just call verify_file. I think with these I was getting an ugly traceback when the file failed to verify, so wanted to actually give the user something more freindly than a traceback... will try your patch again, tweak the file between download and verify, and see if it still spits out the tracebacks. live well, vagrant signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On Sun, May 07, 2017 at 09:58:12PM -0700, Vagrant Cascadian wrote: > Any chance you could review the urllib implementation that is in git > soonish? :) I've been adventurously busy, I hope it's not too late. In bfc0329060e262bfd889b5e40c32ad9099197dcb I have mixed feelings about this, as it uses string split on a path while python has os.path.split. However, I couldn't at a glance follow the code to see what kind of input it works on, and if it's something like http://ftp.fr.debian.org/debian/dists/jessie/Release then the lines are not really paths: -prefix_len = 7 + len(env.get("DI_CODENAME")) # dists/{DI_CODENAME}/ -relname = file[prefix_len:] +separator = os.path.join('dists/', env.get("DI_CODENAME"), '') +separator, relname = file.split(separator) About 6717a008549cd792a4c08d7c21d09cbd8b82a67a: COOL! In b164d54b912a8d702576c191a821ec2f374642cf I agree that except without an exception is too broad, like in here: try: checksums.verify_file(output, relname) except: raise Fail("Checksum invalid: %s", output) I noticed that verify_file in simple_cdd/utils.py already throws Fail if something is wrong. I would get rid of the try/except block altogether, and just call verify_file. 1faddb9cfe4992c36bca7e502f5190e96de74f77 looks good, I can't think of a better way of doing it. I'm attaching a proposed patch for the over-broad excepts. Enrico -- GPG key: 4096R/634F4BD1E7AD5568 2009-05-08 Enrico ZiniFrom c84da64e7485cc89d0c72ab5d15c3df9f58945b5 Mon Sep 17 00:00:00 2001 From: Enrico Zini Date: Mon, 15 May 2017 12:41:33 +0200 Subject: [PATCH] Avoid except: that might catch too much --- simple_cdd/tools/mirror_wget.py | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/simple_cdd/tools/mirror_wget.py b/simple_cdd/tools/mirror_wget.py index 73999a3..dc914cf 100644 --- a/simple_cdd/tools/mirror_wget.py +++ b/simple_cdd/tools/mirror_wget.py @@ -46,7 +46,7 @@ class ToolMirrorWget(Tool): checksums.verify_file(output, relname) log.debug("skipping download: %s checksum matched", output) return -except: +except Fail: log.debug("re-downloading: %s checksum invalid", output) pass if not os.path.isdir(os.path.dirname(output)): @@ -54,10 +54,7 @@ class ToolMirrorWget(Tool): log.debug("downloading: %s", output) request.urlretrieve(url, filename=output) if checksums: -try: -checksums.verify_file(output, relname) -except: -raise Fail("Checksum invalid: %s", output) +checksums.verify_file(output, relname) if env.get("mirror_files"): # Download the checksums present in the archive "extrafiles" and verify -- 2.11.0 signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On 2017-04-28, Vagrant Cascadianwrote: > On 2017-04-28, Vagrant Cascadian wrote: >> On 2017-04-28, Vagrant Cascadian wrote: >>> On 2017-04-28, Enrico Zini wrote: Would you like me to try and provide a version which uses urlretrieve? >>> >>> All these ideas sound good to me, so please take a shot at it! >> >> Actually, being able to verify the checksums instead of relying on >> timestamps got me excited enough to try this myself... > > And pushed to git, review appreciated. Any chance you could review the urllib implementation that is in git soonish? :) In particular: 1faddb9cfe4992c36bca7e502f5190e96de74f77 Explicitly set http_proxy in the environment. b164d54b912a8d702576c191a821ec2f374642cf Only re-download files if known checksums do not match. 6717a008549cd792a4c08d7c21d09cbd8b82a67a Switch to urllib. I suspect my exception handling is a bit over-broad and there are better ways to handle it, at the very least. Thanks for all your help on this! live well, vagrant signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On 2017-04-28, Vagrant Cascadian wrote: > On 2017-04-28, Vagrant Cascadian wrote: >> On 2017-04-28, Enrico Zini wrote: >>> Would you like me to try and provide a version which uses urlretrieve? >> >> All these ideas sound good to me, so please take a shot at it! > > Actually, being able to verify the checksums instead of relying on > timestamps got me excited enough to try this myself... And pushed to git, review appreciated. > So we may need to set that with os.environ or something? I went ahead and added: if env.get("http_proxy"): os.environ.setdefault('http_proxy', env.get("http_proxy")) Seemed to work. Might be a better way. live well, vagrant signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On 2017-04-28, Vagrant Cascadian wrote: > On 2017-04-28, Enrico Zini wrote: >> Would you like me to try and provide a version which uses urlretrieve? > > All these ideas sound good to me, so please take a shot at it! Actually, being able to verify the checksums instead of relying on timestamps got me excited enough to try this myself... The http_proxy environment variable isn't set from profiles/*.conf, but works if you expressly call it like so: http_proxy=http://127.0.0.1:8000 ./build-simple-cdd So we may need to set that with os.environ or something? diff --git a/simple_cdd/tools/mirror_wget.py b/simple_cdd/tools/mirror_wget.py index b05c813..8bec6d0 100644 --- a/simple_cdd/tools/mirror_wget.py +++ b/simple_cdd/tools/mirror_wget.py @@ -3,6 +3,7 @@ from simple_cdd.utils import run_command, Checksums from simple_cdd.gnupg import Gnupg from .base import Tool from urllib.parse import urlparse, urljoin +from urllib import request import os import re import logging @@ -35,13 +36,25 @@ class ToolMirrorWget(Tool): baseurl = env.get("wget_debian_mirror") path_depth = urlparse(baseurl).path.strip("/").count("/") + 1 -def _download(url, output): +def _download(url, output, checksums=None, relname=None): +if checksums: +if os.path.exists(output): +try: +checksums.verify_file(output, relname) +log.debug("skipping download: %s checksum matched", output) +return +except: +log.debug("re-downloading: %s checksum invalid", output) +pass if not os.path.isdir(os.path.dirname(output)): os.makedirs(os.path.dirname(output)) -args = ["wget", "--output-document="+output, "--timestamping", url] -retval = run_command("wget {}".format(url), args, logfd=logfd, env=wget_env) -if retval != 0: -raise Fail("wget exited with code %s, see %s for full output log", retval, logfilename) +log.debug("downloading: %s", output) +request.urlretrieve(url, filename=output) +if checksums: +try: +checksums.verify_file(output, relname) +except: +raise Fail("Checksum invalid: %s", output) # Build the environment for running reprepro wget_env = {} @@ -82,9 +95,7 @@ class ToolMirrorWget(Tool): }) for x in ef_files: -_download(x["url"], x["absname"]) -extrafile_sums.verify_file(x["absname"], x["relname"]) - +_download(x["url"], x["absname"], checksums=extrafile_sums, relname=x["relname"]) checksum_files = env.get("checksum_files") @@ -92,11 +103,6 @@ class ToolMirrorWget(Tool): files = [] files.extend(checksum_files) -for x in files: -p = os.path.join(env.get("MIRROR"), x) -d = os.path.join(env.get("wget_debian_mirror"), x) -_download(d, p) - if checksum_files: # Get the release file and verify that it is valid release_file = os.path.join(env.get("simple_cdd_temp"), env.format("{DI_CODENAME}_Release")) @@ -125,8 +131,9 @@ class ToolMirrorWget(Tool): prefix_len = 7 + len(env.get("DI_CODENAME")) # dists/{DI_CODENAME}/ relname = file[prefix_len:] absname = os.path.join(env.get("MIRROR"), file) +url = os.path.join(env.get("wget_debian_mirror"), file) # Validate the file -sums.verify_file(absname, relname) +_download(url, absname, checksums=sums, relname=relname) # Get the list of extra files to download: those whose # pathname matches di_match @@ -148,7 +155,4 @@ class ToolMirrorWget(Tool): file_sums.parse_checksums_file(absname, hashtype) for f in extra_files: # Download the extra files -_download(f["url"], f["absname"]) -file_sums.verify_file(f["absname"], f["relname"]) - - +_download(f["url"], f["absname"], checksums=file_sums, relname=f["relname"]) signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On 2017-04-28, Enrico Zini wrote: > On Thu, Apr 27, 2017 at 08:26:13AM -0700, Vagrant Cascadian wrote: >> Below is a crude implementation using python3-urllib3. Maybe it's good >> enough. > > I would be tempted to rebase the download code on > urllib.request.urlretrieve, because it avoids an external dependency and > it does it all in one shot: > https://docs.python.org/3/library/urllib.request.html#urllib.request.urlretrieve One thing just occurred to me, does urllib.request.urlretrieve open a new server connection for every download? I saw a significant speed improvement using urllib3 once I figured out how to get it to re-use an already established connection (or rather, how to use it correctly, it does that by default). I've also just noticed that "wget -O" isn't compatible with timestamping, so continuing to use wget isn't really an option... unless we use the checksums method for that too. live well, vagrant signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On 2017-04-28, Enrico Zini wrote: > On Thu, Apr 27, 2017 at 08:26:13AM -0700, Vagrant Cascadian wrote: > >> I want to keep an eye to a patch that would be acceptible to the release >> team for stretch at this point, so maybe only refactoring this new code >> for a smaller, more readable diff would be best. > >> Below is a crude implementation using python3-urllib3. Maybe it's good >> enough. > > I would be tempted to rebase the download code on > urllib.request.urlretrieve, because it avoids an external dependency and > it does it all in one shot: > https://docs.python.org/3/library/urllib.request.html#urllib.request.urlretrieve > > The only thing it does not do is verifying the Last-Modified header. > However, since we already have extrafiles, we could use the checksums in > there to see if the files need redownloading. That would avoid the > worries about parsing timestamps, too. Sounds good! >> It also supports using a proxy, although it seems to require a bit of >> code duplication. More elegant support for proxies would be nice. > > The standard library's urllib is supposed to look at $http_proxy and use > it, according to https://docs.python.org/3/library/urllib.request.html > so at least in theory proxies are a non issues. I wasn't quite sure how to pass around environment variables if set from the configuration (e.g. profiles/*.conf). I guess we could check for the *_proxy environment variables with env.get("http_proxy") and set with os.environ (if not already present)? >> Ironically, there are no remaining calls to wget in mirror_wget.py. >> Probably should deprecate and/or remove wget_debian_mirror and other >> variables that reference wget... > > I would do that, leaving the variable so that nothing explodes on old > config files, but marking it as deprecated/ignored. I could even add an > "ignored" flag to variables that makes it throw an exception if the code > uses them. > > Would you like me to try and provide a version which uses urlretrieve? All these ideas sound good to me, so please take a shot at it! live well, vagrant signature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
On Thu, Apr 27, 2017 at 08:26:13AM -0700, Vagrant Cascadian wrote: > I want to keep an eye to a patch that would be acceptible to the release > team for stretch at this point, so maybe only refactoring this new code > for a smaller, more readable diff would be best. > Below is a crude implementation using python3-urllib3. Maybe it's good > enough. I would be tempted to rebase the download code on urllib.request.urlretrieve, because it avoids an external dependency and it does it all in one shot: https://docs.python.org/3/library/urllib.request.html#urllib.request.urlretrieve The only thing it does not do is verifying the Last-Modified header. However, since we already have extrafiles, we could use the checksums in there to see if the files need redownloading. That would avoid the worries about parsing timestamps, too. > It also supports using a proxy, although it seems to require a bit of > code duplication. More elegant support for proxies would be nice. The standard library's urllib is supposed to look at $http_proxy and use it, according to https://docs.python.org/3/library/urllib.request.html so at least in theory proxies are a non issues. > Ironically, there are no remaining calls to wget in mirror_wget.py. > Probably should deprecate and/or remove wget_debian_mirror and other > variables that reference wget... I would do that, leaving the variable so that nothing explodes on old config files, but marking it as deprecated/ignored. I could even add an "ignored" flag to variables that makes it throw an exception if the code uses them. Would you like me to try and provide a version which uses urlretrieve? Enrico -- GPG key: 4096R/634F4BD1E7AD5568 2009-05-08 Enrico Zinisignature.asc Description: PGP signature
Bug#861198: Shutting down public FTP services
Any review, refactoring, etc. would be great! I want to keep an eye to a patch that would be acceptible to the release team for stretch at this point, so maybe only refactoring this new code for a smaller, more readable diff would be best. On 2017-04-25, Enrico Zini wrote: > simple-cdd currently depends on ftp to be able to get files in > http://ftp.de.debian.org/debian/doc/ that are needed by debian-cd. It > needs ftp because with http there is no reliable way to enumerate the > list of files in doc/ to be downloaded. Solved by the earlier patch, and pushed that to git. > That is the only reason there's still a need of the mirror_wget tool > and all the download can't just be done via python's http client > libraries. Below is a crude implementation using python3-urllib3. Maybe it's good enough. I'm not entirely sure the timestamp checking is effective; it may be downloading the whole file on the header request. Particularly terrified if the date string handling and timezones madness are good enough to reasonably handle debian mirrors in the wild... It also supports using a proxy, although it seems to require a bit of code duplication. More elegant support for proxies would be nice. Ironically, there are no remaining calls to wget in mirror_wget.py. Probably should deprecate and/or remove wget_debian_mirror and other variables that reference wget... diff --git a/simple_cdd/tools/mirror_wget.py b/simple_cdd/tools/mirror_wget.py index 5ccbbca..06e6537 100644 --- a/simple_cdd/tools/mirror_wget.py +++ b/simple_cdd/tools/mirror_wget.py @@ -6,6 +6,9 @@ from urllib.parse import urlparse, urljoin import os import re import logging +import urllib3 +import time +from dateutil import parser log = logging.getLogger() @@ -35,25 +38,43 @@ class ToolMirrorWget(Tool): baseurl = env.get("wget_debian_mirror") path_depth = urlparse(baseurl).path.strip("/").count("/") + 1 -def _wget_one(url, output): +def _download(url, output): if not os.path.isdir(os.path.dirname(output)): os.makedirs(os.path.dirname(output)) -args = ["wget", "--output-document="+output, "--timestamping", url] -retval = run_command("wget {}".format(url), args, logfd=logfd, env=wget_env) -if retval != 0: -raise Fail("wget exited with code %s, see %s for full output log", retval, logfilename) -# Build the environment for running reprepro -wget_env = {} -for name, val, changed in self.env.export_iter(): -wget_env[name] = str(val) +r = http.request('GET', url) +url_modified_string = r.getheader('Last-Modified') +url_modified = parser.parse(url_modified_string) +log.info(url + 'Last Modified' + url_modified_string) +if os.path.exists(output): +file_modified = os.path.getmtime(output) +log.info(output + 'Last Modified' + time.ctime(file_modified)) +if file_modified > url_modified.timestamp(): +log.info('newer file %s than url %s, skipping download', output, url) +return +y = open(output, 'wb') +y.write(r.data) +y.close() +os.utime(output, times=(time.time(), url_modified.timestamp())) + +if env.get("http_proxy"): +http = urllib3.ProxyManager( +env.get("http_proxy"), +cert_reqs='CERT_REQUIRED', +ca_certs='/etc/ssl/certs/ca-certificates.crt' +) +else: +http = urllib3.PoolManager( +cert_reqs='CERT_REQUIRED', +ca_certs='/etc/ssl/certs/ca-certificates.crt' +) if env.get("mirror_files"): # Download the checksums present in the archive "extrafiles" and verify extrafiles_file_inlinesig = os.path.join(env.get("MIRROR"), "extrafiles") extrafiles_file= os.path.join(env.get("simple_cdd_temp"), "extrafiles.unsigned") download_extrafiles_file = os.path.join(env.get("wget_debian_mirror"), "extrafiles") -_wget_one(download_extrafiles_file, extrafiles_file_inlinesig) +_download(download_extrafiles_file, extrafiles_file_inlinesig) self.gnupg.verify_inline_sig(extrafiles_file_inlinesig) self.gnupg.extract_inline_contents(extrafiles_file, extrafiles_file_inlinesig) @@ -81,7 +102,7 @@ class ToolMirrorWget(Tool): }) for x in ef_files: -_wget_one(x["url"], x["absname"]) +_download(x["url"], x["absname"]) extrafile_sums.verify_file(x["absname"], x["relname"])