On Thursday 22 August 2013 12:23:00 Ángel González wrote:
> Tim Ruehsen wrote:
> > Hi,
> >
> > this patch adds a new option to Wget to follow HTTPS only when using
> > recursive mode. It includes a new test and a bit of documentation.
> >
> > Daniel Kahn Gillmor brought this into discussion.
> > It is a very small change in Wget and IMHO brings more control to the
> > user.
> >
> > Regards, Tim
>
> You didn't add Test--httpsonly-r.px in the commit.
Thank you. Here is the patch with Test--httpsonly-r.px.
Regards, Tim
>From 6a0960d49adbcc113949720df6ac78d61e2c4c1d Mon Sep 17 00:00:00 2001
From: Tim Ruehsen <[email protected]>
Date: Thu, 22 Aug 2013 12:28:11 +0200
Subject: [PATCH] added option --https-only
---
doc/ChangeLog | 4 +++
doc/wget.texi | 3 ++
src/ChangeLog | 5 +++
src/init.c | 3 ++
src/main.c | 3 ++
src/options.h | 2 +-
src/recur.c | 23 +++++++++-----
tests/ChangeLog | 6 ++++
tests/Makefile.am | 1 +
tests/Test--httpsonly-r.px | 79 ++++++++++++++++++++++++++++++++++++++++++++++
tests/run-px | 1 +
11 files changed, 121 insertions(+), 9 deletions(-)
create mode 100755 tests/Test--httpsonly-r.px
diff --git a/doc/ChangeLog b/doc/ChangeLog
index bc0fb79..d283055 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2013-08-22 Tim Ruehsen <[email protected]>
+
+ * wget.texi: added description for --https-only
+
2013-08-13 Hrvoje Niksic <[email protected]>
* wget.texi (Download Options): Fix misspelling.
diff --git a/doc/wget.texi b/doc/wget.texi
index ba4612d..cced7ed 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -1606,6 +1606,9 @@ buggy SSL server implementations that make it hard for OpenSSL to
choose the correct protocol version. Fortunately, such servers are
quite rare.
+@item --https-only
+When in recursive mode, only HTTPS links are followed.
+
@cindex SSL certificate, check
@item --no-check-certificate
Don't check the server certificate against the available certificate
diff --git a/src/ChangeLog b/src/ChangeLog
index edfb80f..9fb6b97 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,8 @@
+2013-08-22 Tim Ruehsen <[email protected]>
+
+ * added new option --https-only (main.c, options.h)
+ * recur.c (download_child_p): checking for HTTPS
+
2013-08-09 Tim Ruehsen <[email protected]>
* gnutls.c (ssl_init): Prevent CA files from being loaded twice
diff --git a/src/init.c b/src/init.c
index 1c4432b..033da4f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -194,6 +194,9 @@ static const struct {
{ "httppasswd", &opt.http_passwd, cmd_string }, /* deprecated */
{ "httppassword", &opt.http_passwd, cmd_string },
{ "httpproxy", &opt.http_proxy, cmd_string },
+#ifdef HAVE_SSL
+ { "httpsonly", &opt.https_only, cmd_boolean },
+#endif
{ "httpsproxy", &opt.https_proxy, cmd_string },
{ "httpuser", &opt.http_user, cmd_string },
{ "ignorecase", &opt.ignore_case, cmd_boolean },
diff --git a/src/main.c b/src/main.c
index 6b71a20..8414f5e 100644
--- a/src/main.c
+++ b/src/main.c
@@ -217,6 +217,7 @@ static struct cmdline_option option_data[] =
{ "http-passwd", 0, OPT_VALUE, "httppassword", -1 }, /* deprecated */
{ "http-password", 0, OPT_VALUE, "httppassword", -1 },
{ "http-user", 0, OPT_VALUE, "httpuser", -1 },
+ { IF_SSL ("https-only"), 0, OPT_BOOLEAN, "httpsonly", -1 },
{ "ignore-case", 0, OPT_BOOLEAN, "ignorecase", -1 },
{ "ignore-length", 0, OPT_BOOLEAN, "ignorelength", -1 },
{ "ignore-tags", 0, OPT_VALUE, "ignoretags", -1 },
@@ -636,6 +637,8 @@ HTTPS (SSL/TLS) options:\n"),
--secure-protocol=PR choose secure protocol, one of auto, SSLv2,\n\
SSLv3, and TLSv1.\n"),
N_("\
+ --https-only only follow secure HTTPS links\n"),
+ N_("\
--no-check-certificate don't validate the server's certificate.\n"),
N_("\
--certificate=FILE client certificate file.\n"),
diff --git a/src/options.h b/src/options.h
index 0a10c9b..4460c6c 100644
--- a/src/options.h
+++ b/src/options.h
@@ -215,9 +215,9 @@ struct options
char *ca_directory; /* CA directory (hash files) */
char *ca_cert; /* CA certificate file to use */
-
char *random_file; /* file with random data to seed the PRNG */
char *egd_file; /* file name of the egd daemon socket */
+ bool https_only; /* whether to follow HTTPS only */
#endif /* HAVE_SSL */
bool cookies; /* whether cookies are used. */
diff --git a/src/recur.c b/src/recur.c
index b6ba1d9..edf34d4 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -505,15 +505,16 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
}
/* Several things to check for:
- 1. if scheme is not http, and we don't load it
- 2. check for relative links (if relative_only is set)
- 3. check for domain
- 4. check for no-parent
- 5. check for excludes && includes
- 6. check for suffix
- 7. check for same host (if spanhost is unset), with possible
+ 1. if scheme is not https and https_only requested
+ 2. if scheme is not http, and we don't load it
+ 3. check for relative links (if relative_only is set)
+ 4. check for domain
+ 5. check for no-parent
+ 6. check for excludes && includes
+ 7. check for suffix
+ 8. check for same host (if spanhost is unset), with possible
gethostbyname baggage
- 8. check for robots.txt
+ 9. check for robots.txt
Addendum: If the URL is FTP, and it is to be loaded, only the
domain and suffix settings are "stronger".
@@ -525,6 +526,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
More time- and memory- consuming tests should be put later on
the list. */
+ if (opt.https_only && u->scheme != SCHEME_HTTPS)
+ {
+ DEBUGP (("Not following non-HTTPS links.\n"));
+ goto out;
+ }
+
/* Determine whether URL under consideration has a HTTP-like scheme. */
u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
diff --git a/tests/ChangeLog b/tests/ChangeLog
index 8cd4864..9a58797 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,9 @@
+2013-08-22 Tim Ruehsen <[email protected]>
+
+ * Makefile.am (EXTRA_DIST): Add Test--httpsonly-r.px.
+ * run-px (tests): Likewise.
+ * Test--httpsonly-r.px: New file.
+
2013-03-12 Darshit Shah <[email protected]>
* Makefile.am (EXTRA_DIST): Add Test--post-file.px.
diff --git a/tests/Makefile.am b/tests/Makefile.am
index ac6a663..a494787 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -132,6 +132,7 @@ EXTRA_DIST = FTPServer.pm FTPTest.pm HTTPServer.pm HTTPTest.pm \
Test--spider-r--no-content-disposition.px \
Test--spider-r--no-content-disposition-trivial.px \
Test--spider-r.px \
+ Test--httpsonly-r.px \
run-px certs
check_PROGRAMS = unit-tests
diff --git a/tests/Test--httpsonly-r.px b/tests/Test--httpsonly-r.px
new file mode 100755
index 0000000..019df1a
--- /dev/null
+++ b/tests/Test--httpsonly-r.px
@@ -0,0 +1,79 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+
+###############################################################################
+
+my $mainpage = <<EOF;
+<html>
+<head>
+ <title>Main Page</title>
+</head>
+<body>
+ <p>
+ Some text and a link to a <a href="http://localhost:{{port}}/secondpage.html">second page</a>.
+ </p>
+</body>
+</html>
+EOF
+
+my $secondpage = <<EOF;
+<html>
+<head>
+ <title>Second Page</title>
+</head>
+<body>
+ <p>
+ Anything.
+ </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+ '/index.html' => {
+ code => "200",
+ msg => "Dontcare",
+ headers => {
+ "Content-type" => "text/html",
+ },
+ content => $mainpage,
+ },
+ '/secondpage.html' => {
+ code => "200",
+ msg => "Dontcare",
+ headers => {
+ "Content-type" => "text/html",
+ },
+ content => $secondpage,
+ }
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --https-only -r -nH http://localhost:{{port}}/";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+ 'index.html' => {
+ content => $mainpage,
+ },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test--httpsonly-r",
+ input => \%urls,
+ cmdline => $cmdline,
+ errcode => $expected_error_code,
+ output => \%expected_downloaded_files);
+print $expected_error_code."\n";
+
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/run-px b/tests/run-px
index 3c35d6f..14f5e7c 100755
--- a/tests/run-px
+++ b/tests/run-px
@@ -81,6 +81,7 @@ my @tests = (
'Test--spider-r--no-content-disposition.px',
'Test--spider-r--no-content-disposition-trivial.px',
'Test--spider-r.px',
+ 'Test--httpsonly-r.px',
);
foreach my $var (qw(SYSTEM_WGETRC WGETRC)) {
--
1.8.4.rc3