Hi,

this patch adds a new option to Wget to follow HTTPS only when using recursive 
mode. It includes a new test and a bit of documentation.

Daniel Kahn Gillmor brought this into discussion.
It is a very small change in Wget and IMHO brings more control to the user.

Regards, Tim
>From ce186a5ea664a951faf3ebada399db2dc79d61e6 Mon Sep 17 00:00:00 2001
From: Tim Ruehsen <[email protected]>
Date: Thu, 22 Aug 2013 09:51:23 +0200
Subject: [PATCH] added option --https-only

---
 doc/ChangeLog     |  4 ++++
 doc/wget.texi     |  3 +++
 src/ChangeLog     |  5 +++++
 src/init.c        |  3 +++
 src/main.c        |  3 +++
 src/options.h     |  2 +-
 src/recur.c       | 23 +++++++++++++++--------
 tests/ChangeLog   |  6 ++++++
 tests/Makefile.am |  1 +
 tests/run-px      |  1 +
 10 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/doc/ChangeLog b/doc/ChangeLog
index bc0fb79..d283055 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2013-08-22  Tim Ruehsen  <[email protected]>
+
+	* wget.texi: added description for --https-only
+
 2013-08-13  Hrvoje Niksic  <[email protected]>
 
 	* wget.texi (Download Options): Fix misspelling.
diff --git a/doc/wget.texi b/doc/wget.texi
index ba4612d..cced7ed 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -1606,6 +1606,9 @@ buggy SSL server implementations that make it hard for OpenSSL to
 choose the correct protocol version.  Fortunately, such servers are
 quite rare.
 
+@item --https-only
+When in recursive mode, only HTTPS links are followed.
+
 @cindex SSL certificate, check
 @item --no-check-certificate
 Don't check the server certificate against the available certificate
diff --git a/src/ChangeLog b/src/ChangeLog
index edfb80f..9fb6b97 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,8 @@
+2013-08-22  Tim Ruehsen  <[email protected]>
+
+	* added new option --https-only (main.c, options.h)
+	* recur.c (download_child_p): checking for HTTPS
+
 2013-08-09  Tim Ruehsen  <[email protected]>
 
 	* gnutls.c (ssl_init): Prevent CA files from being loaded twice
diff --git a/src/init.c b/src/init.c
index 1c4432b..033da4f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -194,6 +194,9 @@ static const struct {
   { "httppasswd",       &opt.http_passwd,       cmd_string }, /* deprecated */
   { "httppassword",     &opt.http_passwd,       cmd_string },
   { "httpproxy",        &opt.http_proxy,        cmd_string },
+#ifdef HAVE_SSL
+  { "httpsonly",        &opt.https_only,        cmd_boolean },
+#endif
   { "httpsproxy",       &opt.https_proxy,       cmd_string },
   { "httpuser",         &opt.http_user,         cmd_string },
   { "ignorecase",       &opt.ignore_case,       cmd_boolean },
diff --git a/src/main.c b/src/main.c
index 6b71a20..8414f5e 100644
--- a/src/main.c
+++ b/src/main.c
@@ -217,6 +217,7 @@ static struct cmdline_option option_data[] =
     { "http-passwd", 0, OPT_VALUE, "httppassword", -1 }, /* deprecated */
     { "http-password", 0, OPT_VALUE, "httppassword", -1 },
     { "http-user", 0, OPT_VALUE, "httpuser", -1 },
+    { IF_SSL ("https-only"), 0, OPT_BOOLEAN, "httpsonly", -1 },
     { "ignore-case", 0, OPT_BOOLEAN, "ignorecase", -1 },
     { "ignore-length", 0, OPT_BOOLEAN, "ignorelength", -1 },
     { "ignore-tags", 0, OPT_VALUE, "ignoretags", -1 },
@@ -636,6 +637,8 @@ HTTPS (SSL/TLS) options:\n"),
        --secure-protocol=PR     choose secure protocol, one of auto, SSLv2,\n\
                                 SSLv3, and TLSv1.\n"),
     N_("\
+       --https-only             only follow secure HTTPS links\n"),
+    N_("\
        --no-check-certificate   don't validate the server's certificate.\n"),
     N_("\
        --certificate=FILE       client certificate file.\n"),
diff --git a/src/options.h b/src/options.h
index 0a10c9b..4460c6c 100644
--- a/src/options.h
+++ b/src/options.h
@@ -215,9 +215,9 @@ struct options
   char *ca_directory;		/* CA directory (hash files) */
   char *ca_cert;		/* CA certificate file to use */
 
-
   char *random_file;		/* file with random data to seed the PRNG */
   char *egd_file;		/* file name of the egd daemon socket */
+  bool https_only;		/* whether to follow HTTPS only */
 #endif /* HAVE_SSL */
 
   bool cookies;			/* whether cookies are used. */
diff --git a/src/recur.c b/src/recur.c
index b6ba1d9..edf34d4 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -505,15 +505,16 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
     }
 
   /* Several things to check for:
-     1. if scheme is not http, and we don't load it
-     2. check for relative links (if relative_only is set)
-     3. check for domain
-     4. check for no-parent
-     5. check for excludes && includes
-     6. check for suffix
-     7. check for same host (if spanhost is unset), with possible
+     1. if scheme is not https and https_only requested
+     2. if scheme is not http, and we don't load it
+     3. check for relative links (if relative_only is set)
+     4. check for domain
+     5. check for no-parent
+     6. check for excludes && includes
+     7. check for suffix
+     8. check for same host (if spanhost is unset), with possible
      gethostbyname baggage
-     8. check for robots.txt
+     9. check for robots.txt
 
      Addendum: If the URL is FTP, and it is to be loaded, only the
      domain and suffix settings are "stronger".
@@ -525,6 +526,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
      More time- and memory- consuming tests should be put later on
      the list.  */
 
+  if (opt.https_only && u->scheme != SCHEME_HTTPS)
+    {
+      DEBUGP (("Not following non-HTTPS links.\n"));
+      goto out;
+    }
+
   /* Determine whether URL under consideration has a HTTP-like scheme. */
   u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
 
diff --git a/tests/ChangeLog b/tests/ChangeLog
index 8cd4864..9a58797 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,9 @@
+2013-08-22  Tim Ruehsen <[email protected]>
+
+	* Makefile.am (EXTRA_DIST): Add Test--httpsonly-r.px.
+	* run-px (tests): Likewise.
+	* Test--httpsonly-r.px: New file.
+
 2013-03-12  Darshit Shah <[email protected]>
 
 	* Makefile.am (EXTRA_DIST): Add Test--post-file.px.
diff --git a/tests/Makefile.am b/tests/Makefile.am
index ac6a663..a494787 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -132,6 +132,7 @@ EXTRA_DIST = FTPServer.pm FTPTest.pm HTTPServer.pm HTTPTest.pm \
              Test--spider-r--no-content-disposition.px \
              Test--spider-r--no-content-disposition-trivial.px \
              Test--spider-r.px \
+             Test--httpsonly-r.px \
              run-px certs
 
 check_PROGRAMS = unit-tests
diff --git a/tests/run-px b/tests/run-px
index 3c35d6f..14f5e7c 100755
--- a/tests/run-px
+++ b/tests/run-px
@@ -81,6 +81,7 @@ my @tests = (
     'Test--spider-r--no-content-disposition.px',
     'Test--spider-r--no-content-disposition-trivial.px',
     'Test--spider-r.px',
+    'Test--httpsonly-r.px',
 );
 
 foreach my $var (qw(SYSTEM_WGETRC WGETRC)) {
-- 
1.8.4.rc3

Reply via email to