On Thursday 22 August 2013 12:23:00 Ángel González wrote:
> Tim Ruehsen wrote:
> > Hi,
> > 
> > this patch adds a new option to Wget to follow HTTPS only when using
> > recursive mode. It includes a new test and a bit of documentation.
> > 
> > Daniel Kahn Gillmor brought this into discussion.
> > It is a very small change in Wget and IMHO brings more control to the
> > user.
> > 
> > Regards, Tim
> 
> You didn't add Test--httpsonly-r.px in the commit.

Thank you. Here is the patch with Test--httpsonly-r.px.

Regards, Tim
>From 6a0960d49adbcc113949720df6ac78d61e2c4c1d Mon Sep 17 00:00:00 2001
From: Tim Ruehsen <[email protected]>
Date: Thu, 22 Aug 2013 12:28:11 +0200
Subject: [PATCH] added option --https-only

---
 doc/ChangeLog              |  4 +++
 doc/wget.texi              |  3 ++
 src/ChangeLog              |  5 +++
 src/init.c                 |  3 ++
 src/main.c                 |  3 ++
 src/options.h              |  2 +-
 src/recur.c                | 23 +++++++++-----
 tests/ChangeLog            |  6 ++++
 tests/Makefile.am          |  1 +
 tests/Test--httpsonly-r.px | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/run-px               |  1 +
 11 files changed, 121 insertions(+), 9 deletions(-)
 create mode 100755 tests/Test--httpsonly-r.px

diff --git a/doc/ChangeLog b/doc/ChangeLog
index bc0fb79..d283055 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2013-08-22  Tim Ruehsen  <[email protected]>
+
+	* wget.texi: added description for --https-only
+
 2013-08-13  Hrvoje Niksic  <[email protected]>
 
 	* wget.texi (Download Options): Fix misspelling.
diff --git a/doc/wget.texi b/doc/wget.texi
index ba4612d..cced7ed 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -1606,6 +1606,9 @@ buggy SSL server implementations that make it hard for OpenSSL to
 choose the correct protocol version.  Fortunately, such servers are
 quite rare.
 
+@item --https-only
+When in recursive mode, only HTTPS links are followed.
+
 @cindex SSL certificate, check
 @item --no-check-certificate
 Don't check the server certificate against the available certificate
diff --git a/src/ChangeLog b/src/ChangeLog
index edfb80f..9fb6b97 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,8 @@
+2013-08-22  Tim Ruehsen  <[email protected]>
+
+	* added new option --https-only (main.c, options.h)
+	* recur.c (download_child_p): checking for HTTPS
+
 2013-08-09  Tim Ruehsen  <[email protected]>
 
 	* gnutls.c (ssl_init): Prevent CA files from being loaded twice
diff --git a/src/init.c b/src/init.c
index 1c4432b..033da4f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -194,6 +194,9 @@ static const struct {
   { "httppasswd",       &opt.http_passwd,       cmd_string }, /* deprecated */
   { "httppassword",     &opt.http_passwd,       cmd_string },
   { "httpproxy",        &opt.http_proxy,        cmd_string },
+#ifdef HAVE_SSL
+  { "httpsonly",        &opt.https_only,        cmd_boolean },
+#endif
   { "httpsproxy",       &opt.https_proxy,       cmd_string },
   { "httpuser",         &opt.http_user,         cmd_string },
   { "ignorecase",       &opt.ignore_case,       cmd_boolean },
diff --git a/src/main.c b/src/main.c
index 6b71a20..8414f5e 100644
--- a/src/main.c
+++ b/src/main.c
@@ -217,6 +217,7 @@ static struct cmdline_option option_data[] =
     { "http-passwd", 0, OPT_VALUE, "httppassword", -1 }, /* deprecated */
     { "http-password", 0, OPT_VALUE, "httppassword", -1 },
     { "http-user", 0, OPT_VALUE, "httpuser", -1 },
+    { IF_SSL ("https-only"), 0, OPT_BOOLEAN, "httpsonly", -1 },
     { "ignore-case", 0, OPT_BOOLEAN, "ignorecase", -1 },
     { "ignore-length", 0, OPT_BOOLEAN, "ignorelength", -1 },
     { "ignore-tags", 0, OPT_VALUE, "ignoretags", -1 },
@@ -636,6 +637,8 @@ HTTPS (SSL/TLS) options:\n"),
        --secure-protocol=PR     choose secure protocol, one of auto, SSLv2,\n\
                                 SSLv3, and TLSv1.\n"),
     N_("\
+       --https-only             only follow secure HTTPS links\n"),
+    N_("\
        --no-check-certificate   don't validate the server's certificate.\n"),
     N_("\
        --certificate=FILE       client certificate file.\n"),
diff --git a/src/options.h b/src/options.h
index 0a10c9b..4460c6c 100644
--- a/src/options.h
+++ b/src/options.h
@@ -215,9 +215,9 @@ struct options
   char *ca_directory;		/* CA directory (hash files) */
   char *ca_cert;		/* CA certificate file to use */
 
-
   char *random_file;		/* file with random data to seed the PRNG */
   char *egd_file;		/* file name of the egd daemon socket */
+  bool https_only;		/* whether to follow HTTPS only */
 #endif /* HAVE_SSL */
 
   bool cookies;			/* whether cookies are used. */
diff --git a/src/recur.c b/src/recur.c
index b6ba1d9..edf34d4 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -505,15 +505,16 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
     }
 
   /* Several things to check for:
-     1. if scheme is not http, and we don't load it
-     2. check for relative links (if relative_only is set)
-     3. check for domain
-     4. check for no-parent
-     5. check for excludes && includes
-     6. check for suffix
-     7. check for same host (if spanhost is unset), with possible
+     1. if scheme is not https and https_only requested
+     2. if scheme is not http, and we don't load it
+     3. check for relative links (if relative_only is set)
+     4. check for domain
+     5. check for no-parent
+     6. check for excludes && includes
+     7. check for suffix
+     8. check for same host (if spanhost is unset), with possible
      gethostbyname baggage
-     8. check for robots.txt
+     9. check for robots.txt
 
      Addendum: If the URL is FTP, and it is to be loaded, only the
      domain and suffix settings are "stronger".
@@ -525,6 +526,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
      More time- and memory- consuming tests should be put later on
      the list.  */
 
+  if (opt.https_only && u->scheme != SCHEME_HTTPS)
+    {
+      DEBUGP (("Not following non-HTTPS links.\n"));
+      goto out;
+    }
+
   /* Determine whether URL under consideration has a HTTP-like scheme. */
   u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
 
diff --git a/tests/ChangeLog b/tests/ChangeLog
index 8cd4864..9a58797 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,9 @@
+2013-08-22  Tim Ruehsen <[email protected]>
+
+	* Makefile.am (EXTRA_DIST): Add Test--httpsonly-r.px.
+	* run-px (tests): Likewise.
+	* Test--httpsonly-r.px: New file.
+
 2013-03-12  Darshit Shah <[email protected]>
 
 	* Makefile.am (EXTRA_DIST): Add Test--post-file.px.
diff --git a/tests/Makefile.am b/tests/Makefile.am
index ac6a663..a494787 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -132,6 +132,7 @@ EXTRA_DIST = FTPServer.pm FTPTest.pm HTTPServer.pm HTTPTest.pm \
              Test--spider-r--no-content-disposition.px \
              Test--spider-r--no-content-disposition-trivial.px \
              Test--spider-r.px \
+             Test--httpsonly-r.px \
              run-px certs
 
 check_PROGRAMS = unit-tests
diff --git a/tests/Test--httpsonly-r.px b/tests/Test--httpsonly-r.px
new file mode 100755
index 0000000..019df1a
--- /dev/null
+++ b/tests/Test--httpsonly-r.px
@@ -0,0 +1,79 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+
+###############################################################################
+
+my $mainpage = <<EOF;
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+    Some text and a link to a <a href="http://localhost:{{port}}/secondpage.html";>second page</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $secondpage = <<EOF;
+<html>
+<head>
+  <title>Second Page</title>
+</head>
+<body>
+  <p>
+    Anything.
+  </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    '/index.html' => {
+        code => "200",
+        msg => "Dontcare",
+        headers => {
+            "Content-type" => "text/html",
+        },
+        content => $mainpage,
+    },
+    '/secondpage.html' => {
+        code => "200",
+        msg => "Dontcare",
+        headers => {
+            "Content-type" => "text/html",
+        },
+        content => $secondpage,
+    }
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --https-only -r -nH http://localhost:{{port}}/";;
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'index.html' => {
+        content => $mainpage,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test--httpsonly-r",
+                              input => \%urls,
+                              cmdline => $cmdline,
+                              errcode => $expected_error_code,
+                              output => \%expected_downloaded_files);
+print $expected_error_code."\n";
+
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/run-px b/tests/run-px
index 3c35d6f..14f5e7c 100755
--- a/tests/run-px
+++ b/tests/run-px
@@ -81,6 +81,7 @@ my @tests = (
     'Test--spider-r--no-content-disposition.px',
     'Test--spider-r--no-content-disposition-trivial.px',
     'Test--spider-r.px',
+    'Test--httpsonly-r.px',
 );
 
 foreach my $var (qw(SYSTEM_WGETRC WGETRC)) {
-- 
1.8.4.rc3

Reply via email to