>From 4fc87a1346ed2bb6ebfb63fc0894a3f590627cba Mon Sep 17 00:00:00 2001
From: John Sebastian Peterson <john.s.peterson@live.com>
Date: Tue, 6 Jan 2015 19:50:10 +0100
Subject: [PATCH 1/2] Add option to download links directly after their parent
 page when downloading recursively

because it's more likely to download temporary links before they expire because it's more similar to the browsing experience
---
 doc/wget.texi                     |  10 ++++
 src/init.c                        |  21 ++++++++
 src/main.c                        |   3 ++
 src/options.h                     |   4 ++
 src/recur.c                       |  29 +++++++---
 testenv/Makefile.am               |   1 +
 testenv/Test--spider-r-browser.py | 110 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 171 insertions(+), 7 deletions(-)
 create mode 100755 testenv/Test--spider-r-browser.py

diff --git a/doc/wget.texi b/doc/wget.texi
index d9ed17d..2fb1e1e 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -1916,6 +1916,10 @@ case.
 Turn on recursive retrieving.  @xref{Recursive Download}, for more
 details.  The default maximum depth is 5.
 
+@item --queue-type=@var{queuetype}
+Specify the queue type (@pxref{Recursive Download}). Accepted values are @samp{fifo}
+(the default) and @samp{browser}.
+
 @item -l @var{depth}
 @itemx --level=@var{depth}
 Specify recursion maximum depth level @var{depth} (@pxref{Recursive
@@ -2296,6 +2300,12 @@ documents linked by them, and so on.  In other words, Wget first
 downloads the documents at depth 1, then those at depth 2, and so on
 until the specified maximum depth.
 
+The @dfn{queue type} is FIFO (default) or browser. FIFO download the
+first enqueued links first. Browser download links directly after their
+parent page. If the parent page contain temporary links this can prevent
+that links expire before they're downloaded. Pages sometimes use
+temporary links to prevent direct links to files.
+
 The maximum @dfn{depth} to which the retrieval may descend is specified
 with the @samp{-l} option.  The default maximum depth is five layers.
 
diff --git a/src/init.c b/src/init.c
index 569b25b..ad72441 100644
--- a/src/init.c
+++ b/src/init.c
@@ -104,6 +104,7 @@ CMD_DECLARE (cmd_spec_htmlify);
 CMD_DECLARE (cmd_spec_mirror);
 CMD_DECLARE (cmd_spec_prefer_family);
 CMD_DECLARE (cmd_spec_progress);
+CMD_DECLARE (cmd_spec_queue_type);
 CMD_DECLARE (cmd_spec_recursive);
 CMD_DECLARE (cmd_spec_regex_type);
 CMD_DECLARE (cmd_spec_restrict_file_names);
@@ -247,6 +248,7 @@ static const struct {
   { "proxypasswd",      &opt.proxy_passwd,      cmd_string }, /* deprecated */
   { "proxypassword",    &opt.proxy_passwd,      cmd_string },
   { "proxyuser",        &opt.proxy_user,        cmd_string },
+  { "queuetype",        &opt.queue_type,        cmd_spec_queue_type },
   { "quiet",            &opt.quiet,             cmd_boolean },
   { "quota",            &opt.quota,             cmd_bytes_sum },
 #ifdef HAVE_SSL
@@ -403,6 +405,8 @@ defaults (void)
   opt.restrict_files_nonascii = false;
   opt.restrict_files_case = restrict_no_case_restriction;
 
+  opt.queue_type = queue_type_fifo;
+
   opt.regex_type = regex_type_posix;
 
   opt.max_redirect = 20;
@@ -1441,6 +1445,23 @@ cmd_spec_recursive (const char *com, const char *val, void *place_ignored _GL_UN
   return true;
 }
 
+/* Validate --queue-type and set the choice.  */
+
+static bool
+cmd_spec_queue_type (const char *com, const char *val, void *place_ignored _GL_UNUSED)
+{
+  static const struct decode_item choices[] = {
+    { "fifo", queue_type_fifo },
+    { "browser", queue_type_browser },
+  };
+  int queue_type = queue_type_fifo;
+  int ok = decode_string (val, choices, countof (choices), &queue_type);
+  if (!ok)
+    fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val));
+  opt.queue_type = queue_type;
+  return ok;
+}
+
 /* Validate --regex-type and set the choice.  */
 
 static bool
diff --git a/src/main.c b/src/main.c
index 6feb140..048ffeb 100644
--- a/src/main.c
+++ b/src/main.c
@@ -272,6 +272,7 @@ static struct cmdline_option option_data[] =
     { "proxy-passwd", 0, OPT_VALUE, "proxypassword", -1 }, /* deprecated */
     { "proxy-password", 0, OPT_VALUE, "proxypassword", -1 },
     { "proxy-user", 0, OPT_VALUE, "proxyuser", -1 },
+    { "queue-type", 0, OPT_VALUE, "queuetype", -1 },
     { "quiet", 'q', OPT_BOOLEAN, "quiet", -1 },
     { "quota", 'Q', OPT_VALUE, "quota", -1 },
     { "random-file", 0, OPT_VALUE, "randomfile", -1 },
@@ -737,6 +738,8 @@ Recursive download:\n"),
     N_("\
   -r,  --recursive                 specify recursive download\n"),
     N_("\
+       --queue-type=TYPE           queue type (fifo|browser).\n"),
+    N_("\
   -l,  --level=NUMBER              maximum recursion depth (inf or 0 for infinite)\n"),
     N_("\
        --delete-after              delete files locally after downloading them\n"),
diff --git a/src/options.h b/src/options.h
index b995126..5a4435d 100644
--- a/src/options.h
+++ b/src/options.h
@@ -46,6 +46,10 @@ struct options
   bool relative_only;           /* Follow only relative links. */
   bool no_parent;               /* Restrict access to the parent
                                    directory.  */
+  enum {
+    queue_type_fifo,
+    queue_type_browser
+  } queue_type;                 /* Recursion queue type */
   int reclevel;                 /* Maximum level of recursion */
   bool dirstruct;               /* Do we build the directory structure
                                    as we go along? */
diff --git a/src/recur.c b/src/recur.c
index b6b9dc6..0c06f14 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -90,13 +90,17 @@ url_queue_delete (struct url_queue *queue)
 
 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
    retrieved ("dequeued") from the queue in the order they were placed
-   into it.  */
+   into it. Or browser: items are retrieved directly after their
+   parent page. */
 
 static void
 url_enqueue (struct url_queue *queue, struct iri *i,
              const char *url, const char *referer, int depth,
              bool html_allowed, bool css_allowed)
 {
+  int append = opt.queue_type == queue_type_fifo
+    || (opt.queue_type == queue_type_browser && html_allowed);
+
   struct queue_element *qel = xnew (struct queue_element);
   qel->iri = i;
   qel->url = url;
@@ -110,20 +114,31 @@ url_enqueue (struct url_queue *queue, struct iri *i,
   if (queue->count > queue->maxcount)
     queue->maxcount = queue->count;
 
-  DEBUGP (("Enqueuing %s at depth %d\n",
+  DEBUGP (("%s %s at depth %d\n", append ? "Appending" : "Prepending",
            quotearg_n_style (0, escape_quoting_style, url), depth));
   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 
   if (i)
-    DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url),
-             i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
+    DEBUGP (("[IRI %s %s with %s\n", append ? "Appending" : "Prepending",
+            quote_n (0, url), i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
 
-  if (queue->tail)
-    queue->tail->next = qel;
-  queue->tail = qel;
+  if (append)
+  {
+    if (queue->tail)
+      queue->tail->next = qel;
+    queue->tail = qel;
+  }
+  else
+  {
+    if (queue->head)
+      qel->next = queue->head;
+    queue->head = qel;
+  }
 
   if (!queue->head)
     queue->head = queue->tail;
+  if (!queue->tail)
+    queue->tail = queue->head;
 }
 
 /* Take a URL out of the queue.  Return true if this operation
diff --git a/testenv/Makefile.am b/testenv/Makefile.am
index 39ea76c..2c3b774 100644
--- a/testenv/Makefile.am
+++ b/testenv/Makefile.am
@@ -52,6 +52,7 @@ if HAVE_PYTHON3
     Test-Post.py                            \
     Test-504.py                             \
     Test--spider-r.py                       \
+    Test--spider-r-browser.py               \
     Test-redirect-crash.py
 
   # added test cases expected to fail here and under TESTS
diff --git a/testenv/Test--spider-r-browser.py b/testenv/Test--spider-r-browser.py
new file mode 100755
index 0000000..9c9fd4e
--- /dev/null
+++ b/testenv/Test--spider-r-browser.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+from sys import exit
+from test.http_test import HTTPTest
+from misc.wget_file import WgetFile
+
+"""
+    This test executed Wget in Spider mode with recursive retrieval.
+"""
+TEST_NAME = "Recursive Spider"
+############# File Definitions ###############################################
+mainpage = """
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+    Some text and a link to a <a href="http://127.0.0.1:{{port}}/secondpage.html">second page</a>.
+    Also, an image <img src="http://127.0.0.1:{{port}}/image.svg">
+    Also, a <a href="http://127.0.0.1:{{port}}/nonexistent">broken link</a>.
+  </p>
+</body>
+</html>
+"""
+
+image = "Don't care."
+
+secondpage = """
+<html>
+<head>
+  <title>Second Page</title>
+</head>
+<body>
+  <p>
+    Some text and a link to a <a href="http://127.0.0.1:{{port}}/thirdpage.html">third page</a>.
+    Also, a <a href="http://127.0.0.1:{{port}}/nonexistent">broken link</a>.
+  </p>
+</body>
+</html>
+"""
+
+thirdpage = """
+<html>
+<head>
+  <title>Third Page</title>
+</head>
+<body>
+  <p>
+    Some text and a link to a <a href="http://127.0.0.1:{{port}}/dummy.txt">text file</a>.
+    Also, another <a href="http://127.0.0.1:{{port}}/againnonexistent">broken link</a>.
+  </p>
+</body>
+</html>
+"""
+
+dummyfile = "Don't care."
+
+
+index_html = WgetFile ("index.html", mainpage)
+image_svg = WgetFile ("image.svg", image)
+secondpage_html = WgetFile ("secondpage.html", secondpage)
+thirdpage_html = WgetFile ("thirdpage.html", thirdpage)
+dummy_txt = WgetFile ("dummy.txt", dummyfile)
+
+Request_List = [
+    [
+        "HEAD /",
+        "GET /",
+        "GET /robots.txt",
+        "HEAD /image.svg",
+        "HEAD /secondpage.html",
+        "GET /secondpage.html",
+        "HEAD /nonexistent",
+        "HEAD /thirdpage.html",
+        "GET /thirdpage.html",
+        "HEAD /dummy.txt",
+        "HEAD /againnonexistent"
+    ]
+]
+
+WGET_OPTIONS = "--spider -r --queue-type=browser"
+WGET_URLS = [[""]]
+
+Files = [[index_html, image_svg, secondpage_html, thirdpage_html, dummy_txt]]
+
+ExpectedReturnCode = 8
+ExpectedDownloadedFiles = []
+
+################ Pre and Post Test Hooks #####################################
+pre_test = {
+    "ServerFiles"       : Files
+}
+test_options = {
+    "WgetCommands"      : WGET_OPTIONS,
+    "Urls"              : WGET_URLS
+}
+post_test = {
+    "ExpectedFiles"     : ExpectedDownloadedFiles,
+    "ExpectedRetcode"   : ExpectedReturnCode,
+    "FilesCrawled"      : Request_List
+}
+
+err = HTTPTest (
+                name=TEST_NAME,
+                pre_hook=pre_test,
+                test_params=test_options,
+                post_hook=post_test
+).begin ()
+
+exit (err)

>From 5cb85cfd41e085b1a9af457af7c87477a5bd6a51 Mon Sep 17 00:00:00 2001
From: John Sebastian Peterson <john.s.peterson@live.com>
Date: Thu, 29 Jan 2015 13:59:02 +0100
Subject: [PATCH 2/2] Fix FilesCrawled test to consider the element order

it's supposed to test the element order but symmetric_difference ignore that
---
 testenv/conf/files_crawled.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/testenv/conf/files_crawled.py b/testenv/conf/files_crawled.py
index 334e596..d471048 100644
--- a/testenv/conf/files_crawled.py
+++ b/testenv/conf/files_crawled.py
@@ -1,4 +1,4 @@
-from misc.colour_terminal import print_red
+from misc.colour_terminal import print_green, print_red
 from conf import hook
 from exc.test_failed import TestFailed
 
@@ -18,10 +18,8 @@ def __init__(self, request_headers):
         self.request_headers = request_headers
 
     def __call__(self, test_obj):
-        for headers, remaining in zip(map(set, self.request_headers),
-                                      test_obj.request_remaining()):
-            diff = headers.symmetric_difference(remaining)
+        if self.request_headers != test_obj.request_remaining():
+            print_green ('Expected: %s' % self.request_headers)
+            print_red ('Got: %s' % test_obj.request_remaining())
 
-            if diff:
-                print_red (str(diff))
-                raise TestFailed('Not all files were crawled correctly.')
+            raise TestFailed('Not all files were crawled correctly.')
