This is the change that I'm interested in. I don't expect this to be put into the distribution without a lot of discussion.
This version changes the behavior of --recurse: If a file is downloaded, it will be scanned for links to follow. This differs from the current behavior, in which the URL from which the contents were obtained (after any redirections) is further checked to see if that URL passes the recursion limitations. This patch also includes a test to verify the new behavior. I worry that this is a substantial change of behavior. OTOH, the current behavior seems to be very unintuitive. And the fact that there is no test for this behavior suggests that people have not been depending on it. Comments? Dale
>From fe409b1447da28f2c5677ee2d8114e83a19c75f1 Mon Sep 17 00:00:00 2001 From: "Dale R. Worley" <[email protected]> Date: Tue, 23 Aug 2016 23:07:23 -0400 Subject: [PATCH] Patch to change behavior with redirects under --recurse. Add test for this change. --- src/recur.c | 55 ++------------------------------ testenv/Makefile.am | 1 + testenv/Test-recursive-redirect.py | 64 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 52 deletions(-) create mode 100644 testenv/Test-recursive-redirect.py diff --git a/src/recur.c b/src/recur.c index 2b17e72..72059b4 100644 --- a/src/recur.c +++ b/src/recur.c @@ -191,8 +191,6 @@ typedef enum static reject_reason download_child (const struct urlpos *, struct url *, int, struct url *, struct hash_table *, struct iri *); -static reject_reason descend_redirect (const char *, struct url *, int, - struct url *, struct hash_table *, struct iri *); static void write_reject_log_header (FILE *); static void write_reject_log_reason (FILE *, reject_reason, const struct url *, const struct url *); @@ -358,19 +356,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi) want to follow it. */ if (descend) { - reject_reason r = descend_redirect (redirected, url_parsed, - depth, start_url_parsed, blacklist, i); - if (r == WG_RR_SUCCESS) - { - /* Make sure that the old pre-redirect form gets - blacklisted. */ - blacklist_add (blacklist, url); - } - else - { - write_reject_log_reason (rejectedlog, r, url_parsed, start_url_parsed); - descend = false; - } + /* Make sure that the old pre-redirect form gets + blacklisted. */ + blacklist_add (blacklist, url); } xfree (url); @@ -774,43 +762,6 @@ download_child (const struct urlpos *upos, struct url *parent, int depth, return reason; } -/* This function determines whether we will consider downloading the - children of a URL whose download resulted in a redirection, - possibly to another host, etc. It is needed very rarely, and thus - it is merely a simple-minded wrapper around download_child. */ - -static reject_reason -descend_redirect (const char *redirected, struct url *orig_parsed, int depth, - struct url *start_url_parsed, struct hash_table *blacklist, - struct iri *iri) -{ - struct url *new_parsed; - struct urlpos *upos; - reject_reason reason; - - assert (orig_parsed != NULL); - - new_parsed = url_parse (redirected, NULL, NULL, false); - assert (new_parsed != NULL); - - upos = xnew0 (struct urlpos); - upos->url = new_parsed; - - reason = download_child (upos, orig_parsed, depth, - start_url_parsed, blacklist, iri); - - if (reason == WG_RR_SUCCESS) - blacklist_add (blacklist, upos->url->url); - else - DEBUGP (("Redirection \"%s\" failed the test.\n", redirected)); - - url_free (new_parsed); - xfree (upos); - - return reason; -} - - /* This function writes the rejected log header. */ static void write_reject_log_header (FILE *f) diff --git a/testenv/Makefile.am b/testenv/Makefile.am index deef18e..036b91c 100644 --- a/testenv/Makefile.am +++ b/testenv/Makefile.am @@ -75,6 +75,7 @@ if HAVE_PYTHON3 Test-Post.py \ Test-recursive-basic.py \ Test-recursive-include.py \ + Test-recursive-redirect.py \ Test-redirect.py \ Test-redirect-crash.py \ Test--rejected-log.py \ diff --git a/testenv/Test-recursive-redirect.py b/testenv/Test-recursive-redirect.py new file mode 100644 index 0000000..8a114a5 --- /dev/null +++ b/testenv/Test-recursive-redirect.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +from sys import exit +from test.http_test import HTTPTest +from test.base_test import HTTP, HTTPS +from misc.wget_file import WgetFile + +""" + Basic test of --recursive. +""" +############# File Definitions ############################################### +File1 = """<html><body> +<a href=\"/a/File2.html\">text</a> +<a href=\"/b/File3.html\">text</a> +</body></html>""" +File2 = "With lemon or cream?" +File3 = "Surely you're joking Mr. Feynman" + +File1_rules = { + "Response" : 301, + "SendHeader" : {"Location" : "/b/File1.html"} +} + +File1_File = WgetFile ("a/File1.html", "", rules=File1_rules) +File1_Redirected = WgetFile ("b/File1.html", File1) +File1_Retrieved = WgetFile ("a/File1.html", File1) +File2_File = WgetFile ("a/File2.html", File2) +File3_File = WgetFile ("b/File3.html", File3) + +WGET_OPTIONS = "--recursive --no-host-directories --include-directories=a" +WGET_URLS = [["a/File1.html"]] + +Servers = [HTTP] + +Files = [[File1_Redirected, File1_File, File2_File, File3_File]] +Existing_Files = [] + +ExpectedReturnCode = 0 +ExpectedDownloadedFiles = [File1_Retrieved, File2_File] +Request_List = [["GET /a/File1.html", + "GET /a/File2.html", + "GET /b/File3.html"]] + +################ Pre and Post Test Hooks ##################################### +pre_test = { + "ServerFiles" : Files, + "LocalFiles" : Existing_Files +} +test_options = { + "WgetCommands" : WGET_OPTIONS, + "Urls" : WGET_URLS +} +post_test = { + "ExpectedFiles" : ExpectedDownloadedFiles, + "ExpectedRetcode" : ExpectedReturnCode +} + +err = HTTPTest ( + pre_hook=pre_test, + test_params=test_options, + post_hook=post_test, + protocols=Servers +).begin () + +exit (err) -- 2.10.0.rc0.17.gd63263a
