[Bug-wget] Filtering for requisites and redirections

Dale R. Worley Thu, 13 Oct 2016 15:29:38 -0700

If --page-requisites is specified along with --no-parent, then requisite
files will be downloaded even if their URLs would normally be suppressed
by --no-parent.  This is implemented by a test in section 4 of
download_child in recur.c, and a flag in struct urlpos, link_inline_p,
which says that the *context* of that URL is as a page requisite.


This suggests that the exceptional processing we want to implement for
redirections might be more systematically implemented by using the above
processing as a model, and not by testing the value returned by
download_child.  This involves adding a flag link_redirect_p to struct
urlpos; this flag functions as an alternative to the additional argument
to download_child that I previously suggested.

In addition, this approach avoids the problem of ensuring that
download_child returns the correct value if a URL fails more than one
test, e.g., --accept-regex and robots, because any tests that are to be
ignored in the context are not executed and do not affect the return
value.

It also suggests that we may want to define that --no-parent does not
apply to redirections, in the same way that it does not apply to page
requisites when --page-requisite is set.

I've also updated the TEXI file to describe the functional changes, and
also the previously-undocumented behavior of --page-requisites
overriding --no-parent.  The changes are in the attached diff.

However, looking at the documentation for --no-parent:

       -np
       --no-parent
           Do not ever ascend to the parent directory when retrieving
           recursively.  This is a useful option, since it guarantees that
           only the files below a certain hierarchy will be downloaded.

           Note that the effect of --no-parent is suppressed for fetching
           redirected URLs and for fetching page requisite URLs if
           --page-requisites is specified.

Perhaps we do not want to have --no-parent suppressed by
--page-requisites.  It seems that --no-parent is intended as a security
measure, and the existing code (as well as this proposal) violate its
fundamental premise.

Dale

diff --git a/doc/wget.texi b/doc/wget.texi
index f42773e..2990408 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -2357,6 +2357,11 @@ your shell from expanding it, like in @samp{-A "*.mp3"} or @samp{-A '*.mp3'}.
 @itemx --reject-regex @var{urlregex}
 Specify a regular expression to accept or reject the complete URL.
 
+@strong{Note} that the effect of @samp{--accept-regex} and
+@samp{--reject-regex}  is suppressed for
+fetching redirected URLs and for fetching page requisite URLs if
+@samp{--page-requisites} is specified.
+
 @item --regex-type @var{regextype}
 Specify the regular expression type.  Possible types are @samp{posix} or
 @samp{pcre}.  Note that to be able to use @samp{pcre} type, wget has to be
@@ -2437,12 +2442,21 @@ Specify a comma-separated list of directories you wish to exclude from
 download (@pxref{Directory-Based Limits}).  Elements of
 @var{list} may contain wildcards.
 
+@strong{Note} that the effect of @samp{--include-directories} and
+@samp{--exclude-directories} is suppressed for
+fetching redirected URLs and for fetching page requisite ULRs if
+@samp{--page-requisites} is specified.
+
 @item -np
 @item --no-parent
 Do not ever ascend to the parent directory when retrieving recursively.
 This is a useful option, since it guarantees that only the files
 @emph{below} a certain hierarchy will be downloaded.
 @xref{Directory-Based Limits}, for more details.
+
+@strong{Note} that the effect of @samp{--no-parent} is suppressed for
+fetching redirected URLs and for fetching page requisite ULRs if
+@samp{--page-requisites} is specified.
 @end table
 
 @c man end
diff --git a/src/convert.h b/src/convert.h
index e3ff6f0..af0ab79 100644
--- a/src/convert.h
+++ b/src/convert.h
@@ -72,6 +72,7 @@ struct urlpos {
   unsigned int link_noquote_html_p :1; /* from HTML, but doesn't need " */
   unsigned int link_expect_html :1; /* expected to contain HTML */
   unsigned int link_expect_css  :1; /* expected to contain CSS */
+  unsigned int link_redirect_p  :1; /* the url comes from a redirection */
 
   unsigned int link_refresh_p   :1; /* link was received from
                                        <meta http-equiv=refresh content=...> */
diff --git a/src/recur.c b/src/recur.c
index 1469e31..7bbcd44 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -651,13 +651,14 @@ download_child (const struct urlpos *upos, struct url *parent, int depth,
 
      If we descended to a different host or changed the scheme, ignore
      opt.no_parent.  Also ignore it for documents needed to display
-     the parent page when in -p mode.  */
+     the parent page when in -p mode or redirections.  */
   if (opt.no_parent
       && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
       && 0 == strcasecmp (u->host, start_url_parsed->host)
       && (u->scheme != start_url_parsed->scheme
           || u->port == start_url_parsed->port)
-      && !(opt.page_requisites && upos->link_inline_p))
+      && !(opt.page_requisites && upos->link_inline_p)
+      && !upos->link_redirect_p)
     {
       if (!subdir_p (start_url_parsed->dir, u->dir))
         {
@@ -670,21 +671,28 @@ download_child (const struct urlpos *upos, struct url *parent, int depth,
 
   /* 5. If the file does not match the acceptance list, or is on the
      rejection list, chuck it out.  The same goes for the directory
-     exclusion and inclusion lists.  */
-  if (opt.includes || opt.excludes)
-    {
-      if (!accdir (u->dir))
-        {
-          DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
-          reason = WG_RR_LIST;
-          goto out;
-        }
-    }
-  if (!accept_url (url))
+     exclusion and inclusion lists.
+
+     Ignore this test for documents needed to display the parent page
+     when in -p mode or redirections.  */
+  if (!(opt.page_requisites && upos->link_inline_p)
+      && !upos->link_redirect_p)
     {
-      DEBUGP (("%s is excluded/not-included through regex.\n", url));
-      reason = WG_RR_REGEX;
-      goto out;
+      if (opt.includes || opt.excludes)
+	{
+	  if (!accdir (u->dir))
+	    {
+	      DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
+	      reason = WG_RR_LIST;
+	      goto out;
+	    }
+	}
+      if (!accept_url (url))
+	{
+	  DEBUGP (("%s is excluded/not-included through regex.\n", url));
+	  reason = WG_RR_REGEX;
+	  goto out;
+	}
     }
 
   /* 6. Check for acceptance/rejection rules.  We ignore these rules
@@ -800,18 +808,13 @@ descend_redirect (const char *redirected, struct url *orig_parsed, int depth,
 
   upos = xnew0 (struct urlpos);
   upos->url = new_parsed;
+  upos->link_redirect_p = 1;
 
   reason = download_child (upos, orig_parsed, depth,
                               start_url_parsed, blacklist, iri);
 
   if (reason == WG_RR_SUCCESS)
     blacklist_add (blacklist, upos->url->url);
-  else if (reason == WG_RR_LIST || reason == WG_RR_REGEX)
-    {
-      DEBUGP (("Ignoring decision for redirects, decided to load it.\n"));
-      blacklist_add (blacklist, upos->url->url);
-      reason = WG_RR_SUCCESS;
-    }
   else
     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));

[Bug-wget] Filtering for requisites and redirections

Reply via email to