Hi Gabriel,

Thanks for your helpful comments.

I've corrected the patch.

For me, the most embarrassing thing was having forgotten the commit description 
:D

On 09/24/2015 04:13 PM, Gabriel L. Somlo wrote:
Hi AJ,

Thanks for implementing this! I tested it, and the functionality
appears correct.

I've added a more detailed review inline, below.
+static char *
+convert_basename (const char *p, const struct urlpos *link)
+{
+  int len = link->size;
+  char *url = NULL;
+  char *org_basename = NULL, *local_basename = NULL;
+  char *result = url;

None of the string variables above really need to be initialized,
since you're going to assign them unconditionally below regardless.


I always initialize local variables by default. For me it's good practice.


Consider inverting the test. If basenames are *equal*, return 'url'
immediately, and save an unnecessary xstrdup() + xfree().

Otherwise, call uri_merge() and xfree(url) before returning the
result.


Done.

Finally, I also updated the documentation at doc/wget.texi.


Thanks much,
--Gabriel



Regards,
- AJ
>From 9b4a835af24ed420f18c6531098d823c98bfa74d Mon Sep 17 00:00:00 2001
From: Ander Juaristi <[email protected]>
Date: Tue, 22 Sep 2015 21:10:38 +0200
Subject: [PATCH] Added --convert-file-only option

 * src/convert.c (convert_links_in_hashtable, convert_links):
   test for CO_CONVERT_BASENAME_ONLY.
   (convert_basename): new function.
 * src/convert.h: new constant CO_CONVERT_BASENAME_ONLY.
 * src/init.c, src/main.c, src/options.h: new option "--convert-file-only".
 * doc/wget.texi: updated documentation.

 Reported-By: Gabriel L. Somlo <[email protected]>

---
 doc/wget.texi | 17 +++++++++++
 src/convert.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 src/convert.h |  2 ++
 src/init.c    |  2 ++
 src/main.c    | 24 +++++++++++----
 src/options.h |  3 ++
 6 files changed, 136 insertions(+), 8 deletions(-)

diff --git a/doc/wget.texi b/doc/wget.texi
index f1244aa..0a139e3 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -2123,6 +2123,23 @@ Note that only at the end of the download can Wget know which links have
 been downloaded.  Because of that, the work done by @samp{-k} will be
 performed at the end of all the downloads.
 
+@item --convert-file-only
+This option converts only the filename part of the URLs, leaving the rest
+of the URLs untouched. This filename part is sometimes referred to as the
+"basename", although we avoid that term here in order not to cause confusion.
+
+It works particularly well in conjunction with @samp{--convert-links}, although
+this coupling is not enforced. It proves useful to populate Internet caches
+with files downloaded from different hosts.
+
+Example: if some link points to @file{//foo.com/bar.cgi?xyz} with
+@samp{--convert-links} asserted and its local destination is intended to be
+@file{./foo.com/bar.cgi?xyz.css}, then the link would be converted to
+@file{//foo.com/bar.cgi?xyz.css}. Note that only the filename part has been
+modified. The rest of the URL has been left untouched, including the net path
+(@code{//}) which would otherwise be processed by Wget and converted to the
+effective scheme (ie. @code{http://}).
+
 @cindex backing up converted files
 @item -K
 @itemx --backup-converted
diff --git a/src/convert.c b/src/convert.c
index f0df9a0..8e9aa60 100644
--- a/src/convert.c
+++ b/src/convert.c
@@ -46,6 +46,7 @@ as that of the covered work.  */
 #include "html-url.h"
 #include "css-url.h"
 #include "iri.h"
+#include "xstrndup.h"
 
 static struct hash_table *dl_file_url_map;
 struct hash_table *dl_url_file_map;
@@ -136,8 +137,9 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
                  form.  We do this even if the URL already is in
                  relative form, because our directory structure may
                  not be identical to that on the server (think `-nd',
-                 `--cut-dirs', etc.)  */
-              cur_url->convert = CO_CONVERT_TO_RELATIVE;
+                 `--cut-dirs', etc.). If --convert-file-only was passed,
+                 we only convert the basename portion of the URL.  */
+              cur_url->convert = (opt.convert_file_only ? CO_CONVERT_BASENAME_ONLY : CO_CONVERT_TO_RELATIVE);
               cur_url->local_name = xstrdup (local_name);
               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
             }
@@ -206,6 +208,7 @@ static const char *replace_attr_refresh_hack (const char *, int, FILE *,
                                               const char *, int);
 static char *local_quote_string (const char *, bool);
 static char *construct_relative (const char *, const char *);
+static char *convert_basename (const char *, const struct urlpos *);
 
 /* Change the links in one file.  LINKS is a list of links in the
    document, along with their positions and the desired direction of
@@ -315,11 +318,34 @@ convert_links (const char *file, struct urlpos *links)
 
             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
                      link->url->url, newname, link->pos, file));
+
             xfree (newname);
             xfree (quoted_newname);
             ++to_file_count;
             break;
           }
+        case CO_CONVERT_BASENAME_ONLY:
+          {
+            char *newname = convert_basename (p, link);
+            char *quoted_newname = local_quote_string (newname, link->link_css_p);
+
+            if (link->link_css_p)
+              p = replace_plain (p, link->size, fp, quoted_newname);
+            else if (!link->link_refresh_p)
+              p = replace_attr (p, link->size, fp, quoted_newname);
+            else
+              p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
+                                             link->refresh_timeout);
+
+            DEBUGP (("Converted file part only: %s to %s at position %d in %s.\n",
+                     link->url->url, newname, link->pos, file));
+
+            xfree (newname);
+            xfree (quoted_newname);
+            ++to_file_count;
+
+            break;
+          }
         case CO_CONVERT_TO_COMPLETE:
           /* Convert the link to absolute URL. */
           {
@@ -336,6 +362,7 @@ convert_links (const char *file, struct urlpos *links)
 
             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
                      newlink, link->pos, file));
+
             xfree (quoted_newlink);
             ++to_url_count;
             break;
@@ -422,6 +449,71 @@ construct_relative (const char *basefile, const char *linkfile)
   return link;
 }
 
+/* Construct and return a "transparent proxy" URL
+   reflecting changes made by --adjust-extension to the file component
+   (i.e., "basename") of the original URL, but leaving the "dirname"
+   of the URL (protocol://hostname... portion) untouched.
+
+   Think: populating a squid cache via a recursive wget scrape, where
+   changing URLs to work locally with "file://..." is NOT desirable.
+
+   Example:
+
+   if
+                     p = "//foo.com/bar.cgi?xyz"
+   and
+      link->local_name = "docroot/foo.com/bar.cgi?xyz.css"
+   then
+
+      new_construct_func(p, link);
+   will return
+      "//foo.com/bar.cgi?xyz.css"
+
+   Essentially, we do s/$(basename orig_url)/$(basename link->local_name)/
+*/
+static char *
+convert_basename (const char *p, const struct urlpos *link)
+{
+  int len = link->size;
+  char *url = NULL;
+  char *org_basename = NULL, *local_basename = NULL;
+  char *result = NULL;
+
+  if (*p == '"' || *p == '\'')
+    {
+      len -= 2;
+      p++;
+    }
+
+  url = xstrndup (p, len);
+
+  org_basename = strrchr (url, '/');
+  if (org_basename)
+    org_basename++;
+  else
+    org_basename = url;
+
+  local_basename = strrchr (link->local_name, '/');
+  if (local_basename)
+    local_basename++;
+  else
+    local_basename = url;
+
+  /*
+   * If the basenames differ, graft the adjusted basename (local_basename)
+   * onto the original URL.
+   */
+  if (strcmp (org_basename, local_basename) == 0)
+    result = url;
+  else
+    {
+      result = uri_merge (url, local_basename);
+      xfree (url);
+    }
+
+  return result;
+}
+
 /* Used by write_backup_file to remember which files have been
    written. */
 static struct hash_table *converted_files;
diff --git a/src/convert.h b/src/convert.h
index 3105db1..b3cd196 100644
--- a/src/convert.h
+++ b/src/convert.h
@@ -40,6 +40,8 @@ enum convert_options {
   CO_NOCONVERT = 0,             /* don't convert this URL */
   CO_CONVERT_TO_RELATIVE,       /* convert to relative, e.g. to
                                    "../../otherdir/foo.gif" */
+  CO_CONVERT_BASENAME_ONLY,     /* convert the file portion only (basename)
+                                   leaving the rest of the URL unchanged */
   CO_CONVERT_TO_COMPLETE,       /* convert to absolute, e.g. to
                                    "http://orighost/somedir/bar.jpg";. */
   CO_NULLIFY_BASE               /* change to empty string. */
diff --git a/src/init.c b/src/init.c
index dd1506c..67c94b9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -159,6 +159,7 @@ static const struct {
   { "contentdisposition", &opt.content_disposition, cmd_boolean },
   { "contentonerror",   &opt.content_on_error,  cmd_boolean },
   { "continue",         &opt.always_rest,       cmd_boolean },
+  { "convertfileonly",  &opt.convert_file_only, cmd_boolean },
   { "convertlinks",     &opt.convert_links,     cmd_boolean },
   { "cookies",          &opt.cookies,           cmd_boolean },
 #ifdef HAVE_SSL
@@ -377,6 +378,7 @@ defaults (void)
   opt.htmlify = true;
   opt.http_keep_alive = true;
   opt.use_proxy = true;
+  opt.convert_file_only = false;
   tmp = getenv ("no_proxy");
   if (tmp)
     opt.no_proxy = sepstring (tmp);
diff --git a/src/main.c b/src/main.c
index 92d1bee..cfdc361 100644
--- a/src/main.c
+++ b/src/main.c
@@ -264,6 +264,7 @@ static struct cmdline_option option_data[] =
     { "config", 0, OPT_VALUE, "chooseconfig", -1 },
     { "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 },
     { "continue", 'c', OPT_BOOLEAN, "continue", -1 },
+    { "convert-file-only", 0, OPT_BOOLEAN, "convertfileonly", -1 },
     { "convert-links", 'k', OPT_BOOLEAN, "convertlinks", -1 },
     { "content-disposition", 0, OPT_BOOLEAN, "contentdisposition", -1 },
     { "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 },
@@ -877,6 +878,8 @@ Recursive download:\n"),
   -k,  --convert-links             make links in downloaded HTML or CSS point to\n\
                                      local files\n"),
     N_("\
+       --convert-file-only         convert the file part of the URLs only (usually known as the basename)\n"),
+    N_("\
        --backups=N                 before writing file X, rotate up to N backup files\n"),
 
 #ifdef __VMS
@@ -1387,11 +1390,14 @@ main (int argc, char **argv)
   /* All user options have now been processed, so it's now safe to do
      interoption dependency checks. */
 
-  if (opt.noclobber && opt.convert_links)
+  if (opt.noclobber && (opt.convert_links || opt.convert_file_only))
     {
       fprintf (stderr,
-               _("Both --no-clobber and --convert-links were specified,"
-                 " only --convert-links will be used.\n"));
+               opt.convert_links ?
+                   _("Both --no-clobber and --convert-links were specified,"
+                     " only --convert-links will be used.\n") :
+                    _("Both --no-clobber and --convert-file-only were specified,"
+                      " only --convert-file-only will be used.\n"));
       opt.noclobber = false;
     }
 
@@ -1445,11 +1451,11 @@ Can't timestamp and not clobber old files at the same time.\n"));
 #endif
   if (opt.output_document)
     {
-      if (opt.convert_links
+      if ((opt.convert_links || opt.convert_file_only)
           && (nurl > 1 || opt.page_requisites || opt.recursive))
         {
           fputs (_("\
-Cannot specify both -k and -O if multiple URLs are given, or in combination\n\
+Cannot specify both -k or --convert-file-only and -O if multiple URLs are given, or in combination\n\
 with -p or -r. See the manual for details.\n\n"), stderr);
           print_usage (1);
           exit (WGET_EXIT_GENERIC_ERROR);
@@ -1760,6 +1766,12 @@ for details.\n\n"));
 outputting to a regular file.\n"));
           exit (WGET_EXIT_GENERIC_ERROR);
         }
+      if (!output_stream_regular && (opt.convert_links || opt.convert_file_only))
+        {
+          fprintf (stderr, _("--convert-links or --convert-file-only can be used together \
+only if outputting to a regular file.\n"));
+          exit (WGET_EXIT_GENERIC_ERROR);
+        }
     }
 
 #ifdef __VMS
@@ -1970,7 +1982,7 @@ outputting to a regular file.\n"));
     save_hsts ();
 #endif
 
-  if (opt.convert_links && !opt.delete_after)
+  if ((opt.convert_links || opt.convert_file_only) && !opt.delete_after)
     convert_all_links ();
 
   cleanup ();
diff --git a/src/options.h b/src/options.h
index 0bb7d71..dad08c1 100644
--- a/src/options.h
+++ b/src/options.h
@@ -182,6 +182,9 @@ struct options
                                    NULL. */
   bool convert_links;           /* Will the links be converted
                                    locally? */
+  bool convert_file_only;       /* Convert only the file portion of the URI (i.e. basename).
+                                   Leave everything else untouched. */
+
   bool remove_listing;          /* Do we remove .listing files
                                    generated by FTP? */
   bool htmlify;                 /* Do we HTML-ify the OS-dependent
-- 
1.9.1

Reply via email to