Hi AJ, Thanks for implementing this! I tested it, and the functionality appears correct.
I've added a more detailed review inline, below. On Wed, Sep 23, 2015 at 11:20:23PM +0200, Ander Juaristi wrote: > During the last days Gabriel and I have been working on a new feature called > --convert-file-only. You can find Gabriel's original message to the mailing > list and some additional replies below. I now want to introduce you the > final result :D. > > We already had --convert-links, which prepended '../'-es to the links in > order to keep them valid. However, this assumes that the downloaded files > would be visited locally, ie. with scheme 'file://'. > > What this option does is to convert the file portion of the URL (sometimes > called the 'basename') only, but leaving the rest of the URL intact. > Basically, it replaces the basename of the original URL with the basename of > the expected local file. It is specially useful in combination with > --adjust-extension, and will probably be used with it in most of the cases, > although this is not enforced. The main rationale for this was to be able to > mirror a site in a web cache, such as Squid [1]. You would just `wget -r > --convert-file-only --adjust-extension blah` and expect it to work. > > So if we execute: > > $ wget --recursive --convert-file-only --adjust-extension ... > > All the links to the remote file > > //foo.com/bar.cgi?xyz > > That would've been rewritten to > > docroot/foo.com/bar.cgi?xyz.css > > Would instead appear as > > //foo.com/bar.cgi?xyz.css > > This also works for CSSs, so this > > background:url(//mirror.ini.cmu.edu/foo.png); > > Would result in > > background:url(//mirror.ini.cmu.edu/foo.png) > > Instead of either > > background:url(http://mirror.ini.cmu.edu/foo.png) (<-- default behavior. > Wget converts net paths) > > or > > background:url(../mirror.ini.cmu.edu/foo.png) (<-- this will clearly not > work on a server) > > We had doubts on how to call this new option. I initially proposed something > like '--basename-only', but Gabriel was concerned that although in the Unix > world the term 'basename' correctly refers to the file portion of a URI > (just what we want), this might not be the case in the Wget universe. I ended up reading RFC 3986 "Uniform Resource Identifier (URI): Generic Syntax" for inspiration, with little help. The term "base" is heavily used in the context of "base URL", rather than "basename" as in the Unix command. "--convert-filename-only" would be the only slight improvement I can come up with (my original idea was "--convert-tpu" for "transparent proxy url", but I like AJ's naming idea better). > > ... > > Regards, > - AJ > From af5170772750451d3d67028430f1d21075f6bfb1 Mon Sep 17 00:00:00 2001 > From: Ander Juaristi <[email protected]> > Date: Tue, 22 Sep 2015 21:10:38 +0200 > Subject: [PATCH] Added --convert-file-only option > There probably should be a commit blurb here (maybe start by moving the description of the change from above into git?) > --- > src/convert.c | 95 > +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > src/convert.h | 2 ++ > src/init.c | 2 ++ > src/main.c | 24 +++++++++++---- > src/options.h | 3 ++ > 5 files changed, 118 insertions(+), 8 deletions(-) > > diff --git a/src/convert.c b/src/convert.c > index f0df9a0..35854cd 100644 > --- a/src/convert.c > +++ b/src/convert.c > @@ -46,6 +46,7 @@ as that of the covered work. */ > #include "html-url.h" > #include "css-url.h" > #include "iri.h" > +#include "xstrndup.h" > > static struct hash_table *dl_file_url_map; > struct hash_table *dl_url_file_map; > @@ -136,8 +137,9 @@ convert_links_in_hashtable (struct hash_table > *downloaded_set, > form. We do this even if the URL already is in > relative form, because our directory structure may > not be identical to that on the server (think `-nd', > - `--cut-dirs', etc.) */ > - cur_url->convert = CO_CONVERT_TO_RELATIVE; > + `--cut-dirs', etc.). If --convert-file-only was passed, > + we only convert the basename portion of the URL. */ > + cur_url->convert = (opt.convert_file_only ? > CO_CONVERT_BASENAME_ONLY : CO_CONVERT_TO_RELATIVE); > cur_url->local_name = xstrdup (local_name); > DEBUGP (("will convert url %s to local %s\n", u->url, > local_name)); > } > @@ -206,6 +208,7 @@ static const char *replace_attr_refresh_hack (const char > *, int, FILE *, > const char *, int); > static char *local_quote_string (const char *, bool); > static char *construct_relative (const char *, const char *); > +static char *convert_basename (const char *, const struct urlpos *); > > /* Change the links in one file. LINKS is a list of links in the > document, along with their positions and the desired direction of > @@ -315,9 +318,32 @@ convert_links (const char *file, struct urlpos *links) > > DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n", > link->url->url, newname, link->pos, file)); > + > + xfree (newname); > + xfree (quoted_newname); > + ++to_file_count; > + break; > + } > + case CO_CONVERT_BASENAME_ONLY: > + { > + char *newname = convert_basename (p, link); > + char *quoted_newname = local_quote_string (newname, > link->link_css_p); > + > + if (link->link_css_p) > + p = replace_plain (p, link->size, fp, quoted_newname); > + else if (!link->link_refresh_p) > + p = replace_attr (p, link->size, fp, quoted_newname); > + else > + p = replace_attr_refresh_hack (p, link->size, fp, > quoted_newname, > + link->refresh_timeout); > + > + DEBUGP (("Converted file part only: %s to %s at position %d in > %s.\n", > + link->url->url, newname, link->pos, file)); > + > xfree (newname); > xfree (quoted_newname); > ++to_file_count; > + > break; > } > case CO_CONVERT_TO_COMPLETE: > @@ -336,6 +362,7 @@ convert_links (const char *file, struct urlpos *links) > > DEBUGP (("TO_COMPLETE: <something> to %s at position %d in > %s.\n", > newlink, link->pos, file)); > + > xfree (quoted_newlink); > ++to_url_count; > break; > @@ -422,6 +449,70 @@ construct_relative (const char *basefile, const char > *linkfile) > return link; > } > > +/* Construct and return a "transparent proxy" URL > + reflecting changes made by --adjust-extension to the file component > + (i.e., "basename") of the original URL, but leaving the "dirname" > + of the URL (protocol://hostname... portion) untouched. > + > + Think: populating a squid cache via a recursive wget scrape, where > + changing URLs to work locally with "file://..." is NOT desirable. > + > + Example: > + > + if > + p = "//foo.com/bar.cgi?xyz" > + and > + link->local_name = "docroot/foo.com/bar.cgi?xyz.css" > + then > + > + new_construct_func(p, link); > + will return > + "//foo.com/bar.cgi?xyz.css" > + > + Essentially, we do s/$(basename orig_url)/$(basename link->local_name)/ > +*/ > +static char * > +convert_basename (const char *p, const struct urlpos *link) > +{ > + int len = link->size; > + char *url = NULL; > + char *org_basename = NULL, *local_basename = NULL; > + char *result = url; None of the string variables above really need to be initialized, since you're going to assign them unconditionally below regardless. > + > + if (*p == '"' || *p == '\'') > + { > + len -= 2; > + p++; > + } > + > + url = xstrndup (p, len); > + > + org_basename = strrchr (url, '/'); > + if (org_basename) > + org_basename++; > + else > + org_basename = url; > + > + local_basename = strrchr (link->local_name, '/'); > + if (local_basename) > + local_basename++; > + else > + local_basename = url; > + > + /* > + * If the basenames differ, graft the adjusted basename (local_basename) > + * onto the original URL. > + */ > + if (strcmp (org_basename, local_basename)) > + result = uri_merge (url, local_basename); > + else > + result = xstrdup (url); Consider inverting the test. If basenames are *equal*, return 'url' immediately, and save an unnecessary xstrdup() + xfree(). Otherwise, call uri_merge() and xfree(url) before returning the result. > + > + xfree (url); > + > + return result; > +} > + > /* Used by write_backup_file to remember which files have been > written. */ > static struct hash_table *converted_files; > diff --git a/src/convert.h b/src/convert.h > index 3105db1..b3cd196 100644 > --- a/src/convert.h > +++ b/src/convert.h > @@ -40,6 +40,8 @@ enum convert_options { > CO_NOCONVERT = 0, /* don't convert this URL */ > CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to > "../../otherdir/foo.gif" */ > + CO_CONVERT_BASENAME_ONLY, /* convert the file portion only (basename) > + leaving the rest of the URL unchanged */ > CO_CONVERT_TO_COMPLETE, /* convert to absolute, e.g. to > "http://orighost/somedir/bar.jpg". */ > CO_NULLIFY_BASE /* change to empty string. */ > diff --git a/src/init.c b/src/init.c > index dd1506c..67c94b9 100644 > --- a/src/init.c > +++ b/src/init.c > @@ -159,6 +159,7 @@ static const struct { > { "contentdisposition", &opt.content_disposition, cmd_boolean }, > { "contentonerror", &opt.content_on_error, cmd_boolean }, > { "continue", &opt.always_rest, cmd_boolean }, > + { "convertfileonly", &opt.convert_file_only, cmd_boolean }, > { "convertlinks", &opt.convert_links, cmd_boolean }, > { "cookies", &opt.cookies, cmd_boolean }, > #ifdef HAVE_SSL > @@ -377,6 +378,7 @@ defaults (void) > opt.htmlify = true; > opt.http_keep_alive = true; > opt.use_proxy = true; > + opt.convert_file_only = false; > tmp = getenv ("no_proxy"); > if (tmp) > opt.no_proxy = sepstring (tmp); > diff --git a/src/main.c b/src/main.c > index 92d1bee..cfdc361 100644 > --- a/src/main.c > +++ b/src/main.c > @@ -264,6 +264,7 @@ static struct cmdline_option option_data[] = > { "config", 0, OPT_VALUE, "chooseconfig", -1 }, > { "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 }, > { "continue", 'c', OPT_BOOLEAN, "continue", -1 }, > + { "convert-file-only", 0, OPT_BOOLEAN, "convertfileonly", -1 }, > { "convert-links", 'k', OPT_BOOLEAN, "convertlinks", -1 }, > { "content-disposition", 0, OPT_BOOLEAN, "contentdisposition", -1 }, > { "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 }, > @@ -877,6 +878,8 @@ Recursive download:\n"), > -k, --convert-links make links in downloaded HTML or CSS > point to\n\ > local files\n"), > N_("\ > + --convert-file-only convert the file part of the URLs only > (usually known as the basename)\n"), > + N_("\ > --backups=N before writing file X, rotate up to N > backup files\n"), > > #ifdef __VMS > @@ -1387,11 +1390,14 @@ main (int argc, char **argv) > /* All user options have now been processed, so it's now safe to do > interoption dependency checks. */ > > - if (opt.noclobber && opt.convert_links) > + if (opt.noclobber && (opt.convert_links || opt.convert_file_only)) > { > fprintf (stderr, > - _("Both --no-clobber and --convert-links were specified," > - " only --convert-links will be used.\n")); > + opt.convert_links ? > + _("Both --no-clobber and --convert-links were specified," > + " only --convert-links will be used.\n") : > + _("Both --no-clobber and --convert-file-only were > specified," > + " only --convert-file-only will be used.\n")); > opt.noclobber = false; > } > > @@ -1445,11 +1451,11 @@ Can't timestamp and not clobber old files at the same > time.\n")); > #endif > if (opt.output_document) > { > - if (opt.convert_links > + if ((opt.convert_links || opt.convert_file_only) > && (nurl > 1 || opt.page_requisites || opt.recursive)) > { > fputs (_("\ > -Cannot specify both -k and -O if multiple URLs are given, or in > combination\n\ > +Cannot specify both -k or --convert-file-only and -O if multiple URLs are > given, or in combination\n\ > with -p or -r. See the manual for details.\n\n"), stderr); > print_usage (1); > exit (WGET_EXIT_GENERIC_ERROR); > @@ -1760,6 +1766,12 @@ for details.\n\n")); > outputting to a regular file.\n")); > exit (WGET_EXIT_GENERIC_ERROR); > } > + if (!output_stream_regular && (opt.convert_links || > opt.convert_file_only)) > + { > + fprintf (stderr, _("--convert-links or --convert-file-only can be > used together \ > +only if outputting to a regular file.\n")); > + exit (WGET_EXIT_GENERIC_ERROR); > + } > } > > #ifdef __VMS > @@ -1970,7 +1982,7 @@ outputting to a regular file.\n")); > save_hsts (); > #endif > > - if (opt.convert_links && !opt.delete_after) > + if ((opt.convert_links || opt.convert_file_only) && !opt.delete_after) > convert_all_links (); > > cleanup (); > diff --git a/src/options.h b/src/options.h > index 0bb7d71..dad08c1 100644 > --- a/src/options.h > +++ b/src/options.h > @@ -182,6 +182,9 @@ struct options > NULL. */ > bool convert_links; /* Will the links be converted > locally? */ > + bool convert_file_only; /* Convert only the file portion of the URI > (i.e. basename). > + Leave everything else untouched. */ > + > bool remove_listing; /* Do we remove .listing files > generated by FTP? */ > bool htmlify; /* Do we HTML-ify the OS-dependent > -- > 2.1.4 > Consider also adding a blurb describing the new option to wget.texi, for the man page. Last, but not least, I'm not 100% sure what the style guide says for wget, but consider sticking to a <= 80 character width limit. Thanks much, --Gabriel
