This patch adds an option that allows the user to specify a perl
expression used to modify the target filenames of a call to wget. It
works similarly to perl's "rename" script, in terms of how perl is used
to modify the filename string. That is, the original filename is stored
in the perl variable $_, which the user-supplied code can modify; the
value left in $_ is used instead of the original.
Perl treats $_ as the default variable for regular expressions (among
other operations), so that the user can specify a regular expression
without (having to know) any perl code (other than perl-compatible
regexes), and that will work fine.
I implemented this feature back in August or so, in order to mirror
thepiratebay.org with wget. By default, wget would have put 1M files
into a single directory in order to mirror that site, which (with ext3)
would have destroyed filesystem performance, to say the least.
Since there are many other sites whose visible directory structure is
inappropriate for direct representation in an actual filesystem, I
imagine this patch could be generally useful.
Example usage:
$ wget -x --rename 's?/?%2f?g'
http://www.gnu.org/software/wget/manual/html_node/index.html
--2010-01-15 23:01:23--
http://www.gnu.org/software/wget/manual/html_node/index.html
Resolving www.gnu.org... 199.232.41.10
Connecting to www.gnu.org|199.232.41.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8545 (8.3K) [text/html]
Saving to: "www.gnu.org%2fsoftware%2fwget%2fmanual%2fhtml_node%2findex.html"
100%[===========================================>] 8,545 --.-K/s in
0s
2010-01-15 23:01:23 (134 MB/s) -
"www.gnu.org%2fsoftware%2fwget%2fmanual%2fhtml_node%2findex.html" saved
[8545/8545]
This also works exactly how one would want it to work:
$ wget -q --rename 's?/?%2f?g' -r --no-parent -k
http://www.gnu.org/software/wget/manual/html_node/index.html
I.e., you get the site saved without any of the directory structure, and
all the internal links still work.
It is also possible to create directory structure by adding slashes.
(That is how I dealt with thepiratebay.org).
Regexes are probably the most useful thing to use with this script,
but since arbitrary perl is allowed, quite a lot more could be done.
(An example is generalizing the regex above, to translate some larger
set of characters to %hex codes.) I originally wanted to use PCRE for
this, but (amazingly) it doesn't directly provide any facility for
substitution -- only matching. I couldn't find such a facility in C
library form anywhere on the internet. Rather than (re)implement it, I
just called perl. I thought it was terribly hackish at the time, but
now I like it. It actually adds much less to the binary (when you don't
use it) than the PCRE approach would have.
diff -r 2340fa0d1b78 src/Makefile.am
--- a/src/Makefile.am Wed Jan 13 20:41:15 2010 -0800
+++ b/src/Makefile.am Fri Jan 15 23:52:47 2010 -0500
@@ -44,13 +44,13 @@
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
recur.c res.c retr.c snprintf.c spider.c url.c \
- utils.c exits.c build_info.c $(IRI_OBJ) \
+ utils.c exits.c build_info.c perlfilter.c $(IRI_OBJ) \
css-url.h css-tokens.h connect.h convert.h cookies.h \
ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
options.h progress.h ptimer.h recur.h res.h retr.h \
spider.h ssl.h sysdep.h url.h utils.h wget.h iri.h \
- exits.h gettext.h
+ exits.h gettext.h perlfilter.h
nodist_wget_SOURCES = version.c
EXTRA_wget_SOURCES = iri.c
LDADD = $(LIBOBJS) ../lib/libgnu.a @MD5_LDADD@
diff -r 2340fa0d1b78 src/http.c
--- a/src/http.c Wed Jan 13 20:41:15 2010 -0800
+++ b/src/http.c Fri Jan 15 23:52:47 2010 -0500
@@ -69,6 +69,7 @@
#ifdef __VMS
# include "vms.h"
#endif /* def __VMS */
+#include "perlfilter.h"
extern char *version_string;
@@ -2567,6 +2568,19 @@
{
hstat.local_file = url_file_name (u);
got_name = true;
+
+ if (opt.rename_output)
+ {
+ static pipe2_t *filter = 0;
+ if (!filter)
+ {
+ filter = malloc(sizeof(*filter));
+ *filter = init_perl_filter(opt.rename_output);
+ }
+ char *p = hstat.local_file;
+ hstat.local_file = apply_perl_filter(*filter, hstat.local_file);
+ free(p);
+ }
}
/* TODO: Ick! This code is now in both gethttp and http_loop, and is
diff -r 2340fa0d1b78 src/init.c
--- a/src/init.c Wed Jan 13 20:41:15 2010 -0800
+++ b/src/init.c Fri Jan 15 23:52:47 2010 -0500
@@ -236,6 +236,7 @@
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
+ { "renameoutput", &opt.rename_output, cmd_string },
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
{ "retryconnrefused", &opt.retry_connrefused, cmd_boolean },
diff -r 2340fa0d1b78 src/main.c
--- a/src/main.c Wed Jan 13 20:41:15 2010 -0800
+++ b/src/main.c Fri Jan 15 23:52:47 2010 -0500
@@ -253,6 +253,7 @@
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
+ { "rename-output", 0, OPT_VALUE, "renameoutput", -1 },
{ "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
{ "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
{ "retry-connrefused", 0, OPT_BOOLEAN, "retryconnrefused", -1 },
@@ -534,53 +535,55 @@
N_("\
HTTP options:\n"),
N_("\
- --http-user=USER set http user to USER.\n"),
+ --http-user=USER set http user to USER.\n"),
N_("\
- --http-password=PASS set http password to PASS.\n"),
+ --http-password=PASS set http password to PASS.\n"),
N_("\
- --no-cache disallow server-cached data.\n"),
+ --no-cache disallow server-cached data.\n"),
N_ ("\
- --default-page=NAME Change the default page name (normally\n\
- this is `index.html'.).\n"),
+ --default-page=NAME Change the default page name (normally\n\
+ this is `index.html'.).\n"),
N_("\
- -E, --adjust-extension save HTML/CSS documents with proper extensions.\n"),
+ -E, --adjust-extension save HTML/CSS documents with proper extensions.\n"),
N_("\
- --ignore-length ignore `Content-Length' header field.\n"),
+ --ignore-length ignore `Content-Length' header field.\n"),
N_("\
- --header=STRING insert STRING among the headers.\n"),
+ --header=STRING insert STRING among the headers.\n"),
N_("\
- --max-redirect maximum redirections allowed per page.\n"),
+ --max-redirect maximum redirections allowed per page.\n"),
N_("\
- --proxy-user=USER set USER as proxy username.\n"),
+ --proxy-user=USER set USER as proxy username.\n"),
N_("\
- --proxy-password=PASS set PASS as proxy password.\n"),
+ --proxy-password=PASS set PASS as proxy password.\n"),
N_("\
- --referer=URL include `Referer: URL' header in HTTP request.\n"),
+ --referer=URL include `Referer: URL' header in HTTP request.\n"),
N_("\
- --save-headers save the HTTP headers to file.\n"),
+ --save-headers save the HTTP headers to file.\n"),
N_("\
- -U, --user-agent=AGENT identify as AGENT instead of Wget/VERSION.\n"),
+ -U, --user-agent=AGENT identify as AGENT instead of Wget/VERSION.\n"),
N_("\
- --no-http-keep-alive disable HTTP keep-alive (persistent connections).\n"),
+ --no-http-keep-alive disable HTTP keep-alive (persistent connections).\n"),
N_("\
- --no-cookies don't use cookies.\n"),
+ --no-cookies don't use cookies.\n"),
N_("\
- --load-cookies=FILE load cookies from FILE before session.\n"),
+ --load-cookies=FILE load cookies from FILE before session.\n"),
N_("\
- --save-cookies=FILE save cookies to FILE after session.\n"),
+ --save-cookies=FILE save cookies to FILE after session.\n"),
N_("\
- --keep-session-cookies load and save session (non-permanent) cookies.\n"),
+ --keep-session-cookies load and save session (non-permanent) cookies.\n"),
N_("\
- --post-data=STRING use the POST method; send STRING as the data.\n"),
+ --post-data=STRING use the POST method; send STRING as the data.\n"),
N_("\
- --post-file=FILE use the POST method; send contents of FILE.\n"),
+ --post-file=FILE use the POST method; send contents of FILE.\n"),
N_("\
- --content-disposition honor the Content-Disposition header when\n\
- choosing local file names (EXPERIMENTAL).\n"),
+ --content-disposition honor the Content-Disposition header when\n\
+ choosing local file names (EXPERIMENTAL).\n"),
N_("\
- --auth-no-challenge send Basic HTTP authentication information\n\
- without first waiting for the server's\n\
- challenge.\n"),
+ --auth-no-challenge send Basic HTTP authentication information\n\
+ without first waiting for the server's\n\
+ challenge.\n"),
+ N_("\
+ --rename-output=PERL-CODE rename output file(s) with perl.\n"),
"\n",
#ifdef HAVE_SSL
diff -r 2340fa0d1b78 src/options.h
--- a/src/options.h Wed Jan 13 20:41:15 2010 -0800
+++ b/src/options.h Fri Jan 15 23:52:47 2010 -0500
@@ -246,6 +246,8 @@
int ftp_stmlf; /* Force Stream_LF format for binary FTP. */
#endif /* def __VMS */
+ char *rename_output; /* Rename output file(s) using this perl code. */
+
bool useservertimestamps; /* Update downloaded files' timestamps to
match those on server? */
diff -r 2340fa0d1b78 src/perlfilter.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/perlfilter.c Fri Jan 15 23:52:47 2010 -0500
@@ -0,0 +1,103 @@
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <error.h>
+#include <errno.h>
+#include <unistd.h>
+#include "perlfilter.h"
+
+static int
+open2(char *argv[], int *result_in, int *result_out, pid_t *result_child)
+{
+ int pipe_out[2]; /* out of parent */
+ int pipe_in[2]; /* into parent */
+ pid_t cpid;
+
+ if (pipe(pipe_out) < 0)
+ return -1;
+ if (pipe(pipe_in) < 0)
+ return -1;
+
+ cpid = fork();
+ if (cpid < 0)
+ return -1;
+
+ if (cpid == 0) { /* child */
+ int *input = pipe_out; /* for sanity */
+ int *output = pipe_in;
+ close(output[0]);
+ close(input[1]);
+
+ /* see http://unixwiz.net/techtips/remap-pipe-fds.c.txt */
+ if (output[1] == 0)
+ if ((output[1] = dup(output[1]) < 0))
+ error(1, errno, "in child process: dup");
+ if (output[0] == 1)
+ if ((output[0] = dup(output[0]) < 0))
+ error(1, errno, "in child process: dup");
+
+ if (dup2(output[1], 1) < 0)
+ error(1, errno, "in child process: dup2");
+ if (dup2(input[0], 0) < 0)
+ error(1, errno, "in child process: dup2");
+
+ if (output[1] != 1)
+ close(output[1]);
+ if (input[0] != 0)
+ close(input[0]);
+
+ execv(argv[0], &argv[1]);
+ error(1, errno, "in child process: exec");
+ return -1; /* fucking warning */
+
+ } else { /* parent */
+ close(pipe_out[0]);
+ close(pipe_in[1]);
+
+ *result_in = pipe_in[0];
+ *result_out = pipe_out[1];
+ if (*result_child)
+ *result_child = cpid;
+ return 0;
+ }
+}
+
+pipe2_t init_perl_filter(char *src)
+{
+ pipe2_t res = { 0, 0, 0 }; /* initialize to silence warning */
+ char *prefix = "$|++;substr $_, -1, 1, '';";
+ char *postfix = ";$_.=chr 0";
+ char *perlcode = malloc(strlen(prefix)+strlen(src)+strlen(postfix)+1);
+ strcpy(perlcode, prefix);
+ strcat(perlcode, src);
+ strcat(perlcode, postfix);
+ char *cmd[] = {
+ "/usr/bin/perl", "perl", "-0", "-pe", perlcode, 0
+ };
+
+ int in = -1, out = -1; /* initialize to silence warning */
+ if (open2(cmd, &in, &out, &res.pid) < 0)
+ error(1, 0, "open2");
+ res.out = fdopen(out, "w");
+ res.in = fdopen(in, "r");
+
+ return res;
+}
+
+char *apply_perl_filter(pipe2_t filter, char *s)
+{
+ fwrite(s, 1, strlen(s)+1, filter.out);
+ fflush(filter.out);
+ fsync(fileno(filter.out));
+
+ char c = 1;
+ if (fscanf(filter.in, "%a[\001-\377]%c", &s, &c)) {
+ if (c)
+ free(s); /* no terminating \0 -> return error */
+ else
+ return s;
+ }
+ return 0;
+}
diff -r 2340fa0d1b78 src/perlfilter.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/perlfilter.h Fri Jan 15 23:52:47 2010 -0500
@@ -0,0 +1,10 @@
+#ifndef _PERLFILTER_H
+#define _PERLFILTER_H
+#include <stdio.h>
+typedef struct {
+ FILE *in, *out;
+ pid_t pid;
+} pipe2_t;
+pipe2_t init_perl_filter(char *src);
+char *apply_perl_filter(pipe2_t filter, char *s);
+#endif