This patch adds an option that allows the user to specify a perl
expression used to modify the target filenames of a call to wget.  It
works similarly to perl's "rename" script, in terms of how perl is used
to modify the filename string.  That is, the original filename is stored
in the perl variable $_, which the user-supplied code can modify; the
value left in $_ is used instead of the original.

Perl treats $_ as the default variable for regular expressions (among
other operations), so that the user can specify a regular expression
without (having to know) any perl code (other than perl-compatible
regexes), and that will work fine.

I implemented this feature back in August or so, in order to mirror
thepiratebay.org with wget.  By default, wget would have put 1M files
into a single directory in order to mirror that site, which (with ext3)
would have destroyed filesystem performance, to say the least.

Since there are many other sites whose visible directory structure is
inappropriate for direct representation in an actual filesystem, I
imagine this patch could be generally useful.

Example usage:


  $ wget -x --rename 's?/?%2f?g' 
http://www.gnu.org/software/wget/manual/html_node/index.html

  --2010-01-15 23:01:23--  
http://www.gnu.org/software/wget/manual/html_node/index.html
  Resolving www.gnu.org... 199.232.41.10
  Connecting to www.gnu.org|199.232.41.10|:80... connected.
  HTTP request sent, awaiting response... 200 OK
  Length: 8545 (8.3K) [text/html]
  Saving to: "www.gnu.org%2fsoftware%2fwget%2fmanual%2fhtml_node%2findex.html"

  100%[===========================================>] 8,545       --.-K/s   in 
0s      

  2010-01-15 23:01:23 (134 MB/s) - 
"www.gnu.org%2fsoftware%2fwget%2fmanual%2fhtml_node%2findex.html" saved 
[8545/8545]


This also works exactly how one would want it to work:


  $ wget -q --rename 's?/?%2f?g' -r --no-parent -k 
http://www.gnu.org/software/wget/manual/html_node/index.html


I.e., you get the site saved without any of the directory structure, and
all the internal links still work.

It is also possible to create directory structure by adding slashes.
(That is how I dealt with thepiratebay.org).

Regexes are probably the most useful thing to use with this script,
but since arbitrary perl is allowed, quite a lot more could be done.
(An example is generalizing the regex above, to translate some larger
set of characters to %hex codes.)  I originally wanted to use PCRE for
this, but (amazingly) it doesn't directly provide any facility for
substitution -- only matching.  I couldn't find such a facility in C
library form anywhere on the internet.  Rather than (re)implement it, I
just called perl.  I thought it was terribly hackish at the time, but
now I like it.  It actually adds much less to the binary (when you don't
use it) than the PCRE approach would have.
diff -r 2340fa0d1b78 src/Makefile.am
--- a/src/Makefile.am	Wed Jan 13 20:41:15 2010 -0800
+++ b/src/Makefile.am	Fri Jan 15 23:52:47 2010 -0500
@@ -44,13 +44,13 @@
 	       ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
 	       http.c init.c log.c main.c netrc.c progress.c ptimer.c     \
 	       recur.c res.c retr.c snprintf.c spider.c url.c	          \
-	       utils.c exits.c build_info.c $(IRI_OBJ)			  \
+	       utils.c exits.c build_info.c perlfilter.c $(IRI_OBJ)	  \
 	       css-url.h css-tokens.h connect.h convert.h cookies.h       \
 	       ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h      \
 	       http.h http-ntlm.h init.h log.h mswindows.h netrc.h        \
 	       options.h progress.h ptimer.h recur.h res.h retr.h         \
 	       spider.h ssl.h sysdep.h url.h utils.h wget.h iri.h 	  \
-	       exits.h gettext.h
+	       exits.h gettext.h perlfilter.h
 nodist_wget_SOURCES = version.c
 EXTRA_wget_SOURCES = iri.c
 LDADD = $(LIBOBJS) ../lib/libgnu.a @MD5_LDADD@
diff -r 2340fa0d1b78 src/http.c
--- a/src/http.c	Wed Jan 13 20:41:15 2010 -0800
+++ b/src/http.c	Fri Jan 15 23:52:47 2010 -0500
@@ -69,6 +69,7 @@
 #ifdef __VMS
 # include "vms.h"
 #endif /* def __VMS */
+#include "perlfilter.h"
 
 extern char *version_string;
 
@@ -2567,6 +2568,19 @@
     {
       hstat.local_file = url_file_name (u);
       got_name = true;
+
+      if (opt.rename_output)
+	{
+	  static pipe2_t *filter = 0;
+	  if (!filter)
+	    {
+	      filter = malloc(sizeof(*filter));
+	      *filter = init_perl_filter(opt.rename_output);
+	    }
+	  char *p = hstat.local_file;
+	  hstat.local_file = apply_perl_filter(*filter, hstat.local_file);
+	  free(p);
+	}
     }
 
   /* TODO: Ick! This code is now in both gethttp and http_loop, and is
diff -r 2340fa0d1b78 src/init.c
--- a/src/init.c	Wed Jan 13 20:41:15 2010 -0800
+++ b/src/init.c	Fri Jan 15 23:52:47 2010 -0500
@@ -236,6 +236,7 @@
   { "relativeonly",     &opt.relative_only,     cmd_boolean },
   { "remoteencoding",   &opt.encoding_remote,   cmd_string },
   { "removelisting",    &opt.remove_listing,    cmd_boolean },
+  { "renameoutput",     &opt.rename_output,     cmd_string },
   { "restrictfilenames", NULL,                  cmd_spec_restrict_file_names },
   { "retrsymlinks",     &opt.retr_symlinks,     cmd_boolean },
   { "retryconnrefused", &opt.retry_connrefused, cmd_boolean },
diff -r 2340fa0d1b78 src/main.c
--- a/src/main.c	Wed Jan 13 20:41:15 2010 -0800
+++ b/src/main.c	Fri Jan 15 23:52:47 2010 -0500
@@ -253,6 +253,7 @@
     { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
     { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
     { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
+    { "rename-output", 0, OPT_VALUE, "renameoutput", -1 },
     { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
     { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
     { "retry-connrefused", 0, OPT_BOOLEAN, "retryconnrefused", -1 },
@@ -534,53 +535,55 @@
     N_("\
 HTTP options:\n"),
     N_("\
-       --http-user=USER        set http user to USER.\n"),
+       --http-user=USER		  set http user to USER.\n"),
     N_("\
-       --http-password=PASS    set http password to PASS.\n"),
+       --http-password=PASS	  set http password to PASS.\n"),
     N_("\
-       --no-cache              disallow server-cached data.\n"),
+       --no-cache		  disallow server-cached data.\n"),
     N_ ("\
-       --default-page=NAME     Change the default page name (normally\n\
-                               this is `index.html'.).\n"),
+       --default-page=NAME	  Change the default page name (normally\n\
+				  this is `index.html'.).\n"),
     N_("\
-  -E,  --adjust-extension      save HTML/CSS documents with proper extensions.\n"),
+  -E,  --adjust-extension	  save HTML/CSS documents with proper extensions.\n"),
     N_("\
-       --ignore-length         ignore `Content-Length' header field.\n"),
+       --ignore-length		  ignore `Content-Length' header field.\n"),
     N_("\
-       --header=STRING         insert STRING among the headers.\n"),
+       --header=STRING		  insert STRING among the headers.\n"),
     N_("\
-       --max-redirect          maximum redirections allowed per page.\n"),
+       --max-redirect		  maximum redirections allowed per page.\n"),
     N_("\
-       --proxy-user=USER       set USER as proxy username.\n"),
+       --proxy-user=USER	  set USER as proxy username.\n"),
     N_("\
-       --proxy-password=PASS   set PASS as proxy password.\n"),
+       --proxy-password=PASS	  set PASS as proxy password.\n"),
     N_("\
-       --referer=URL           include `Referer: URL' header in HTTP request.\n"),
+       --referer=URL		  include `Referer: URL' header in HTTP request.\n"),
     N_("\
-       --save-headers          save the HTTP headers to file.\n"),
+       --save-headers		  save the HTTP headers to file.\n"),
     N_("\
-  -U,  --user-agent=AGENT      identify as AGENT instead of Wget/VERSION.\n"),
+  -U,  --user-agent=AGENT	  identify as AGENT instead of Wget/VERSION.\n"),
     N_("\
-       --no-http-keep-alive    disable HTTP keep-alive (persistent connections).\n"),
+       --no-http-keep-alive	  disable HTTP keep-alive (persistent connections).\n"),
     N_("\
-       --no-cookies            don't use cookies.\n"),
+       --no-cookies		  don't use cookies.\n"),
     N_("\
-       --load-cookies=FILE     load cookies from FILE before session.\n"),
+       --load-cookies=FILE	  load cookies from FILE before session.\n"),
     N_("\
-       --save-cookies=FILE     save cookies to FILE after session.\n"),
+       --save-cookies=FILE	  save cookies to FILE after session.\n"),
     N_("\
-       --keep-session-cookies  load and save session (non-permanent) cookies.\n"),
+       --keep-session-cookies	  load and save session (non-permanent) cookies.\n"),
     N_("\
-       --post-data=STRING      use the POST method; send STRING as the data.\n"),
+       --post-data=STRING	  use the POST method; send STRING as the data.\n"),
     N_("\
-       --post-file=FILE        use the POST method; send contents of FILE.\n"),
+       --post-file=FILE		  use the POST method; send contents of FILE.\n"),
     N_("\
-       --content-disposition   honor the Content-Disposition header when\n\
-                               choosing local file names (EXPERIMENTAL).\n"),
+       --content-disposition	  honor the Content-Disposition header when\n\
+				  choosing local file names (EXPERIMENTAL).\n"),
     N_("\
-       --auth-no-challenge     send Basic HTTP authentication information\n\
-                               without first waiting for the server's\n\
-                               challenge.\n"),
+       --auth-no-challenge	  send Basic HTTP authentication information\n\
+				  without first waiting for the server's\n\
+				  challenge.\n"),
+    N_("\
+       --rename-output=PERL-CODE  rename output file(s) with perl.\n"),
     "\n",
 
 #ifdef HAVE_SSL
diff -r 2340fa0d1b78 src/options.h
--- a/src/options.h	Wed Jan 13 20:41:15 2010 -0800
+++ b/src/options.h	Fri Jan 15 23:52:47 2010 -0500
@@ -246,6 +246,8 @@
   int ftp_stmlf;                /* Force Stream_LF format for binary FTP. */
 #endif /* def __VMS */
 
+  char *rename_output;          /* Rename output file(s) using this perl code. */
+
   bool useservertimestamps;  	/* Update downloaded files' timestamps to
 				   match those on server? */
 
diff -r 2340fa0d1b78 src/perlfilter.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/perlfilter.c	Fri Jan 15 23:52:47 2010 -0500
@@ -0,0 +1,103 @@
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <error.h>
+#include <errno.h>
+#include <unistd.h>
+#include "perlfilter.h"
+
+static int
+open2(char *argv[], int *result_in, int *result_out, pid_t *result_child)
+{
+  int pipe_out[2]; /* out of parent */
+  int pipe_in[2];  /* into parent */
+  pid_t cpid;
+
+  if (pipe(pipe_out) < 0)
+    return -1;
+  if (pipe(pipe_in) < 0)
+    return -1;
+
+  cpid = fork();
+  if (cpid < 0)
+    return -1;
+
+  if (cpid == 0) { /* child */
+    int *input = pipe_out; /* for sanity */
+    int *output = pipe_in;
+    close(output[0]);
+    close(input[1]);
+
+    /* see http://unixwiz.net/techtips/remap-pipe-fds.c.txt */
+    if (output[1] == 0)
+      if ((output[1] = dup(output[1]) < 0))
+	error(1, errno, "in child process: dup");
+    if (output[0] == 1)
+      if ((output[0] = dup(output[0]) < 0))
+	error(1, errno, "in child process: dup");
+
+    if (dup2(output[1], 1) < 0)
+      error(1, errno, "in child process: dup2");
+    if (dup2(input[0], 0) < 0)
+      error(1, errno, "in child process: dup2");
+
+    if (output[1] != 1)
+      close(output[1]);
+    if (input[0] != 0)
+      close(input[0]);
+
+    execv(argv[0], &argv[1]);
+    error(1, errno, "in child process: exec");
+    return -1; /* fucking warning */
+
+  } else { /* parent */
+    close(pipe_out[0]);
+    close(pipe_in[1]);
+
+    *result_in = pipe_in[0];
+    *result_out = pipe_out[1];
+    if (*result_child)
+      *result_child = cpid;
+    return 0;
+  }
+}
+
+pipe2_t init_perl_filter(char *src)
+{
+  pipe2_t res = { 0, 0, 0 }; /* initialize to silence warning */
+  char *prefix = "$|++;substr $_, -1, 1, '';";
+  char *postfix = ";$_.=chr 0";
+  char *perlcode = malloc(strlen(prefix)+strlen(src)+strlen(postfix)+1);
+  strcpy(perlcode, prefix);
+  strcat(perlcode, src);
+  strcat(perlcode, postfix);
+  char *cmd[] = {
+   "/usr/bin/perl", "perl", "-0", "-pe", perlcode, 0
+  };
+
+  int in = -1, out = -1; /* initialize to silence warning */
+  if (open2(cmd, &in, &out, &res.pid) < 0)
+    error(1, 0, "open2");
+  res.out = fdopen(out, "w");
+  res.in  = fdopen(in, "r");
+
+  return res;
+}
+
+char *apply_perl_filter(pipe2_t filter, char *s)
+{
+  fwrite(s, 1, strlen(s)+1, filter.out);
+  fflush(filter.out);
+  fsync(fileno(filter.out));
+
+  char c = 1;
+  if (fscanf(filter.in, "%a[\001-\377]%c", &s, &c)) {
+    if (c)
+      free(s); /* no terminating \0 -> return error */
+    else
+      return s;
+  }
+  return 0;
+}
diff -r 2340fa0d1b78 src/perlfilter.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/perlfilter.h	Fri Jan 15 23:52:47 2010 -0500
@@ -0,0 +1,10 @@
+#ifndef _PERLFILTER_H
+#define _PERLFILTER_H
+#include <stdio.h>
+typedef struct {
+  FILE *in, *out;
+  pid_t pid;
+} pipe2_t;
+pipe2_t init_perl_filter(char *src);
+char *apply_perl_filter(pipe2_t filter, char *s);
+#endif

Reply via email to