Attached (9586 bytes before MIME) please find wget-1.7.diff; this adds 
two options to wget 1.7, namely --only=command and --not=command which 
filter files before fetching. Three escapes are processed in the 
commands, \u (URL), \f (local file) and \r (referer URL) being replaced 
with their shell-escaped equivalents.

I chose to implement this using resource-expensive external commands 
rather than internal regexes for the following reasons:

* more flexible (at least on Unix) than regex
* MUCH more flexible than fixed strings
* fits the Unix everything-is-a-component ethos
* leaves wget itself light
* no need to include a regexp for every occasion
* when I have to leave my BASH for very long I get all shakey (-:

The idea was to add the feature with minimal impact on wget itself. 
Anyone who wants performance is more than welcome to add a regexp 
library or interface for themselves.

This patch touches all necessary source files plus configure.in (adds 
two tests for stpcpy() and waitpid() availabiliy). It does not touch 
AUTHORS, ChangeLog or anything else.

It could still use an option to disable escaping of the strings 
substituted into the commands for those cases when you wish to drop the 
strings into quotes or something. I also considered a stdin template but 
that was less portable and I'm sure the Windows people in particular 
wouldn't thank me for it.

The Windows implementation of the exec piece has not been done as I have 
no Windows machines, let alone a Windows development environment. 
Windows would also require less shell escapes simply because the shell 
(more of a cuttlebone, really) doesn't do as much. This does not impair 
normal wget operation on Windows and should not impair compilation (but 
this I cannot test).

The reason for me doing this is I wanted to mirror some stuff without 
fetching a gazillion helpful copies of files called ?D=A and the like. A 
command to do this using the patched wget is:

     wget -m --no-parent -nH "--not=echo \u|grep '?'" \
       http://the.site.name/path/to/files/

Another possible use for it is remote progress reporting (good for ISO 
images trickling down a slow link, for example):

     wget -m --no-parent -nH -nd \
       "--only=echo \\u|mail -s \"up to $(basename \\f)\" [EMAIL PROTECTED]" \
       http://the.site.name/path/to/ISO/images/

Enjoy. Please let me know if it gets rejected so I can post it somewhere 
for others who need this functionality. (-:

Cheers; Leon
diff -cdr wget-1.7-orig/configure.in wget-1.7/configure.in
*** wget-1.7-orig/configure.in  Tue May 29 06:02:47 2001
--- wget-1.7/configure.in       Sun Jul 22 16:16:23 2001
***************
*** 171,180 ****
  dnl
  AC_FUNC_ALLOCA
  AC_FUNC_MMAP
! AC_CHECK_FUNCS(strdup strstr strcasecmp strncasecmp)
  AC_CHECK_FUNCS(gettimeofday mktime strptime)
  AC_CHECK_FUNCS(strerror snprintf vsnprintf select signal symlink access isatty)
  AC_CHECK_FUNCS(uname gethostname)
  
  AC_CHECK_FUNCS(gethostbyname, [], [
    AC_CHECK_LIB(nsl, gethostbyname)
--- 171,181 ----
  dnl
  AC_FUNC_ALLOCA
  AC_FUNC_MMAP
! AC_CHECK_FUNCS(strdup strstr strcasecmp strncasecmp stpcpy)
  AC_CHECK_FUNCS(gettimeofday mktime strptime)
  AC_CHECK_FUNCS(strerror snprintf vsnprintf select signal symlink access isatty)
  AC_CHECK_FUNCS(uname gethostname)
+ AC_CHECK_FUNCS(waitpid)
  
  AC_CHECK_FUNCS(gethostbyname, [], [
    AC_CHECK_LIB(nsl, gethostbyname)
diff -cdr wget-1.7-orig/src/init.c wget-1.7/src/init.c
*** wget-1.7-orig/src/init.c    Mon May 28 03:35:04 2001
--- wget-1.7/src/init.c Sun Jul 22 14:49:21 2001
***************
*** 149,155 ****
--- 149,157 ----
    { "noclobber",      &opt.noclobber,         cmd_boolean },
    { "noparent",               &opt.no_parent,         cmd_boolean },
    { "noproxy",                &opt.no_proxy,          cmd_vector },
+   { "not",            &opt.not_filter,        cmd_string },
    { "numtries",               &opt.ntry,              cmd_number_inf },/* 
deprecated*/
+   { "only",           &opt.only_filter,       cmd_string },
    { "outputdocument", &opt.output_document,   cmd_file },
    { "pagerequisites", &opt.page_requisites,   cmd_boolean },
    { "passiveftp",     &opt.ftp_pasv,          cmd_lockable_boolean },
diff -cdr wget-1.7-orig/src/main.c wget-1.7/src/main.c
*** wget-1.7-orig/src/main.c    Mon May 28 03:35:05 2001
--- wget-1.7/src/main.c Sun Jul 22 19:22:53 2001
***************
*** 171,176 ****
--- 171,178 ----
         --waitretry=SECONDS      wait 1...SECONDS between retries of a retrieval.\n\
    -Y,  --proxy=on/off           turn proxy on or off.\n\
    -Q,  --quota=NUMBER           set retrieval quota to NUMBER.\n\
+        --only=command         only fetch on cmd success (\\f is filename,)\n\
+        --not=command          fetch on cmd failure (\\u is URL, \\r is referer)\n\
  \n"), stdout);
    fputs (_("\
  Directories:\n\
***************
*** 308,313 ****
--- 310,317 ----
      { "level", required_argument, NULL, 'l' },
      { "load-cookies", required_argument, NULL, 161 },
      { "no", required_argument, NULL, 'n' },
+     { "not", required_argument, NULL, 163 },
+     { "only", required_argument, NULL, 164 },
      { "output-document", required_argument, NULL, 'O' },
      { "output-file", required_argument, NULL, 'o' },
      { "proxy", required_argument, NULL, 'Y' },
***************
*** 528,533 ****
--- 532,543 ----
          break;
        case 162:
          setval ("savecookies", optarg);
+         break;
+       case 163:
+         setval ("not", optarg);
+         break;
+       case 164:
+         setval ("only", optarg);
          break;
        case 157:
          setval ("referer", optarg);
diff -cdr wget-1.7-orig/src/mswindows.c wget-1.7/src/mswindows.c
*** wget-1.7-orig/src/mswindows.c       Mon May 28 03:35:06 2001
--- wget-1.7/src/mswindows.c    Sun Jul 22 15:40:08 2001
***************
*** 112,117 ****
--- 112,129 ----
      FreeConsole ();
  }
  
+ int
+ filter_url (const char *command, struct urlinfo * url)
+ {
+   /* *** IMPLEMENT ME! ***
+    *
+    * This does nothing and returns TRUE, but really needs to replicate
+    * for Windows the functionality of the Unix version in utils.c but
+    * I don't have any Windows machines... Leon Brooks <[EMAIL PROTECTED]>
+    */
+   return 1;
+ }
+ 
  static BOOL WINAPI
  ws_handler (DWORD dwEvent)
  {
diff -cdr wget-1.7-orig/src/options.h wget-1.7/src/options.h
*** wget-1.7-orig/src/options.h Mon May 28 03:35:08 2001
--- wget-1.7/src/options.h      Sun Jul 22 15:49:09 2001
***************
*** 73,78 ****
--- 73,80 ----
                                   FTP. */
    char *output_document;      /* The output file to which the
                                   documents will be printed.  */
+   char *only_filter;          /* only-fetch-this filter command */
+   char *not_filter;           /* dont-fetch-this filter command */
    int od_known_regular;               /* whether output_document is a
                                     regular file we can manipulate,
                                     i.e. not `-' or a device file. */
diff -cdr wget-1.7-orig/src/retr.c wget-1.7/src/retr.c
*** wget-1.7-orig/src/retr.c    Mon May 28 03:35:09 2001
--- wget-1.7/src/retr.c Sun Jul 22 16:40:28 2001
***************
*** 397,402 ****
--- 397,418 ----
    assert (u->proto != URLFILE);       /* #### Implement me!  */
    mynewloc = NULL;
  
+   if (opt.only_filter != NULL)
+   {
+ printf ("only-filter: [%s]\n", opt.only_filter);
+     if (filter_url (opt.only_filter, u) != 0)
+       return FILTERED;
+ printf ("allowing\n");
+   }
+ 
+   if (opt.not_filter != NULL)
+   {
+ printf ("not-filter: [%s]\n", opt.not_filter);
+     if (filter_url (opt.not_filter, u) <= 0)
+       return FILTERED;
+ printf ("allowing\n");
+   }
+ 
    if (u->proto == URLHTTP
  #ifdef HAVE_SSL
        || u->proto == URLHTTPS
diff -cdr wget-1.7-orig/src/utils.c wget-1.7/src/utils.c
*** wget-1.7-orig/src/utils.c   Mon May 28 03:35:12 2001
--- wget-1.7/src/utils.c        Sun Jul 22 19:20:16 2001
***************
*** 23,33 ****
--- 23,37 ----
  #include <stdio.h>
  #include <stdlib.h>
  #ifdef HAVE_STRING_H
+ #define __USE_GNU 1   /* for stpcpy() */
  # include <string.h>
  #else  /* not HAVE_STRING_H */
  # include <strings.h>
  #endif /* not HAVE_STRING_H */
  #include <sys/types.h>
+ #ifdef HAVE_WAITPID
+ # include <sys/wait.h>
+ #endif
  #ifdef HAVE_UNISTD_H
  # include <unistd.h>
  #endif
***************
*** 55,60 ****
--- 59,65 ----
  #include "utils.h"
  #include "fnmatch.h"
  #include "hash.h"
+ #include "url.h"
  
  #ifndef errno
  extern int errno;
***************
*** 429,435 ****
      }
  }
  
! /* The Windows versions of the following two functions are defined in
     mswindows.c.  */
  
  #ifndef WINDOWS
--- 434,440 ----
      }
  }
  
! /* The Windows versions of the following three functions are defined in
     mswindows.c.  */
  
  #ifndef WINDOWS
***************
*** 461,466 ****
--- 466,592 ----
        exit (0);
      }
    /* child: keep running */
+ }
+ #endif /* not WINDOWS */
+ 
+ #ifndef WINDOWS
+ extern char ** environ;
+ 
+ static char *
+ shell_escape (const char *url)
+ {
+   const char *p;
+   char *escaped, *p2;
+   int len;
+ 
+   if (url == NULL)
+     return NULL;
+   p = url;
+   len = 1;
+   while (*p)
+   {
+     if (strchr ("&\"'`$*?[;:<>()|\\", *p++) != NULL)
+       ++len;
+     ++len;
+   }
+   escaped = xmalloc (len);
+   p2 = escaped;
+   while (*url)
+   {
+     if (strchr ("&\"'`$*?[;:<>()|\\", *url) != NULL)
+       *p2++ = '\\';
+     *p2++ = *url++;
+   }
+   *p2 = '\0';
+   return escaped;    
+ }
+ 
+ int   /* -1 == error, other == filter cmd status */
+ filter_url (const char *command, struct urlinfo *ui)
+ {
+   char *parsed_cmd, *p2;      /* modified command and pointer into it */
+   char *escurl = NULL, *escfile = NULL, *escref = NULL; /* caches */
+   const char *p;              /* pointer into unmod cmd */
+   int newlen, status;         /* size of mod cmd, status of cmd */
+   pid_t pid;                  /* cmd's PID after fork() */
+ 
+   newlen = strlen (command);
+   p = command;
+   while ((p = strchr (p, '\\')) != NULL)
+   {
+     switch (*++p)
+       {
+         case 'u':
+         if (escurl == NULL)
+           escurl = shell_escape (ui->url);
+         newlen += strlen (escurl) - 2;
+         break;
+         case 'f':
+         if (escfile == NULL)
+           escfile = shell_escape (ui->local);
+         newlen += strlen (escfile) - 2;
+         break;
+         case 'r':
+         if (escref == NULL)
+           escref = shell_escape (ui->referer);
+         newlen += ((escref == NULL) ? 0 : strlen (escref)) - 2;
+         break;
+       }
+   }
+   p2 = parsed_cmd = xmalloc (++ newlen);
+   p = command;
+   while (*p)
+   {
+     if (*p == '\\')
+     {
+       switch (*++p)
+       {
+       case 'u':
+         p2 = stpcpy (p2, escurl);
+         break;
+       case 'f':
+         p2 = stpcpy (p2, escfile);
+         break;
+       case 'r':
+         p2 = stpcpy (p2, escref);
+         break;
+       default:
+         *p2++ = '\\';
+         *p2++ = *p;
+       }
+       ++p;
+     }
+       else
+       *p2++ = *p++;
+   }
+   *p2++ = '\0';
+ fprintf (stderr, "[Command (%d/%d): `%s']\n", newlen, (p2 - parsed_cmd), parsed_cmd);
+   pid = fork ();
+   if (pid == -1)
+     return -1;
+   if (pid == 0)
+   {
+     char *localargv[4];
+     localargv[0] = "sh";
+     localargv[1] = "-c";
+     localargv[2] = parsed_cmd;
+     localargv[3] = NULL;
+     execve ("/bin/sh", localargv, environ);
+     exit (127);
+   }
+   for (;;)
+   {
+     if (waitpid (pid, &status, 0) == -1)
+     {
+       if (errno != EINTR)
+       return -1;
+     }
+       else
+     {
+ fprintf (stderr, "[Status %d]\n", WEXITSTATUS(status));
+       return WEXITSTATUS(status);
+     }
+   }
  }
  #endif /* not WINDOWS */
  
diff -cdr wget-1.7-orig/src/wget.h wget-1.7/src/wget.h
*** wget-1.7-orig/src/wget.h    Mon May 28 03:35:15 2001
--- wget-1.7/src/wget.h Sun Jul 22 15:05:43 2001
***************
*** 294,300 ****
    URLBADPATTERN, FILEBADFILE, RANGEERR, RETRBADPATTERN,
    RETNOTSUP, ROBOTSOK, NOROBOTS, PROXERR, AUTHFAILED,
    QUOTEXC, WRITEFAILED,
!   SSLERRCERTFILE,SSLERRCERTKEY,SSLERRCTXCREATE
  } uerr_t;
  
  typedef unsigned char  boolean;
--- 294,301 ----
    URLBADPATTERN, FILEBADFILE, RANGEERR, RETRBADPATTERN,
    RETNOTSUP, ROBOTSOK, NOROBOTS, PROXERR, AUTHFAILED,
    QUOTEXC, WRITEFAILED,
!   SSLERRCERTFILE, SSLERRCERTKEY, SSLERRCTXCREATE,
!   FILTERED
  } uerr_t;
  
  typedef unsigned char  boolean;

Reply via email to