Hello,

after reading so much about regex support for wget (espacially the lack
of it) and experiencing myself how annoying it can be if you have
downloaded a hundred /thumbs/ directories, I tried to implement regex
support myself.
I used pcre library from http://www.pcre.org which was pretty easy to
use, given the fact that I never ever touched a single line of C (or
C++) code before.
Unfortunately I don't know jack about autoconf, makefiles etc.
The patch in its current form is only useful with MSVC as I didn't alter
any other makefiles.
I hope someone can do that for me and include the pcre license from
http://www.pcre.org/license.txt

As you can see pcre.h and pcre.lib need to be somwhere the compiler can
find them and HAVE_REGEX needs to be defined.
Files and directories are ignored if the regex given on the command line
match. For Syntax see wget --help.
The patch was made against current cvs code.
Hope this helps somehow.

Tobias
diff -ruwb wget-regex2/src/ftp.c wget-regex3/src/ftp.c
--- wget-regex2/src/ftp.c       Sat Apr 02 02:41:04 2005
+++ wget-regex3/src/ftp.c       Wed Apr 06 18:55:24 2005
@@ -1749,7 +1749,11 @@
     return res;
   /* First: weed out that do not conform the global rules given in
      opt.accepts and opt.rejects.  */
+#ifdef HAVE_REGEX     
+  if (opt.accepts || opt.rejects || opt.exclregfile)
+#else
   if (opt.accepts || opt.rejects)
+#endif /* HAVE_REGEX */
     {
       f = start;
       while (f)
diff -ruwb wget-regex2/src/init.c wget-regex3/src/init.c
--- wget-regex2/src/init.c      Sun Mar 20 17:07:38 2005
+++ wget-regex3/src/init.c      Wed Apr 06 19:37:13 2005
@@ -137,6 +137,10 @@
 #endif
   { "excludedirectories", &opt.excludes,       cmd_directory_vector },
   { "excludedomains",  &opt.exclude_domains,   cmd_vector },
+#ifdef HAVE_REGEX  
+  { "excluderegexdir", &opt.exclregdir,        cmd_string },
+  { "excluderegexfile", &opt.exclregfile,      cmd_string },
+#endif /* HAVE_REGEX */
   { "followftp",       &opt.follow_ftp,        cmd_boolean },
   { "followtags",      &opt.follow_tags,       cmd_vector },
   { "forcehtml",       &opt.force_html,        cmd_boolean },
@@ -1367,6 +1371,12 @@
   xfree_null (opt.sslcertkey);
   xfree_null (opt.sslcertfile);
 #endif /* HAVE_SSL */
+#ifdef HAVE_REGEX
+  xfree_null (opt.exclregdir_c)
+  xfree_null (opt.exclregfile_c)
+  xfree_null (opt.exclregdir);
+  xfree_null (opt.exclregfile);
+#endif /* HAVE_REGEX */
   xfree_null (opt.bind_address);
   xfree_null (opt.cookies_input);
   xfree_null (opt.cookies_output);
diff -ruwb wget-regex2/src/main.c wget-regex3/src/main.c
--- wget-regex2/src/main.c      Tue Mar 22 15:20:02 2005
+++ wget-regex3/src/main.c      Wed Apr 06 19:03:56 2005
@@ -68,6 +68,10 @@
 /* On GNU system this will include system-wide getopt.h. */
 #include "getopt.h"
 
+#ifdef HAVE_REGEX
+#include <pcre.h>
+#endif /* HAVE_REGEX */
+
 #ifndef PATH_SEPARATOR
 # define PATH_SEPARATOR '/'
 #endif
@@ -176,6 +180,10 @@
     { "egd-file", 0, OPT_VALUE, "egdfile", -1 },
     { "exclude-directories", 'X', OPT_VALUE, "excludedirectories", -1 },
     { "exclude-domains", 0, OPT_VALUE, "excludedomains", -1 },
+#ifdef HAVE_REGEX
+    { "exclude-regex-dirs", 0, OPT_VALUE, "excluderegexdir", -1 },
+    { "exclude-regex-files", 0, OPT_VALUE, "excluderegexfile", -1 },
+#endif
     { "execute", 'e', OPT__EXECUTE, NULL, required_argument },
     { "follow-ftp", 0, OPT_BOOLEAN, "followftp", -1 },
     { "follow-tags", 0, OPT_VALUE, "followtags", -1 },
@@ -591,6 +599,12 @@
   -D,  --domains=LIST              comma-separated list of accepted 
domains.\n"),
     N_("\
        --exclude-domains=LIST      comma-separated list of rejected 
domains.\n"),
+#ifdef HAVE_REGEX      
+    N_("\
+       --exclude-regex-dirs=PATTERN   pattern of directories to reject.\n"),
+       N_("\
+       --exclude-regex-files=PATTERN  pattern of files to reject.\n"),
+#endif /* HAVE_REGEX */
     N_("\
        --follow-ftp                follow FTP links from HTML documents.\n"),
     N_("\
@@ -647,6 +661,7 @@
   int i, ret, longindex;
   int nurl, status;
   int append_to_log = 0;
+  const char *error;  
 
   i18n_initialize ();
 
@@ -819,6 +834,40 @@
       exit (1);
     }
 #endif
+
+#ifdef HAVE_REGEX
+  if (opt.exclregdir)
+    {          
+      opt.exclregdir_c = pcre_compile(
+        opt.exclregdir,       /* the pattern */
+        0,                    /* default options */
+        &error,               /* for error message */
+        &i,                   /* for error offset */
+        NULL);                /* use default character tables */       
+      
+      if (opt.exclregdir_c == NULL)
+        {              
+          printf (_("Directory RegEx compilation failed at offset %d: %s\n"), 
i, error);
+          exit (1);
+        }
+    }
+    
+    if (opt.exclregfile)
+    {          
+      opt.exclregfile_c = pcre_compile(
+        opt.exclregfile,       /* the pattern */
+        0,                    /* default options */
+        &error,               /* for error message */
+        &i,                   /* for error offset */
+        NULL);                /* use default character tables */       
+      
+      if (opt.exclregfile_c == NULL)
+        {              
+          printf (_("File RegEx compilation failed at offset %d: %s\n"), i, 
error);
+          exit (1);
+        }
+    }
+#endif /* HAVE_REGEX */
 
   nurl = argc - optind;
   if (!nurl && !opt.input_filename)
diff -ruwb wget-regex2/src/options.h wget-regex3/src/options.h
--- wget-regex2/src/options.h   Sat Mar 19 19:29:24 2005
+++ wget-regex3/src/options.h   Wed Apr 06 18:15:55 2005
@@ -27,6 +27,10 @@
 file, but you are not obligated to do so.  If you do not wish to do
 so, delete this exception statement from your version.  */
 
+#ifdef HAVE_REGEX
+#include <pcre.h>
+#endif /* HAVE_REGEX */
+
 struct options
 {
   int verbose;                 /* Are we verbose? */
@@ -65,6 +69,12 @@
   char **excludes;             /* List of excluded FTP directories. */
   char **includes;             /* List of FTP directories to
                                   follow. */
+#ifdef HAVE_REGEX                                 
+  char *exclregdir;            /* Pattern for regex exclusion */
+  char *exclregfile;           /* Pattern for regex exclusion */
+  pcre *exclregdir_c;
+  pcre *exclregfile_c;
+#endif /* HAVE_REGEX */
 
   char **domains;              /* See host.c */
   char **exclude_domains;
diff -ruwb wget-regex2/src/recur.c wget-regex3/src/recur.c
--- wget-regex2/src/recur.c     Sun Nov 02 21:56:36 2003
+++ wget-regex3/src/recur.c     Wed Apr 06 19:03:52 2005
@@ -506,7 +506,11 @@
   /* 5. If the file does not match the acceptance list, or is on the
      rejection list, chuck it out.  The same goes for the directory
      exclusion and inclusion lists.  */
+#ifdef HAVE_REGEX     
+  if (opt.includes || opt.excludes || opt.exclregdir)
+#else
   if (opt.includes || opt.excludes)
+#endif /* HAVE_REGEX */
     {
       if (!accdir (u->dir, ALLABS))
        {
diff -ruwb wget-regex2/src/utils.c wget-regex3/src/utils.c
--- wget-regex2/src/utils.c     Fri Apr 01 20:22:38 2005
+++ wget-regex3/src/utils.c     Wed Apr 06 18:41:45 2005
@@ -104,6 +104,10 @@
 #include "utils.h"
 #include "hash.h"
 
+#ifdef HAVE_REGEX
+#include <pcre.h>
+#endif /* HAVE_REGEX */
+
 #ifndef errno
 extern int errno;
 #endif
@@ -649,6 +653,15 @@
     --l;
   if (s[l] == '/')
     s += (l + 1);
+    
+#ifdef HAVE_REGEX
+  if (opt.exclregfile) {       
+       if (!accregex((const pcre *const *)opt.exclregfile_c, s)) {
+               return 0;
+       }
+  }
+#endif /* HAVE_REGEX */
+    
   if (opt.accepts)
     {
       if (opt.rejects)
@@ -662,6 +675,31 @@
   return 1;
 }
 
+#ifdef HAVE_REGEX
+
+int
+accregex(const pcre *re, const char *s)
+{
+       int rc;
+  int ovector[3];
+       
+       rc = pcre_exec(
+    re,                   /* the compiled pattern */
+    NULL,                 /* no extra data - we didn't study the pattern */
+    s,                    /* the subject string */
+    (int)strlen(s),       /* the length of the subject */
+    0,                    /* start at offset 0 in the subject */
+    0,                    /* default options */
+    ovector,              /* output vector for substring information */
+    3);                   /* number of elements in the output vector */
+
+  // if rc < 0, the pattern did match -> accept
+  if (rc < 0) return 1;
+  else return 0;         
+}
+
+#endif /* HAVE_REGEX */
+
 /* Compare S1 and S2 frontally; S2 must begin with S1.  E.g. if S1 is
    `/something', frontcmp() will return 1 only if S2 begins with
    `/something'.  Otherwise, 0 is returned.  */
@@ -705,6 +743,16 @@
   /* Remove starting '/'.  */
   if (flags & ALLABS && *directory == '/')
     ++directory;
+
+#ifdef HAVE_REGEX
+  if (opt.exclregdir) {        
+       printf("Applying RegEx to %s\n",directory);
+       if (!accregex((const pcre *const *)opt.exclregdir_c, directory)) {
+               return 0;
+       }
+  }
+#endif /* HAVE_REGEX */   
+    
   if (opt.includes)
     {
       if (!proclist (opt.includes, directory, flags))
diff -ruwb wget-regex2\windows\Makefile.src wget-regex3\windows\Makefile.src
--- wget-regex2\windows\Makefile.src    Sat Feb 26 02:23:22 2005
+++ wget-regex3\windows\Makefile.src    Wed Apr 06 22:11:01 2005
@@ -36,14 +36,28 @@
 SSLOBJ = gen_sslfunc$o
 !endif
 
+# RegEx support requires the PCRE library (see http://www.pcre.org/).
+
+# If you do not have PCRE installed or wish to build Wget without RegEx
+
+# support, either comment-out the following lines or define NO_REGEX.
+
+!ifndef NO_REGEX
+
+REGDEFS        = /DHAVE_REGEX
+
+REGLIBS        = pcre.lib
+
+!endif 
+
 o = .obj
 
 CC     = cl
 LD     = link
 RM     = -del
 
-DEFS   = /DWINDOWS /D_CONSOLE /DHAVE_CONFIG_H $(SSLDEFS)
-LIBS   = kernel32.lib advapi32.lib wsock32.lib user32.lib gdi32.lib $(SSLLIBS)
+DEFS   = /DWINDOWS /D_CONSOLE /DHAVE_CONFIG_H $(SSLDEFS) $(REGDEFS)
+LIBS   = kernel32.lib advapi32.lib wsock32.lib user32.lib gdi32.lib $(SSLLIBS) 
$(REGLIBS)
 
 !ifdef DEBUG
 CFLAGS = /nologo /MTd /Od /Zi /I. $(DEFS)
diff -ruwb wget-regex2\windows\README wget-regex3\windows\README
--- wget-regex2\windows\README  Thu Feb 12 20:09:10 2004
+++ wget-regex3\windows\README  Wed Apr 06 22:08:14 2005
@@ -32,6 +32,12 @@
 lines in windows\Makefile.src; then follow the normal instructions
 (configure.bat and so on).
 
+By default (for MSVC), wget is built with RegEx support, using the PCRE 
Library.
+You need PCRE from http://www.pcre.org. Get the source and compile it.
+Place pcre.h and pcre.lib somewhere your compiler can find them.
+pcre.h to "C:\Program Files\Microsoft Visual Studio\VC98\lib"
+pcre.lib to "C:\Program Files\Microsoft Visual Studio\VC98\include"
+
 If you want to build the help file you will need a copy of makeinfo to
 convert wget.texi to rtf and html.  I've made a copy available at
 <URL:ftp://sunsite.dk/projects/wget/makeinfo.zip>.  This copy of

Reply via email to