Hello, after reading so much about regex support for wget (espacially the lack of it) and experiencing myself how annoying it can be if you have downloaded a hundred /thumbs/ directories, I tried to implement regex support myself. I used pcre library from http://www.pcre.org which was pretty easy to use, given the fact that I never ever touched a single line of C (or C++) code before. Unfortunately I don't know jack about autoconf, makefiles etc. The patch in its current form is only useful with MSVC as I didn't alter any other makefiles. I hope someone can do that for me and include the pcre license from http://www.pcre.org/license.txt
As you can see pcre.h and pcre.lib need to be somwhere the compiler can find them and HAVE_REGEX needs to be defined. Files and directories are ignored if the regex given on the command line match. For Syntax see wget --help. The patch was made against current cvs code. Hope this helps somehow. Tobias
diff -ruwb wget-regex2/src/ftp.c wget-regex3/src/ftp.c --- wget-regex2/src/ftp.c Sat Apr 02 02:41:04 2005 +++ wget-regex3/src/ftp.c Wed Apr 06 18:55:24 2005 @@ -1749,7 +1749,11 @@ return res; /* First: weed out that do not conform the global rules given in opt.accepts and opt.rejects. */ +#ifdef HAVE_REGEX + if (opt.accepts || opt.rejects || opt.exclregfile) +#else if (opt.accepts || opt.rejects) +#endif /* HAVE_REGEX */ { f = start; while (f) diff -ruwb wget-regex2/src/init.c wget-regex3/src/init.c --- wget-regex2/src/init.c Sun Mar 20 17:07:38 2005 +++ wget-regex3/src/init.c Wed Apr 06 19:37:13 2005 @@ -137,6 +137,10 @@ #endif { "excludedirectories", &opt.excludes, cmd_directory_vector }, { "excludedomains", &opt.exclude_domains, cmd_vector }, +#ifdef HAVE_REGEX + { "excluderegexdir", &opt.exclregdir, cmd_string }, + { "excluderegexfile", &opt.exclregfile, cmd_string }, +#endif /* HAVE_REGEX */ { "followftp", &opt.follow_ftp, cmd_boolean }, { "followtags", &opt.follow_tags, cmd_vector }, { "forcehtml", &opt.force_html, cmd_boolean }, @@ -1367,6 +1371,12 @@ xfree_null (opt.sslcertkey); xfree_null (opt.sslcertfile); #endif /* HAVE_SSL */ +#ifdef HAVE_REGEX + xfree_null (opt.exclregdir_c) + xfree_null (opt.exclregfile_c) + xfree_null (opt.exclregdir); + xfree_null (opt.exclregfile); +#endif /* HAVE_REGEX */ xfree_null (opt.bind_address); xfree_null (opt.cookies_input); xfree_null (opt.cookies_output); diff -ruwb wget-regex2/src/main.c wget-regex3/src/main.c --- wget-regex2/src/main.c Tue Mar 22 15:20:02 2005 +++ wget-regex3/src/main.c Wed Apr 06 19:03:56 2005 @@ -68,6 +68,10 @@ /* On GNU system this will include system-wide getopt.h. */ #include "getopt.h" +#ifdef HAVE_REGEX +#include <pcre.h> +#endif /* HAVE_REGEX */ + #ifndef PATH_SEPARATOR # define PATH_SEPARATOR '/' #endif @@ -176,6 +180,10 @@ { "egd-file", 0, OPT_VALUE, "egdfile", -1 }, { "exclude-directories", 'X', OPT_VALUE, "excludedirectories", -1 }, { "exclude-domains", 0, OPT_VALUE, "excludedomains", -1 }, +#ifdef HAVE_REGEX + { "exclude-regex-dirs", 0, OPT_VALUE, "excluderegexdir", -1 }, + { "exclude-regex-files", 0, OPT_VALUE, "excluderegexfile", -1 }, +#endif { "execute", 'e', OPT__EXECUTE, NULL, required_argument }, { "follow-ftp", 0, OPT_BOOLEAN, "followftp", -1 }, { "follow-tags", 0, OPT_VALUE, "followtags", -1 }, @@ -591,6 +599,12 @@ -D, --domains=LIST comma-separated list of accepted domains.\n"), N_("\ --exclude-domains=LIST comma-separated list of rejected domains.\n"), +#ifdef HAVE_REGEX + N_("\ + --exclude-regex-dirs=PATTERN pattern of directories to reject.\n"), + N_("\ + --exclude-regex-files=PATTERN pattern of files to reject.\n"), +#endif /* HAVE_REGEX */ N_("\ --follow-ftp follow FTP links from HTML documents.\n"), N_("\ @@ -647,6 +661,7 @@ int i, ret, longindex; int nurl, status; int append_to_log = 0; + const char *error; i18n_initialize (); @@ -819,6 +834,40 @@ exit (1); } #endif + +#ifdef HAVE_REGEX + if (opt.exclregdir) + { + opt.exclregdir_c = pcre_compile( + opt.exclregdir, /* the pattern */ + 0, /* default options */ + &error, /* for error message */ + &i, /* for error offset */ + NULL); /* use default character tables */ + + if (opt.exclregdir_c == NULL) + { + printf (_("Directory RegEx compilation failed at offset %d: %s\n"), i, error); + exit (1); + } + } + + if (opt.exclregfile) + { + opt.exclregfile_c = pcre_compile( + opt.exclregfile, /* the pattern */ + 0, /* default options */ + &error, /* for error message */ + &i, /* for error offset */ + NULL); /* use default character tables */ + + if (opt.exclregfile_c == NULL) + { + printf (_("File RegEx compilation failed at offset %d: %s\n"), i, error); + exit (1); + } + } +#endif /* HAVE_REGEX */ nurl = argc - optind; if (!nurl && !opt.input_filename) diff -ruwb wget-regex2/src/options.h wget-regex3/src/options.h --- wget-regex2/src/options.h Sat Mar 19 19:29:24 2005 +++ wget-regex3/src/options.h Wed Apr 06 18:15:55 2005 @@ -27,6 +27,10 @@ file, but you are not obligated to do so. If you do not wish to do so, delete this exception statement from your version. */ +#ifdef HAVE_REGEX +#include <pcre.h> +#endif /* HAVE_REGEX */ + struct options { int verbose; /* Are we verbose? */ @@ -65,6 +69,12 @@ char **excludes; /* List of excluded FTP directories. */ char **includes; /* List of FTP directories to follow. */ +#ifdef HAVE_REGEX + char *exclregdir; /* Pattern for regex exclusion */ + char *exclregfile; /* Pattern for regex exclusion */ + pcre *exclregdir_c; + pcre *exclregfile_c; +#endif /* HAVE_REGEX */ char **domains; /* See host.c */ char **exclude_domains; diff -ruwb wget-regex2/src/recur.c wget-regex3/src/recur.c --- wget-regex2/src/recur.c Sun Nov 02 21:56:36 2003 +++ wget-regex3/src/recur.c Wed Apr 06 19:03:52 2005 @@ -506,7 +506,11 @@ /* 5. If the file does not match the acceptance list, or is on the rejection list, chuck it out. The same goes for the directory exclusion and inclusion lists. */ +#ifdef HAVE_REGEX + if (opt.includes || opt.excludes || opt.exclregdir) +#else if (opt.includes || opt.excludes) +#endif /* HAVE_REGEX */ { if (!accdir (u->dir, ALLABS)) { diff -ruwb wget-regex2/src/utils.c wget-regex3/src/utils.c --- wget-regex2/src/utils.c Fri Apr 01 20:22:38 2005 +++ wget-regex3/src/utils.c Wed Apr 06 18:41:45 2005 @@ -104,6 +104,10 @@ #include "utils.h" #include "hash.h" +#ifdef HAVE_REGEX +#include <pcre.h> +#endif /* HAVE_REGEX */ + #ifndef errno extern int errno; #endif @@ -649,6 +653,15 @@ --l; if (s[l] == '/') s += (l + 1); + +#ifdef HAVE_REGEX + if (opt.exclregfile) { + if (!accregex((const pcre *const *)opt.exclregfile_c, s)) { + return 0; + } + } +#endif /* HAVE_REGEX */ + if (opt.accepts) { if (opt.rejects) @@ -662,6 +675,31 @@ return 1; } +#ifdef HAVE_REGEX + +int +accregex(const pcre *re, const char *s) +{ + int rc; + int ovector[3]; + + rc = pcre_exec( + re, /* the compiled pattern */ + NULL, /* no extra data - we didn't study the pattern */ + s, /* the subject string */ + (int)strlen(s), /* the length of the subject */ + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + ovector, /* output vector for substring information */ + 3); /* number of elements in the output vector */ + + // if rc < 0, the pattern did match -> accept + if (rc < 0) return 1; + else return 0; +} + +#endif /* HAVE_REGEX */ + /* Compare S1 and S2 frontally; S2 must begin with S1. E.g. if S1 is `/something', frontcmp() will return 1 only if S2 begins with `/something'. Otherwise, 0 is returned. */ @@ -705,6 +743,16 @@ /* Remove starting '/'. */ if (flags & ALLABS && *directory == '/') ++directory; + +#ifdef HAVE_REGEX + if (opt.exclregdir) { + printf("Applying RegEx to %s\n",directory); + if (!accregex((const pcre *const *)opt.exclregdir_c, directory)) { + return 0; + } + } +#endif /* HAVE_REGEX */ + if (opt.includes) { if (!proclist (opt.includes, directory, flags)) diff -ruwb wget-regex2\windows\Makefile.src wget-regex3\windows\Makefile.src --- wget-regex2\windows\Makefile.src Sat Feb 26 02:23:22 2005 +++ wget-regex3\windows\Makefile.src Wed Apr 06 22:11:01 2005 @@ -36,14 +36,28 @@ SSLOBJ = gen_sslfunc$o !endif +# RegEx support requires the PCRE library (see http://www.pcre.org/). + +# If you do not have PCRE installed or wish to build Wget without RegEx + +# support, either comment-out the following lines or define NO_REGEX. + +!ifndef NO_REGEX + +REGDEFS = /DHAVE_REGEX + +REGLIBS = pcre.lib + +!endif + o = .obj CC = cl LD = link RM = -del -DEFS = /DWINDOWS /D_CONSOLE /DHAVE_CONFIG_H $(SSLDEFS) -LIBS = kernel32.lib advapi32.lib wsock32.lib user32.lib gdi32.lib $(SSLLIBS) +DEFS = /DWINDOWS /D_CONSOLE /DHAVE_CONFIG_H $(SSLDEFS) $(REGDEFS) +LIBS = kernel32.lib advapi32.lib wsock32.lib user32.lib gdi32.lib $(SSLLIBS) $(REGLIBS) !ifdef DEBUG CFLAGS = /nologo /MTd /Od /Zi /I. $(DEFS) diff -ruwb wget-regex2\windows\README wget-regex3\windows\README --- wget-regex2\windows\README Thu Feb 12 20:09:10 2004 +++ wget-regex3\windows\README Wed Apr 06 22:08:14 2005 @@ -32,6 +32,12 @@ lines in windows\Makefile.src; then follow the normal instructions (configure.bat and so on). +By default (for MSVC), wget is built with RegEx support, using the PCRE Library. +You need PCRE from http://www.pcre.org. Get the source and compile it. +Place pcre.h and pcre.lib somewhere your compiler can find them. +pcre.h to "C:\Program Files\Microsoft Visual Studio\VC98\lib" +pcre.lib to "C:\Program Files\Microsoft Visual Studio\VC98\include" + If you want to build the help file you will need a copy of makeinfo to convert wget.texi to rtf and html. I've made a copy available at <URL:ftp://sunsite.dk/projects/wget/makeinfo.zip>. This copy of