Hello, Here is two new options to accept or reject an url with a regular expression.
--regex-accept --regex-reject I have included #ifdef conditionnal in order to make it optionnal, I plan to use a autoconf macro to detect whether the libc regex is usable. Do you find this patch usefull ? Nicolas. ChangeLog: * configure.in: Check for regex feature. doc/ChangeLog: * wget.info (Recursive Accept/Reject Options): Document `--regex-accept' and `--regex-reject'. (Url-Based Limits): Ditto. (Wgetrc Commands): Ditto. src/ChangeLog: * init.c: New options `--regex-accept' and `--regex-reject'. * main.c: Ditto. * options.h: Ditto. * recur.c (download_child_p): Take opt.regex_accept and opt.regex_reject into account. * utils.c (regex_match): New function. (regex_accurl): Ditto. (free_regex_vec): Ditto. (append_regex_vec): Ditto. Index: configure.in =================================================================== RCS file: /pack/anoncvs/wget/configure.in,v retrieving revision 1.73 diff -u -r1.73 configure.in --- configure.in 2003/11/26 22:46:13 1.73 +++ configure.in 2004/02/08 22:42:03 @@ -74,6 +74,13 @@ test x"${ENABLE_DEBUG}" = xyes && AC_DEFINE([ENABLE_DEBUG], 1, [Define if you want the debug output support compiled in.]) +AC_ARG_ENABLE(regex, +[ --disable-regex disable support for regular expression url + matching], +ENABLE_REGEX=$enableval, ENABLE_REGEX=yes) +test x"${ENABLE_REGEX}" = xyes && AC_DEFINE([ENABLE_REGEX], 1, + [Define if you want the regex support compiled in.]) + wget_need_md5=no case "${USE_OPIE}${USE_DIGEST}" in Index: doc/wget.texi =================================================================== RCS file: /pack/anoncvs/wget/doc/wget.texi,v retrieving revision 1.97 diff -u -r1.97 wget.texi --- doc/wget.texi 2004/02/08 10:50:13 1.97 +++ doc/wget.texi 2004/02/08 22:42:08 @@ -1575,6 +1575,13 @@ download (@pxref{Directory-Based Limits} for more details.) Elements of @var{list} may contain wildcards. [EMAIL PROTECTED] [EMAIL PROTECTED] [EMAIL PROTECTED] [EMAIL PROTECTED] +Specify a regular expression used to accept or reject urls. Each use of these +options add a regular expression to the corresponding list. To be accepted, +an url must match any expression of the accept list and none of the reject +list. + @item -np @item --no-parent Do not ever ascend to the parent directory when retrieving recursively. @@ -1672,6 +1679,7 @@ * Spanning Hosts:: (Un)limiting retrieval based on host name. * Types of Files:: Getting only certain files. * Directory-Based Limits:: Getting only certain directories. +* Url-Based Limits:: Getting only certain urls. * Relative Links:: Follow relative links only. * FTP Links:: Following FTP links. @end menu @@ -1873,6 +1881,37 @@ intelligent fashion. @end table [EMAIL PROTECTED] Url-Based Limits [EMAIL PROTECTED] Url-Based Limits [EMAIL PROTECTED] url-based limits + +Some website require clever rules to decide if a file must be downloaded or +not. For example, when every information is included in the request part of an +url. In such cases, directory or file type limits are not powerfull enough. + +Wget offers two options to deal with this problem. Each option +description lists a long name and the equivalent command in @file{.wgetrc}. + [EMAIL PROTECTED] accept urls [EMAIL PROTECTED] urls, accept [EMAIL PROTECTED] @samp [EMAIL PROTECTED] --regex-accept @var{regex} [EMAIL PROTECTED] regex_accept = @var{regex} +The argument to @samp{--regex-accept} is a regular expression, like ones used +by grep. This expression is added to a list of acceptable url patterns. To be +accepted, an url must match any pattern in the list. + + + [EMAIL PROTECTED] reject urls [EMAIL PROTECTED] urls, reject [EMAIL PROTECTED] --regex-reject @var{regex} [EMAIL PROTECTED] regex_reject = @var{regex} +The @samp{--regex-reject} option works the same way as @samp{--regex-accept}, only +its logic is the reverse; Wget will download all urls @emph{except} the +ones matching any pattern in the list. [EMAIL PROTECTED] table + @node Relative Links @section Relative Links @cindex relative links @@ -2416,6 +2455,10 @@ Set HTTP @samp{Referer:} header just like @samp{--referer}. (Note it was the folks who wrote the @sc{http} spec who got the spelling of ``referrer'' wrong.) + [EMAIL PROTECTED] regex_accept/regex_reject = @var{string} +Same as @samp{--regex-accept}/@samp{--regex-reject} (@pxref{Url-Based +Limits}). @item quiet = on/off Quiet mode---the same as @samp{-q}. Index: src/init.c =================================================================== RCS file: /pack/anoncvs/wget/src/init.c,v retrieving revision 1.91 diff -u -r1.91 init.c --- src/init.c 2003/12/14 13:35:27 1.91 +++ src/init.c 2004/02/08 22:42:09 @@ -85,6 +85,9 @@ CMD_DECLARE (cmd_directory); CMD_DECLARE (cmd_time); CMD_DECLARE (cmd_vector); +#ifdef ENABLE_REGEX +CMD_DECLARE (cmd_regex_vector); +#endif CMD_DECLARE (cmd_spec_dirstruct); CMD_DECLARE (cmd_spec_header); @@ -191,6 +194,10 @@ { "reclevel", &opt.reclevel, cmd_number_inf }, { "recursive", NULL, cmd_spec_recursive }, { "referer", &opt.referer, cmd_string }, +#ifdef ENABLE_REGEX + { "regexaccept", &opt.regex_accepts, cmd_regex_vector }, + { "regexreject", &opt.regex_rejects, cmd_regex_vector }, +#endif { "reject", &opt.rejects, cmd_vector }, { "relativeonly", &opt.relative_only, cmd_boolean }, { "removelisting", &opt.remove_listing, cmd_boolean }, @@ -853,6 +860,45 @@ return 1; } +#ifdef ENABLE_REGEX + +static int +cmd_regex_vector (const char *com, const char *val, void *closure) +{ + int err; + regex_t ***pvec = (regex_t ***)closure; + + if (*val) + { + regex_t *r; + r = (regex_t *)xmalloc (sizeof (regex_t)); + /* Compile the regex. */ + err = regcomp (r, val, REG_NOSUB); + if (err) + { + size_t len; + char *errbuf; + len = regerror (err, r, NULL, 0); + errbuf = (char *)xmalloc (len * sizeof (char)); + regerror (err, r, errbuf, len); + /* Regular expression compilation error. */ + fprintf (stderr, _("%s: %s: %s in `%s'.\n"), + exec_name, com, errbuf, val); + xfree (errbuf); + return 0; + } + *pvec = append_regex_vec (*pvec, r); + } + else + { + free_regex_vec (*pvec); + *pvec = NULL; + } + return 1; +} + +#endif /* ENABLE_REGEX */ + static int simple_atof PARAMS ((const char *, const char *, double *)); /* Enginge for cmd_bytes and cmd_bytes_large: converts a string such @@ -1323,6 +1369,10 @@ free_vec (opt.rejects); free_vec (opt.excludes); free_vec (opt.includes); +#ifdef ENABLE_REGEX + free_regex_vec (opt.regex_accepts); + free_regex_vec (opt.regex_rejects); +#endif free_vec (opt.domains); free_vec (opt.follow_tags); free_vec (opt.ignore_tags); Index: src/main.c =================================================================== RCS file: /pack/anoncvs/wget/src/main.c,v retrieving revision 1.110 diff -u -r1.110 main.c --- src/main.c 2003/12/14 13:35:27 1.110 +++ src/main.c 2004/02/08 22:42:10 @@ -226,6 +226,10 @@ { "read-timeout", 0, OPT_VALUE, "readtimeout", -1 }, { "recursive", 'r', OPT_BOOLEAN, "recursive", -1 }, { "referer", 0, OPT_VALUE, "referer", -1 }, +#ifdef ENABLE_REGEX + { "regex-accept", 0, OPT_VALUE, "regexaccept", -1 }, + { "regex-reject", 0, OPT_VALUE, "regexreject", -1 }, +#endif { "reject", 'R', OPT_VALUE, "reject", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, @@ -605,6 +609,12 @@ -I, --include-directories=LIST list of allowed directories.\n"), N_("\ -X, --exclude-directories=LIST list of excluded directories.\n"), +#ifdef ENABLE_REGEX + N_("\ + --regex-accept=REGEX url matching the regex will be accepted.\n"), + N_("\ + --regex-reject=REGEX url matching the regex will be rejected.\n"), +#endif N_("\ -np, --no-parent don't ascend to the parent directory.\n"), "\n", Index: src/options.h =================================================================== RCS file: /pack/anoncvs/wget/src/options.h,v retrieving revision 1.45 diff -u -r1.45 options.h --- src/options.h 2003/12/06 03:01:31 1.45 +++ src/options.h 2004/02/08 22:42:10 @@ -30,6 +30,10 @@ /* Needed for FDP. */ #include <stdio.h> +#ifdef ENABLE_REGEX +#include <regex.h> +#endif + struct options { int verbose; /* Are we verbose? */ @@ -68,6 +72,11 @@ char **excludes; /* List of excluded FTP directories. */ char **includes; /* List of FTP directories to follow. */ + +#ifdef ENABLE_REGEX + regex_t **regex_accepts; /* List of url regex to accept. */ + regex_t **regex_rejects; /* List of url regex to reject. */ +#endif char **domains; /* See host.c */ char **exclude_domains; Index: src/recur.c =================================================================== RCS file: /pack/anoncvs/wget/src/recur.c,v retrieving revision 1.58 diff -u -r1.58 recur.c --- src/recur.c 2003/11/02 19:56:37 1.58 +++ src/recur.c 2004/02/08 22:42:10 @@ -445,7 +445,8 @@ 6. check for suffix 7. check for same host (if spanhost is unset), with possible gethostbyname baggage - 8. check for robots.txt + 8. check for regex accepts & rejects + 9. check for robots.txt Addendum: If the URL is FTP, and it is to be loaded, only the domain and suffix settings are "stronger". @@ -541,7 +542,22 @@ goto out; } - /* 8. */ +#ifdef ENABLE_REGEX + + /* 8. If the url does not match the accept regexes, or match + the reject regexes, chuck it out. */ + if (opt.regex_accepts || opt.regex_rejects) + { + if (!regex_accurl (url)) + { + DEBUGP (("%s is rejected by regexes.\n", url)); + goto out; + } + } + +#endif /* ENABLE_REGEX */ + + /* 9. */ if (opt.use_robots && u_scheme_like_http) { struct robot_specs *specs = res_get_specs (u->host, u->port); Index: src/utils.c =================================================================== RCS file: /pack/anoncvs/wget/src/utils.c,v retrieving revision 1.77 diff -u -r1.77 utils.c --- src/utils.c 2004/01/29 12:38:52 1.77 +++ src/utils.c 2004/02/08 22:42:12 @@ -600,6 +600,46 @@ return 1; } +#ifdef ENABLE_REGEX + +/* Return whether URL match one of the regex contained in the RV vector. */ +int +regex_match (const char *url, regex_t **rv) +{ + regex_t **r; + int err; + + if (!url || !rv) + return 0; + for (r = rv; *r; r++) + { + err = regexec (*r, url, 0, NULL, 0); + if (err == 0) + return 1; + } + return 0; +} + +/* Return whether URL is acceptable for download using regex-accept and + regex-reject lists. */ +int +regex_accurl (const char *url) +{ + if (opt.regex_accepts) + { + if (!regex_match (url, opt.regex_accepts)) + return 0; + } + if (opt.regex_rejects) + { + if (regex_match (url, opt.regex_rejects)) + return 0; + } + return 1; +} + +#endif /* ENABLE_REGEX */ + /* Return non-zero if STRING ends with TAIL. For instance: match_tail ("abc", "bc", 0) -> 1 @@ -957,11 +997,57 @@ /* Count v2. */ for (j = 0; v2[j]; j++); /* Reallocate v1. */ - v1 = (char **)xrealloc (v1, (i + j + 1) * sizeof (char **)); + v1 = (char **)xrealloc (v1, (i + j + 1) * sizeof (char *)); memcpy (v1 + i, v2, (j + 1) * sizeof (char *)); xfree (v2); return v1; } + +#ifdef ENABLE_REGEX + +/* Free the pointers in a NULL-terminated vector of regex_t, then + free the pointer itself. */ +void +free_regex_vec (regex_t **vec) +{ + if (vec) + { + regex_t **p = vec; + while (*p) + { + regfree (*p); + xfree (*p++); + } + xfree (vec); + } +} + +/* Append the regex R to vector V. The function reallocate V, not R (thus you + * may use the contents of R but not V after the call). */ +regex_t ** +append_regex_vec (regex_t **v, regex_t *r) +{ + int i; + if (!r) + return v; + if (!v) + { + /* Make a new vector. */ + v = (regex_t **)xmalloc (2 * sizeof (regex_t *)); + v[0] = r; + v[1] = NULL; + return v; + } + /* Count v. */ + for (i = 0; v[i]; i++); + /* Reallocate v. */ + v = (regex_t **)xrealloc (v, (i + 1 + 1) * sizeof (regex_t *)); + v[i] = r; + v[i + 1] = NULL; + return v; +} + +#endif /* ENABLE_REGEX */ /* A set of simple-minded routines to store strings in a linked list. This used to also be used for searching, but now we have hash Index: src/utils.h =================================================================== RCS file: /pack/anoncvs/wget/src/utils.h,v retrieving revision 1.32 diff -u -r1.32 utils.h --- src/utils.h 2003/11/29 18:40:01 1.32 +++ src/utils.h 2004/02/08 22:42:12 @@ -86,6 +86,10 @@ int acceptable PARAMS ((const char *)); int accdir PARAMS ((const char *s, enum accd)); +#ifdef ENABLE_REGEX +int regex_match PARAMS ((const char *url, regex_t **rv)); +int regex_accurl PARAMS ((const char *url)); +#endif /* ENABLE_REGEX */ char *suffix PARAMS ((const char *s)); int match_tail PARAMS ((const char *, const char *, int)); int has_wildcards_p PARAMS ((const char *)); @@ -98,6 +102,10 @@ void free_vec PARAMS ((char **)); char **merge_vecs PARAMS ((char **, char **)); +#ifdef ENABLE_REGEX +void free_regex_vec PARAMS ((regex_t **vec)); +regex_t **append_regex_vec PARAMS ((regex_t **v, regex_t *r)); +#endif /* ENABLE_REGEX */ slist *slist_append PARAMS ((slist *, const char *)); slist *slist_prepend PARAMS ((slist *, const char *)); slist *slist_nreverse PARAMS ((slist *));