Regex matching of url

Nicolas Schodet Wed, 11 Feb 2004 08:32:27 -0800

Hello,

Here is two new options to accept or reject an url with a regular
expression.


--regex-accept
--regex-reject

I have included #ifdef conditionnal in order to make it optionnal, I
plan to use a autoconf macro to detect whether the libc regex is usable.

Do you find this patch usefull ?

Nicolas.



ChangeLog:

        * configure.in: Check for regex feature.

doc/ChangeLog:

        * wget.info (Recursive Accept/Reject Options): Document
        `--regex-accept' and `--regex-reject'.
        (Url-Based Limits): Ditto.
        (Wgetrc Commands): Ditto.

src/ChangeLog:

        * init.c: New options `--regex-accept' and `--regex-reject'.

        * main.c: Ditto.

        * options.h: Ditto.

        * recur.c (download_child_p): Take opt.regex_accept and
        opt.regex_reject into account.

        * utils.c (regex_match): New function.
        (regex_accurl): Ditto.
        (free_regex_vec): Ditto.
        (append_regex_vec): Ditto.


Index: configure.in
===================================================================
RCS file: /pack/anoncvs/wget/configure.in,v
retrieving revision 1.73
diff -u -r1.73 configure.in
--- configure.in        2003/11/26 22:46:13     1.73
+++ configure.in        2004/02/08 22:42:03
@@ -74,6 +74,13 @@
 test x"${ENABLE_DEBUG}" = xyes && AC_DEFINE([ENABLE_DEBUG], 1,
    [Define if you want the debug output support compiled in.])
 
+AC_ARG_ENABLE(regex,
+[  --disable-regex         disable support for regular expression url
+                          matching],
+ENABLE_REGEX=$enableval, ENABLE_REGEX=yes)
+test x"${ENABLE_REGEX}" = xyes && AC_DEFINE([ENABLE_REGEX], 1,
+   [Define if you want the regex support compiled in.])
+
 wget_need_md5=no
 
 case "${USE_OPIE}${USE_DIGEST}" in
Index: doc/wget.texi
===================================================================
RCS file: /pack/anoncvs/wget/doc/wget.texi,v
retrieving revision 1.97
diff -u -r1.97 wget.texi
--- doc/wget.texi       2004/02/08 10:50:13     1.97
+++ doc/wget.texi       2004/02/08 22:42:08
@@ -1575,6 +1575,13 @@
 download (@pxref{Directory-Based Limits} for more details.)  Elements of
 @var{list} may contain wildcards.
 
[EMAIL PROTECTED] [EMAIL PROTECTED]
[EMAIL PROTECTED] [EMAIL PROTECTED]
+Specify a regular expression used to accept or reject urls.  Each use of these
+options add a regular expression to the corresponding list.  To be accepted,
+an url must match any expression of the accept list and none of the reject
+list.
+
 @item -np
 @item --no-parent
 Do not ever ascend to the parent directory when retrieving recursively.
@@ -1672,6 +1679,7 @@
 * Spanning Hosts::         (Un)limiting retrieval based on host name.
 * Types of Files::         Getting only certain files.
 * Directory-Based Limits:: Getting only certain directories.
+* Url-Based Limits::       Getting only certain urls.
 * Relative Links::         Follow relative links only.
 * FTP Links::              Following FTP links.
 @end menu
@@ -1873,6 +1881,37 @@
 intelligent fashion.
 @end table
 
[EMAIL PROTECTED] Url-Based Limits
[EMAIL PROTECTED] Url-Based Limits
[EMAIL PROTECTED] url-based limits
+
+Some website require clever rules to decide if a file must be downloaded or
+not. For example, when every information is included in the request part of an
+url. In such cases, directory or file type limits are not powerfull enough.
+
+Wget offers two options to deal with this problem.  Each option
+description lists a long name and the equivalent command in @file{.wgetrc}.
+
[EMAIL PROTECTED] accept urls
[EMAIL PROTECTED] urls, accept
[EMAIL PROTECTED] @samp
[EMAIL PROTECTED] --regex-accept @var{regex}
[EMAIL PROTECTED] regex_accept = @var{regex}
+The argument to @samp{--regex-accept} is a regular expression, like ones used
+by grep. This expression is added to a list of acceptable url patterns. To be
+accepted, an url must match any pattern in the list.
+
+
+
[EMAIL PROTECTED] reject urls
[EMAIL PROTECTED] urls, reject
[EMAIL PROTECTED] --regex-reject @var{regex}
[EMAIL PROTECTED] regex_reject = @var{regex}
+The @samp{--regex-reject} option works the same way as @samp{--regex-accept}, only
+its logic is the reverse; Wget will download all urls @emph{except} the
+ones matching any pattern in the list.
[EMAIL PROTECTED] table
+
 @node Relative Links
 @section Relative Links
 @cindex relative links
@@ -2416,6 +2455,10 @@
 Set HTTP @samp{Referer:} header just like @samp{--referer}.  (Note it
 was the folks who wrote the @sc{http} spec who got the spelling of
 ``referrer'' wrong.)
+
[EMAIL PROTECTED] regex_accept/regex_reject = @var{string}
+Same as @samp{--regex-accept}/@samp{--regex-reject} (@pxref{Url-Based
+Limits}).
 
 @item quiet = on/off
 Quiet mode---the same as @samp{-q}.
Index: src/init.c
===================================================================
RCS file: /pack/anoncvs/wget/src/init.c,v
retrieving revision 1.91
diff -u -r1.91 init.c
--- src/init.c  2003/12/14 13:35:27     1.91
+++ src/init.c  2004/02/08 22:42:09
@@ -85,6 +85,9 @@
 CMD_DECLARE (cmd_directory);
 CMD_DECLARE (cmd_time);
 CMD_DECLARE (cmd_vector);
+#ifdef ENABLE_REGEX
+CMD_DECLARE (cmd_regex_vector);
+#endif
 
 CMD_DECLARE (cmd_spec_dirstruct);
 CMD_DECLARE (cmd_spec_header);
@@ -191,6 +194,10 @@
   { "reclevel",                &opt.reclevel,          cmd_number_inf },
   { "recursive",       NULL,                   cmd_spec_recursive },
   { "referer",         &opt.referer,           cmd_string },
+#ifdef ENABLE_REGEX
+  { "regexaccept",     &opt.regex_accepts,     cmd_regex_vector },
+  { "regexreject",     &opt.regex_rejects,     cmd_regex_vector },
+#endif
   { "reject",          &opt.rejects,           cmd_vector },
   { "relativeonly",    &opt.relative_only,     cmd_boolean },
   { "removelisting",   &opt.remove_listing,    cmd_boolean },
@@ -853,6 +860,45 @@
   return 1;
 }
 
+#ifdef ENABLE_REGEX
+
+static int
+cmd_regex_vector (const char *com, const char *val, void *closure)
+{
+  int err;
+  regex_t ***pvec = (regex_t ***)closure;
+
+  if (*val)
+    {
+      regex_t *r;
+      r = (regex_t *)xmalloc (sizeof (regex_t));
+      /* Compile the regex.  */
+      err = regcomp (r, val, REG_NOSUB);
+      if (err)
+       {
+         size_t len;
+         char *errbuf;
+         len = regerror (err, r, NULL, 0);
+         errbuf = (char *)xmalloc (len * sizeof (char));
+         regerror (err, r, errbuf, len);
+         /* Regular expression compilation error.  */
+         fprintf (stderr, _("%s: %s: %s in `%s'.\n"),
+                  exec_name, com, errbuf, val);
+         xfree (errbuf);
+         return 0;
+       }
+      *pvec = append_regex_vec (*pvec, r);
+    }
+  else
+    {
+      free_regex_vec (*pvec);
+      *pvec = NULL;
+    }
+  return 1;
+}
+
+#endif /* ENABLE_REGEX */
+
 static int simple_atof PARAMS ((const char *, const char *, double *));
 
 /* Enginge for cmd_bytes and cmd_bytes_large: converts a string such
@@ -1323,6 +1369,10 @@
   free_vec (opt.rejects);
   free_vec (opt.excludes);
   free_vec (opt.includes);
+#ifdef ENABLE_REGEX
+  free_regex_vec (opt.regex_accepts);
+  free_regex_vec (opt.regex_rejects);
+#endif
   free_vec (opt.domains);
   free_vec (opt.follow_tags);
   free_vec (opt.ignore_tags);
Index: src/main.c
===================================================================
RCS file: /pack/anoncvs/wget/src/main.c,v
retrieving revision 1.110
diff -u -r1.110 main.c
--- src/main.c  2003/12/14 13:35:27     1.110
+++ src/main.c  2004/02/08 22:42:10
@@ -226,6 +226,10 @@
     { "read-timeout", 0, OPT_VALUE, "readtimeout", -1 },
     { "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
     { "referer", 0, OPT_VALUE, "referer", -1 },
+#ifdef ENABLE_REGEX
+    { "regex-accept", 0, OPT_VALUE, "regexaccept", -1 },
+    { "regex-reject", 0, OPT_VALUE, "regexreject", -1 },
+#endif
     { "reject", 'R', OPT_VALUE, "reject", -1 },
     { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
     { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
@@ -605,6 +609,12 @@
   -I,  --include-directories=LIST  list of allowed directories.\n"),
     N_("\
   -X,  --exclude-directories=LIST  list of excluded directories.\n"),
+#ifdef ENABLE_REGEX
+    N_("\
+       --regex-accept=REGEX        url matching the regex will be accepted.\n"),
+    N_("\
+       --regex-reject=REGEX        url matching the regex will be rejected.\n"),
+#endif
     N_("\
   -np, --no-parent                 don't ascend to the parent directory.\n"),
     "\n",
Index: src/options.h
===================================================================
RCS file: /pack/anoncvs/wget/src/options.h,v
retrieving revision 1.45
diff -u -r1.45 options.h
--- src/options.h       2003/12/06 03:01:31     1.45
+++ src/options.h       2004/02/08 22:42:10
@@ -30,6 +30,10 @@
 /* Needed for FDP.  */
 #include <stdio.h>
 
+#ifdef ENABLE_REGEX
+#include <regex.h>
+#endif
+
 struct options
 {
   int verbose;                 /* Are we verbose? */
@@ -68,6 +72,11 @@
   char **excludes;             /* List of excluded FTP directories. */
   char **includes;             /* List of FTP directories to
                                   follow. */
+
+#ifdef ENABLE_REGEX
+  regex_t **regex_accepts;     /* List of url regex to accept. */
+  regex_t **regex_rejects;     /* List of url regex to reject. */
+#endif
 
   char **domains;              /* See host.c */
   char **exclude_domains;
Index: src/recur.c
===================================================================
RCS file: /pack/anoncvs/wget/src/recur.c,v
retrieving revision 1.58
diff -u -r1.58 recur.c
--- src/recur.c 2003/11/02 19:56:37     1.58
+++ src/recur.c 2004/02/08 22:42:10
@@ -445,7 +445,8 @@
      6. check for suffix
      7. check for same host (if spanhost is unset), with possible
      gethostbyname baggage
-     8. check for robots.txt
+     8. check for regex accepts & rejects
+     9. check for robots.txt
 
      Addendum: If the URL is FTP, and it is to be loaded, only the
      domain and suffix settings are "stronger".
@@ -541,7 +542,22 @@
        goto out;
       }
 
-  /* 8. */
+#ifdef ENABLE_REGEX
+
+  /* 8. If the url does not match the accept regexes, or match
+     the reject regexes, chuck it out.  */
+  if (opt.regex_accepts || opt.regex_rejects)
+    {
+      if (!regex_accurl (url))
+       {
+         DEBUGP (("%s is rejected by regexes.\n", url));
+         goto out;
+       }
+    }
+
+#endif /* ENABLE_REGEX */
+
+  /* 9. */
   if (opt.use_robots && u_scheme_like_http)
     {
       struct robot_specs *specs = res_get_specs (u->host, u->port);
Index: src/utils.c
===================================================================
RCS file: /pack/anoncvs/wget/src/utils.c,v
retrieving revision 1.77
diff -u -r1.77 utils.c
--- src/utils.c 2004/01/29 12:38:52     1.77
+++ src/utils.c 2004/02/08 22:42:12
@@ -600,6 +600,46 @@
   return 1;
 }
 
+#ifdef ENABLE_REGEX
+
+/* Return whether URL match one of the regex contained in the RV vector.  */
+int
+regex_match (const char *url, regex_t **rv)
+{
+  regex_t **r;
+  int err;
+
+  if (!url || !rv)
+    return 0;
+  for (r = rv; *r; r++)
+    {
+      err = regexec (*r, url, 0, NULL, 0);
+      if (err == 0)
+       return 1;
+    }
+  return 0;
+}
+
+/* Return whether URL is acceptable for download using regex-accept and
+   regex-reject lists.  */
+int
+regex_accurl (const char *url)
+{
+  if (opt.regex_accepts)
+    {
+      if (!regex_match (url, opt.regex_accepts))
+       return 0;
+    }
+  if (opt.regex_rejects)
+    {
+      if (regex_match (url, opt.regex_rejects))
+       return 0;
+    }
+  return 1;
+}
+
+#endif /* ENABLE_REGEX */
+
 /* Return non-zero if STRING ends with TAIL.  For instance:
 
    match_tail ("abc", "bc", 0)  -> 1
@@ -957,11 +997,57 @@
   /* Count v2.  */
   for (j = 0; v2[j]; j++);
   /* Reallocate v1.  */
-  v1 = (char **)xrealloc (v1, (i + j + 1) * sizeof (char **));
+  v1 = (char **)xrealloc (v1, (i + j + 1) * sizeof (char *));
   memcpy (v1 + i, v2, (j + 1) * sizeof (char *));
   xfree (v2);
   return v1;
 }
+
+#ifdef ENABLE_REGEX
+
+/* Free the pointers in a NULL-terminated vector of regex_t, then
+   free the pointer itself.  */
+void
+free_regex_vec (regex_t **vec)
+{
+  if (vec)
+    {
+      regex_t **p = vec;
+      while (*p)
+       {
+         regfree (*p);
+         xfree (*p++);
+       }
+      xfree (vec);
+    }
+}
+
+/* Append the regex R to vector V.  The function reallocate V, not R (thus you
+ * may use the contents of R but not V after the call).  */
+regex_t **
+append_regex_vec (regex_t **v, regex_t *r)
+{
+  int i;
+  if (!r)
+    return v;
+  if (!v)
+    {
+      /* Make a new vector.  */
+      v = (regex_t **)xmalloc (2 * sizeof (regex_t *));
+      v[0] = r;
+      v[1] = NULL;
+      return v;
+    }
+  /* Count v.  */
+  for (i = 0; v[i]; i++);
+  /* Reallocate v.  */
+  v = (regex_t **)xrealloc (v, (i + 1 + 1) * sizeof (regex_t *));
+  v[i] = r;
+  v[i + 1] = NULL;
+  return v;
+}
+
+#endif /* ENABLE_REGEX */
 
 /* A set of simple-minded routines to store strings in a linked list.
    This used to also be used for searching, but now we have hash
Index: src/utils.h
===================================================================
RCS file: /pack/anoncvs/wget/src/utils.h,v
retrieving revision 1.32
diff -u -r1.32 utils.h
--- src/utils.h 2003/11/29 18:40:01     1.32
+++ src/utils.h 2004/02/08 22:42:12
@@ -86,6 +86,10 @@
 
 int acceptable PARAMS ((const char *));
 int accdir PARAMS ((const char *s, enum accd));
+#ifdef ENABLE_REGEX
+int regex_match PARAMS ((const char *url, regex_t **rv));
+int regex_accurl PARAMS ((const char *url));
+#endif /* ENABLE_REGEX */
 char *suffix PARAMS ((const char *s));
 int match_tail PARAMS ((const char *, const char *, int));
 int has_wildcards_p PARAMS ((const char *));
@@ -98,6 +102,10 @@
 
 void free_vec PARAMS ((char **));
 char **merge_vecs PARAMS ((char **, char **));
+#ifdef ENABLE_REGEX
+void free_regex_vec PARAMS ((regex_t **vec));
+regex_t **append_regex_vec PARAMS ((regex_t **v, regex_t *r));
+#endif /* ENABLE_REGEX */
 slist *slist_append PARAMS ((slist *, const char *));
 slist *slist_prepend PARAMS ((slist *, const char *));
 slist *slist_nreverse PARAMS ((slist *));

Regex matching of url

Reply via email to