Hi,

Here is a patch that adds the --acceptregex and --rejectregex options.

With these options it would be possible to do two things:

1. You can match complete urls, instead of just the directory prefix or the file name suffix (which you can do with --accept and --include-directories). 2. You can use regular expressions to do the matching, which is sometimes easier to than using a list of wildcard patterns.

Now this isn't a new idea (there are long discussions in the archive, see [1]). But somehow the previous attempts didn't make it, so I thought I'd send my own version. It's a small patch, I've been using it for a while and found it really useful.

I've made two versions of the patch: one uses PCRE, the other uses the gnulib regex library, which is probably easier to integrate.

Regards,

Gijs

[1] https://lists.gnu.org/archive/html/bug-wget/2009-09/msg00035.html
=== modified file 'bootstrap.conf'
--- bootstrap.conf	2012-03-20 19:41:14 +0000
+++ bootstrap.conf	2012-04-04 15:09:08 +0000
@@ -58,6 +58,7 @@
 quote
 quotearg
 recv
+regex
 select
 send
 setsockopt

=== modified file 'src/init.c'
--- src/init.c	2012-03-08 09:00:51 +0000
+++ src/init.c	2012-04-04 17:46:59 +0000
@@ -80,6 +80,7 @@
 CMD_DECLARE (cmd_directory_vector);
 CMD_DECLARE (cmd_number);
 CMD_DECLARE (cmd_number_inf);
+CMD_DECLARE (cmd_regex);
 CMD_DECLARE (cmd_string);
 CMD_DECLARE (cmd_file);
 CMD_DECLARE (cmd_directory);
@@ -116,6 +117,7 @@
 } commands[] = {
   /* KEEP THIS LIST ALPHABETICALLY SORTED */
   { "accept",           &opt.accepts,           cmd_vector },
+  { "acceptregex",      &opt.acceptregex,       cmd_regex },
   { "addhostdir",       &opt.add_hostdir,       cmd_boolean },
   { "adjustextension",  &opt.adjust_extension,  cmd_boolean },
   { "alwaysrest",       &opt.always_rest,       cmd_boolean }, /* deprecated */
@@ -237,6 +239,7 @@
   { "recursive",        NULL,                   cmd_spec_recursive },
   { "referer",          &opt.referer,           cmd_string },
   { "reject",           &opt.rejects,           cmd_vector },
+  { "rejectregex",      &opt.rejectregex,       cmd_regex },
   { "relativeonly",     &opt.relative_only,     cmd_boolean },
   { "remoteencoding",   &opt.encoding_remote,   cmd_string },
   { "removelisting",    &opt.remove_listing,    cmd_boolean },
@@ -943,6 +946,30 @@
   return true;
 }
 
+/* Compile the regular expression and place a
+   pointer to *PLACE.  */
+static bool
+cmd_regex (const char *com, const char *val, void *place)
+{
+  regex_t **regex = (regex_t **)place;
+  *regex = malloc (sizeof (regex_t));
+
+  int errcode = regcomp (*regex, val, REG_EXTENDED | REG_NOSUB);
+
+  if (errcode != 0)
+    {
+      int errbuf_size = regerror (errcode, *regex, NULL, 0);
+      char *errbuf = malloc (errbuf_size);
+      errbuf_size = regerror (errcode, *regex, errbuf, errbuf_size);
+      fprintf (stderr, _("%s: %s: Invalid regular expression %s, %s\n"),
+               exec_name, com, quote (val), errbuf);
+      xfree (errbuf);
+      return false;
+    }
+
+  return true;
+}
+
 
 /* Like the above, but handles tilde-expansion when reading a user's
    `.wgetrc'.  In that case, and if VAL begins with `~', the tilde

=== modified file 'src/main.c'
--- src/main.c	2012-03-05 21:23:06 +0000
+++ src/main.c	2012-04-04 15:15:50 +0000
@@ -158,6 +158,7 @@
 static struct cmdline_option option_data[] =
   {
     { "accept", 'A', OPT_VALUE, "accept", -1 },
+    { "acceptregex", 0, OPT_VALUE, "acceptregex", -1 },
     { "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 },
     { "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument },
     { "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 },
@@ -263,6 +264,7 @@
     { "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
     { "referer", 0, OPT_VALUE, "referer", -1 },
     { "reject", 'R', OPT_VALUE, "reject", -1 },
+    { "rejectregex", 0, OPT_VALUE, "rejectregex", -1 },
     { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
     { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
     { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
@@ -723,6 +725,10 @@
     N_("\
   -R,  --reject=LIST               comma-separated list of rejected extensions.\n"),
     N_("\
+       --acceptregex=REGEX         extended regex matching accepted URLs.\n"),
+    N_("\
+       --rejectregex=REGEX         extended regex matching rejected URLs.\n"),
+    N_("\
   -D,  --domains=LIST              comma-separated list of accepted domains.\n"),
     N_("\
        --exclude-domains=LIST      comma-separated list of rejected domains.\n"),

=== modified file 'src/options.h'
--- src/options.h	2012-03-05 21:23:06 +0000
+++ src/options.h	2012-04-04 17:43:42 +0000
@@ -29,6 +29,8 @@
 shall include the source code for the parts of OpenSSL used as well
 as that of the covered work.  */
 
+#include <regex.h>
+
 struct options
 {
   int verbose;			/* Are we verbose?  (First set to -1,
@@ -74,6 +76,9 @@
   bool ignore_case;		/* Whether to ignore case when
 				   matching dirs and files */
 
+  regex_t *acceptregex;		/* Patterns to accept. */
+  regex_t *rejectregex;		/* Patterns to reject. */
+
   char **domains;		/* See host.c */
   char **exclude_domains;
   bool dns_cache;		/* whether we cache DNS lookups. */

=== modified file 'src/recur.c'
--- src/recur.c	2011-03-30 23:37:12 +0000
+++ src/recur.c	2012-04-04 17:48:34 +0000
@@ -586,6 +586,11 @@
           goto out;
         }
     }
+  if (!accept_url (url))
+    {
+      DEBUGP (("%s is excluded/not-included through regex.\n", url));
+      goto out;
+    }
 
   /* 6. Check for acceptance/rejection rules.  We ignore these rules
      for directories (no file name to match) and for non-leaf HTMLs,

=== modified file 'src/utils.c'
--- src/utils.c	2012-03-29 18:13:27 +0000
+++ src/utils.c	2012-04-04 17:47:46 +0000
@@ -917,6 +917,48 @@
   return true;
 }
 
+/* Determine whether an URL is acceptable to be followed, according to
+   regex patterns to accept/reject.  */
+bool
+accept_url (const char *s)
+{
+  int rc;
+  bool accept = true;
+
+  if (opt.acceptregex)
+    {
+      rc = regexec (opt.acceptregex, s, 0, NULL, 0);
+      if (rc == REG_NOMATCH)
+        accept = false;
+      else if (rc != 0)
+        {
+          int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0);
+          char *errbuf = malloc (errbuf_size);
+          errbuf_size = regerror (rc, opt.acceptregex, errbuf, errbuf_size);
+          logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+                     quote (s), rc);
+          xfree (errbuf);
+          accept = false;
+        }
+    }
+  if (accept && opt.rejectregex)
+    {
+      rc = regexec (opt.rejectregex, s, 0, NULL, 0);
+      if (rc == 0)
+        accept = false;
+      else if (rc != REG_NOMATCH)
+        {
+          int errbuf_size = regerror (rc, opt.rejectregex, NULL, 0);
+          char *errbuf = malloc (errbuf_size);
+          errbuf_size = regerror (rc, opt.rejectregex, errbuf, errbuf_size);
+          logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+                     quote (s), rc);
+          xfree (errbuf);
+        }
+    }
+  return accept;
+}
+
 /* Check if D2 is a subdirectory of D1.  E.g. if D1 is `/something', subdir_p()
    will return true if and only if D2 begins with `/something/' or is exactly
    '/something'.  */

=== modified file 'src/utils.h'
--- src/utils.h	2011-01-01 12:19:37 +0000
+++ src/utils.h	2012-04-04 15:13:48 +0000
@@ -90,6 +90,7 @@
 
 int fnmatch_nocase (const char *, const char *, int);
 bool acceptable (const char *);
+bool accept_url (const char *);
 bool accdir (const char *s);
 char *suffix (const char *s);
 bool match_tail (const char *, const char *, bool);

=== modified file 'configure.ac'
--- configure.ac	2012-03-25 11:47:53 +0000
+++ configure.ac	2012-03-30 12:15:48 +0000
@@ -532,6 +532,18 @@
                   ])
 )
 
+dnl
+dnl Check for PCRE
+dnl
+
+AC_CHECK_HEADER(pcre.h,
+                AC_CHECK_LIB(pcre, pcre_compile,
+                  [LIBS="${LIBS} -lpcre"
+                   AC_DEFINE([HAVE_LIBPCRE], 1,
+                             [Define if libpcre is available.])
+                  ])
+)
+
  
 dnl Needed by src/Makefile.am
 AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])

=== modified file 'src/init.c'
--- src/init.c	2012-03-08 09:00:51 +0000
+++ src/init.c	2012-03-30 13:06:42 +0000
@@ -80,6 +80,9 @@
 CMD_DECLARE (cmd_directory_vector);
 CMD_DECLARE (cmd_number);
 CMD_DECLARE (cmd_number_inf);
+#ifdef HAVE_LIBPCRE
+CMD_DECLARE (cmd_regex);
+#endif
 CMD_DECLARE (cmd_string);
 CMD_DECLARE (cmd_file);
 CMD_DECLARE (cmd_directory);
@@ -116,6 +119,9 @@
 } commands[] = {
   /* KEEP THIS LIST ALPHABETICALLY SORTED */
   { "accept",           &opt.accepts,           cmd_vector },
+#ifdef HAVE_LIBPCRE
+  { "acceptregex",      &opt.acceptregex,       cmd_regex },
+#endif
   { "addhostdir",       &opt.add_hostdir,       cmd_boolean },
   { "adjustextension",  &opt.adjust_extension,  cmd_boolean },
   { "alwaysrest",       &opt.always_rest,       cmd_boolean }, /* deprecated */
@@ -237,6 +243,9 @@
   { "recursive",        NULL,                   cmd_spec_recursive },
   { "referer",          &opt.referer,           cmd_string },
   { "reject",           &opt.rejects,           cmd_vector },
+#ifdef HAVE_LIBPCRE
+  { "rejectregex",      &opt.rejectregex,       cmd_regex },
+#endif
   { "relativeonly",     &opt.relative_only,     cmd_boolean },
   { "remoteencoding",   &opt.encoding_remote,   cmd_string },
   { "removelisting",    &opt.remove_listing,    cmd_boolean },
@@ -943,6 +952,30 @@
   return true;
 }
 
+#ifdef HAVE_LIBPCRE
+/* Compile the PCRE regular expression and place a
+   pointer to *PLACE.  */
+static bool
+cmd_regex (const char *com, const char *val, void *place)
+{
+  pcre **pregex = (pcre **)place;
+
+  const char *error;
+  int erroffset;
+
+  *pregex = pcre_compile (val, 0, &error, &erroffset, 0);
+
+  if (!pregex)
+    {
+      fprintf (stderr, _("%s: %s: Invalid regular expression %s, %s\n"),
+               exec_name, com, quote (val), error);
+      return false;
+    }
+
+  return true;
+}
+#endif
+
 
 /* Like the above, but handles tilde-expansion when reading a user's
    `.wgetrc'.  In that case, and if VAL begins with `~', the tilde

=== modified file 'src/main.c'
--- src/main.c	2012-03-05 21:23:06 +0000
+++ src/main.c	2012-03-30 12:20:40 +0000
@@ -158,6 +158,9 @@
 static struct cmdline_option option_data[] =
   {
     { "accept", 'A', OPT_VALUE, "accept", -1 },
+#ifdef HAVE_LIBPCRE
+    { "acceptregex", 0, OPT_VALUE, "acceptregex", -1 },
+#endif
     { "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 },
     { "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument },
     { "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 },
@@ -263,6 +266,9 @@
     { "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
     { "referer", 0, OPT_VALUE, "referer", -1 },
     { "reject", 'R', OPT_VALUE, "reject", -1 },
+#ifdef HAVE_LIBPCRE
+    { "rejectregex", 0, OPT_VALUE, "rejectregex", -1 },
+#endif
     { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
     { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
     { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
@@ -722,6 +728,12 @@
   -A,  --accept=LIST               comma-separated list of accepted extensions.\n"),
     N_("\
   -R,  --reject=LIST               comma-separated list of rejected extensions.\n"),
+#ifdef HAVE_LIBPCRE
+    N_("\
+       --acceptregex=REGEX         PCRE-compatible regex matching accepted URLs.\n"),
+    N_("\
+       --rejectregex=REGEX         PCRE-compatible regex matching rejected URLs.\n"),
+#endif
     N_("\
   -D,  --domains=LIST              comma-separated list of accepted domains.\n"),
     N_("\

=== modified file 'src/options.h'
--- src/options.h	2012-03-05 21:23:06 +0000
+++ src/options.h	2012-03-30 12:23:09 +0000
@@ -29,6 +29,10 @@
 shall include the source code for the parts of OpenSSL used as well
 as that of the covered work.  */
 
+#ifdef HAVE_LIBPCRE
+#include <pcre.h>
+#endif
+
 struct options
 {
   int verbose;			/* Are we verbose?  (First set to -1,
@@ -74,6 +78,11 @@
   bool ignore_case;		/* Whether to ignore case when
 				   matching dirs and files */
 
+#ifdef HAVE_LIBPCRE
+  pcre *acceptregex;		/* Patterns to accept. */
+  pcre *rejectregex;		/* Patterns to reject. */
+#endif
+
   char **domains;		/* See host.c */
   char **exclude_domains;
   bool dns_cache;		/* whether we cache DNS lookups. */

=== modified file 'src/recur.c'
--- src/recur.c	2011-03-30 23:37:12 +0000
+++ src/recur.c	2012-04-04 17:49:20 +0000
@@ -586,6 +586,11 @@
           goto out;
         }
     }
+  if (!accept_url (url))
+    {
+      DEBUGP (("%s is excluded/not-included through regex.\n", url));
+      goto out;
+    }
 
   /* 6. Check for acceptance/rejection rules.  We ignore these rules
      for directories (no file name to match) and for non-leaf HTMLs,

=== modified file 'src/utils.c'
--- src/utils.c	2012-03-29 18:13:27 +0000
+++ src/utils.c	2012-03-30 13:26:44 +0000
@@ -917,6 +917,44 @@
   return true;
 }
 
+#ifdef HAVE_LIBPCRE
+#define OVECCOUNT 30
+/* Determine whether an URL is acceptable to be followed, according to
+   regex patterns to accept/reject.  */
+bool
+accept_url (const char *s)
+{
+  int l = strlen (s);
+  int rc;
+  int ovector[OVECCOUNT];
+  bool accept = true;
+
+  if (opt.acceptregex)
+    {
+      rc = pcre_exec (opt.acceptregex, 0, s, l, 0, 0, ovector, OVECCOUNT);
+      if (rc == PCRE_ERROR_NOMATCH)
+        accept = false;
+      else if (rc < 0)
+        {
+          logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+                     quote (s), rc);
+          accept = false;
+        }
+    }
+  if (accept && opt.rejectregex)
+    {
+      rc = pcre_exec (opt.rejectregex, 0, s, l, 0, 0, ovector, OVECCOUNT);
+      if (rc >= 0)
+        accept = false;
+      else if (rc < 0)
+        logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+                   quote (s), rc);
+    }
+  return accept;
+}
+#undef OVECCOUNT
+#endif
+
 /* Check if D2 is a subdirectory of D1.  E.g. if D1 is `/something', subdir_p()
    will return true if and only if D2 begins with `/something/' or is exactly
    '/something'.  */

=== modified file 'src/utils.h'
--- src/utils.h	2011-01-01 12:19:37 +0000
+++ src/utils.h	2012-03-30 13:30:25 +0000
@@ -90,6 +90,9 @@
 
 int fnmatch_nocase (const char *, const char *, int);
 bool acceptable (const char *);
+#ifdef HAVE_LIBPCRE
+bool accept_url (const char *);
+#endif
 bool accdir (const char *s);
 char *suffix (const char *s);
 bool match_tail (const char *, const char *, bool);

Reply via email to