Hi,

Here is a new version of the regular expressions patch. The new version combines POSIX (always, from gnulib) and PCRE (if available).

The patch adds these options:

 --accept-regex="..."
 --reject-regex="..."

 --regex-type=posix   for POSIX extended regexes (the default)
 --regex-type=pcre    for PCRE regexes (if PCRE is available)

In reference to the --match-query-string patch: since the regexes look at the complete URL, you can also use them to match the query string.

Regards,

Gijs
=== modified file 'ChangeLog'
--- ChangeLog	2012-03-25 11:47:53 +0000
+++ ChangeLog	2012-04-10 22:28:11 +0000
@@ -1,3 +1,8 @@
+2012-04-11  Gijs van Tulder  <gvtul...@gmail.com>
+
+	* bootstrap.conf (gnulib_modules): Include module `regex'.
+	* configure.ac: Check for PCRE library.
+
 2012-03-25 Ray Satiro <raysat...@yahoo.com>
 
 	* configure.ac: Fix build under mingw when OpenSSL is used.

=== modified file 'bootstrap.conf'
--- bootstrap.conf	2012-03-20 19:41:14 +0000
+++ bootstrap.conf	2012-04-04 15:09:08 +0000
@@ -58,6 +58,7 @@
 quote
 quotearg
 recv
+regex
 select
 send
 setsockopt

=== modified file 'configure.ac'
--- configure.ac	2012-03-25 11:47:53 +0000
+++ configure.ac	2012-04-10 21:59:48 +0000
@@ -532,6 +532,18 @@
                   ])
 )
 
+dnl
+dnl Check for PCRE
+dnl
+
+AC_CHECK_HEADER(pcre.h,
+                AC_CHECK_LIB(pcre, pcre_compile,
+                  [LIBS="${LIBS} -lpcre"
+                   AC_DEFINE([HAVE_LIBPCRE], 1,
+                             [Define if libpcre is available.])
+                  ])
+)
+
  
 dnl Needed by src/Makefile.am
 AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])

=== modified file 'src/ChangeLog'
--- src/ChangeLog	2012-04-01 14:30:59 +0000
+++ src/ChangeLog	2012-04-10 22:30:28 +0000
@@ -1,3 +1,12 @@
+2012-04-11  Gijs van Tulder  <gvtul...@gmail.com>
+
+	* init.c: Add --accept-regex, --reject-regex and --regex-type.
+	* main.c: Likewise.
+	* options.c: Likewise.
+	* recur.c: Likewise.
+	* utils.c: Add regex-related functions.
+	* utils.h: Add regex-related functions.
+
 2012-04-01  Giuseppe Scrivano  <gscriv...@gnu.org>
 
 	* gnutls.c (wgnutls_read_timeout): Ensure timer is freed.

=== modified file 'src/init.c'
--- src/init.c	2012-03-08 09:00:51 +0000
+++ src/init.c	2012-04-10 22:10:10 +0000
@@ -46,6 +46,10 @@
 # endif
 #endif
 
+#include <regex.h>
+#ifdef HAVE_LIBPCRE
+# include <pcre.h>
+#endif
 
 #ifdef HAVE_PWD_H
 # include <pwd.h>
@@ -94,6 +98,7 @@
 CMD_DECLARE (cmd_spec_prefer_family);
 CMD_DECLARE (cmd_spec_progress);
 CMD_DECLARE (cmd_spec_recursive);
+CMD_DECLARE (cmd_spec_regex_type);
 CMD_DECLARE (cmd_spec_restrict_file_names);
 #ifdef HAVE_SSL
 CMD_DECLARE (cmd_spec_secure_protocol);
@@ -116,6 +121,7 @@
 } commands[] = {
   /* KEEP THIS LIST ALPHABETICALLY SORTED */
   { "accept",           &opt.accepts,           cmd_vector },
+  { "acceptregex",      &opt.acceptregex_s,     cmd_string },
   { "addhostdir",       &opt.add_hostdir,       cmd_boolean },
   { "adjustextension",  &opt.adjust_extension,  cmd_boolean },
   { "alwaysrest",       &opt.always_rest,       cmd_boolean }, /* deprecated */
@@ -236,7 +242,9 @@
   { "reclevel",         &opt.reclevel,          cmd_number_inf },
   { "recursive",        NULL,                   cmd_spec_recursive },
   { "referer",          &opt.referer,           cmd_string },
+  { "regextype",        &opt.regex_type,        cmd_spec_regex_type },
   { "reject",           &opt.rejects,           cmd_vector },
+  { "rejectregex",      &opt.rejectregex_s,     cmd_string },
   { "relativeonly",     &opt.relative_only,     cmd_boolean },
   { "remoteencoding",   &opt.encoding_remote,   cmd_string },
   { "removelisting",    &opt.remove_listing,    cmd_boolean },
@@ -361,6 +369,8 @@
   opt.restrict_files_nonascii = false;
   opt.restrict_files_case = restrict_no_case_restriction;
 
+  opt.regex_type = regex_type_posix;
+
   opt.max_redirect = 20;
 
   opt.waitretry = 10;
@@ -1368,6 +1378,25 @@
   return true;
 }
 
+/* Validate --regex-type and set the choice.  */
+
+static bool
+cmd_spec_regex_type (const char *com, const char *val, void *place_ignored)
+{
+  static const struct decode_item choices[] = {
+    { "posix", regex_type_posix },
+#ifdef HAVE_LIBPCRE
+    { "pcre",  regex_type_pcre },
+#endif
+  };
+  int regex_type = regex_type_posix;
+  int ok = decode_string (val, choices, countof (choices), &regex_type);
+  if (!ok)
+    fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val));
+  opt.regex_type = regex_type;
+  return ok;
+}
+
 static bool
 cmd_spec_restrict_file_names (const char *com, const char *val, void *place_ignored)
 {

=== modified file 'src/main.c'
--- src/main.c	2012-03-05 21:23:06 +0000
+++ src/main.c	2012-04-10 22:25:56 +0000
@@ -158,6 +158,7 @@
 static struct cmdline_option option_data[] =
   {
     { "accept", 'A', OPT_VALUE, "accept", -1 },
+    { "accept-regex", 0, OPT_VALUE, "acceptregex", -1 },
     { "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 },
     { "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument },
     { "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 },
@@ -262,7 +263,9 @@
     { "read-timeout", 0, OPT_VALUE, "readtimeout", -1 },
     { "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
     { "referer", 0, OPT_VALUE, "referer", -1 },
+    { "regex-type", 0, OPT_VALUE, "regextype", -1 },
     { "reject", 'R', OPT_VALUE, "reject", -1 },
+    { "reject-regex", 0, OPT_VALUE, "rejectregex", -1 },
     { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
     { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
     { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
@@ -723,6 +726,17 @@
     N_("\
   -R,  --reject=LIST               comma-separated list of rejected extensions.\n"),
     N_("\
+       --accept-regex=REGEX        regex matching accepted URLs.\n"),
+    N_("\
+       --reject-regex=REGEX        regex matching rejected URLs.\n"),
+#ifdef HAVE_LIBPCRE
+    N_("\
+       --regex-type=TYPE           regex type (posix|pcre).\n"),
+#else
+    N_("\
+       --regex-type=TYPE           regex type (posix).\n"),
+#endif
+    N_("\
   -D,  --domains=LIST              comma-separated list of accepted domains.\n"),
     N_("\
        --exclude-domains=LIST      comma-separated list of rejected domains.\n"),
@@ -1323,6 +1337,35 @@
       exit (1);
     }
 
+  /* Compile the regular expressions.  */
+  switch (opt.regex_type)
+    {
+#ifdef HAVE_LIBPCRE
+      case regex_type_pcre:
+        opt.regex_compile_fun = compile_pcre_regex;
+        opt.regex_match_fun = match_pcre_regex;
+        break;
+#endif
+
+      case regex_type_posix:
+      default:
+        opt.regex_compile_fun = compile_posix_regex;
+        opt.regex_match_fun = match_posix_regex;
+        break;
+    }
+  if (opt.acceptregex_s)
+    {
+      opt.acceptregex = opt.regex_compile_fun (opt.acceptregex_s);
+      if (!opt.acceptregex)
+        exit (1);
+    }
+  if (opt.rejectregex_s)
+    {
+      opt.rejectregex = opt.regex_compile_fun (opt.rejectregex_s);
+      if (!opt.rejectregex)
+        exit (1);
+    }
+
 #ifdef ENABLE_IRI
   if (opt.enable_iri)
     {

=== modified file 'src/options.h'
--- src/options.h	2012-03-05 21:23:06 +0000
+++ src/options.h	2012-04-10 22:20:26 +0000
@@ -74,6 +74,19 @@
   bool ignore_case;		/* Whether to ignore case when
 				   matching dirs and files */
 
+  char *acceptregex_s;		/* Patterns to accept (a regex string). */
+  char *rejectregex_s;		/* Patterns to reject (a regex string). */
+  void *acceptregex;		/* Patterns to accept (a regex struct). */
+  void *rejectregex;		/* Patterns to reject (a regex struct). */
+  enum {
+#ifdef HAVE_LIBPCRE
+    regex_type_pcre,
+#endif
+    regex_type_posix
+  } regex_type;   /* The regex library. */
+  void *(*regex_compile_fun)(const char *);  /* Function to compile a regex. */
+  bool (*regex_match_fun)(const void *, const char *);  /* Function to match a string to a regex. */
+
   char **domains;		/* See host.c */
   char **exclude_domains;
   bool dns_cache;		/* whether we cache DNS lookups. */

=== modified file 'src/recur.c'
--- src/recur.c	2011-03-30 23:37:12 +0000
+++ src/recur.c	2012-04-04 17:48:34 +0000
@@ -586,6 +586,11 @@
           goto out;
         }
     }
+  if (!accept_url (url))
+    {
+      DEBUGP (("%s is excluded/not-included through regex.\n", url));
+      goto out;
+    }
 
   /* 6. Check for acceptance/rejection rules.  We ignore these rules
      for directories (no file name to match) and for non-leaf HTMLs,

=== modified file 'src/utils.c'
--- src/utils.c	2012-03-29 18:13:27 +0000
+++ src/utils.c	2012-04-10 22:22:10 +0000
@@ -73,6 +73,11 @@
 #include <signal.h>
 #include <setjmp.h>
 
+#include <regex.h>
+#ifdef HAVE_LIBPCRE
+# include <pcre.h>
+#endif
+
 #ifndef HAVE_SIGSETJMP
 /* If sigsetjmp is a macro, configure won't pick it up. */
 # ifdef sigsetjmp
@@ -917,6 +922,19 @@
   return true;
 }
 
+/* Determine whether an URL is acceptable to be followed, according to
+   regex patterns to accept/reject.  */
+bool
+accept_url (const char *s)
+{
+  if (opt.acceptregex && !opt.regex_match_fun (opt.acceptregex, s))
+    return false;
+  if (opt.rejectregex && opt.regex_match_fun (opt.rejectregex, s))
+    return false;
+
+  return true;
+}
+
 /* Check if D2 is a subdirectory of D1.  E.g. if D1 is `/something', subdir_p()
    will return true if and only if D2 begins with `/something/' or is exactly
    '/something'.  */
@@ -2309,6 +2327,92 @@
   return q - (char *) dest;
 }
 
+#ifdef HAVE_LIBPCRE
+/* Compiles the PCRE regex. */
+void *
+compile_pcre_regex (const char *str)
+{
+  const char *errbuf;
+  int erroffset;
+  pcre *regex = pcre_compile (str, 0, &errbuf, &erroffset, 0);
+
+  if (!regex)
+    {
+      fprintf (stderr, _("Invalid regular expression %s, %s\n"),
+               quote (str), errbuf);
+      return false;
+    }
+  return regex;
+}
+#endif
+
+/* Compiles the POSIX regex. */
+void *
+compile_posix_regex (const char *str)
+{
+  regex_t *regex = malloc (sizeof (regex_t));
+
+  int errcode = regcomp ((regex_t *) regex, str, REG_EXTENDED | REG_NOSUB);
+
+  if (errcode != 0)
+    {
+      int errbuf_size = regerror (errcode, (regex_t *) regex, NULL, 0);
+      char *errbuf = malloc (errbuf_size);
+      errbuf_size = regerror (errcode, (regex_t *) regex, errbuf, errbuf_size);
+      fprintf (stderr, _("Invalid regular expression %s, %s\n"),
+               quote (str), errbuf);
+      xfree (errbuf);
+      return NULL;
+    }
+
+  return regex;
+}
+
+#ifdef HAVE_LIBPCRE
+#define OVECCOUNT 30
+/* Matches a PCRE regex.  */
+bool
+match_pcre_regex (const void *regex, const char *str)
+{
+  int l = strlen (str);
+  int ovector[OVECCOUNT];
+
+  int rc = pcre_exec ((pcre *) regex, 0, str, l, 0, 0, ovector, OVECCOUNT);
+  if (rc == PCRE_ERROR_NOMATCH)
+    return false;
+  else if (rc < 0)
+    {
+      logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+                 quote (str), rc);
+      return false;
+    }
+  else
+    return true;
+}
+#undef OVECCOUNT
+#endif
+
+/* Matches a POSIX regex.  */
+bool
+match_posix_regex (const void *regex, const char *str)
+{
+  int rc = regexec ((regex_t *) regex, str, 0, NULL, 0);
+  if (rc == REG_NOMATCH)
+    return false;
+  else if (rc == 0)
+    return true;
+  else
+    {
+      int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0);
+      char *errbuf = malloc (errbuf_size);
+      errbuf_size = regerror (rc, opt.acceptregex, errbuf, errbuf_size);
+      logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+                 quote (str), rc);
+      xfree (errbuf);
+      return false;
+    }
+}
+
 #undef IS_ASCII
 #undef NEXT_CHAR
 

=== modified file 'src/utils.h'
--- src/utils.h	2011-01-01 12:19:37 +0000
+++ src/utils.h	2012-04-10 22:10:39 +0000
@@ -90,6 +90,7 @@
 
 int fnmatch_nocase (const char *, const char *, int);
 bool acceptable (const char *);
+bool accept_url (const char *);
 bool accdir (const char *s);
 char *suffix (const char *s);
 bool match_tail (const char *, const char *, bool);
@@ -141,6 +142,14 @@
 int base64_encode (const void *, int, char *);
 int base64_decode (const char *, void *);
 
+#ifdef HAVE_LIBPCRE
+void *compile_pcre_regex (const char *);
+bool match_pcre_regex (const void *, const char *);
+#endif
+
+void *compile_posix_regex (const char *);
+bool match_posix_regex (const void *, const char *);
+
 void stable_sort (void *, size_t, size_t, int (*) (const void *, const void *));
 
 const char *print_decimal (double);

Reply via email to