Hi,
Here is a patch that adds the --acceptregex and --rejectregex options.
With these options it would be possible to do two things:
1. You can match complete urls, instead of just the directory prefix or
the file name suffix (which you can do with --accept and
--include-directories).
2. You can use regular expressions to do the matching, which is
sometimes easier to than using a list of wildcard patterns.
Now this isn't a new idea (there are long discussions in the archive,
see [1]). But somehow the previous attempts didn't make it, so I thought
I'd send my own version. It's a small patch, I've been using it for a
while and found it really useful.
I've made two versions of the patch: one uses PCRE, the other uses the
gnulib regex library, which is probably easier to integrate.
Regards,
Gijs
[1] https://lists.gnu.org/archive/html/bug-wget/2009-09/msg00035.html
=== modified file 'bootstrap.conf'
--- bootstrap.conf 2012-03-20 19:41:14 +0000
+++ bootstrap.conf 2012-04-04 15:09:08 +0000
@@ -58,6 +58,7 @@
quote
quotearg
recv
+regex
select
send
setsockopt
=== modified file 'src/init.c'
--- src/init.c 2012-03-08 09:00:51 +0000
+++ src/init.c 2012-04-04 17:46:59 +0000
@@ -80,6 +80,7 @@
CMD_DECLARE (cmd_directory_vector);
CMD_DECLARE (cmd_number);
CMD_DECLARE (cmd_number_inf);
+CMD_DECLARE (cmd_regex);
CMD_DECLARE (cmd_string);
CMD_DECLARE (cmd_file);
CMD_DECLARE (cmd_directory);
@@ -116,6 +117,7 @@
} commands[] = {
/* KEEP THIS LIST ALPHABETICALLY SORTED */
{ "accept", &opt.accepts, cmd_vector },
+ { "acceptregex", &opt.acceptregex, cmd_regex },
{ "addhostdir", &opt.add_hostdir, cmd_boolean },
{ "adjustextension", &opt.adjust_extension, cmd_boolean },
{ "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */
@@ -237,6 +239,7 @@
{ "recursive", NULL, cmd_spec_recursive },
{ "referer", &opt.referer, cmd_string },
{ "reject", &opt.rejects, cmd_vector },
+ { "rejectregex", &opt.rejectregex, cmd_regex },
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
@@ -943,6 +946,30 @@
return true;
}
+/* Compile the regular expression and place a
+ pointer to *PLACE. */
+static bool
+cmd_regex (const char *com, const char *val, void *place)
+{
+ regex_t **regex = (regex_t **)place;
+ *regex = malloc (sizeof (regex_t));
+
+ int errcode = regcomp (*regex, val, REG_EXTENDED | REG_NOSUB);
+
+ if (errcode != 0)
+ {
+ int errbuf_size = regerror (errcode, *regex, NULL, 0);
+ char *errbuf = malloc (errbuf_size);
+ errbuf_size = regerror (errcode, *regex, errbuf, errbuf_size);
+ fprintf (stderr, _("%s: %s: Invalid regular expression %s, %s\n"),
+ exec_name, com, quote (val), errbuf);
+ xfree (errbuf);
+ return false;
+ }
+
+ return true;
+}
+
/* Like the above, but handles tilde-expansion when reading a user's
`.wgetrc'. In that case, and if VAL begins with `~', the tilde
=== modified file 'src/main.c'
--- src/main.c 2012-03-05 21:23:06 +0000
+++ src/main.c 2012-04-04 15:15:50 +0000
@@ -158,6 +158,7 @@
static struct cmdline_option option_data[] =
{
{ "accept", 'A', OPT_VALUE, "accept", -1 },
+ { "acceptregex", 0, OPT_VALUE, "acceptregex", -1 },
{ "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 },
{ "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument },
{ "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 },
@@ -263,6 +264,7 @@
{ "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
{ "referer", 0, OPT_VALUE, "referer", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 },
+ { "rejectregex", 0, OPT_VALUE, "rejectregex", -1 },
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
@@ -723,6 +725,10 @@
N_("\
-R, --reject=LIST comma-separated list of rejected extensions.\n"),
N_("\
+ --acceptregex=REGEX extended regex matching accepted URLs.\n"),
+ N_("\
+ --rejectregex=REGEX extended regex matching rejected URLs.\n"),
+ N_("\
-D, --domains=LIST comma-separated list of accepted domains.\n"),
N_("\
--exclude-domains=LIST comma-separated list of rejected domains.\n"),
=== modified file 'src/options.h'
--- src/options.h 2012-03-05 21:23:06 +0000
+++ src/options.h 2012-04-04 17:43:42 +0000
@@ -29,6 +29,8 @@
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
+#include <regex.h>
+
struct options
{
int verbose; /* Are we verbose? (First set to -1,
@@ -74,6 +76,9 @@
bool ignore_case; /* Whether to ignore case when
matching dirs and files */
+ regex_t *acceptregex; /* Patterns to accept. */
+ regex_t *rejectregex; /* Patterns to reject. */
+
char **domains; /* See host.c */
char **exclude_domains;
bool dns_cache; /* whether we cache DNS lookups. */
=== modified file 'src/recur.c'
--- src/recur.c 2011-03-30 23:37:12 +0000
+++ src/recur.c 2012-04-04 17:48:34 +0000
@@ -586,6 +586,11 @@
goto out;
}
}
+ if (!accept_url (url))
+ {
+ DEBUGP (("%s is excluded/not-included through regex.\n", url));
+ goto out;
+ }
/* 6. Check for acceptance/rejection rules. We ignore these rules
for directories (no file name to match) and for non-leaf HTMLs,
=== modified file 'src/utils.c'
--- src/utils.c 2012-03-29 18:13:27 +0000
+++ src/utils.c 2012-04-04 17:47:46 +0000
@@ -917,6 +917,48 @@
return true;
}
+/* Determine whether an URL is acceptable to be followed, according to
+ regex patterns to accept/reject. */
+bool
+accept_url (const char *s)
+{
+ int rc;
+ bool accept = true;
+
+ if (opt.acceptregex)
+ {
+ rc = regexec (opt.acceptregex, s, 0, NULL, 0);
+ if (rc == REG_NOMATCH)
+ accept = false;
+ else if (rc != 0)
+ {
+ int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0);
+ char *errbuf = malloc (errbuf_size);
+ errbuf_size = regerror (rc, opt.acceptregex, errbuf, errbuf_size);
+ logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+ quote (s), rc);
+ xfree (errbuf);
+ accept = false;
+ }
+ }
+ if (accept && opt.rejectregex)
+ {
+ rc = regexec (opt.rejectregex, s, 0, NULL, 0);
+ if (rc == 0)
+ accept = false;
+ else if (rc != REG_NOMATCH)
+ {
+ int errbuf_size = regerror (rc, opt.rejectregex, NULL, 0);
+ char *errbuf = malloc (errbuf_size);
+ errbuf_size = regerror (rc, opt.rejectregex, errbuf, errbuf_size);
+ logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+ quote (s), rc);
+ xfree (errbuf);
+ }
+ }
+ return accept;
+}
+
/* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p()
will return true if and only if D2 begins with `/something/' or is exactly
'/something'. */
=== modified file 'src/utils.h'
--- src/utils.h 2011-01-01 12:19:37 +0000
+++ src/utils.h 2012-04-04 15:13:48 +0000
@@ -90,6 +90,7 @@
int fnmatch_nocase (const char *, const char *, int);
bool acceptable (const char *);
+bool accept_url (const char *);
bool accdir (const char *s);
char *suffix (const char *s);
bool match_tail (const char *, const char *, bool);
=== modified file 'configure.ac'
--- configure.ac 2012-03-25 11:47:53 +0000
+++ configure.ac 2012-03-30 12:15:48 +0000
@@ -532,6 +532,18 @@
])
)
+dnl
+dnl Check for PCRE
+dnl
+
+AC_CHECK_HEADER(pcre.h,
+ AC_CHECK_LIB(pcre, pcre_compile,
+ [LIBS="${LIBS} -lpcre"
+ AC_DEFINE([HAVE_LIBPCRE], 1,
+ [Define if libpcre is available.])
+ ])
+)
+
dnl Needed by src/Makefile.am
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
=== modified file 'src/init.c'
--- src/init.c 2012-03-08 09:00:51 +0000
+++ src/init.c 2012-03-30 13:06:42 +0000
@@ -80,6 +80,9 @@
CMD_DECLARE (cmd_directory_vector);
CMD_DECLARE (cmd_number);
CMD_DECLARE (cmd_number_inf);
+#ifdef HAVE_LIBPCRE
+CMD_DECLARE (cmd_regex);
+#endif
CMD_DECLARE (cmd_string);
CMD_DECLARE (cmd_file);
CMD_DECLARE (cmd_directory);
@@ -116,6 +119,9 @@
} commands[] = {
/* KEEP THIS LIST ALPHABETICALLY SORTED */
{ "accept", &opt.accepts, cmd_vector },
+#ifdef HAVE_LIBPCRE
+ { "acceptregex", &opt.acceptregex, cmd_regex },
+#endif
{ "addhostdir", &opt.add_hostdir, cmd_boolean },
{ "adjustextension", &opt.adjust_extension, cmd_boolean },
{ "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */
@@ -237,6 +243,9 @@
{ "recursive", NULL, cmd_spec_recursive },
{ "referer", &opt.referer, cmd_string },
{ "reject", &opt.rejects, cmd_vector },
+#ifdef HAVE_LIBPCRE
+ { "rejectregex", &opt.rejectregex, cmd_regex },
+#endif
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
@@ -943,6 +952,30 @@
return true;
}
+#ifdef HAVE_LIBPCRE
+/* Compile the PCRE regular expression and place a
+ pointer to *PLACE. */
+static bool
+cmd_regex (const char *com, const char *val, void *place)
+{
+ pcre **pregex = (pcre **)place;
+
+ const char *error;
+ int erroffset;
+
+ *pregex = pcre_compile (val, 0, &error, &erroffset, 0);
+
+ if (!pregex)
+ {
+ fprintf (stderr, _("%s: %s: Invalid regular expression %s, %s\n"),
+ exec_name, com, quote (val), error);
+ return false;
+ }
+
+ return true;
+}
+#endif
+
/* Like the above, but handles tilde-expansion when reading a user's
`.wgetrc'. In that case, and if VAL begins with `~', the tilde
=== modified file 'src/main.c'
--- src/main.c 2012-03-05 21:23:06 +0000
+++ src/main.c 2012-03-30 12:20:40 +0000
@@ -158,6 +158,9 @@
static struct cmdline_option option_data[] =
{
{ "accept", 'A', OPT_VALUE, "accept", -1 },
+#ifdef HAVE_LIBPCRE
+ { "acceptregex", 0, OPT_VALUE, "acceptregex", -1 },
+#endif
{ "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 },
{ "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument },
{ "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 },
@@ -263,6 +266,9 @@
{ "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
{ "referer", 0, OPT_VALUE, "referer", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 },
+#ifdef HAVE_LIBPCRE
+ { "rejectregex", 0, OPT_VALUE, "rejectregex", -1 },
+#endif
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
@@ -722,6 +728,12 @@
-A, --accept=LIST comma-separated list of accepted extensions.\n"),
N_("\
-R, --reject=LIST comma-separated list of rejected extensions.\n"),
+#ifdef HAVE_LIBPCRE
+ N_("\
+ --acceptregex=REGEX PCRE-compatible regex matching accepted URLs.\n"),
+ N_("\
+ --rejectregex=REGEX PCRE-compatible regex matching rejected URLs.\n"),
+#endif
N_("\
-D, --domains=LIST comma-separated list of accepted domains.\n"),
N_("\
=== modified file 'src/options.h'
--- src/options.h 2012-03-05 21:23:06 +0000
+++ src/options.h 2012-03-30 12:23:09 +0000
@@ -29,6 +29,10 @@
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
+#ifdef HAVE_LIBPCRE
+#include <pcre.h>
+#endif
+
struct options
{
int verbose; /* Are we verbose? (First set to -1,
@@ -74,6 +78,11 @@
bool ignore_case; /* Whether to ignore case when
matching dirs and files */
+#ifdef HAVE_LIBPCRE
+ pcre *acceptregex; /* Patterns to accept. */
+ pcre *rejectregex; /* Patterns to reject. */
+#endif
+
char **domains; /* See host.c */
char **exclude_domains;
bool dns_cache; /* whether we cache DNS lookups. */
=== modified file 'src/recur.c'
--- src/recur.c 2011-03-30 23:37:12 +0000
+++ src/recur.c 2012-04-04 17:49:20 +0000
@@ -586,6 +586,11 @@
goto out;
}
}
+ if (!accept_url (url))
+ {
+ DEBUGP (("%s is excluded/not-included through regex.\n", url));
+ goto out;
+ }
/* 6. Check for acceptance/rejection rules. We ignore these rules
for directories (no file name to match) and for non-leaf HTMLs,
=== modified file 'src/utils.c'
--- src/utils.c 2012-03-29 18:13:27 +0000
+++ src/utils.c 2012-03-30 13:26:44 +0000
@@ -917,6 +917,44 @@
return true;
}
+#ifdef HAVE_LIBPCRE
+#define OVECCOUNT 30
+/* Determine whether an URL is acceptable to be followed, according to
+ regex patterns to accept/reject. */
+bool
+accept_url (const char *s)
+{
+ int l = strlen (s);
+ int rc;
+ int ovector[OVECCOUNT];
+ bool accept = true;
+
+ if (opt.acceptregex)
+ {
+ rc = pcre_exec (opt.acceptregex, 0, s, l, 0, 0, ovector, OVECCOUNT);
+ if (rc == PCRE_ERROR_NOMATCH)
+ accept = false;
+ else if (rc < 0)
+ {
+ logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+ quote (s), rc);
+ accept = false;
+ }
+ }
+ if (accept && opt.rejectregex)
+ {
+ rc = pcre_exec (opt.rejectregex, 0, s, l, 0, 0, ovector, OVECCOUNT);
+ if (rc >= 0)
+ accept = false;
+ else if (rc < 0)
+ logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
+ quote (s), rc);
+ }
+ return accept;
+}
+#undef OVECCOUNT
+#endif
+
/* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p()
will return true if and only if D2 begins with `/something/' or is exactly
'/something'. */
=== modified file 'src/utils.h'
--- src/utils.h 2011-01-01 12:19:37 +0000
+++ src/utils.h 2012-03-30 13:30:25 +0000
@@ -90,6 +90,9 @@
int fnmatch_nocase (const char *, const char *, int);
bool acceptable (const char *);
+#ifdef HAVE_LIBPCRE
+bool accept_url (const char *);
+#endif
bool accdir (const char *s);
char *suffix (const char *s);
bool match_tail (const char *, const char *, bool);