Hi,
I included the Perl regular expressions (PCRE) library from
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
to get better filename matching capabilities for "--accept" and
"--reject" options.
For now, it is an compile time option.
You must specify "--with-pcre" as an option for the configure script and
have pcre >= 3.0 (RedHat and SuSE RPMs are available - use
http://ftpsearch.ntnu.no ) installed.
I just uploaded pcre-3.1-47 to RedHat, it should be available on every
http://contrib.redhat.com mirror in the next few days.
If you include this feature, any Perl-ish regex can be used on
"--accept" and "--reject" options, the old aren't the supported any
more.
Examples:
-A 'HTML,HTM,JPG,JPEG' is now: -A 'HTM.?$|JP.?G$'
Or when mirroring Apache servers:
-R '^\?.=[AD]'
This seems to be quite complicated for those who did never use Perl, but
it is powerful as hell.
BTW: Matching is done case-independently.
Attached is a patch against my last "gold" version.
The most recent package for now is
http://www.simprovement.com/linux/files/wget-1.5.3gold2.tar.gz
Or try
http://www.simprovement.com/linux/frech.cgi
My "gold" version includes some patches from this list (e.g. preliminary
HTTPS support) and my own extensions.
Please send me any patches/bugfixes you make!
Bye,
--
____ Soenke Jan Peters
|_ _| Rostock, Germany PGP on request,
._|| <[EMAIL PROTECTED]> Subject: getpgpkey
http://www.simprovement.com No solicitations!
diff -urN --exclude=configure wget-1.5.3gold/ChangeLog wget-1.5.3gold2/ChangeLog
--- wget-1.5.3gold/ChangeLog Wed May 24 22:29:18 2000
+++ wget-1.5.3gold2/ChangeLog Fri Jun 30 00:39:45 2000
@@ -1,3 +1,15 @@
+2000-06-29 Soenke J. Peters <[EMAIL PROTECTED]>
+ * PCRE library option at compile time
+ for --accept and --reject options
+
+ * HTTPS support via OpenSSL
+
+ * Alternative print_percentage() function
+
+ * --filter-script option
+
+ * Referrer faking via --referer=fake
+
2000-05-24 Dan Harkless <[EMAIL PROTECTED]>
* TODO: Timestamps sometimes not copied over on files retrieved by FTP.
diff -urN --exclude=configure wget-1.5.3gold/Makefile.in wget-1.5.3gold2/Makefile.in
--- wget-1.5.3gold/Makefile.in Mon Jun 26 07:10:44 2000
+++ wget-1.5.3gold2/Makefile.in Fri Jun 30 00:23:19 2000
@@ -47,7 +47,8 @@
CPPFLAGS = @CPPFLAGS@ -I$(SSL_INCDIR)
DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
SSL_LIBS = @SSL_LIBS@
-LIBS = @LIBS@ $(SSL_LIBS)
+PCRE_LIBS = @PCRE_LIBS@
+LIBS = @LIBS@ $(SSL_LIBS) $(PCRE_LIBS)
SSL_LIBDIR = @SSL_LIBDIR@
LDFLAGS = @LDFLAGS@
diff -urN --exclude=configure wget-1.5.3gold/NEWS wget-1.5.3gold2/NEWS
--- wget-1.5.3gold/NEWS Mon Jun 26 10:59:15 2000
+++ wget-1.5.3gold2/NEWS Fri Jun 30 00:38:00 2000
@@ -5,6 +5,11 @@
Please send GNU Wget bug reports to <[EMAIL PROTECTED]>.
+* Changes in Wget 1.5.3gold2
+
+** PCRE library option at compile time for --accept and --reject options
+
+
* Changes in Wget 1.5.3gold
** HTTPS support via OpenSSL
diff -urN --exclude=configure wget-1.5.3gold/configure.in wget-1.5.3gold2/configure.in
--- wget-1.5.3gold/configure.in Mon Jun 26 05:28:27 2000
+++ wget-1.5.3gold2/configure.in Fri Jun 30 00:54:03 2000
@@ -61,6 +61,20 @@
[ --with-ssl-libdir=DIR where to find SSLeay library files (optional)],
ssl_libdir="$withval", ssl_libdir="/usr/lib")
+AC_ARG_WITH(pcre,
+[ --with-pcre compile in PCRE support],
+[if test "$withval" != "no" ; then
+ AC_CHECK_HEADERS(pcre.h)
+ if test "$ac_cv_header_pcre_h" = "yes" ; then
+ AC_CHECK_LIB(pcre,pcre_compile)
+ if test "$ac_cv_lib_pcre_pcre_compile" = "yes" ; then
+ AC_DEFINE(USE_PCRE)
+ PCRE_LIBS="-lpcre"
+ AC_SUBST(PCRE_LIBS)
+ fi
+ fi
+fi])
+
AC_ARG_ENABLE(opie,
[ --disable-opie disable support for opie or s/key FTP login],
USE_OPIE=$enableval, USE_OPIE=yes)
@@ -228,6 +242,7 @@
dnl
dnl Create output
dnl
-AC_OUTPUT([Makefile src/Makefile doc/Makefile util/Makefile po/Makefile.in],
+AC_OUTPUT([Makefile src/Makefile doc/Makefile util/Makefile po/Makefile.in wget.spec],
[WGET_PROCESS_PO
test -z "$CONFIG_HEADERS" || echo timestamp > stamp-h])
+
diff -urN --exclude=configure wget-1.5.3gold/src/config.h.in
wget-1.5.3gold2/src/config.h.in
--- wget-1.5.3gold/src/config.h.in Mon Jun 26 05:28:27 2000
+++ wget-1.5.3gold2/src/config.h.in Fri Jun 30 00:26:17 2000
@@ -84,6 +84,9 @@
/* Define if you want SSLeay support. */
#undef HAVE_SSLEAY
+/* Define if you want to use PCRE for --accept and --reject options */
+#undef USE_PCRE
+
/* Define if you have sys/time.h header. */
#undef HAVE_SYS_TIME_H
diff -urN --exclude=configure wget-1.5.3gold/src/init.c wget-1.5.3gold2/src/init.c
--- wget-1.5.3gold/src/init.c Mon Jun 26 10:29:45 2000
+++ wget-1.5.3gold2/src/init.c Thu Jun 29 22:18:17 2000
@@ -93,7 +93,11 @@
void *closure;
int (*action) PARAMS ((const char *, const char *, void *));
} commands[] = {
+#ifndef USE_PCRE
{ "accept", &opt.accepts, cmd_vector },
+#else /* USE_PCRE */
+ { "accept", &opt.accepts, cmd_string },
+#endif /* USE_PCRE */
{ "addhostdir", &opt.add_hostdir, cmd_boolean },
{ "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */
{ "background", &opt.background, cmd_boolean },
@@ -152,7 +156,11 @@
{ "reclevel", &opt.reclevel, cmd_number_inf },
{ "recursive", NULL, cmd_spec_recursive },
{ "referer", &opt.referer, cmd_string },
+#ifndef USE_PCRE
{ "reject", &opt.rejects, cmd_vector },
+#else /* USE_PCRE */
+ { "reject", &opt.rejects, cmd_string },
+#endif /* USE_PCRE */
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "removelisting", &opt.remove_listing, cmd_boolean },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
@@ -950,8 +958,13 @@
free (opt.dir_prefix);
FREE_MAYBE (opt.input_filename);
FREE_MAYBE (opt.output_document);
+#ifndef USE_PCRE
free_vec (opt.accepts);
free_vec (opt.rejects);
+#else /* USE_PCRE */
+ FREE_MAYBE (opt.accepts);
+ FREE_MAYBE (opt.rejects);
+#endif /* USE_PCRE */
free_vec (opt.excludes);
free_vec (opt.includes);
free_vec (opt.domains);
diff -urN --exclude=configure wget-1.5.3gold/src/main.c wget-1.5.3gold2/src/main.c
--- wget-1.5.3gold/src/main.c Mon Jun 26 10:42:53 2000
+++ wget-1.5.3gold2/src/main.c Fri Jun 30 00:32:34 2000
@@ -62,6 +62,14 @@
struct options opt;
+#ifdef USE_PCRE
+#include <pcre.h>
+ pcre *pcre_accept_pattern;
+ pcre_extra *pcre_accept_hints;
+ pcre *pcre_reject_pattern;
+ pcre_extra *pcre_reject_hints;
+#endif /* USE_PCRE */
+
/* From log.c. */
void log_init PARAMS ((const char *, int));
void log_close PARAMS ((void));
@@ -211,6 +219,11 @@
int i, c, nurl, status, append_to_log;
int wr = 0;
+#ifdef USE_PCRE
+ int pcre_errptr = 0;
+ const char *pcre_error = NULL;
+#endif /* USE_PCRE */
+
static struct option long_options[] =
{
/* Options without arguments: */
@@ -616,6 +629,27 @@
}
if (opt.verbose == -1)
opt.verbose = !opt.quiet;
+
+#ifdef USE_PCRE
+ if (opt.accepts) {
+ pcre_accept_pattern = pcre_compile(opt.accepts, PCRE_CASELESS,
+ &pcre_error, &pcre_errptr, NULL);
+ pcre_accept_hints = pcre_study(pcre_accept_pattern, 0, &pcre_error);
+ if (pcre_error != NULL) {
+ printf (_("pcre: error while studing regex %s: %s\n"),opt.accepts, pcre_error);
+ exit (0);
+ }
+ }
+ if (opt.rejects) {
+ pcre_reject_pattern = pcre_compile(opt.rejects, PCRE_CASELESS,
+ &pcre_error, &pcre_errptr, NULL);
+ pcre_reject_hints = pcre_study(pcre_reject_pattern, 0, &pcre_error);
+ if (pcre_error != NULL) {
+ printf (_("pcre: error while studing regex %s: %s\n"),opt.rejects, pcre_error);
+ exit (0);
+ }
+ }
+#endif /* USE_PCRE */
/* Retain compatibility with previous scripts.
if wait has been set, but waitretry has not, give it the wait value.
diff -urN --exclude=configure wget-1.5.3gold/src/options.h
wget-1.5.3gold2/src/options.h
--- wget-1.5.3gold/src/options.h Mon Jun 26 10:35:45 2000
+++ wget-1.5.3gold2/src/options.h Thu Jun 29 22:19:22 2000
@@ -56,8 +56,13 @@
int spider; /* Is Wget in spider mode? */
+#ifndef USE_PCRE
char **accepts; /* List of patterns to accept. */
char **rejects; /* List of patterns to reject. */
+#else /* USE_PCRE */
+ char *accepts; /* List of patterns to accept. */
+ char *rejects; /* List of patterns to reject. */
+#endif /* USE_PCRE */
char *filter_script; /* Is there a script filtering files to
download? */
char **excludes; /* List of excluded FTP directories. */
diff -urN --exclude=configure wget-1.5.3gold/src/utils.c wget-1.5.3gold2/src/utils.c
--- wget-1.5.3gold/src/utils.c Mon Jun 26 08:11:44 2000
+++ wget-1.5.3gold2/src/utils.c Fri Jun 30 00:35:58 2000
@@ -46,6 +46,14 @@
# include <libc.h> /* for access() */
#endif
+#ifdef USE_PCRE
+ #include <pcre.h>
+ extern pcre *pcre_accept_pattern;
+ extern pcre_extra *pcre_accept_hints;
+ extern pcre *pcre_reject_pattern;
+ extern pcre_extra *pcre_reject_hints;
+#endif /* USE_PCRE */
+
#include "wget.h"
#include "utils.h"
#include "fnmatch.h"
@@ -546,12 +554,20 @@
int
acceptable (const char *s)
{
+#ifdef USE_PCRE
+ int pcre_accept_match = 1, pcre_reject_match = 0, pcre_length = 0;
+ int pcre_offsets[99];
+ char pcre_buffer[65535];
+#endif /* USE_PCRE */
+
int l = strlen (s);
while (l && s[l] != '/')
--l;
if (s[l] == '/')
s += (l + 1);
+
+#ifndef USE_PCRE
if (opt.accepts)
{
if (opt.rejects)
@@ -563,6 +579,28 @@
else if (opt.rejects)
return !in_acclist ((const char *const *)opt.rejects, s, 1);
return 1;
+#else /* USE_PCRE */
+ strcpy(pcre_buffer, s);
+ pcre_length = (int)strlen(pcre_buffer);
+ if (pcre_length > 0 && pcre_buffer[pcre_length-1] == '\n') {
+ pcre_buffer[--pcre_length] = 0;
+ }
+ if (opt.accepts) {
+ pcre_accept_match = pcre_exec(pcre_accept_pattern,
+ pcre_accept_hints, pcre_buffer, pcre_length,
+ 0, 0, pcre_offsets, 99) >= 0;
+ }
+ if (opt.rejects) {
+ pcre_reject_match = pcre_exec(pcre_reject_pattern,
+ pcre_reject_hints, pcre_buffer, pcre_length,
+ 0, 0, pcre_offsets, 99) >= 0;
+ }
+ if (pcre_accept_match && ! pcre_reject_match) {
+ return 1;
+ } else {
+ return 0;
+ }
+#endif /* USE_PCRE */
}
/* Compare S1 and S2 frontally; S2 must begin with S1. E.g. if S1 is
diff -urN --exclude=configure wget-1.5.3gold/src/version.c
wget-1.5.3gold2/src/version.c
--- wget-1.5.3gold/src/version.c Mon Jun 26 10:54:53 2000
+++ wget-1.5.3gold2/src/version.c Fri Jun 30 00:40:48 2000
@@ -1 +1 @@
-char *version_string = "1.5.3gold";
+char *version_string = "1.5.3gold2";
diff -urN --exclude=configure wget-1.5.3gold/wget.spec wget-1.5.3gold2/wget.spec
--- wget-1.5.3gold/wget.spec Mon Jun 26 11:17:59 2000
+++ wget-1.5.3gold2/wget.spec Fri Jun 30 01:04:30 2000
@@ -1,7 +1,6 @@
Summary: A utility for retrieving files using the HTTP, HTTPS or FTP protocols.
Name: wget
-%define version 1.5.3gold
-Version: %{version}
+Version: 1.5.3gold2
Release: 1
Copyright: GPL
Group: Applications/Internet
@@ -31,6 +30,7 @@
%setup -q
%build
+# use "--with-pcre" to enable PCRE!
./configure --prefix=/usr --sysconfdir=/etc \
--with-OpenSSL --with-ssl-incdir=/usr/include/openssl
make
diff -urN --exclude=configure wget-1.5.3gold/wget.spec.in wget-1.5.3gold2/wget.spec.in
--- wget-1.5.3gold/wget.spec.in Thu Jan 1 01:00:00 1970
+++ wget-1.5.3gold2/wget.spec.in Fri Jun 30 01:04:49 2000
@@ -0,0 +1,95 @@
+Summary: A utility for retrieving files using the HTTP, HTTPS or FTP protocols.
+Name: wget
+Version: @VERSION@
+Release: 1
+Copyright: GPL
+Group: Applications/Internet
+Packager: Soenke J. Peters <[EMAIL PROTECTED]>
+Source: wget-%{version}.tar.gz
+Prereq: /sbin/install-info
+Requires: openssl
+BuildRoot: /var/tmp/%{name}-root
+
+%description
+GNU Wget is a file retrieval utility which can use either the HTTP, HTTPS or
+FTP protocols. Wget features include the ability to work in the
+background while you're logged out, recursive retrieval of
+directories, file name wildcard matching, remote file timestamp
+storage and comparison, use of Rest with FTP servers and Range with
+HTTP servers to retrieve files over slow or unstable connections,
+support for Proxy servers, and configurability.
+
+Install wget if you need to retrieve large numbers of files with HTTP, HTTPS or
+FTP, or if you need a utility for mirroring web sites or FTP directories.
+
+This version has been HTTPS-enabled by Jaroslaw Piotr SOBIESZEK
+<[EMAIL PROTECTED]> and some patches from the wget mailing list have
+been added.
+
+%prep
+%setup -q
+
+%build
+# use "--with-pcre" to enable PCRE!
+./configure --prefix=/usr --sysconfdir=/etc \
+ --with-OpenSSL --with-ssl-incdir=/usr/include/openssl
+make
+
+%install
+rm -rf $RPM_BUILD_ROOT
+make install prefix=$RPM_BUILD_ROOT/usr sysconfdir=$RPM_BUILD_ROOT/etc
+gzip $RPM_BUILD_ROOT/usr/info/*
+strip $RPM_BUILD_ROOT/usr/bin/* || :
+
+%post
+/sbin/install-info /usr/info/wget.info.gz /usr/info/dir
+
+%preun
+if [ "$1" = 0 ]; then
+ /sbin/install-info --delete /usr/info/wget.info.gz /usr/info/dir
+fi
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%defattr(-,root,root)
+%doc AUTHORS MAILING-LIST NEWS README INSTALL doc/ChangeLog
+%config /etc/wgetrc
+/usr/man/man1/wget.1
+/usr/bin/wget
+/usr/info/*
+/usr/share/locale/*/LC_MESSAGES/*
+
+%changelog
+* Mon Jun 26 2000 Soenke J. Peters <[EMAIL PROTECTED]>
+- included some stuff from CVS tree
+- HTTPS support
+
+* Thu Aug 26 1999 Jeff Johnson <[EMAIL PROTECTED]>
+- don't permit chmod 777 on symlinks (#4725).
+
+* Sun Mar 21 1999 Cristian Gafton <[EMAIL PROTECTED]>
+- auto rebuild in the new build environment (release 4)
+
+* Fri Dec 18 1998 Bill Nottingham <[EMAIL PROTECTED]>
+- build for 6.0 tree
+- add Provides
+
+* Sat Oct 10 1998 Cristian Gafton <[EMAIL PROTECTED]>
+- strip binaries
+- version 1.5.3
+
+* Sat Jun 27 1998 Jeff Johnson <[EMAIL PROTECTED]>
+- updated to 1.5.2
+
+* Thu Apr 30 1998 Cristian Gafton <[EMAIL PROTECTED]>
+- modified group to Applications/Networking
+
+* Wed Apr 22 1998 Cristian Gafton <[EMAIL PROTECTED]>
+- upgraded to 1.5.0
+- they removed the man page from the distribution (Duh!) and I added it back
+ from 1.4.5. Hey, removing the man page is DUMB!
+
+* Fri Nov 14 1997 Cristian Gafton <[EMAIL PROTECTED]>
+- first build against glibc