Hi, This is a backported version of the URL rewrite patch for the 3.1.6 code--backported because I took the 3.2.x version and migrated it back, adding as little as necessary to implement it. As far as I can tell, it works OK, but I'm not going to commit it until I get a few brave testers. -Geoff
Index: htdig/Retriever.cc =================================================================== RCS file: /cvsroot/htdig/htdig/htdig/Retriever.cc,v retrieving revision 1.36.2.24 diff -c -3 -p -r1.36.2.24 Retriever.cc *** htdig/Retriever.cc 2001/06/29 16:57:34 1.36.2.24 --- htdig/Retriever.cc 2001/09/24 03:20:37 *************** Retriever::got_href(URL &url, char *desc *** 1203,1208 **** --- 1203,1209 ---- } url.normalize(); + url.rewrite(); // If it is a backlink from the current document, // just update that field. Writing to the database Index: htdig/htdig.cc =================================================================== RCS file: /cvsroot/htdig/htdig/htdig/htdig.cc,v retrieving revision 1.3.2.8 diff -c -3 -p -r1.3.2.8 htdig.cc *** htdig/htdig.cc 2001/07/25 22:49:34 1.3.2.8 --- htdig/htdig.cc 2001/09/24 03:20:37 *************** *** 19,24 **** --- 19,25 ---- #include "htdig.h" #include "defaults.h" #include "HtURLCodec.h" + #include "HtURLRewriter.h" #include "HtWordType.h" // If we have this, we probably want it. *************** main(int ac, char **av) *** 161,166 **** --- 162,176 ---- if (url_part_errors.length() != 0) reportError(form("Invalid url_part_aliases or common_url_parts: %s", url_part_errors.get())); + + // + // Check url_rewrite_rules for errors. + String url_rewrite_rules = HtURLRewriter::instance()->ErrMsg(); + + if (url_rewrite_rules.length() != 0) + reportError(form("Invalid url_rewrite_rules: %s", + url_rewrite_rules.get())); + // // If indicated, change the database file names to have the .work Index: htlib/HtRegex.cc =================================================================== RCS file: HtRegex.cc diff -N HtRegex.cc *** /dev/null Thu May 24 22:33:05 2001 --- HtRegex.cc Sun Sep 23 20:20:37 2001 *************** *** 0 **** --- 1,105 ---- + // + // HtRegex.cc + // + // HtRegex: A simple C++ wrapper class for the system regex routines. + // + // Part of the ht://Dig package <http://www.htdig.org/> + // Copyright (c) 1999-2001 The ht://Dig Group + // For copyright details, see the file COPYING in your distribution + // or the GNU General Public License version 2 or later + // <http://www.gnu.org/copyleft/gpl.html> + // + // $Id: HtRegex.cc,v 1.9.2.8 2001/05/16 16:36:45 ghutchis Exp $ + // + + #ifdef HAVE_CONFIG_H + #include "htconfig.h" + #endif /* HAVE_CONFIG_H */ + + #include "HtRegex.h" + #include <locale.h> + + + HtRegex::HtRegex() : compiled(0) { } + + HtRegex::HtRegex(const char *str, int case_sensitive) : compiled(0) + { + set(str, case_sensitive); + } + + HtRegex::~HtRegex() + { + if (compiled != 0) regfree(&re); + compiled = 0; + } + + const String &HtRegex::lastError() + { + return lastErrorMessage; + } + + int + HtRegex::set(const char * str, int case_sensitive) + { + if (compiled != 0) regfree(&re); + + int err; + compiled = 0; + if (str == NULL) return 0; + if (strlen(str) <= 0) return 0; + if (err = regcomp(&re, str, case_sensitive ? REG_EXTENDED : +(REG_EXTENDED|REG_ICASE)), err == 0) + { + compiled = 1; + } + else + { + size_t len = regerror(err, &re, 0, 0); + char *buf = new char[len]; + regerror(err, &re, buf, len); + lastErrorMessage = buf; + delete buf; + } + return compiled; + } + + int + HtRegex::setEscaped(StringList &list, int case_sensitive) + { + String *str; + String transformedLimits; + list.Start_Get(); + while ((str = (String *) list.Get_Next())) + { + if (str->indexOf('[') == 0 && str->lastIndexOf(']') == str->length()-1) + { + transformedLimits << str->sub(1,str->length()-2).get(); + } + else // Backquote any regex special characters + { + for (int pos = 0; pos < str->length(); pos++) + { + if (strchr("^.[$()|*+?{\\", str->Nth(pos))) + transformedLimits << '\\'; + transformedLimits << str->Nth(pos); + } + } + transformedLimits << "|"; + } + transformedLimits.chop(1); + + return set(transformedLimits, case_sensitive); + } + + int + HtRegex::match(const char * str, int nullpattern, int nullstr) + { + int rval; + + if (compiled == 0) return(nullpattern); + if (str == NULL) return(nullstr); + if (strlen(str) <= 0) return(nullstr); + rval = regexec(&re, str, (size_t) 0, NULL, 0); + if (rval == 0) return(1); + else return(0); + } + Index: htlib/HtRegex.h =================================================================== RCS file: HtRegex.h diff -N HtRegex.h *** /dev/null Thu May 24 22:33:05 2001 --- HtRegex.h Sun Sep 23 20:20:37 2001 *************** *** 0 **** --- 1,65 ---- + // + // HtRegex.h + // + // HtRegex: A simple C++ wrapper class for the system regex routines. + // + // Part of the ht://Dig package <http://www.htdig.org/> + // Copyright (c) 1999, 2000 The ht://Dig Group + // For copyright details, see the file COPYING in your distribution + // or the GNU General Public License version 2 or later + // <http://www.gnu.org/copyleft/gpl.html> + // + // $Id: HtRegex.h,v 1.5.2.4 2000/08/21 02:29:15 ghutchis Exp $ + // + // + + #ifndef _HtRegex_h_ + #define _HtRegex_h_ + + #include "Object.h" + #include "StringList.h" + + // This is an attempt to get around compatibility problems + // with the included regex + #ifdef HAVE_BROKEN_REGEX + #include <regex.h> + #else + #include "regex.h" + #endif + + #include <sys/types.h> + #include <fstream.h> + + class HtRegex : public Object + { + public: + // + // Construction/Destruction + // + HtRegex(); + HtRegex(const char *str, int case_sensitive = 0); + virtual ~HtRegex(); + + // + // Methods for setting the pattern + // + int set(const String& str, int case_sensitive = 0) { return +set(str.get(), case_sensitive); } + int set(const char *str, int case_sensitive = 0); + int setEscaped(StringList &list, int case_sensitive = 0); + + virtual const String &lastError(); // returns the last error message + + // + // Methods for checking a match + // + int match(const String& str, int nullmatch, int nullstr) { return +match(str.get(), nullmatch, nullstr); } + int match(const char *str, int nullmatch, int nullstr); + + protected: + int compiled; + regex_t re; + + String lastErrorMessage; + }; + + #endif Index: htlib/HtRegexReplace.cc =================================================================== RCS file: HtRegexReplace.cc diff -N HtRegexReplace.cc *** /dev/null Thu May 24 22:33:05 2001 --- HtRegexReplace.cc Sun Sep 23 20:20:38 2001 *************** *** 0 **** --- 1,141 ---- + // + // HtRegexReplace.cc + // + // HtRegexReplace: A subclass of HtRegex that can perform replacements + // + // Part of the ht://Dig package <http://www.htdig.org/> + // Copyright (c) 2000 The ht://Dig Group + // For copyright details, see the file COPYING in your distribution + // or the GNU Public License version 2 or later + // <http://www.gnu.org/copyleft/gpl.html> + // + // $Id: HtRegexReplace.cc,v 1.1.2.2 2001/05/16 16:36:45 ghutchis Exp $ + // + + #include "HtRegexReplace.h" + #include <locale.h> + + + HtRegexReplace::HtRegexReplace() + { + } + + HtRegexReplace::HtRegexReplace(const char *from, const char *to, int case_sensitive) + : HtRegex(from, case_sensitive) + { + memset(®s, 0, sizeof(regs)); + repBuf = 0; + segSize = + segUsed = 0; + segMark = 0; + repLen = 0; + + setReplace(to); + } + + HtRegexReplace::~HtRegexReplace() + { + empty(); + } + + int HtRegexReplace::replace(String &str, int nullpattern, int nullstr) + { + const int regCount = sizeof(regs) / sizeof(regs[0]); + if (compiled == 0 || repBuf == 0) return nullpattern; + if (str.length() == 0) return nullstr; + + if (regexec(&re, str.get(), regCount, regs, 0) == 0) + { + // Firstly work out how long the result string will be. We think this +will be more effecient + // than letting the buffer grow in stages as we build the result, but +who knows? + //cout << "!!! Match !!!" << endl; + size_t resLen = repLen; + int i, reg, repPos; + const char *src = str.get(); + + for (i = 1; i < (int) segUsed; i += 2) + { + reg = segMark[i]; + if (reg < regCount && regs[reg].rm_so != -1) + resLen += regs[reg].rm_eo - regs[reg].rm_so; + } + //cout << "result will be " << resLen << " chars long" << endl; + String result(resLen); // Make the result string preallocating the +buffer size + for (i = 0, repPos = 0;; ) + { + //cout << "appending segment " << i << endl; + result.append(repBuf + repPos, segMark[i] - repPos); + // part of the replace string + repPos = segMark[i]; // move forward + if (++i == (int) segUsed) break; // was that the last +segment? + reg = segMark[i++]; // get the register +number + if (reg < regCount && regs[reg].rm_so != -1) + result.append((char *) src + regs[reg].rm_so, +regs[reg].rm_eo - regs[reg].rm_so); + } + str = result; + //cout << "return " << result.get() << endl; + + return 1; + } + + return 0; + } + + // Private: place a mark in the mark buffer growing it if necessary. + void HtRegexReplace::putMark(int n) + { + // assert(segUsed <= segSize); + if (segUsed == segSize) + { + size_t newSize = segSize * 2 + 5; // grow in chunks + int *newMark = new int[newSize]; // do we assume that +new can't fail? + memcpy(newMark, segMark, segSize * sizeof(int)); + delete segMark; + segMark = newMark; + segSize = newSize; + } + segMark[segUsed++] = n; + } + + void HtRegexReplace::empty() + { + // Destroy any existing replace pattern + delete repBuf; repBuf = 0; + segSize = segUsed = 0; + delete segMark; segMark = 0; + repLen = 0; + } + + void HtRegexReplace::setReplace(const char *to) + { + empty(); + + repBuf = new char[strlen(to)]; // replace buffer can never contain +more text than to string + int bufPos = 0; // our position within the output buffer + + while (*to) + { + if (*to == '\\') + { + if (*++to == '\0') break; + if (*to >= '0' && *to <= '9') + { + putMark(bufPos); + putMark(*to - '0'); + } + else + { + // We could handle some C style escapes here, but +instead we just pass the character + // after the backslash through. This means that \\, \" +and \' will do the right thing. + // It's unlikely that anyone will need any C style +escapes in ht://Dig anyway. + repBuf[bufPos++] = *to; + } + to++; + } + else + { + repBuf[bufPos++] = *to++; + } + } + putMark(bufPos); + repLen = (size_t) bufPos; + } Index: htlib/HtRegexReplace.h =================================================================== RCS file: HtRegexReplace.h diff -N HtRegexReplace.h *** /dev/null Thu May 24 22:33:05 2001 --- HtRegexReplace.h Sun Sep 23 20:20:38 2001 *************** *** 0 **** --- 1,58 ---- + // + // HtRegexReplace.h + // + // HtRegexReplace: A subclass of HtRegex that can perform replacements + // + // Part of the ht://Dig package <http://www.htdig.org/> + // Copyright (c) 2000 The ht://Dig Group + // For copyright details, see the file COPYING in your distribution + // or the GNU Public License version 2 or later + // <http://www.gnu.org/copyleft/gpl.html> + // + // $Id: HtRegexReplace.h,v 1.1.2.1 2000/08/21 02:33:13 ghutchis Exp $ + // + + #ifndef _HtRegexReplace_h_ + #define _HtRegexReplace_h_ + + #ifdef HAVE_CONFIG_H + #include "htconfig.h" + #endif /* HAVE_CONFIG_H */ + + #include "HtRegex.h" + + class HtRegexReplace : public HtRegex + { + public: + // + // Construction/Destruction + // + HtRegexReplace(); + HtRegexReplace(const char *from, const char *to, int case_sensitive = 0); + virtual ~HtRegexReplace(); + + // + // Methods for setting the replacement pattern + // + void setReplace(const String& str) { setReplace(str.get()); } + void setReplace(const char *str); + + // + // Methods for replacing + // + int replace(String &str, int nullpattern = 0, int nullstr = 0); + + protected: + char *repBuf; // Replace text. + size_t segSize, segUsed; + int *segMark; + size_t repLen; + + regmatch_t regs[10]; + + // Various private methods + void putMark(int n); + void empty(); + }; + + #endif Index: htlib/HtRegexReplaceList.cc =================================================================== RCS file: HtRegexReplaceList.cc diff -N HtRegexReplaceList.cc *** /dev/null Thu May 24 22:33:05 2001 --- HtRegexReplaceList.cc Sun Sep 23 20:20:38 2001 *************** *** 0 **** --- 1,72 ---- + // + // HtRegexReplaceList.cc + // + // HtRegexReplaceList: Perform RegexReplace on a list of from/to pairs. + // Patterns are applied in order; pattern matching + // doesn't stop when a match occurs. + // + // Part of the ht://Dig package <http://www.htdig.org/> + // Copyright (c) 2000-2001 The ht://Dig Group + // For copyright details, see the file COPYING in your distribution + // or the GNU Public License version 2 or later + // <http://www.gnu.org/copyleft/gpl.html> + // + // $Id: HtRegexReplaceList.cc,v 1.1.2.3 2001/07/06 23:43:12 ghutchis Exp $ + // + // + + #include "HtRegexReplaceList.h" + #include <iostream.h> + + HtRegexReplaceList::HtRegexReplaceList(StringList &list, int case_sensitive ) + { + if (list.Count() & 1) + { + lastErrorMessage = "HtRegexReplaceList needs an even number of +strings"; + return; + } + + int i; + String err; + + for (i = 0; i < list.Count(); i += 2) + { + String from = list[i]; + String to = list[i+1]; + HtRegexReplace *replacer = new HtRegexReplace(from.get(), to.get(), +case_sensitive); + replacers.Add(replacer); // Stash it even if there's an +error so it will get destroyed later + const String &err = replacer->lastError(); + if (err.length() != 0) + { + lastErrorMessage = err; + return; + } + } + } + + HtRegexReplaceList::~HtRegexReplaceList() + { + // replacers gets chucked away + } + + int HtRegexReplaceList::Replace(String &str, int nullpattern , int nullstr ) + { + int repCount = replacers.Count(); + int doneCount = 0; + + for (int rep = 0; rep < repCount; rep++) + { + HtRegexReplace *replacer = (HtRegexReplace *) replacers[rep]; + if (replacer->replace(str, nullpattern, nullstr) > 0) + doneCount++; + } + + return doneCount; + } + + const String &HtRegexReplaceList::lastError() + { + return lastErrorMessage; + } + + // End of HtRegexReplaceList.cc Index: htlib/HtRegexReplaceList.h =================================================================== RCS file: HtRegexReplaceList.h diff -N HtRegexReplaceList.h *** /dev/null Thu May 24 22:33:05 2001 --- HtRegexReplaceList.h Sun Sep 23 20:20:38 2001 *************** *** 0 **** --- 1,39 ---- + // + // HtRegexReplaceList.h + // + // HtRegexReplaceList: Perform RegexReplace on a list of from/to pairs. + // Patterns are applied in order; pattern matching + // doesn't stop when a match occurs. + // + // Part of the ht://Dig package <http://www.htdig.org/> + // Copyright (c) 2000 The ht://Dig Group + // For copyright details, see the file COPYING in your distribution + // or the GNU Public License version 2 or later + // <http://www.gnu.org/copyleft/gpl.html> + // + // $Id: HtRegexReplaceList.h,v 1.1.2.1 2000/08/21 02:33:13 ghutchis Exp $ + // + + #ifndef __HtRegexReplaceList_h + #define __HtRegexReplaceList_h + + #include "HtRegexReplace.h" + #include "List.h" + #include "StringList.h" + + class HtRegexReplaceList : public Object + { + public: + // Construct a HtRegexReplaceList. |list| should contain an even + // number of strings that constitute from/to pairs. + HtRegexReplaceList(StringList &list, int case_sensitive = 0); + virtual ~HtRegexReplaceList(); + int Replace(String &str, int nullpattern = 0, int nullstr = 0); + virtual const String &lastError(); + + private: + List replacers; + String lastErrorMessage; + }; + + #endif /* __HtRegexReplaceList_h */ Index: htlib/HtURLRewriter.cc =================================================================== RCS file: HtURLRewriter.cc diff -N HtURLRewriter.cc *** /dev/null Thu May 24 22:33:05 2001 --- HtURLRewriter.cc Sun Sep 23 20:20:38 2001 *************** *** 0 **** --- 1,49 ---- + // + // Methods for HtURLRewriter + // + // $Id: HtURLRewriter.cc,v 1.0 2000/08/16 14:43:00 aarmstrong Exp $ + // + // + + #include "HtURLRewriter.h" + #include "defaults.h" // For "config" + + // Constructor: parses the appropriate parameters using the + // encapsulated RegexReplaceList class. + // Only used in privacy. + HtURLRewriter::HtURLRewriter() + { + StringList list(config["url_rewrite_rules"], " \t"); + + myRegexReplace = new HtRegexReplaceList(list); + } + + + HtURLRewriter::~HtURLRewriter() + { + delete myRegexReplace; + } + + // Supposedly used as HtURLRewriter::instance()->ErrMsg() + // to check if RegexReplaceList liked what was fed. + const String& HtURLRewriter::ErrMsg() + { + return myRegexReplace->lastError(); + } + + + // Canonical singleton interface. + HtURLRewriter * + HtURLRewriter::instance() + { + static HtURLRewriter *_instance = 0; + + if (_instance == 0) + { + _instance = new HtURLRewriter(); + } + + return _instance; + } + + // End of HtURLRewriter.cc Index: htlib/HtURLRewriter.h =================================================================== RCS file: HtURLRewriter.h diff -N HtURLRewriter.h *** /dev/null Thu May 24 22:33:05 2001 --- HtURLRewriter.h Sun Sep 23 20:20:38 2001 *************** *** 0 **** --- 1,46 ---- + // + // HtURLRewriter + // + // $Id: HtURLRewriter.h,v 1.0 2000/08/16 14:43:00 aarmstrong Exp $ + // + #ifndef __HtURLRewriter_h + #define __HtURLRewriter_h + + #include "HtRegexReplaceList.h" + + // Container for a RegexReplaceList (not subclassed from it due to + // portability-problems using initializers). + // Not for subclassing. + class HtURLRewriter + { + public: + static HtURLRewriter *instance(); + virtual ~HtURLRewriter(); + + inline String Replace(String &source) { return myRegexReplace->Replace(source); } + + // If an error was discovered during the parsing of + // config directives, this member gives a + // nonempty String with an error message. + const String& ErrMsg(); + + // egcs-1.1 (and some earlier versions) always erroneously + // warns (even without warning flags) about classic singleton + // constructs ("only defines private constructors and has no + // friends"). Rather than adding autoconf tests to shut these + // versions up with -Wno-ctor-dtor-privacy, we fake normal + // conformism for it here (the minimal effort). + friend void my_friend_Harvey__a_faked_friend_function(); + + private: + // Hide default-constructor, copy-constructor and assignment + // operator, making this a singleton. + HtURLRewriter(); + HtURLRewriter(const HtURLRewriter &); + void operator= (const HtURLRewriter &); + + HtRegexReplaceList *myRegexReplace; + String myErrMsg; + }; + + #endif /* __HtURLRewriter_h */ Index: htlib/Makefile.in =================================================================== RCS file: /cvsroot/htdig/htdig/htlib/Makefile.in,v retrieving revision 1.13.2.2 diff -c -3 -p -r1.13.2.2 Makefile.in *** htlib/Makefile.in 1999/03/29 15:53:48 1.13.2.2 --- htlib/Makefile.in 2001/09/24 03:20:38 *************** OBJS= Configuration.o Connection.o Datab *** 16,22 **** URL.o URLTrans.o cgi.o \ good_strtok.o io.o strcasecmp.o \ strptime.o mytimegm.o HtCodec.o HtWordCodec.o \ ! HtURLCodec.o regex.o HtWordType.o TARGET= libht.a --- 16,24 ---- URL.o URLTrans.o cgi.o \ good_strtok.o io.o strcasecmp.o \ strptime.o mytimegm.o HtCodec.o HtWordCodec.o \ ! HtURLCodec.o regex.o HtWordType.o \ ! HtRegex.o HtRegexReplace.o HtRegexReplaceList.o \ ! HtURLRewriter.o TARGET= libht.a Index: htlib/URL.cc =================================================================== RCS file: /cvsroot/htdig/htdig/htlib/Attic/URL.cc,v retrieving revision 1.18.2.8 diff -c -3 -p -r1.18.2.8 URL.cc *** htlib/URL.cc 2001/08/31 21:07:33 1.18.2.8 --- htlib/URL.cc 2001/09/24 03:20:38 *************** static char RCSid[] = "$Id: URL.cc,v 1.1 *** 13,18 **** --- 13,19 ---- #include "Configuration.h" #include "StringMatch.h" #include "StringList.h" + #include "HtURLRewriter.h" #include <string.h> #include <stdlib.h> #include <stdio.h> *************** URL::URL(char *ref, URL &parent) *** 222,227 **** --- 223,234 ---- _url << _path; } + void URL::rewrite() + { + String _out = HtURLRewriter::instance()->Replace(_url); + if (strcmp(_out.get(), _url.get()) != 0) + parse(_out.get()); + } //***************************************************************************** // void URL::parse(char *u) Index: htlib/URL.h =================================================================== RCS file: /cvsroot/htdig/htdig/htlib/Attic/URL.h,v retrieving revision 1.4.2.1 diff -c -3 -p -r1.4.2.1 URL.h *** htlib/URL.h 2000/02/16 21:14:59 1.4.2.1 --- htlib/URL.h 2001/09/24 03:20:38 *************** public: *** 58,63 **** --- 58,64 ---- char *get() {return _url;} void dump(); void normalize(); + void rewrite(); char *signature(); private: Index: htlib/htString.h =================================================================== RCS file: /cvsroot/htdig/htdig/htlib/htString.h,v retrieving revision 1.5.2.1 diff -c -3 -p -r1.5.2.1 htString.h *** htlib/htString.h 2001/06/07 20:23:59 1.5.2.1 --- htlib/htString.h 2001/09/24 03:20:38 *************** public: *** 79,84 **** --- 79,85 ---- // Access to specific characters // char &operator [] (int n); + char Nth(int n) {return (*this)[n];} char last(); //