Hi!

I have written a patch that allows hostname translations and does a second
check to limit the search after the URL is normalized.

After having applied the patch, the following checks/translations are
made:

*) First, a new URL is checked against the "limit_urls_to:" configuration
directive as in the original version. (I use this directive e.g. to limit
the URLs to my domain so that unnecessary hostname lookups, etc. are
avoided.)

*) Then the URL is normalized. Among other tasks the canonical name of the
Host is looked up. (Nothing changed to the original Version.)

*) After that my "server_aliases" configuration directive is used to
translate the hostname-portion (it is the canonical name now) of the URL.

*) Finally my "limit_normalized:" directive does additional filtering
of the hostnames.


I have two examples for the use of these new features:

1) Suppose your web-server has the canonical hostname "foo" and the
aliases "www" and "bar". They are not virtual hosts, so you can use any
alias to reach the same pages. However it would be nice to index the pages
only once and it looks better, if the URLs contain "www" (the alias name)
instead of "foo". You can achieve this by adding the following
configuration directives:

allow_virtual_hosts:  false
limit_urls_to:        .mydomain
server_aliases:       foo.mydomain:80=www.mydomain:80
limit_normalized:     http://www.mydomain
start_url:            http://www.mydomain

2) Anonther use for these features is, if multiple Servers are accessing
the same web-pages, for example, if they are sharing a network-filesystem.
In my domain, there are 11 Web-Servers that are accessing the same
web-space: (Any canonical name or alias can be used.)

Main Server: speth08.wu-wien.ac.at (Aliases: www, proxy)
Additional Servers:
asterix.wu-wien.ac.at (Aliases: as, speth13)
botanix.wu-wien.ac.at (Aliases: bo, speth14)
falbala.wu-wien.ac.at (Aliases: fa, speth07)
and so on...

My config-file looks like this:

allow_virtual_hosts:    false
limit_urls_to:          .wu-wien.ac.at/
server_aliases: speth08.wu-wien.ac.at:80=www.wu-wien.ac.at:80 \
                asterix.wu-wien.ac.at:80=www.wu-wien.ac.at:80 \
                botanix.wu-wien.ac.at:80=www.wu-wien.ac.at:80 \
                falbala.wu-wien.ac.at:80=www.wu-wien.ac.at:80
limit_normalized:       http://www.wu-wien.ac.at/
start_url:              http://www.wu-wien.ac.at/


- Leo -

---------- snipp! ----------
diff -aur htdig-3.1.0b1/htdig/Retriever.cc htdig-3.1.0b1-new/htdig/Retriever.cc
--- htdig-3.1.0b1/htdig/Retriever.cc    Tue Sep  8 05:29:55 1998
+++ htdig-3.1.0b1-new/htdig/Retriever.cc        Tue Oct  6 18:59:58 1998
@@ -815,7 +815,7 @@
        }
 
        url.normalize();
-       if (IsValidURL(url.get()))
+       if (limitsn.FindFirst(url.get()) >= 0)
        {
            //
            // First add it to the document database
@@ -925,7 +925,7 @@
        }
 
        url.normalize();
-       if (IsValidURL(url.get()))
+       if (limitsn.FindFirst(url.get()) >= 0)
        {
            //
            // First add it to the document database
diff -aur htdig-3.1.0b1/htdig/htdig.h htdig-3.1.0b1-new/htdig/htdig.h
--- htdig-3.1.0b1/htdig/htdig.h Tue Sep  8 05:29:55 1998
+++ htdig-3.1.0b1-new/htdig/htdig.h     Tue Oct  6 18:59:58 1998
@@ -28,6 +28,7 @@
 extern int             debug;
 extern DocumentDB      docs;
 extern StringMatch     limits;
+extern StringMatch     limitsn;
 extern StringMatch     excludes;
 extern FILE            *urls_seen;
 extern FILE            *images_seen;
diff -aur htdig-3.1.0b1/htdig/main.cc htdig-3.1.0b1-new/htdig/main.cc
--- htdig-3.1.0b1/htdig/main.cc Tue Sep  8 05:29:55 1998
+++ htdig-3.1.0b1-new/htdig/main.cc     Tue Oct  6 18:59:58 1998
@@ -10,6 +10,7 @@
 int                    report_statistics = 0;
 DocumentDB             docs;
 StringMatch            limits;
+StringMatch            limitsn;
 StringMatch            excludes;
 FILE                   *urls_seen = NULL;
 FILE                   *images_seen = NULL;
@@ -151,6 +152,19 @@
     }
     limits.IgnoreCase();
     limits.Pattern(pattern);
+
+    l = config["limit_normalized"];
+    p = strtok(l, " \t");
+    pattern = 0;
+    while (p)
+    {
+       if (pattern.length())
+           pattern << '|';
+       pattern << p;
+       p = strtok(0, " \t");
+    }
+    limitsn.IgnoreCase();
+    limitsn.Pattern(pattern);
 
     //
     // Patterns to exclude from urls...
diff -aur htdig-3.1.0b1/htlib/URL.cc htdig-3.1.0b1-new/htlib/URL.cc
--- htdig-3.1.0b1/htlib/URL.cc  Tue Sep  8 05:29:55 1998
+++ htdig-3.1.0b1-new/htlib/URL.cc      Tue Oct  6 19:00:02 1998
@@ -490,6 +490,7 @@
            _host = realname->get();
        else
            machines.Add(key, new String(_host));
+       ServerAlias();
     }
     
     //
@@ -525,3 +526,43 @@
     return _signature;
 }
 
+
+void URL::ServerAlias()
+{
+  static Dictionary *serveraliases= 0;
+
+  if (! serveraliases)
+    {
+      String l= config["server_aliases"];
+      serveraliases = new Dictionary();
+      char *p = strtok(l, " \t");
+      char *salias= NULL;
+      while (p)
+       {
+         salias = strchr(p, '=');
+         if (! salias)
+           continue;
+         *salias++= '\0';
+         serveraliases->Add(p, new String(salias));
+         // cout << "Alias: " << p << "->" << salias << "\n";
+         // printf ("Alias: %s->%s\n", p, salias);
+         p = strtok(0, " \t");
+       }
+    }
+
+  String *al= 0;
+  int newport;
+  char *p;
+  int delim;
+  _signature = _host;
+  _signature << ':' << _port;
+  if (al= (String *) serveraliases->Find(_signature))
+    {
+      delim= al->indexOf(':');
+      // printf("%s->%s\n", (char *) _signature, (char *) *al);
+      _host= al->sub(0,delim);
+      sscanf(al->sub(delim+1), "%d", &newport);
+      _port= newport;
+      // printf("\nNeuer URL: %s:%d\n", (char *) _host, _port);
+    }
+}
diff -aur htdig-3.1.0b1/htlib/URL.h htdig-3.1.0b1-new/htlib/URL.h
--- htdig-3.1.0b1/htlib/URL.h   Tue Sep  8 05:29:55 1998
+++ htdig-3.1.0b1-new/htlib/URL.h       Tue Oct  6 19:00:02 1998
@@ -61,6 +61,7 @@
 
     void               removeIndex(String &);
     void                normalizePath();
+    void               ServerAlias();
 };
 
 
---------- snipp! ----------

-----------------------------------------------------------------------
Alexander (Leo) Bergolth                          [EMAIL PROTECTED]
WU-Wien - Zentrum fuer Informatikdienste       http://leo.wu-wien.ac.at
Info Center
In a world without walls and fences, who needs windows and gates?

----------------------------------------------------------------------
To unsubscribe from the htdig mailing list, send a message to
[EMAIL PROTECTED] containing the single word "unsubscribe" in
the body of the message.

Reply via email to