Here's another quickie patch to 3.2.x.  It's support for 
a content-type alias attribute for htdig which sorts out
servers that get the content-type wrong in responses.
While getting the server correctly configured is probably 
a better solution, this works if the server is maintained 
by someone else.

usage is something like: 
content_type_aliases: text/plain=text/html
and can be set for specific servers in the server block.

Also included is the patch as an attachment for when the c+p gets mangled.

Is the config file entry in the right format?  I've not seen a definitive
description of what should be where, so it's a bit of a guess.


=======================================
diff -rup htdig/htcommon/defaults.cc htdig-patch3/htcommon/defaults.cc
--- htdig/htcommon/defaults.cc  Thu Aug 30 03:43:38 2001
+++ htdig-patch3/htcommon/defaults.cc   Thu Oct 18 14:49:27 2001
@@ -271,6 +271,13 @@ http://www.htdig.org/";, " \
        compile time. \
        </p> \
 " }, \
+{ "content_type_aliases", "",  \
+    "string list", "htdig", "server", "SLI-special", "Indexing:Where", \
+    "content_type_aliases: text/plain=text/html", " \
+    This attribute tells htdig to use a different parser to that 
indicated by the content-type \
+    returned by the server.  This is occasionally useful for 
mis-configured servers who server \
+    up dynamic content but don't set the content-type correctly. \
+" }, \
 { "create_image_list", "false",  \
        "boolean", "htdig", "", "all", "Extra Output", "create_image_list: 
yes", " \
        If set to true, a file with all the image URLs that \
@@ -2545,6 +2552,8 @@ form during indexing and translated for 
        "string", "all", "", "3.2.0b1", "Extra Output", 
"wordlist_monitor_output: myfile", " \
         Print monitoring output on file instead of the default stderr. \
 " }, 
+
+
 {0, 0, 0, 0, 0, 0, 0, 0, 0}
 };
 
Only in htdig-patch3/htcommon: defaults.cc~
diff -rup htdig/htdig/Document.cc htdig-patch3/htdig/Document.cc
--- htdig/htdig/Document.cc     Thu May 17 04:36:44 2001
+++ htdig-patch3/htdig/Document.cc      Thu Oct 18 14:27:37 2001
@@ -629,7 +629,7 @@ Document::RetrieveLocal(HtDateTime date,
 //   parsers are external programs that will be used.
 //
 Parsable *
-Document::getParsable()
+Document::getParsable( const String& serverName )
 {
     static HTML                        *html = 0;
     static Plaintext           *plaintext = 0;
@@ -637,6 +637,8 @@ Document::getParsable()
 
     Parsable   *parsable = 0;
 
+    ContentTypeAlias( serverName );
+
     if (ExternalParser::canParse(contentType))
     {
        if (externalParser)
@@ -701,4 +703,51 @@ int Document::ShouldWeRetry(Transport::D
       return 1;
 
    return 0;
+}
+
+
+
+void
+Document::ContentTypeAlias( const String& serverName )
+{
+    HtConfiguration* config= HtConfiguration::config();
+    Dictionary content_type_aliases;
+
+    String l;
+    if ( serverName.length() > 0 )
+        l = config->Find("server", serverName, "content_type_aliases");
+    else
+        l = config->Find( "content_type_aliases");
+
+    if ( l.length() == 0 )
+        return;
+
+    String from, *to;
+    char *p = strtok(l, " \t");
+    char *ct_alias= NULL;
+    while (p)
+    {
+        ct_alias = strchr(p, '=');
+        if (! ct_alias )
+        {
+            p = strtok(0, " \t");
+            continue;
+        }
+        *ct_alias++= '\0';
+        from = p;
+        to= new String( ct_alias );
+        content_type_aliases.Add(from.get(), to);
+        // fprintf (stderr, "Alias: %s->%s\n", from.get(), to->get());
+        p = strtok(0, " \t");
+    }
+
+
+    String* new_ct = 0;
+    if ( (new_ct = (String*) content_type_aliases.Find( contentType )) )
+    {
+        if ( debug > 1 )
+            cout << "Translating content type '" << contentType << "' to 
'" << *new_ct << "'\n";
+        contentType = *new_ct;
+    }
+
 }
diff -rup htdig/htdig/Retriever.cc htdig-patch3/htdig/Retriever.cc
--- htdig/htdig/Retriever.cc    Tue Oct 16 15:27:16 2001
+++ htdig-patch3/htdig/Retriever.cc     Thu Oct 18 14:28:29 2001
@@ -802,7 +802,7 @@ Retriever::RetrievedDocument(Document &d
     // routines.
     // This will generate the Parsable object as a specific parser
     //
-    Parsable   *parsable = doc.getParsable();
+    Parsable   *parsable = doc.getParsable( base->host() );
     if (parsable)
       parsable->parse(*this, *base);
     else

=======================================




Jamie Anstice
Search Engineer
S.L.I. Systems
[EMAIL PROTECTED]
ph:  64 961 3262
mobile: 64 21 264 9347
diff -rup htdig/htcommon/defaults.cc htdig-patch3/htcommon/defaults.cc
--- htdig/htcommon/defaults.cc  Thu Aug 30 03:43:38 2001
+++ htdig-patch3/htcommon/defaults.cc   Thu Oct 18 14:49:27 2001
@@ -271,6 +271,13 @@ http://www.htdig.org/";, " \
        compile time. \
        </p> \
 " }, \
+{ "content_type_aliases", "",  \
+    "string list", "htdig", "server", "SLI-special", "Indexing:Where", \
+    "content_type_aliases: text/plain=text/html", " \
+    This attribute tells htdig to use a different parser to that indicated by the 
+content-type \
+    returned by the server.  This is occasionally useful for mis-configured servers 
+who server \
+    up dynamic content but don't set the content-type correctly. \
+" }, \
 { "create_image_list", "false",  \
        "boolean", "htdig", "", "all", "Extra Output", "create_image_list: yes", " \
        If set to true, a file with all the image URLs that \
@@ -2545,6 +2552,8 @@ form during indexing and translated for 
        "string", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor_output: 
myfile", " \
         Print monitoring output on file instead of the default stderr. \
 " }, 
+
+
 {0, 0, 0, 0, 0, 0, 0, 0, 0}
 };
 
Only in htdig-patch3/htcommon: defaults.cc~
diff -rup htdig/htdig/Document.cc htdig-patch3/htdig/Document.cc
--- htdig/htdig/Document.cc     Thu May 17 04:36:44 2001
+++ htdig-patch3/htdig/Document.cc      Thu Oct 18 14:27:37 2001
@@ -629,7 +629,7 @@ Document::RetrieveLocal(HtDateTime date,
 //   parsers are external programs that will be used.
 //
 Parsable *
-Document::getParsable()
+Document::getParsable( const String& serverName )
 {
     static HTML                        *html = 0;
     static Plaintext           *plaintext = 0;
@@ -637,6 +637,8 @@ Document::getParsable()
     
     Parsable   *parsable = 0;
 
+    ContentTypeAlias( serverName );
+
     if (ExternalParser::canParse(contentType))
     {
        if (externalParser)
@@ -701,4 +703,51 @@ int Document::ShouldWeRetry(Transport::D
       return 1;
       
    return 0;
+}
+
+
+
+void
+Document::ContentTypeAlias( const String& serverName )
+{
+    HtConfiguration* config= HtConfiguration::config();
+    Dictionary content_type_aliases;
+
+    String l;
+    if ( serverName.length() > 0 )
+        l = config->Find("server", serverName, "content_type_aliases");
+    else
+        l = config->Find( "content_type_aliases");
+
+    if ( l.length() == 0 )
+        return;
+
+    String from, *to;
+    char *p = strtok(l, " \t");
+    char *ct_alias= NULL;
+    while (p)
+    {
+        ct_alias = strchr(p, '=');
+        if (! ct_alias )
+        {
+            p = strtok(0, " \t");
+            continue;
+        }
+        *ct_alias++= '\0';
+        from = p;
+        to= new String( ct_alias );
+        content_type_aliases.Add(from.get(), to);
+        // fprintf (stderr, "Alias: %s->%s\n", from.get(), to->get());
+        p = strtok(0, " \t");
+    }
+
+
+    String* new_ct = 0;
+    if ( (new_ct = (String*) content_type_aliases.Find( contentType )) )
+    {
+        if ( debug > 1 )
+            cout << "Translating content type '" << contentType << "' to '" << 
+*new_ct << "'\n";
+        contentType = *new_ct;
+    }
+
 }
diff -rup htdig/htdig/Retriever.cc htdig-patch3/htdig/Retriever.cc
--- htdig/htdig/Retriever.cc    Tue Oct 16 15:27:16 2001
+++ htdig-patch3/htdig/Retriever.cc     Thu Oct 18 14:28:29 2001
@@ -802,7 +802,7 @@ Retriever::RetrievedDocument(Document &d
     // routines.
     // This will generate the Parsable object as a specific parser
     //
-    Parsable   *parsable = doc.getParsable();
+    Parsable   *parsable = doc.getParsable( base->host() );
     if (parsable)
       parsable->parse(*this, *base);
     else

Reply via email to