According to Lennart Almkvist:
> Some more testing gave the following results:
> 
> The german flower words "Stiefmütterchen" and the islandic
> "þrenningarfjóla" are treated different in meta content
> and in the body or title part of an html document.
> 
> When in the body or in the title,  the  "ü", "þ" and "ó "
> are decoded to a one byte character in the .wordlist and .words.db files.
> 
> In meta content however, these  words are decoded to "stiefmuuml;t"
> and "thorn;rennin" in the .wordlist and .words.db file. That is the "&" is
> removed and the rest is kept as letters ("&" is in valid_punctuation but
> the ";"  is not, by default).
> 
> Should not they be decoded as the title or body is ?

Here's a patch for 3.1.2 that should do what you want.  Please give it a
try and let us know if it fixes this bug.

--- htdig-3.1.2.bak/htdig/HTML.h        Wed Apr 21 21:47:57 1999
+++ htdig-3.1.2/htdig/HTML.h    Fri Jul 30 12:23:25 1999
@@ -72,6 +72,7 @@ private:
     // Helper functions
     //
     void               do_tag(Retriever &, String &);
+    char               *transSGML(char *);
 };
 
 #endif
--- htdig-3.1.2.bak/htdig/HTML.cc       Wed Apr 21 21:47:57 1999
+++ htdig-3.1.2/htdig/HTML.cc   Fri Jul 30 12:24:14 1999
@@ -744,7 +744,7 @@ HTML::do_tag(Retriever &retriever, Strin
            }
            if (conf["htdig-email-subject"])
            {
-               retriever.got_meta_subject(conf["htdig-email-subject"]);
+               retriever.got_meta_subject(transSGML(conf["htdig-email-subject"]));
            }
            if (conf["htdig-keywords"] || conf["keywords"])
            {
@@ -757,7 +757,7 @@ HTML::do_tag(Retriever &retriever, Strin
                char    *keywords = conf["htdig-keywords"];
                if (!keywords)
                    keywords = conf["keywords"];
-               char    *w = strtok(keywords, " ,\t\r\n");
+               char    *w = strtok(transSGML(keywords), " ,\t\r\n");
                while (w)
                {
                    if (strlen(w) >= minimumWordLength)
@@ -811,7 +811,7 @@ HTML::do_tag(Retriever &retriever, Strin
                    //
                    // We need to do two things. First grab the description
                    //
-                   meta_dsc = conf["content"];
+                   meta_dsc = transSGML(conf["content"]);
                   if (meta_dsc.length() > max_meta_description_length)
                     meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
                   if (debug > 1)
@@ -824,7 +824,7 @@ HTML::do_tag(Retriever &retriever, Strin
                   // (slot 11 is the new slot for this)
                   //
 
-                  char        *w = strtok(conf["content"], " \t\r\n");
+                  char        *w = strtok(transSGML(conf["content"]), " \t\r\n");
                    while (w)
                     {
                        if (strlen(w) >= minimumWordLength)
@@ -836,7 +836,7 @@ HTML::do_tag(Retriever &retriever, Strin
 
                if (keywordsMatch.CompareWord(cache))
                {
-                   char        *w = strtok(conf["content"], " ,\t\r\n");
+                   char        *w = strtok(transSGML(conf["content"]), " ,\t\r\n");
                    while (w)
                    {
                        if (strlen(w) >= minimumWordLength)
@@ -855,7 +855,7 @@ HTML::do_tag(Retriever &retriever, Strin
                }
                else if (mystrcasecmp(cache, "htdig-email-subject") == 0)
                {
-                   retriever.got_meta_subject(conf["content"]);
+                   retriever.got_meta_subject(transSGML(conf["content"]));
                }
                else if (mystrcasecmp(cache, "htdig-noindex") == 0)
                  {
@@ -1095,4 +1095,26 @@ HTML::do_tag(Retriever &retriever, Strin
        default:
            return;                                             // Nothing...
     }
+}
+
+
+//*****************************************************************************
+// char * HTML::transSGML(char *text)
+//
+char *
+HTML::transSGML(char *str)
+{
+    static String      convert;
+    unsigned char      *text = (unsigned char *)str;
+
+    convert = 0;
+    while (*text)
+    {
+       if (*text == '&')
+           convert << SGMLEntities::translateAndUpdate(text);
+       else
+           convert << *text;
+       text++;
+    }
+    return convert.get();
 }

-- 
Gilles R. Detillieux              E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930

------------------------------------
To unsubscribe from the htdig mailing list, send a message to
[EMAIL PROTECTED] containing the single word unsubscribe in
the SUBJECT of the message.

Reply via email to