According to Lennart Almkvist:
> Some more testing gave the following results:
>
> The german flower words "Stiefmütterchen" and the islandic
> "þrenningarfjóla" are treated different in meta content
> and in the body or title part of an html document.
>
> When in the body or in the title, the "ü", "þ" and "ó "
> are decoded to a one byte character in the .wordlist and .words.db files.
>
> In meta content however, these words are decoded to "stiefmuuml;t"
> and "thorn;rennin" in the .wordlist and .words.db file. That is the "&" is
> removed and the rest is kept as letters ("&" is in valid_punctuation but
> the ";" is not, by default).
>
> Should not they be decoded as the title or body is ?
Here's a patch for 3.1.2 that should do what you want. Please give it a
try and let us know if it fixes this bug.
--- htdig-3.1.2.bak/htdig/HTML.h Wed Apr 21 21:47:57 1999
+++ htdig-3.1.2/htdig/HTML.h Fri Jul 30 12:23:25 1999
@@ -72,6 +72,7 @@ private:
// Helper functions
//
void do_tag(Retriever &, String &);
+ char *transSGML(char *);
};
#endif
--- htdig-3.1.2.bak/htdig/HTML.cc Wed Apr 21 21:47:57 1999
+++ htdig-3.1.2/htdig/HTML.cc Fri Jul 30 12:24:14 1999
@@ -744,7 +744,7 @@ HTML::do_tag(Retriever &retriever, Strin
}
if (conf["htdig-email-subject"])
{
- retriever.got_meta_subject(conf["htdig-email-subject"]);
+ retriever.got_meta_subject(transSGML(conf["htdig-email-subject"]));
}
if (conf["htdig-keywords"] || conf["keywords"])
{
@@ -757,7 +757,7 @@ HTML::do_tag(Retriever &retriever, Strin
char *keywords = conf["htdig-keywords"];
if (!keywords)
keywords = conf["keywords"];
- char *w = strtok(keywords, " ,\t\r\n");
+ char *w = strtok(transSGML(keywords), " ,\t\r\n");
while (w)
{
if (strlen(w) >= minimumWordLength)
@@ -811,7 +811,7 @@ HTML::do_tag(Retriever &retriever, Strin
//
// We need to do two things. First grab the description
//
- meta_dsc = conf["content"];
+ meta_dsc = transSGML(conf["content"]);
if (meta_dsc.length() > max_meta_description_length)
meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
if (debug > 1)
@@ -824,7 +824,7 @@ HTML::do_tag(Retriever &retriever, Strin
// (slot 11 is the new slot for this)
//
- char *w = strtok(conf["content"], " \t\r\n");
+ char *w = strtok(transSGML(conf["content"]), " \t\r\n");
while (w)
{
if (strlen(w) >= minimumWordLength)
@@ -836,7 +836,7 @@ HTML::do_tag(Retriever &retriever, Strin
if (keywordsMatch.CompareWord(cache))
{
- char *w = strtok(conf["content"], " ,\t\r\n");
+ char *w = strtok(transSGML(conf["content"]), " ,\t\r\n");
while (w)
{
if (strlen(w) >= minimumWordLength)
@@ -855,7 +855,7 @@ HTML::do_tag(Retriever &retriever, Strin
}
else if (mystrcasecmp(cache, "htdig-email-subject") == 0)
{
- retriever.got_meta_subject(conf["content"]);
+ retriever.got_meta_subject(transSGML(conf["content"]));
}
else if (mystrcasecmp(cache, "htdig-noindex") == 0)
{
@@ -1095,4 +1095,26 @@ HTML::do_tag(Retriever &retriever, Strin
default:
return; // Nothing...
}
+}
+
+
+//*****************************************************************************
+// char * HTML::transSGML(char *text)
+//
+char *
+HTML::transSGML(char *str)
+{
+ static String convert;
+ unsigned char *text = (unsigned char *)str;
+
+ convert = 0;
+ while (*text)
+ {
+ if (*text == '&')
+ convert << SGMLEntities::translateAndUpdate(text);
+ else
+ convert << *text;
+ text++;
+ }
+ return convert.get();
}
--
Gilles R. Detillieux E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba Phone: (204)789-3766
Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930
------------------------------------
To unsubscribe from the htdig mailing list, send a message to
[EMAIL PROTECTED] containing the single word unsubscribe in
the SUBJECT of the message.