Hi all. Geoff & I discussed a problem with htmerge a week and a half ago,
but I never did post a patch for it. It prevents description text words from
clobbering the anchor number of merged text words after an anchor.
--- htmerge/words.cc.anchorbug Tue Feb 16 23:03:56 1999
+++ htmerge/words.cc Thu Mar 18 18:11:52 1999
@@ -208,7 +208,8 @@ mergeWords(char *wordtmp, char *wordfile
last_wr.weight += wr.weight;
if (wr.location < last_wr.location)
last_wr.location = wr.location;
- if (wr.anchor < last_wr.anchor)
+ if (wr.anchor > 0 && wr.anchor < last_wr.anchor
+ || last_wr.anchor == 0)
last_wr.anchor = wr.anchor;
continue;
}
While I'm at it, here's another patch I've added to docs.cc, so you can
find out why htmerge is deleting some documents from the database, with
-vv.
--- htmerge/docs.cc.quiet Tue Feb 16 23:03:56 1999
+++ htmerge/docs.cc Fri Mar 19 08:36:47 1999
@@ -63,16 +63,34 @@ convertDocs(char *doc_db, char *doc_inde
// For some reason, this document doesn't have an excerpt
// (probably because of a noindex directive, or disallowed
// by robots.txt or server_max_docs). Remove it
+ if (verbose > 1)
+ {
+ cout << "htmerge: " << url->get() << " (id " << id
+ << ") discarded: empty or disallowed, no DocHead" << endl;
+ cout.flush();
+ }
db.Delete(url->get());
}
else if ((ref->DocState()) == Reference_noindex)
{
// This document has been marked with a noindex tag. Remove it
+ if (verbose > 1)
+ {
+ cout << "htmerge: " << url->get() << " (id " << id
+ << ") discarded: noindex meta tag" << endl;
+ cout.flush();
+ }
db.Delete(url->get());
}
else if (remove_unused && discard_list.Exists(id))
{
// This document is not valid anymore. Remove it
+ if (verbose > 1)
+ {
+ cout << "htmerge: " << url->get() << " (id " << id
+ << ") discarded: gone or modified" << endl;
+ cout.flush();
+ }
db.Delete(url->get());
}
else
--
Gilles R. Detillieux E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba Phone: (204)789-3766
Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930
------------------------------------
To unsubscribe from the htdig mailing list, send a message to
[EMAIL PROTECTED] containing the single word "unsubscribe" in
the SUBJECT of the message.