This patch combines and supercedes the words.cc.0, words.cc.1 and db.cc.0
patches for the htmerge program in ht://Dig 3.1.5, from the ccsf.org
htdig patch archive, and adds further fixes. It corrects PR#872, as
reported and fixed by Tudor Hulubei, by quoting the temporary directory
name and word_list name in case they contain spaces. It corrects PR#952,
as reported by Tomas Frydrych, by correctly handling words that begin with
"+", "-", and "!", when these are in extra_word_characters, and it fixes
a small error in the original patch for this problem. It corrects the
problems with bad wordlists generated by htmerge -m causing it to lose
entries in the words.db, as reported by Olivier Korn and Curtis Ireland.
Finally, it also corrects the problem with the sort program using a
non-ASCII collating sequence for locales other than "C", as reported
by Charles Népote and Olivier Korn. This too was causing htmerge
(with or without -m) to lose entries in the words.db. It will now
detect when words are out of sequence and will append to pre-existing
words.db records rather than replacing them. It's still recommended
that you set the LC_COLLATE environment variable to "C" for htmerge,
when indexing documents with accents or umlauts, because htmerge will
be much more efficient if the words are properly collated. (In a simple
test case, htmerge took 3-4 times as long, and produced a words.db that
was about 40% larger, due to wasted space from frequent record updates,
when LC_COLLATE was not set to C.) The patch fixes rundig to do this,
but if you run htmerge manually or from another script, you'll need to
do this yourself. The textutils-2.0 package for Linux introduced the
new, locale-aware sort program, so if you have this package, or a more
recent one, you'll need to use this patch, or export LC_COLLATE=C in
your environment variables before running htmerge, or preferably both.
You can apply this patch in the main htdig-3.1.5 source directory using
"patch -p0 < this-message-file".
--- htmerge/db.cc.fieldbug Thu Feb 24 20:29:11 2000
+++ htmerge/db.cc Mon Feb 12 15:43:53 2001
@@ -229,16 +229,16 @@ mergeDB()
// Record the word in the new file
fprintf(wordlist, "%s", word.get());
+ fprintf(wordlist, "\ti:%d\tl:%d\tw:%d",
+ wr.id,
+ wr.location,
+ wr.weight);
#ifndef NO_WORD_COUNT
if (wr.count != 1)
{
fprintf(wordlist, "\tc:%d", wr.count);
}
#endif
- fprintf(wordlist, "\tl:%d\ti:%d\tw:%d",
- wr.location,
- wr.id,
- wr.weight);
if (wr.anchor != 0)
{
fprintf(wordlist, "\ta:%d", wr.anchor);
@@ -313,16 +313,16 @@ mergeDB()
// Record the word in the new file
fprintf(wordlist, "%s", word.get());
+ fprintf(wordlist, "\ti:%d\tl:%d\tw:%d",
+ wr.id + docIDOffset,
+ wr.location,
+ wr.weight);
#ifndef NO_WORD_COUNT
if (wr.count != 1)
{
fprintf(wordlist, "\tc:%d", wr.count);
}
#endif
- fprintf(wordlist, "\tl:%d\ti:%d\tw:%d",
- wr.location,
- wr.id + docIDOffset,
- wr.weight);
if (wr.anchor != 0)
{
fprintf(wordlist, "\ta:%d", wr.anchor);
--- htmerge/words.cc.wordbugs Thu Feb 24 20:29:11 2000
+++ htmerge/words.cc Fri Feb 23 09:46:42 2001
@@ -29,6 +29,7 @@ mergeWords(char *wordtmp, char *wordfile
int word_count = 0;
WordRecord wr, last_wr;
String last_word;
+ String high_word;
//
// Check for file access errors
@@ -59,9 +60,9 @@ mergeWords(char *wordtmp, char *wordfile
String tmpdir = getenv("TMPDIR");
if (tmpdir.length())
{
- command << " -T " << tmpdir;
+ command << " -T \"" << tmpdir << "\"";
}
- command << ' ' << wordtmp;
+ command << " \"" << wordtmp << "\"";
FILE *sorted = popen(command, "r");
if (!sorted)
{
@@ -74,37 +75,40 @@ mergeWords(char *wordtmp, char *wordfile
//
while (fgets(buffer, sizeof(buffer), sorted))
{
- if (*buffer == '+')
+ //
+ // Split the line up into the word, count, location, and
+ // document id.
+ //
+ word = good_strtok(buffer, '\t');
+ pair = good_strtok(NULL, '\t');
+ if (!word.length() || !pair || !*pair)
{
+ if (*buffer == '+')
+ {
//
// This tells us that the document hasn't changed and we
// are to reuse the old words
//
- }
- else if (*buffer == '-')
- {
+ }
+ else if (*buffer == '-')
+ {
if (removeBadUrls)
{
discard_list.Add(strtok(buffer + 1, "\n"), 0);
if (verbose)
cout << "htmerge: Removing doc #" << buffer + 1 << endl;
}
- }
- else if (*buffer == '!')
- {
+ }
+ else if (*buffer == '!')
+ {
discard_list.Add(strtok(buffer + 1, "\n"), 0);
if (verbose)
cout << "htmerge: doc #" << buffer + 1 <<
" has been superceeded." << endl;
+ }
}
else
{
- //
- // Split the line up into the word, count, location, and
- // document id.
- //
- word = good_strtok(buffer, '\t');
- pair = good_strtok(NULL, '\t');
wr.Clear(); // Reset count to 1, anchor to 0, and all that
sid = "-";
while (pair && *pair)
@@ -214,6 +218,7 @@ mergeWords(char *wordtmp, char *wordfile
out = 0;
out.append((char *) &last_wr, sizeof(last_wr));
currentWord = last_word;
+ high_word = last_word;
}
else if (strcmp(last_word, currentWord) == 0)
{
@@ -232,6 +237,13 @@ mergeWords(char *wordtmp, char *wordfile
currentWord = last_word;
out = 0;
+ if (strcmp(last_word, high_word) > 0)
+ high_word = last_word;
+ else
+ {
+ // words in non-ASCII collating order, get earlier record
+ dbf->Get(currentWord, out);
+ }
out.append((char *) &last_wr, sizeof(last_wr));
word_count++;
if (verbose && word_count == 1)
@@ -290,6 +302,7 @@ mergeWords(char *wordtmp, char *wordfile
out = 0;
out.append((char *) &last_wr, sizeof(last_wr));
currentWord = last_word;
+ high_word = last_word;
}
else if (strcmp(last_word, currentWord) == 0)
{
@@ -308,6 +321,13 @@ mergeWords(char *wordtmp, char *wordfile
currentWord = last_word;
out = 0;
+ if (strcmp(last_word, high_word) > 0)
+ high_word = last_word;
+ else
+ {
+ // words in non-ASCII collating order, get earlier record
+ dbf->Get(currentWord, out);
+ }
out.append((char *) &last_wr, sizeof(last_wr));
word_count++;
if (verbose && word_count == 1)
--- installdir/rundig.sortbug Thu Feb 24 20:29:12 2000
+++ installdir/rundig Fri Feb 23 11:43:19 2001
@@ -34,7 +34,7 @@ TMPDIR=$DBDIR
export TMPDIR
$BINDIR/htdig -i $opts $stats $alt
-$BINDIR/htmerge $opts $stats $alt
+LC_COLLATE=C $BINDIR/htmerge $opts $stats $alt
case "$alt" in
-a)
( cd $DBDIR && test -f db.docdb.work &&
--
Gilles R. Detillieux E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba Phone: (204)789-3766
Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930
_______________________________________________
htdig-general mailing list <[EMAIL PROTECTED]>
To unsubscribe, send a message to <[EMAIL PROTECTED]> with a
subject of unsubscribe
FAQ: http://htdig.sourceforge.net/FAQ.html