[htdig] PATCH: htmerge bug fixes for 3.1.5

Gilles Detillieux Fri, 23 Feb 2001 10:00:26 -0800
This patch combines and supercedes the words.cc.0, words.cc.1 and db.cc.0
patches for the htmerge program in ht://Dig 3.1.5, from the ccsf.org
htdig patch archive, and adds further fixes.  It corrects PR#872, as
reported and fixed by Tudor Hulubei, by quoting the temporary directory
name and word_list name in case they contain spaces.  It corrects PR#952,
as reported by Tomas Frydrych, by correctly handling words that begin with
"+", "-", and "!", when these are in extra_word_characters, and it fixes
a small error in the original patch for this problem.  It corrects the
problems with bad wordlists generated by htmerge -m causing it to lose
entries in the words.db, as reported by Olivier Korn and Curtis Ireland.

Finally, it also corrects the problem with the sort program using a
non-ASCII collating sequence for locales other than "C", as reported
by Charles Népote and Olivier Korn.  This too was causing htmerge
(with or without -m) to lose entries in the words.db.  It will now
detect when words are out of sequence and will append to pre-existing
words.db records rather than replacing them.  It's still recommended
that you set the LC_COLLATE environment variable to "C" for htmerge,
when indexing documents with accents or umlauts, because htmerge will
be much more efficient if the words are properly collated.  (In a simple
test case, htmerge took 3-4 times as long, and produced a words.db that
was about 40% larger, due to wasted space from frequent record updates,
when LC_COLLATE was not set to C.)  The patch fixes rundig to do this,
but if you run htmerge manually or from another script, you'll need to
do this yourself.  The textutils-2.0 package for Linux introduced the
new, locale-aware sort program, so if you have this package, or a more
recent one, you'll need to use this patch, or export LC_COLLATE=C in
your environment variables before running htmerge, or preferably both.

You can apply this patch in the main htdig-3.1.5 source directory using
"patch -p0 < this-message-file".


--- htmerge/db.cc.fieldbug      Thu Feb 24 20:29:11 2000
+++ htmerge/db.cc       Mon Feb 12 15:43:53 2001
@@ -229,16 +229,16 @@ mergeDB()
 
        // Record the word in the new file
        fprintf(wordlist, "%s", word.get());
+       fprintf(wordlist, "\ti:%d\tl:%d\tw:%d",
+               wr.id,
+               wr.location,
+               wr.weight);
 #ifndef NO_WORD_COUNT
        if (wr.count != 1)
          {
            fprintf(wordlist, "\tc:%d", wr.count);
          }
 #endif
-       fprintf(wordlist, "\tl:%d\ti:%d\tw:%d",
-               wr.location,
-               wr.id,
-               wr.weight);
        if (wr.anchor != 0)
          {
            fprintf(wordlist, "\ta:%d", wr.anchor);
@@ -313,16 +313,16 @@ mergeDB()
 
        // Record the word in the new file
        fprintf(wordlist, "%s", word.get());
+       fprintf(wordlist, "\ti:%d\tl:%d\tw:%d",
+               wr.id + docIDOffset,
+               wr.location,
+               wr.weight);
 #ifndef NO_WORD_COUNT
        if (wr.count != 1)
          {
            fprintf(wordlist, "\tc:%d", wr.count);
          }
 #endif
-       fprintf(wordlist, "\tl:%d\ti:%d\tw:%d",
-               wr.location,
-               wr.id + docIDOffset,
-               wr.weight);
        if (wr.anchor != 0)
          {
            fprintf(wordlist, "\ta:%d", wr.anchor);
--- htmerge/words.cc.wordbugs   Thu Feb 24 20:29:11 2000
+++ htmerge/words.cc    Fri Feb 23 09:46:42 2001
@@ -29,6 +29,7 @@ mergeWords(char *wordtmp, char *wordfile
     int                word_count = 0;
     WordRecord wr, last_wr;
     String      last_word;
+    String      high_word;
 
     //
     // Check for file access errors
@@ -59,9 +60,9 @@ mergeWords(char *wordtmp, char *wordfile
     String     tmpdir = getenv("TMPDIR");
     if (tmpdir.length())
     {
-       command << " -T " << tmpdir;
+       command << " -T \"" << tmpdir << "\"";
     }
-    command << ' ' << wordtmp;
+    command << " \"" << wordtmp << "\"";
     FILE       *sorted = popen(command, "r");
     if (!sorted)
     {
@@ -74,37 +75,40 @@ mergeWords(char *wordtmp, char *wordfile
     //
     while (fgets(buffer, sizeof(buffer), sorted))
     {
-       if (*buffer == '+')
+       //
+       // Split the line up into the word, count, location, and
+       // document id.
+       //
+       word = good_strtok(buffer, '\t');
+       pair = good_strtok(NULL, '\t');
+       if (!word.length() || !pair || !*pair)
        {
+         if (*buffer == '+')
+         {
            //
            // This tells us that the document hasn't changed and we
            // are to reuse the old words
            //
-       }
-       else if (*buffer == '-')
-       {
+         }
+         else if (*buffer == '-')
+         {
            if (removeBadUrls)
            {
                discard_list.Add(strtok(buffer + 1, "\n"), 0);
                if (verbose)
                    cout << "htmerge: Removing doc #" << buffer + 1 << endl;
            }
-       }
-       else if (*buffer == '!')
-       {
+         }
+         else if (*buffer == '!')
+         {
            discard_list.Add(strtok(buffer + 1, "\n"), 0);
            if (verbose)
                cout << "htmerge: doc #" << buffer + 1 <<
                    " has been superceeded." << endl;
+         }
        }
        else
        {
-           //
-           // Split the line up into the word, count, location, and
-           // document id.
-           //
-           word = good_strtok(buffer, '\t');
-           pair = good_strtok(NULL, '\t');
            wr.Clear();   // Reset count to 1, anchor to 0, and all that
            sid = "-";
            while (pair && *pair)
@@ -214,6 +218,7 @@ mergeWords(char *wordtmp, char *wordfile
                out = 0;
                out.append((char *) &last_wr, sizeof(last_wr));
                currentWord = last_word;
+               high_word = last_word;
            }
            else if (strcmp(last_word, currentWord) == 0)
            {
@@ -232,6 +237,13 @@ mergeWords(char *wordtmp, char *wordfile
                currentWord = last_word;
 
                out = 0;
+               if (strcmp(last_word, high_word) > 0)
+                   high_word = last_word;
+               else
+               {
+                   // words in non-ASCII collating order, get earlier record
+                   dbf->Get(currentWord, out);
+               }
                out.append((char *) &last_wr, sizeof(last_wr));
                word_count++;
                if (verbose && word_count == 1)
@@ -290,6 +302,7 @@ mergeWords(char *wordtmp, char *wordfile
        out = 0;
        out.append((char *) &last_wr, sizeof(last_wr));
        currentWord = last_word;
+       high_word = last_word;
       }
     else if (strcmp(last_word, currentWord) == 0)
       {
@@ -308,6 +321,13 @@ mergeWords(char *wordtmp, char *wordfile
        currentWord = last_word;
        
        out = 0;
+       if (strcmp(last_word, high_word) > 0)
+           high_word = last_word;
+       else
+       {
+           // words in non-ASCII collating order, get earlier record
+           dbf->Get(currentWord, out);
+       }
        out.append((char *) &last_wr, sizeof(last_wr));
        word_count++;
        if (verbose && word_count == 1)
--- installdir/rundig.sortbug   Thu Feb 24 20:29:12 2000
+++ installdir/rundig   Fri Feb 23 11:43:19 2001
@@ -34,7 +34,7 @@ TMPDIR=$DBDIR
 export TMPDIR
 
 $BINDIR/htdig -i $opts $stats $alt
-$BINDIR/htmerge $opts $stats $alt
+LC_COLLATE=C $BINDIR/htmerge $opts $stats $alt
 case "$alt" in
 -a)
   ( cd $DBDIR && test -f db.docdb.work &&


-- 
Gilles R. Detillieux              E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930

_______________________________________________
htdig-general mailing list <[EMAIL PROTECTED]>
To unsubscribe, send a message to <[EMAIL PROTECTED]> with a 
subject of unsubscribe
FAQ: http://htdig.sourceforge.net/FAQ.html
[htdig] PATCH: htmerge bug fixes for 3.1.5

Reply via email to