Hi,

   I just finished my "accents" fuzzy class.  I don't think I'll go
further at this point. It's only for french because the translation table
is hardcoded but it would be pretty easy to make it file-based I suppose.

Now that I have done it, I must agree with you: It is better this way than
to go the "case" way if only because it is easier to migrate from one 
release to the other :-)! It is also easy to activate or deactivate
on a 'per search" basis with the new build_select_list parameter.

So thanks again for your support you and Geoff.

My patches for release 3.1.5 are included at the end of the post.  I have
not updated the documentation for now.


Patches for the "accents" fuzzy algorithm.

htcommon directory:

diff defaults.cc.orig defaults.cc
29a30
>     {"accents_db",                      "${database_base}.accents.db"},
=========================

htfuzzy directory:

diff Fuzzy.cc.orig Fuzzy.cc
15a16
> #include "Accents.h"
173a175,176
>     else if (mystrcasecmp(name, "accents") == 0)
>       return new Accents();
=========================
diff htfuzzy.cc.orig htfuzzy.cc
45a46
> #include "Accents.h"
110a112,115
>       else if (mystrcasecmp(av[i], "accents") == 0)
>       {
>           wordAlgorithms.Add(new Accents);
>       }
239a245
>     cout << "\taccents\n";
=========================
diff Makefile.in.orig Makefile.in
13c13
<               Substring.o Prefix.o
---
>               Substring.o Prefix.o Accents.o
17c17
<               Substring.o Prefix.o
---
>               Substring.o Prefix.o Accents.o
==========================

cat Accents.h
//
// Accents.h
//
// $Id: $
//
//
#ifndef _Accents_h_
#define _Accents_h_

#include "Fuzzy.h"

class Accents : public Fuzzy
{
public:
        //
        // Construction/Destruction
        //
                                        Accents();
        virtual                 ~Accents();

        virtual int     writeDB(Configuration &config);

        virtual void    generateKey(char *word, String &key);

        virtual void    addWord(char *word);

private:
};

#endif
===================

cat Accents.cc
//
// Accents.cc
//
// Implementation of Accents
//
//
//
#if RELEASE
static char RCSid[] = "$Id: $";
#endif

#include "Configuration.h"
#include "htconfig.h"
#include "Accents.h"
#include "Dictionary.h"
#include <ctype.h>
#include <fstream.h>

extern int debug;

/*---------------------------------------------------------------.
| Ajoute par Robert Marchand pour permettre le traitement adequat de |
| l'ISO-LATIN         (provient du code de Pierre Rosa)              |
`---------------------------------------------------------------*/

/*--------------------------------------------------.
| table iso-latin1 "minusculisee" et "de-accentuee" |
`--------------------------------------------------*/
  
static char MinusculeISOLAT1[256] = {
     0,   1,   2,   3,   4,   5,   6,   7,
     8,   9,  10,  11,  12,  13,  14,  15,
    16,  17,  18,  19,  20,  21,  22,  23,
    24,  25,  26,  27,  28,  29,  30,  31,
    32,  33,  34,  35,  36,  37,  38,  39,
    40,  41,  42,  43,  44,  45,  46,  47,
    48,  49,  50,  51,  52,  53,  54,  55,
    56,  57,  58,  59,  60,  61,  62,  63,
    64, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
   'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
   'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
   'x', 'y', 'z',  91,  92,  93,  94,  95,
    96, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
   'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
   'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
   'x', 'y', 'z', 123, 124, 125, 126, 127,
   128, 129, 130, 131, 132, 133, 134, 135,
   136, 137, 138, 139, 140, 141, 142, 143,
   144, 145, 146, 147, 148, 149, 150, 151,
   152, 153, 154, 155, 156, 157, 158, 159,
   160, 161, 162, 163, 164, 165, 166, 167,
   168, 168, 170, 171, 172, 173, 174, 175,
   176, 177, 178, 179, 180, 181, 182, 183,
   184, 185, 186, 187, 188, 189, 190, 191,
   'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
   'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
   208, 'n', 'o', 'o', 'o', 'o', 'o', 'o',
   'o', 'u', 'u', 'u', 'u', 'y', 222, 223,
   'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
   'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
   240, 'n', 'o', 'o', 'o', 'o', 'o', 'o',
   'o', 'u', 'u', 'u', 'u', 'y', 254, 255};
  

//*****************************************************************************
// Accents::Accents()
//
Accents::Accents()
{
    name = "accents";
}


//*****************************************************************************
// Accents::~Accents()
//
Accents::~Accents()
{
}

//*****************************************************************************
// int Accents::writeDB(Configuration &config)
//
int
Accents::writeDB(Configuration &config)
{
    String      var = name;
    var << "_db";
    String      filename = config[var];

    index = Database::getDatabaseInstance();
    if (index->OpenReadWrite(filename, 0664) == NOTOK)
        return NOTOK;

    String      *s;
    char        *fuzzyKey;

    int         count = 0;

    dict->Start_Get();
    while ((fuzzyKey = dict->Get_Next()))
    {
        s = (String *) dict->Find(fuzzyKey);

        // Only add if meaningfull list
        if (mystrcasecmp(fuzzyKey, s->get()) != 0) {

          index->Put(fuzzyKey, *s);

          if (debug > 1)
            {
              cout << "htfuzzy: '" << fuzzyKey << "' ==> '" << s->get() << "'\n"
;
            }
          count++;
          if ((count % 100) == 0 && debug == 1)
            {
              cout << "htfuzzy: keys: " << count << '\n';
              cout.flush();
            }
        }
    }
    if (debug == 1)
    {
        cout << "htfuzzy:Total keys: " << count << "\n";
    }
    return OK;
}


//*****************************************************************************
// void Accents::generateKey(char *word, String &key)
//
void
Accents::generateKey(char *word, String &key)
{

    if (!word || !*word)
      return;

    key = '0';
    while (*word) {
      key << MinusculeISOLAT1[ *word++ ];
    }
}


//*****************************************************************************
// void Accents::addWord(char *word)
//
void
Accents::addWord(char *word)
{
    if (!dict)
    {
        dict = new Dictionary;
    }

    String      key;
    generateKey(word, key);

    String      *s = (String *) dict->Find(key);
    if (s)
    {
      //        if (mystrcasestr(s->get(), word) != 0)
      (*s) << ' ' << word;
    }
    else
    {
        dict->Add(key, new String(word));
    }
}

==========================


-------
Robert Marchand                 t�l: 343-6111 poste 5210
DiTER-SDI                       e-mail: [EMAIL PROTECTED]
Universit� de Montr�al          Montr�al, Canada
------------------------------------
To unsubscribe from the htdig mailing list, send a message to
[EMAIL PROTECTED]
You will receive a message to confirm this.

Reply via email to