I've made a quick little modification to allow me to translate accented
characters down to their low ascii equivalent.  Basically it just adds a
second word to the database with all low ascii characters.  This way I
can still let my users search for accents if they want, but aren't forced
to enter them. 

The translation is also hard coded right now :( I was thinking of adding a
config file paramater such as "translation_table: 123-162=a 183-184=d" etc
if this feature is useful and so others could adapt it without actually
changing the code =)

The only problem with this is that htsearch can't highlight the word
searched for because the accents are no longer there.  I just use the
"no_excerpt_show_top: true" in my config file to prevent a bunch of
"Search word not found in top of document." Not a big deal for me because
the important thing is that it actually finds relavant documents.

I'm by no means a very good c++ coder so any suggestions would be great.
Is there a better file for me to be modifying instead of htdig/Retriever?

Thanks for your time,

Alex Chan


*** Retriever.cc        Wed Jun 30 16:34:02 1999
--- ../../htdig-3.1.2/htdig/Retriever.cc        Wed Apr 21 22:47:57 1999
*************** Retriever::Retriever(RetrieverLog flags)
*** 83,121 ****
              fclose(urls_parsed);
        }
          unlink(filelog);
      }
- 
-       
-     // Create the lookup table      
-       if  (config.Boolean("translateaccents", 0)) {
-           cout << "setting up transtable: "<<endl;
-           for (  unsigned char i = 0; i < 255 ; i++)
-           {
-                   transtable[i] = i;
-           }
-           // Specific translation range
-           // htdig is case insensitive but upper case 
-           // is included just in case
-           TableEntry(224, 230, 'a');
-           TableEntry(192, 198, 'A');
-           TableEntry(231, 231, 'c');
-           TableEntry(199, 199, 'C');
-           TableEntry(232, 235, 'e');
-           TableEntry(200, 203, 'E');
-           TableEntry(236, 239, 'i');
-           TableEntry(204, 207, 'I');
-           TableEntry(236, 239, 'i');
-           TableEntry(241, 241, 'n');
-           TableEntry(209, 209, 'N');
-           TableEntry(242, 246, 'o');
-           TableEntry(210, 214, 'O');
-           TableEntry(249, 252, 'u');
-           TableEntry(217, 220, 'U');
-           TableEntry(253, 255, 'y');
-           TableEntry(221, 221, 'Y');
-       }
-               
  }
  
  
  //*****************************************************************************
--- 83,90 ----
*************** Retriever::~Retriever()
*** 125,161 ****
  {
      delete doc;
  }
  
- //*******************************************************
- // Retriever::TableEntry(int start, int finish, unsigned char letter) 
- // Enters an alternate value into the tranlsation table
- void 
- Retriever::TableEntry(int start, int finish, unsigned char letter)
- {
-         for (int i=start; i <= finish; i++)
-         {
-                 transtable[i] = letter;
-         }
- }
- 
- int
- Retriever::Translate(char* w)
- {
-       unsigned char* word = w; // Change the sign in order to ease table lookup
-         int HighAsciiFound = false;
-         while (*word)
-         {
-                 if ( *word > 127 )
-                 { 
-                       *word = transtable[*word];
-                         HighAsciiFound = true;
-                 }
-                 word++;
-         }
-         return HighAsciiFound;
- }
- 
  
  //*****************************************************************************
  // void Retriever::setUsernamePassword(char *credentials)
  //
--- 94,101 ----
*************** Retriever::GetRef(char *u)
*** 928,936 ****
  //
  void
  Retriever::got_word(char *word, int location, int heading)
  {
-     static bool translateaccents = config.Boolean("translateaccents", 0);
      if (debug > 3)
        cout << "word: " << word << '@' << location << endl;
      if (heading > 11 || heading < 0) // Current limits for headings
        heading = 0;  // Assume it's just normal text
--- 868,875 ----
*************** Retriever::got_word(char *word, int loca
*** 938,956 ****
      {
        String w = word;
        HtStripPunctuation(w);
        if (w.length() >= minimumWordLength)
-       { 
        words.Word(w, location, current_anchor_number, factor[heading]);
- 
-       if (translateaccents){
-               if (Translate(w)) {
-                       // Add the word in again with accents translated down 
-                       words.Word(w, location, current_anchor_number, 
factor[heading]);
-               } 
- 
-       }
-       }
      }
  }
  
  
--- 877,885 ----
*** Retriever.h Wed Jun 30 14:57:23 1999
--- ../../htdig-3.1.2/htdig/Retriever.h Wed Apr 21 22:47:57 1999
*************** private:
*** 123,136 ****
      void              RetrievedDocument(Document &, char *url, DocumentRef *ref);
      void              parse_url(URLRef &urlRef);
      void              got_redirect(char *, DocumentRef *);
      void              recordNotFound(char *url, char *referer, int reason);
- 
- 
-  
-     int               Translate( char* word);
-     void              TableEntry(int start, int finish, unsigned char letter);
-     unsigned char     transtable[255];
  };
  
  #endif
  
--- 123,130 ----

Reply via email to