I've made a quick little modification to allow me to translate accented characters down to their low ascii equivalent. Basically it just adds a second word to the database with all low ascii characters. This way I can still let my users search for accents if they want, but aren't forced to enter them. The translation is also hard coded right now :( I was thinking of adding a config file paramater such as "translation_table: 123-162=a 183-184=d" etc if this feature is useful and so others could adapt it without actually changing the code =) The only problem with this is that htsearch can't highlight the word searched for because the accents are no longer there. I just use the "no_excerpt_show_top: true" in my config file to prevent a bunch of "Search word not found in top of document." Not a big deal for me because the important thing is that it actually finds relavant documents. I'm by no means a very good c++ coder so any suggestions would be great. Is there a better file for me to be modifying instead of htdig/Retriever? Thanks for your time, Alex Chan
*** Retriever.cc Wed Jun 30 16:34:02 1999 --- ../../htdig-3.1.2/htdig/Retriever.cc Wed Apr 21 22:47:57 1999 *************** Retriever::Retriever(RetrieverLog flags) *** 83,121 **** fclose(urls_parsed); } unlink(filelog); } - - - // Create the lookup table - if (config.Boolean("translateaccents", 0)) { - cout << "setting up transtable: "<<endl; - for ( unsigned char i = 0; i < 255 ; i++) - { - transtable[i] = i; - } - // Specific translation range - // htdig is case insensitive but upper case - // is included just in case - TableEntry(224, 230, 'a'); - TableEntry(192, 198, 'A'); - TableEntry(231, 231, 'c'); - TableEntry(199, 199, 'C'); - TableEntry(232, 235, 'e'); - TableEntry(200, 203, 'E'); - TableEntry(236, 239, 'i'); - TableEntry(204, 207, 'I'); - TableEntry(236, 239, 'i'); - TableEntry(241, 241, 'n'); - TableEntry(209, 209, 'N'); - TableEntry(242, 246, 'o'); - TableEntry(210, 214, 'O'); - TableEntry(249, 252, 'u'); - TableEntry(217, 220, 'U'); - TableEntry(253, 255, 'y'); - TableEntry(221, 221, 'Y'); - } - } //***************************************************************************** --- 83,90 ---- *************** Retriever::~Retriever() *** 125,161 **** { delete doc; } - //******************************************************* - // Retriever::TableEntry(int start, int finish, unsigned char letter) - // Enters an alternate value into the tranlsation table - void - Retriever::TableEntry(int start, int finish, unsigned char letter) - { - for (int i=start; i <= finish; i++) - { - transtable[i] = letter; - } - } - - int - Retriever::Translate(char* w) - { - unsigned char* word = w; // Change the sign in order to ease table lookup - int HighAsciiFound = false; - while (*word) - { - if ( *word > 127 ) - { - *word = transtable[*word]; - HighAsciiFound = true; - } - word++; - } - return HighAsciiFound; - } - //***************************************************************************** // void Retriever::setUsernamePassword(char *credentials) // --- 94,101 ---- *************** Retriever::GetRef(char *u) *** 928,936 **** // void Retriever::got_word(char *word, int location, int heading) { - static bool translateaccents = config.Boolean("translateaccents", 0); if (debug > 3) cout << "word: " << word << '@' << location << endl; if (heading > 11 || heading < 0) // Current limits for headings heading = 0; // Assume it's just normal text --- 868,875 ---- *************** Retriever::got_word(char *word, int loca *** 938,956 **** { String w = word; HtStripPunctuation(w); if (w.length() >= minimumWordLength) - { words.Word(w, location, current_anchor_number, factor[heading]); - - if (translateaccents){ - if (Translate(w)) { - // Add the word in again with accents translated down - words.Word(w, location, current_anchor_number, factor[heading]); - } - - } - } } } --- 877,885 ----
*** Retriever.h Wed Jun 30 14:57:23 1999 --- ../../htdig-3.1.2/htdig/Retriever.h Wed Apr 21 22:47:57 1999 *************** private: *** 123,136 **** void RetrievedDocument(Document &, char *url, DocumentRef *ref); void parse_url(URLRef &urlRef); void got_redirect(char *, DocumentRef *); void recordNotFound(char *url, char *referer, int reason); - - - - int Translate( char* word); - void TableEntry(int start, int finish, unsigned char letter); - unsigned char transtable[255]; }; #endif --- 123,130 ----