Hi, I recently had the following problem: Due to the use of a CMS some of our pages are now UTF-8 encoded. Since we are a german university our pages may contain german umlauts ;-) I use ht://Dig to index all servers on the campus. The problem is/was, that we cannot find words with umlauts on those UTF-8 pages.
First workaround: add accept-charset="ISO-8859-1" to the ht://Dig search form. Now we can find words with umlauts on old (non UTF-8)pages but not one the new (UTF-8) pages. Attached you'll find a patch, that does a simple UTF-8 to 8bit ASCII conversion. All non-convertable characters are are mapped to a questionmark(?). ReadBody may not be the best place to add this code (and it should be added to ReadChunkedBody as well), but it was the easiest way to achieve my goal. One may give me a hint for a better place :-) Comments welcome .... Andreas -- ! Andreas Jobs Network Operating Center ! ! Ruhr-Universitaet Bochum ! ! The only way to clean a compromised system is to flatten and rebuild. !
diff -ur htdig-3.2.0b6.orig/htnet/HtHTTP.cc htdig-3.2.0b6/htnet/HtHTTP.cc --- htdig-3.2.0b6.orig/htnet/HtHTTP.cc 2004-05-28 15:15:23.000000000 +0200 +++ htdig-3.2.0b6/htnet/HtHTTP.cc 2005-04-27 23:26:16.000000000 +0200 @@ -643,6 +643,8 @@ String line = 0; int inHeader = 1; + _needUTF8Convert = 0; + if (_response._modification_time) { delete _response._modification_time; @@ -731,7 +733,15 @@ token = strtok(token, "\n\t"); if (token && *token) + { _response._content_type = token; + if ((_response._content_type.indexOf("text/html") != -1) && (_response._content_type.indexOf("UTF-8") != -1)) + { + if ( debug > 4 ) + cout << "needUTF8Convert flagged" << endl; + _needUTF8Convert = 1; + } + } } else if( ! mystrncasecmp((char*)line, "content-length:", 15)) @@ -970,6 +980,31 @@ } + if ( _needUTF8Convert ) + { + if ( debug > 4 ) + cout << "Converting UTF-8 characters" << endl; + + char *srcPtr, *dstPtr; + srcPtr = dstPtr = _response._contents.get(); + while ( *srcPtr ) + { + if ( ( *srcPtr & 0x80 ) == 0 ) + *dstPtr++ = *srcPtr++; + else if ( ( *srcPtr & 0xE0 ) == 0xC0 ) { + *dstPtr++ = (((*srcPtr & 0x03) << 6) | (*(srcPtr+1) & 0x3F) ) & 0xFF; + srcPtr += 2; + } else if ( ( *srcPtr & 0xF0 ) == 0xE0 ) { + *dstPtr++ = '?'; + srcPtr += 3; + } else { + *dstPtr++ = '?'; + srcPtr += 4; + } + } + *dstPtr = 0; + } + // Set document length _response._document_length = _response._contents.length(); diff -ur htdig-3.2.0b6.orig/htnet/HtHTTP.h htdig-3.2.0b6/htnet/HtHTTP.h --- htdig-3.2.0b6.orig/htnet/HtHTTP.h 2004-05-28 15:15:23.000000000 +0200 +++ htdig-3.2.0b6/htnet/HtHTTP.h 2005-04-27 23:25:43.000000000 +0200 @@ -316,6 +316,7 @@ int _bytes_read; // Bytes read URL _url; // URL to retrieve URL _referer; // Referring URL + int _needUTF8Convert; // Flag for simple UTF-8 convert String _accept_language; // accept-language directive
pgpV2Xg5W2VfY.pgp
Description: PGP signature