[htdig-dev] Simple UTF-8 support patch

Andreas Jobs Thu, 28 Apr 2005 17:42:47 -0700

Hi,

I recently had the following problem: Due to the use of a CMS some of our pages
are now UTF-8 encoded. Since we are a german university our pages may contain
german umlauts ;-) I use ht://Dig to index all servers on the campus. The
problem is/was, that we cannot find words with umlauts on those UTF-8 pages.


First workaround: add accept-charset="ISO-8859-1" to the ht://Dig search form.
Now we can find words with umlauts on old (non UTF-8)pages but not one the new
(UTF-8) pages.

Attached you'll find a patch, that does a simple UTF-8 to 8bit ASCII
conversion. All non-convertable characters are are mapped to a questionmark(?). 

ReadBody may not be the best place to add this code (and it should be added to
ReadChunkedBody as well), but it was the easiest way to achieve my goal. One
may give me a hint for a better place :-)

Comments welcome ....

Andreas

-- 
! Andreas Jobs                                 Network Operating Center !
!                                              Ruhr-Universitaet Bochum !
! The only way to clean a compromised system is to flatten and rebuild. !

diff -ur htdig-3.2.0b6.orig/htnet/HtHTTP.cc htdig-3.2.0b6/htnet/HtHTTP.cc
--- htdig-3.2.0b6.orig/htnet/HtHTTP.cc  2004-05-28 15:15:23.000000000 +0200
+++ htdig-3.2.0b6/htnet/HtHTTP.cc       2005-04-27 23:26:16.000000000 +0200
@@ -643,6 +643,8 @@
     String     line = 0;
     int                inHeader = 1;
 
+    _needUTF8Convert = 0;
+
     if (_response._modification_time)
     {
        delete _response._modification_time;
@@ -731,7 +733,15 @@
             token = strtok(token, "\n\t");
 
             if (token && *token)
+            {
                _response._content_type = token;
+               if ((_response._content_type.indexOf("text/html") != -1) && 
(_response._content_type.indexOf("UTF-8") != -1))
+               {
+                  if ( debug > 4 )
+                     cout << "needUTF8Convert flagged" << endl;
+                  _needUTF8Convert = 1;
+               }
+            }
 
          }
          else if( ! mystrncasecmp((char*)line, "content-length:", 15))
@@ -970,6 +980,31 @@
 
     }
 
+    if ( _needUTF8Convert )
+    {
+        if ( debug > 4 )
+            cout << "Converting UTF-8 characters" << endl;
+
+        char *srcPtr, *dstPtr;
+        srcPtr = dstPtr = _response._contents.get();
+        while ( *srcPtr )
+        {
+            if ( ( *srcPtr & 0x80 ) == 0 )
+                *dstPtr++ = *srcPtr++;
+            else if ( ( *srcPtr & 0xE0 ) == 0xC0 ) {
+                *dstPtr++ = (((*srcPtr & 0x03) << 6) | (*(srcPtr+1) & 0x3F)  ) 
& 0xFF;
+                srcPtr += 2;
+            } else if ( ( *srcPtr & 0xF0 ) == 0xE0 ) {
+                *dstPtr++ = '?';
+                srcPtr += 3;
+            } else {
+                *dstPtr++ = '?';
+                srcPtr += 4;
+            }
+        }
+        *dstPtr = 0;
+    }
+
     // Set document length
     _response._document_length = _response._contents.length();
 
diff -ur htdig-3.2.0b6.orig/htnet/HtHTTP.h htdig-3.2.0b6/htnet/HtHTTP.h
--- htdig-3.2.0b6.orig/htnet/HtHTTP.h   2004-05-28 15:15:23.000000000 +0200
+++ htdig-3.2.0b6/htnet/HtHTTP.h        2005-04-27 23:25:43.000000000 +0200
@@ -316,6 +316,7 @@
    int         _bytes_read;        // Bytes read
    URL         _url;               // URL to retrieve
    URL         _referer;           // Referring URL
+   int         _needUTF8Convert;   // Flag for simple UTF-8 convert
 
    String      _accept_language;    // accept-language directive

pgpV2Xg5W2VfY.pgp
Description: PGP signature

[htdig-dev] Simple UTF-8 support patch

Reply via email to