Author: bhofmann
Date: Thu Aug 19 09:06:42 2010
New Revision: 987091

URL: http://svn.apache.org/viewvc?rev=987091&view=rev
Log:
Fix: BasicRemoteContentFetcher does not handle non UTF-8 encoded content 
correctly

This patch fixes original bug report from Justin Wyllie

http://codereview.appspot.com/1952044/

Modified:
    shindig/trunk/php/src/common/sample/BasicRemoteContentFetcher.php

Modified: shindig/trunk/php/src/common/sample/BasicRemoteContentFetcher.php
URL: 
http://svn.apache.org/viewvc/shindig/trunk/php/src/common/sample/BasicRemoteContentFetcher.php?rev=987091&r1=987090&r2=987091&view=diff
==============================================================================
--- shindig/trunk/php/src/common/sample/BasicRemoteContentFetcher.php (original)
+++ shindig/trunk/php/src/common/sample/BasicRemoteContentFetcher.php Thu Aug 
19 09:06:42 2010
@@ -91,23 +91,34 @@ class BasicRemoteContentFetcher extends 
     // Attempt to magically convert all text'ish responses to UTF8, especially 
the xml and json parsers get upset if invalid UTF8 is encountered
     $textTypes = array('text', 'html', 'json', 'xml', 'atom');
     $isTextType = false;
+    $isXml = false;
     foreach ($textTypes as $textType) {
-       if (strpos($contentType, $textType) !== false) {
-               $isTextType = true;
-               break;
-       }
+      if (strpos($contentType, $textType) !== false) {
+        if ($textType === 'xml') {
+          $isXml = true;
+        }
+        $isTextType = true;
+       break;
+      }
     }
     if ($isTextType && function_exists('mb_convert_encoding')) {
-      if (0 == preg_match("/charset\s*=\s*([^\"' >]*)/ix",$content, $charset)) 
{
+      // try to retrieve content type out of
+      if (0 == preg_match("/charset\s*=\s*([^\"' >]*)/ix",$content, $charset) 
&& //http header or html meta tags
+          0 == preg_match("/encoding\s*=\s*[\'\"]([^\"' >]*)/ix",$content, 
$charset)) { //xml declaration
         $charset = 'UTF-8';
       } else {
-                       $charset = trim($charset[1]);
-                       if (($pos = strpos($charset, "\n")) !== false) {
-                         $charset = trim(substr($charset, 0, $pos));
-                       }
+               $charset = trim($charset[1]);
+               if (($pos = strpos($charset, "\n")) !== false) {
+                 $charset = trim(substr($charset, 0, $pos));
                }
-               // the xml and json parsers get very upset if there are invalid 
UTF8 sequences in the string, by recoding it any bad chars will be filtered out
+         }
+         // the xml and json parsers get very upset if there are invalid UTF8 
sequences in the string, by recoding it any bad chars will be filtered out
       $content = mb_convert_encoding($content, 'UTF-8', $charset);
+      // if original charset is not utf-8 we now try to rewrite any xml 
declarations
+      if ($isXml === true && strtoupper($charset) !== 'UTF-8') {
+        $pattern =  'encoding=\s*([\'"])' . $charset . '\s*\1';
+        $content = mb_ereg_replace($pattern, 'encoding="UTF-8"', $content, 
"i");
+      }
        }
     // on redirects and such we get multiple headers back from curl it seems, 
we really only want the last one
     while (substr($content, 0, strlen('HTTP')) == 'HTTP' && strpos($content, 
"\r\n\r\n") !== false) {
@@ -131,7 +142,7 @@ class BasicRemoteContentFetcher extends 
     if (curl_errno($request->handle)) {
       $httpCode = '500';
       $body = 'Curl error: ' . curl_error($request->handle);
-    }   
+    }
 
     $request->setHttpCode($httpCode);
     $request->setHttpCodeMsg($this->resolveHttpCode($httpCode));


Reply via email to