Author: bhofmann Date: Thu Aug 19 09:06:42 2010 New Revision: 987091 URL: http://svn.apache.org/viewvc?rev=987091&view=rev Log: Fix: BasicRemoteContentFetcher does not handle non UTF-8 encoded content correctly
This patch fixes original bug report from Justin Wyllie http://codereview.appspot.com/1952044/ Modified: shindig/trunk/php/src/common/sample/BasicRemoteContentFetcher.php Modified: shindig/trunk/php/src/common/sample/BasicRemoteContentFetcher.php URL: http://svn.apache.org/viewvc/shindig/trunk/php/src/common/sample/BasicRemoteContentFetcher.php?rev=987091&r1=987090&r2=987091&view=diff ============================================================================== --- shindig/trunk/php/src/common/sample/BasicRemoteContentFetcher.php (original) +++ shindig/trunk/php/src/common/sample/BasicRemoteContentFetcher.php Thu Aug 19 09:06:42 2010 @@ -91,23 +91,34 @@ class BasicRemoteContentFetcher extends // Attempt to magically convert all text'ish responses to UTF8, especially the xml and json parsers get upset if invalid UTF8 is encountered $textTypes = array('text', 'html', 'json', 'xml', 'atom'); $isTextType = false; + $isXml = false; foreach ($textTypes as $textType) { - if (strpos($contentType, $textType) !== false) { - $isTextType = true; - break; - } + if (strpos($contentType, $textType) !== false) { + if ($textType === 'xml') { + $isXml = true; + } + $isTextType = true; + break; + } } if ($isTextType && function_exists('mb_convert_encoding')) { - if (0 == preg_match("/charset\s*=\s*([^\"' >]*)/ix",$content, $charset)) { + // try to retrieve content type out of + if (0 == preg_match("/charset\s*=\s*([^\"' >]*)/ix",$content, $charset) && //http header or html meta tags + 0 == preg_match("/encoding\s*=\s*[\'\"]([^\"' >]*)/ix",$content, $charset)) { //xml declaration $charset = 'UTF-8'; } else { - $charset = trim($charset[1]); - if (($pos = strpos($charset, "\n")) !== false) { - $charset = trim(substr($charset, 0, $pos)); - } + $charset = trim($charset[1]); + if (($pos = strpos($charset, "\n")) !== false) { + $charset = trim(substr($charset, 0, $pos)); } - // the xml and json parsers get very upset if there are invalid UTF8 sequences in the string, by recoding it any bad chars will be filtered out + } + // the xml and json parsers get very upset if there are invalid UTF8 sequences in the string, by recoding it any bad chars will be filtered out $content = mb_convert_encoding($content, 'UTF-8', $charset); + // if original charset is not utf-8 we now try to rewrite any xml declarations + if ($isXml === true && strtoupper($charset) !== 'UTF-8') { + $pattern = 'encoding=\s*([\'"])' . $charset . '\s*\1'; + $content = mb_ereg_replace($pattern, 'encoding="UTF-8"', $content, "i"); + } } // on redirects and such we get multiple headers back from curl it seems, we really only want the last one while (substr($content, 0, strlen('HTTP')) == 'HTTP' && strpos($content, "\r\n\r\n") !== false) { @@ -131,7 +142,7 @@ class BasicRemoteContentFetcher extends if (curl_errno($request->handle)) { $httpCode = '500'; $body = 'Curl error: ' . curl_error($request->handle); - } + } $request->setHttpCode($httpCode); $request->setHttpCodeMsg($this->resolveHttpCode($httpCode));
