Author: Alexandru Stanoi Date: 2007-04-25 17:01:30 +0200 (Wed, 25 Apr 2007) New Revision: 4921
Log: - Implemented feature request #9785: Allow developers to specify their own character conversion function to UTF-8. Also fixed issue #8369 as developers can ignore the notices thrown by iconv in their own conversion function. Modified: trunk/Mail/ChangeLog trunk/Mail/src/internal/charset_convert.php trunk/Mail/tests/parser/parser_test.php Modified: trunk/Mail/ChangeLog =================================================================== --- trunk/Mail/ChangeLog 2007-04-25 14:54:30 UTC (rev 4920) +++ trunk/Mail/ChangeLog 2007-04-25 15:01:30 UTC (rev 4921) @@ -32,6 +32,9 @@ Documentation enhancement - Fixed issue #10656: Parsing of incomplete multipart/related mails does not trigger a notice anymore. +- Implemented feature request #9785: Allow developers to specify their own + character conversion function to UTF-8. Also fixed issue #8369 as developers + can ignore the notices thrown by iconv in their own conversion function. 1.2.1 - [RELEASEDATE] @@ -58,8 +61,6 @@ ezcMailFileParser. - Fixed issue #10396: Method convertToUTF8 assumes 'latin1' charset instead of 'unknown-8bit' and 'x-user-defined'. -- Fixed issue #8369: Ignoring notices thrown by iconv() in the convertToUTF8() - method. 1.2 - Monday 18 December 2006 Modified: trunk/Mail/src/internal/charset_convert.php =================================================================== --- trunk/Mail/src/internal/charset_convert.php 2007-04-25 14:54:30 UTC (rev 4920) +++ trunk/Mail/src/internal/charset_convert.php 2007-04-25 15:01:30 UTC (rev 4921) @@ -1,26 +1,115 @@ <?php /** - * File containing the ezcMailCharsetConverter + * File containing the ezcMailCharsetConverter class. * * @package Mail - * @version //autogentag// - * @copyright Copyright (C) 2005, 2006 eZ systems as. All rights reserved. + * @version //autogen// + * @copyright Copyright (C) 2005-2007 eZ systems as. All rights reserved. * @license http://ez.no/licenses/new_bsd New BSD License - * @access private */ /** - * Small internal class for common character set conversion methods inside Mail. + * Class containing common character set conversion methods. * + * By calling the static function ezcMailCharsetConverter::setConvertMethod() + * before doing mail parsing, another callback function can be used for + * character conversion to UTF-8 in place of the normal iconv() conversion. + * + * The callback function must have this signature: + * <code> + * public static function function_name( $text, $originalCharset ); + * </code> + * + * where: + * - $text = string to convert to UTF-8 + * - $originalCharset = in what charset is $text + * + * Example: + * <code> + * // specify another function for character set conversion + * ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 'convertToUTF8IconvIgnore' ) ); + * + * // ...code for mail parsing... + * </code> + * + * where myConverter is (along with some other examples of charset conversion + * functions which can be used): + * <code> + * class myConverter + * { + * public static function convertToUTF8IconvIgnore( $text, $originalCharset ) + * { + * if ( $originalCharset === 'unknown-8bit' || $originalCharset === 'x-user-defined' ) + * { + * $originalCharset = "latin1"; + * } + * return iconv( $originalCharset, 'utf-8//IGNORE', $text ); + * } + * + * public static function convertToUTF8IconvTranslit( $text, $originalCharset ) + * { + * if ( $originalCharset === 'unknown-8bit' || $originalCharset === 'x-user-defined' ) + * { + * $originalCharset = "latin1"; + * } + * return iconv( $originalCharset, 'utf-8//TRANSLIT', $text ); + * } + * + * public static function convertToUTF8Mbstring( $text, $originalCharset ) + * { + * return mb_convert_encoding( $text, "UTF-8", $originalCharset ); + * } + * } + * </code> + * + * Developers can choose to use the error suppresion operator ('@') in front of + * the iconv() calls in the above examples, in order to ignore the notices thrown + * when processing broken text (issue #8369). + * * @package Mail - * @version //autogentag// - * @access private + * @version //autogen// */ class ezcMailCharsetConverter { /** - * Converts the $text with the charset $originalCharset to UTF-8 + * Callback function to use for character set conversion to UTF8. * + * @var callback + */ + private static $method = array( __CLASS__, 'convertToUTF8Iconv' ); + + /** + * Sets the callback function used for character set conversion to UTF-8. + * + * Call this method before doing mail parsing if you need a special way + * of converting the character set to UTF-8. + * + * @param callback $method + */ + public static function setConvertMethod( $method ) + { + self::$method = $method; + } + + /** + * Converts the $text with the charset $originalCharset to UTF-8. + * + * It calls the function specified by using the static method + * setConvertMethod(). By default it calls convertToUTF8Iconv() defined + * in this class. + * + * @param string $text + * @param string $originalCharset + * @return string + */ + public static function convertToUTF8( $text, $originalCharset ) + { + return call_user_func( self::$method, $text, $originalCharset ); + } + + /** + * Converts the $text with the charset $originalCharset to UTF-8. + * * In case $originalCharset is 'unknown-8bit' or 'x-user-defined' then * it is assumed to be 'latin1' (ISO-8859-1). * @@ -28,13 +117,13 @@ * @param string $originalCharset * @return string */ - public static function convertToUTF8( $text, $originalCharset ) + public static function convertToUTF8Iconv( $text, $originalCharset ) { if ( $originalCharset === 'unknown-8bit' || $originalCharset === 'x-user-defined' ) { $originalCharset = "latin1"; } - return @iconv( $originalCharset, 'utf-8', $text ); + return iconv( $originalCharset, 'utf-8', $text ); } } ?> Modified: trunk/Mail/tests/parser/parser_test.php =================================================================== --- trunk/Mail/tests/parser/parser_test.php 2007-04-25 14:54:30 UTC (rev 4920) +++ trunk/Mail/tests/parser/parser_test.php 2007-04-25 15:01:30 UTC (rev 4921) @@ -63,6 +63,44 @@ } +class myConverter +{ + public static function convertToUTF8Iconv( $text, $originalCharset ) + { + if ( $originalCharset === 'unknown-8bit' || $originalCharset === 'x-user-defined' ) + { + $originalCharset = "latin1"; + } + // '@' is to avoid notices on broken input - see issue #8369 + return @iconv( $originalCharset, 'utf-8', $text ); + } + + public static function convertToUTF8IconvIgnore( $text, $originalCharset ) + { + if ( $originalCharset === 'unknown-8bit' || $originalCharset === 'x-user-defined' ) + { + $originalCharset = "latin1"; + } + // '@' is to avoid notices on broken input - see issue #8369 + return @iconv( $originalCharset, 'utf-8//IGNORE', $text ); + } + + public static function convertToUTF8IconvTranslit( $text, $originalCharset ) + { + if ( $originalCharset === 'unknown-8bit' || $originalCharset === 'x-user-defined' ) + { + $originalCharset = "latin1"; + } + // '@' is to avoid notices on broken input - see issue #8369 + return @iconv( $originalCharset, 'utf-8//TRANSLIT', $text ); + } + + public static function convertToUTF8Mbstring( $text, $originalCharset ) + { + return mb_convert_encoding( $text, "UTF-8", $originalCharset ); + } +} + /** * @package Mail * @subpackage Tests @@ -1313,20 +1351,106 @@ public function testIconvCharsetConverterIconv1() { + ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 'convertToUTF8Iconv' ) ); $parser = new ezcMailParser(); $set = new SingleFileSet( 'various/test-broken-iconv-1' ); $mail = $parser->parseMail( $set ); $mail = $mail[0]; $this->assertEquals( 63, strlen( $mail->body->text ) ); + ezcMailCharsetConverter::setConvertMethod( array( 'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) ); } public function testIconvCharsetConverterIconv2() { + ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 'convertToUTF8Iconv' ) ); $parser = new ezcMailParser(); $set = new SingleFileSet( 'various/test-broken-iconv-2' ); $mail = $parser->parseMail( $set ); $mail = $mail[0]; $this->assertEquals( 38, strlen( $mail->body->text ) ); + ezcMailCharsetConverter::setConvertMethod( array( 'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) ); } + + public function testIconvCharsetConverterIconvIgnore1() + { + ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 'convertToUTF8IconvIgnore' ) ); + $parser = new ezcMailParser(); + $set = new SingleFileSet( 'various/test-broken-iconv-1' ); + $mail = $parser->parseMail( $set ); + $mail = $mail[0]; + $this->assertEquals( 450, strlen( $mail->body->text ) ); + ezcMailCharsetConverter::setConvertMethod( array( 'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) ); + } + + public function testIconvCharsetConverterIconvIgnore2() + { + ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 'convertToUTF8IconvIgnore' ) ); + $parser = new ezcMailParser(); + $set = new SingleFileSet( 'various/test-broken-iconv-2' ); + $mail = $parser->parseMail( $set ); + $mail = $mail[0]; + $this->assertEquals( 97, strlen( $mail->body->text ) ); + ezcMailCharsetConverter::setConvertMethod( array( 'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) ); + } + + + public function testIconvCharsetConverterIconvTranslit1() + { + ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 'convertToUTF8IconvTranslit' ) ); + $parser = new ezcMailParser(); + $set = new SingleFileSet( 'various/test-broken-iconv-1' ); + $mail = $parser->parseMail( $set ); + $mail = $mail[0]; + $this->assertEquals( 63, strlen( $mail->body->text ) ); + ezcMailCharsetConverter::setConvertMethod( array( 'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) ); + } + + public function testIconvCharsetConverterIconvTranslit2() + { + ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 'convertToUTF8IconvTranslit' ) ); + $parser = new ezcMailParser(); + $set = new SingleFileSet( 'various/test-broken-iconv-2' ); + $mail = $parser->parseMail( $set ); + $mail = $mail[0]; + $this->assertEquals( 38, strlen( $mail->body->text ) ); + ezcMailCharsetConverter::setConvertMethod( array( 'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) ); + } + + public function testMbstringCharsetConverter1() + { + if ( ezcBaseFeatures::hasExtensionSupport( 'mbstring' ) ) + { + ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 'convertToUTF8Mbstring' ) ); + $parser = new ezcMailParser(); + $set = new SingleFileSet( 'various/test-broken-iconv-1' ); + $mail = $parser->parseMail( $set ); + $mail = $mail[0]; + $this->assertEquals( 468, strlen( $mail->body->text ) ); + ezcMailCharsetConverter::setConvertMethod( array( 'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) ); + } + else + { + $this->markTestSkipped( "This test doesn't work without the mbstring extension. PHP must be compiled with --enable-mbstring." ); + } + + } + + public function testMbstringCharsetConverter2() + { + if ( ezcBaseFeatures::hasExtensionSupport( 'mbstring' ) ) + { + ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 'convertToUTF8Mbstring' ) ); + $parser = new ezcMailParser(); + $set = new SingleFileSet( 'various/test-broken-iconv-2' ); + $mail = $parser->parseMail( $set ); + $mail = $mail[0]; + $this->assertEquals( 99, strlen( $mail->body->text ) ); + ezcMailCharsetConverter::setConvertMethod( array( 'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) ); + } + else + { + $this->markTestSkipped( "This test doesn't work without the mbstring extension. PHP must be compiled with --enable-mbstring." ); + } + } } ?> -- svn-components mailing list svn-components@lists.ez.no http://lists.ez.no/mailman/listinfo/svn-components