Author: Alexandru Stanoi
Date: 2007-04-25 17:01:30 +0200 (Wed, 25 Apr 2007)
New Revision: 4921

Log:
- Implemented feature request #9785: Allow developers to specify their own 
  character conversion function to UTF-8. Also fixed issue #8369 as developers
  can ignore the notices thrown by iconv in their own conversion function.

Modified:
   trunk/Mail/ChangeLog
   trunk/Mail/src/internal/charset_convert.php
   trunk/Mail/tests/parser/parser_test.php

Modified: trunk/Mail/ChangeLog
===================================================================
--- trunk/Mail/ChangeLog        2007-04-25 14:54:30 UTC (rev 4920)
+++ trunk/Mail/ChangeLog        2007-04-25 15:01:30 UTC (rev 4921)
@@ -32,6 +32,9 @@
   Documentation enhancement
 - Fixed issue #10656: Parsing of incomplete multipart/related mails does not
   trigger a notice anymore.    
+- Implemented feature request #9785: Allow developers to specify their own 
+  character conversion function to UTF-8. Also fixed issue #8369 as developers
+  can ignore the notices thrown by iconv in their own conversion function.
 
 
 1.2.1 - [RELEASEDATE]
@@ -58,8 +61,6 @@
   ezcMailFileParser.
 - Fixed issue #10396: Method convertToUTF8 assumes 'latin1' charset instead of
   'unknown-8bit' and 'x-user-defined'.
-- Fixed issue #8369: Ignoring notices thrown by iconv() in the convertToUTF8()
-  method.
 
 
 1.2 - Monday 18 December 2006

Modified: trunk/Mail/src/internal/charset_convert.php
===================================================================
--- trunk/Mail/src/internal/charset_convert.php 2007-04-25 14:54:30 UTC (rev 
4920)
+++ trunk/Mail/src/internal/charset_convert.php 2007-04-25 15:01:30 UTC (rev 
4921)
@@ -1,26 +1,115 @@
 <?php
 /**
- * File containing the ezcMailCharsetConverter
+ * File containing the ezcMailCharsetConverter class.
  *
  * @package Mail
- * @version //autogentag//
- * @copyright Copyright (C) 2005, 2006 eZ systems as. All rights reserved.
+ * @version //autogen//
+ * @copyright Copyright (C) 2005-2007 eZ systems as. All rights reserved.
  * @license http://ez.no/licenses/new_bsd New BSD License
- * @access private
  */
 
 /**
- * Small internal class for common character set conversion methods inside 
Mail.
+ * Class containing common character set conversion methods.
  *
+ * By calling the static function ezcMailCharsetConverter::setConvertMethod()
+ * before doing mail parsing, another callback function can be used for
+ * character conversion to UTF-8 in place of the normal iconv() conversion.
+ *
+ * The callback function must have this signature:
+ * <code>
+ * public static function function_name( $text, $originalCharset );
+ * </code>
+ *
+ * where:
+ *  - $text = string to convert to UTF-8
+ *  - $originalCharset = in what charset is $text
+ *
+ * Example:
+ * <code>
+ * // specify another function for character set conversion
+ * ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 
'convertToUTF8IconvIgnore' ) );
+ *
+ * // ...code for mail parsing...
+ * </code>
+ *
+ * where myConverter is (along with some other examples of charset conversion
+ * functions which can be used):
+ * <code>
+ * class myConverter
+ * {
+ *     public static function convertToUTF8IconvIgnore( $text, 
$originalCharset )
+ *     {
+ *         if ( $originalCharset === 'unknown-8bit' || $originalCharset === 
'x-user-defined' )
+ *         {
+ *             $originalCharset = "latin1";
+ *         }
+ *         return iconv( $originalCharset, 'utf-8//IGNORE', $text );
+ *     }
+ *
+ *     public static function convertToUTF8IconvTranslit( $text, 
$originalCharset )
+ *     {
+ *         if ( $originalCharset === 'unknown-8bit' || $originalCharset === 
'x-user-defined' )
+ *         {
+ *             $originalCharset = "latin1";
+ *         }
+ *         return iconv( $originalCharset, 'utf-8//TRANSLIT', $text );
+ *     }
+ *
+ *     public static function convertToUTF8Mbstring( $text, $originalCharset )
+ *     {
+ *         return mb_convert_encoding( $text, "UTF-8", $originalCharset );
+ *     }
+ * }
+ * </code>
+ *
+ * Developers can choose to use the error suppresion operator ('@') in front of
+ * the iconv() calls in the above examples, in order to ignore the notices 
thrown
+ * when processing broken text (issue #8369).
+ *
  * @package Mail
- * @version //autogentag//
- * @access private
+ * @version //autogen//
  */
 class ezcMailCharsetConverter
 {
     /**
-     * Converts the $text with the charset $originalCharset to UTF-8
+     * Callback function to use for character set conversion to UTF8.
      *
+     * @var callback
+     */
+    private static $method = array( __CLASS__, 'convertToUTF8Iconv' );
+
+    /**
+     * Sets the callback function used for character set conversion to UTF-8.
+     *
+     * Call this method before doing mail parsing if you need a special way
+     * of converting the character set to UTF-8.
+     *
+     * @param callback $method
+     */
+    public static function setConvertMethod( $method )
+    {
+        self::$method = $method;
+    }
+
+    /**
+     * Converts the $text with the charset $originalCharset to UTF-8.
+     *
+     * It calls the function specified by using the static method
+     * setConvertMethod(). By default it calls convertToUTF8Iconv() defined
+     * in this class.
+     *
+     * @param string $text
+     * @param string $originalCharset
+     * @return string
+     */
+    public static function convertToUTF8( $text, $originalCharset )
+    {
+        return call_user_func( self::$method, $text, $originalCharset );
+    }
+
+    /**
+     * Converts the $text with the charset $originalCharset to UTF-8.
+     *
      * In case $originalCharset is 'unknown-8bit' or 'x-user-defined' then
      * it is assumed to be 'latin1' (ISO-8859-1).
      *
@@ -28,13 +117,13 @@
      * @param string $originalCharset
      * @return string
      */
-    public static function convertToUTF8( $text, $originalCharset )
+    public static function convertToUTF8Iconv( $text, $originalCharset )
     {
         if ( $originalCharset === 'unknown-8bit' || $originalCharset === 
'x-user-defined' )
         {
             $originalCharset = "latin1";
         }
-        return @iconv( $originalCharset, 'utf-8', $text );
+        return iconv( $originalCharset, 'utf-8', $text );
     }
 }
 ?>

Modified: trunk/Mail/tests/parser/parser_test.php
===================================================================
--- trunk/Mail/tests/parser/parser_test.php     2007-04-25 14:54:30 UTC (rev 
4920)
+++ trunk/Mail/tests/parser/parser_test.php     2007-04-25 15:01:30 UTC (rev 
4921)
@@ -63,6 +63,44 @@
 
 }
 
+class myConverter
+{
+    public static function convertToUTF8Iconv( $text, $originalCharset )
+    {
+        if ( $originalCharset === 'unknown-8bit' || $originalCharset === 
'x-user-defined' )
+        {
+            $originalCharset = "latin1";
+        }
+        // '@' is to avoid notices on broken input - see issue #8369
+        return @iconv( $originalCharset, 'utf-8', $text );
+    }
+
+    public static function convertToUTF8IconvIgnore( $text, $originalCharset )
+    {
+        if ( $originalCharset === 'unknown-8bit' || $originalCharset === 
'x-user-defined' )
+        {
+            $originalCharset = "latin1";
+        }
+        // '@' is to avoid notices on broken input - see issue #8369
+        return @iconv( $originalCharset, 'utf-8//IGNORE', $text );
+    }
+
+    public static function convertToUTF8IconvTranslit( $text, $originalCharset 
)
+    {
+        if ( $originalCharset === 'unknown-8bit' || $originalCharset === 
'x-user-defined' )
+        {
+            $originalCharset = "latin1";
+        }
+        // '@' is to avoid notices on broken input - see issue #8369
+        return @iconv( $originalCharset, 'utf-8//TRANSLIT', $text );
+    }
+
+    public static function convertToUTF8Mbstring( $text, $originalCharset )
+    {
+        return mb_convert_encoding( $text, "UTF-8", $originalCharset );
+    }
+}
+
 /**
  * @package Mail
  * @subpackage Tests
@@ -1313,20 +1351,106 @@
 
     public function testIconvCharsetConverterIconv1()
     {
+        ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 
'convertToUTF8Iconv' ) );
         $parser = new ezcMailParser();
         $set = new SingleFileSet( 'various/test-broken-iconv-1' );
         $mail = $parser->parseMail( $set );
         $mail = $mail[0];
         $this->assertEquals( 63, strlen( $mail->body->text ) );
+        ezcMailCharsetConverter::setConvertMethod( array( 
'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) );
     }
 
     public function testIconvCharsetConverterIconv2()
     {
+        ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 
'convertToUTF8Iconv' ) );
         $parser = new ezcMailParser();
         $set = new SingleFileSet( 'various/test-broken-iconv-2' );
         $mail = $parser->parseMail( $set );
         $mail = $mail[0];
         $this->assertEquals( 38, strlen( $mail->body->text ) );
+        ezcMailCharsetConverter::setConvertMethod( array( 
'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) );
     }
+
+    public function testIconvCharsetConverterIconvIgnore1()
+    {
+        ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 
'convertToUTF8IconvIgnore' ) );
+        $parser = new ezcMailParser();
+        $set = new SingleFileSet( 'various/test-broken-iconv-1' );
+        $mail = $parser->parseMail( $set );
+        $mail = $mail[0];
+        $this->assertEquals( 450, strlen( $mail->body->text ) );
+        ezcMailCharsetConverter::setConvertMethod( array( 
'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) );
+    }
+
+    public function testIconvCharsetConverterIconvIgnore2()
+    {
+        ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 
'convertToUTF8IconvIgnore' ) );
+        $parser = new ezcMailParser();
+        $set = new SingleFileSet( 'various/test-broken-iconv-2' );
+        $mail = $parser->parseMail( $set );
+        $mail = $mail[0];
+        $this->assertEquals( 97, strlen( $mail->body->text ) );
+        ezcMailCharsetConverter::setConvertMethod( array( 
'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) );
+    }
+    
+
+    public function testIconvCharsetConverterIconvTranslit1()
+    {
+        ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 
'convertToUTF8IconvTranslit' ) );
+        $parser = new ezcMailParser();
+        $set = new SingleFileSet( 'various/test-broken-iconv-1' );
+        $mail = $parser->parseMail( $set );
+        $mail = $mail[0];
+        $this->assertEquals( 63, strlen( $mail->body->text ) );
+        ezcMailCharsetConverter::setConvertMethod( array( 
'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) );
+    }
+
+    public function testIconvCharsetConverterIconvTranslit2()
+    {
+        ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 
'convertToUTF8IconvTranslit' ) );
+        $parser = new ezcMailParser();
+        $set = new SingleFileSet( 'various/test-broken-iconv-2' );
+        $mail = $parser->parseMail( $set );
+        $mail = $mail[0];
+        $this->assertEquals( 38, strlen( $mail->body->text ) );
+        ezcMailCharsetConverter::setConvertMethod( array( 
'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) );
+    }
+
+    public function testMbstringCharsetConverter1()
+    {
+        if ( ezcBaseFeatures::hasExtensionSupport( 'mbstring' ) )
+        {
+            ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 
'convertToUTF8Mbstring' ) );
+            $parser = new ezcMailParser();
+            $set = new SingleFileSet( 'various/test-broken-iconv-1' );
+            $mail = $parser->parseMail( $set );
+            $mail = $mail[0];
+            $this->assertEquals( 468, strlen( $mail->body->text ) );
+            ezcMailCharsetConverter::setConvertMethod( array( 
'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) );
+        }
+        else
+        {
+            $this->markTestSkipped( "This test doesn't work without the 
mbstring extension. PHP must be compiled with --enable-mbstring." );
+        }
+
+    }
+
+    public function testMbstringCharsetConverter2()
+    {
+        if ( ezcBaseFeatures::hasExtensionSupport( 'mbstring' ) )
+        {
+            ezcMailCharsetConverter::setConvertMethod( array( 'myConverter', 
'convertToUTF8Mbstring' ) );
+            $parser = new ezcMailParser();
+            $set = new SingleFileSet( 'various/test-broken-iconv-2' );
+            $mail = $parser->parseMail( $set );
+            $mail = $mail[0];
+            $this->assertEquals( 99, strlen( $mail->body->text ) );
+            ezcMailCharsetConverter::setConvertMethod( array( 
'ezcMailCharsetConverter', 'convertToUTF8Iconv' ) );
+        }
+        else
+        {
+            $this->markTestSkipped( "This test doesn't work without the 
mbstring extension. PHP must be compiled with --enable-mbstring." );
+        }
+    }
 }
 ?>

-- 
svn-components mailing list
svn-components@lists.ez.no
http://lists.ez.no/mailman/listinfo/svn-components

Reply via email to