moriyoshi                                Mon, 07 Dec 2009 15:41:43 +0000

Revision: http://svn.php.net/viewvc?view=revision&revision=291821

Log:
- Take account of surrogate pairs.

Changed paths:
    U   php/php-src/branches/PHP_5_2/ext/standard/html.c
    U   php/php-src/branches/PHP_5_2/ext/standard/tests/strings/bug49785.phpt
    U   php/php-src/branches/PHP_5_3/ext/standard/html.c
    U   php/php-src/branches/PHP_5_3/ext/standard/tests/strings/bug49785.phpt
    U   php/php-src/trunk/ext/standard/html.c
    U   php/php-src/trunk/ext/standard/tests/strings/bug49785.phpt

Modified: php/php-src/branches/PHP_5_2/ext/standard/html.c
===================================================================
--- php/php-src/branches/PHP_5_2/ext/standard/html.c    2009-12-07 15:34:13 UTC 
(rev 291820)
+++ php/php-src/branches/PHP_5_2/ext/standard/html.c    2009-12-07 15:41:43 UTC 
(rev 291821)
@@ -566,6 +566,8 @@
                                        this_char = ((c & 0x0f) << 12) | 
((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
                                        if (this_char < 0x800) {
                                                MB_FAILURE(pos);
+                                       } else if (this_char >= 0xd800 && 
this_char <= 0xdfff) {
+                                               MB_FAILURE(pos);
                                        }
                                        MB_WRITE((unsigned char)c);
                                        MB_WRITE((unsigned char)str[pos + 1]);

Modified: php/php-src/branches/PHP_5_2/ext/standard/tests/strings/bug49785.phpt
===================================================================
--- php/php-src/branches/PHP_5_2/ext/standard/tests/strings/bug49785.phpt       
2009-12-07 15:34:13 UTC (rev 291820)
+++ php/php-src/branches/PHP_5_2/ext/standard/tests/strings/bug49785.phpt       
2009-12-07 15:41:43 UTC (rev 291821)
@@ -36,10 +36,14 @@
 var_dump(_bin2hex(htmlentities("\xf8\x88\x80\x80\x80", ENT_QUOTES, "UTF-8")));

 echo "--\n";
-// UTF-8: alternative (invalid) UTF-8 sequence
+// UTF-8: alternative (invalid) UTF-8 sequence / surrogate pairs
 var_dump(_bin2hex(htmlspecialchars("\xc0\xa6", ENT_QUOTES, 'UTF-8')));
 var_dump(_bin2hex(htmlspecialchars("\xe0\x80\xa6", ENT_QUOTES, 'UTF-8')));
 var_dump(_bin2hex(htmlspecialchars("\xf0\x80\x80\xa6", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xec\xbf\xbf", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xed\xa0\x80", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xed\xbf\xbf", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xee\x80\x80", ENT_QUOTES, 'UTF-8')));

 // Shift_JIS: non-lead byte >= 0x80
 var_dump(_bin2hex(htmlspecialchars("\x80", ENT_QUOTES, 'Shift_JIS')));
@@ -158,6 +162,10 @@
 string(0) ""
 string(0) ""
 string(0) ""
+string(6) "ecbfbf"
+string(0) ""
+string(0) ""
+string(6) "ee8080"
 string(2) "80"
 string(2) "a0"
 string(2) "a1"

Modified: php/php-src/branches/PHP_5_3/ext/standard/html.c
===================================================================
--- php/php-src/branches/PHP_5_3/ext/standard/html.c    2009-12-07 15:34:13 UTC 
(rev 291820)
+++ php/php-src/branches/PHP_5_3/ext/standard/html.c    2009-12-07 15:41:43 UTC 
(rev 291821)
@@ -565,6 +565,8 @@
                                        this_char = ((c & 0x0f) << 12) | 
((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
                                        if (this_char < 0x800) {
                                                MB_FAILURE(pos);
+                                       } else if (this_char >= 0xd800 && 
this_char <= 0xdfff) {
+                                               MB_FAILURE(pos);
                                        }
                                        MB_WRITE((unsigned char)c);
                                        MB_WRITE((unsigned char)str[pos + 1]);

Modified: php/php-src/branches/PHP_5_3/ext/standard/tests/strings/bug49785.phpt
===================================================================
--- php/php-src/branches/PHP_5_3/ext/standard/tests/strings/bug49785.phpt       
2009-12-07 15:34:13 UTC (rev 291820)
+++ php/php-src/branches/PHP_5_3/ext/standard/tests/strings/bug49785.phpt       
2009-12-07 15:41:43 UTC (rev 291821)
@@ -42,10 +42,14 @@
 var_dump(_bin2hex(htmlentities("\xf0\x80\x80\x80\xf0\x90\x80\x80", ENT_QUOTES 
| ENT_IGNORE, "UTF-8")));

 echo "--\n";
-// UTF-8: alternative (invalid) UTF-8 sequence
+// UTF-8: alternative (invalid) UTF-8 sequence / surrogate pairs
 var_dump(_bin2hex(htmlspecialchars("\xc0\xa6", ENT_QUOTES, 'UTF-8')));
 var_dump(_bin2hex(htmlspecialchars("\xe0\x80\xa6", ENT_QUOTES, 'UTF-8')));
 var_dump(_bin2hex(htmlspecialchars("\xf0\x80\x80\xa6", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xec\xbf\xbf", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xed\xa0\x80", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xed\xbf\xbf", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xee\x80\x80", ENT_QUOTES, 'UTF-8')));

 // Shift_JIS: non-lead byte >= 0x80
 var_dump(_bin2hex(htmlspecialchars("\x80", ENT_QUOTES, 'Shift_JIS')));
@@ -168,6 +172,10 @@
 string(0) ""
 string(0) ""
 string(0) ""
+string(6) "ecbfbf"
+string(0) ""
+string(0) ""
+string(6) "ee8080"
 string(2) "80"
 string(2) "a0"
 string(2) "a1"

Modified: php/php-src/trunk/ext/standard/html.c
===================================================================
--- php/php-src/trunk/ext/standard/html.c       2009-12-07 15:34:13 UTC (rev 
291820)
+++ php/php-src/trunk/ext/standard/html.c       2009-12-07 15:41:43 UTC (rev 
291821)
@@ -571,6 +571,8 @@
                                        this_char = ((c & 0x0f) << 12) | 
((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
                                        if (this_char < 0x800) {
                                                MB_FAILURE(pos);
+                                       } else if (this_char >= 0xd800 && 
this_char <= 0xdfff) {
+                                               MB_FAILURE(pos);
                                        }
                                        MB_WRITE((unsigned char)c);
                                        MB_WRITE((unsigned char)str[pos + 1]);

Modified: php/php-src/trunk/ext/standard/tests/strings/bug49785.phpt
===================================================================
--- php/php-src/trunk/ext/standard/tests/strings/bug49785.phpt  2009-12-07 
15:34:13 UTC (rev 291820)
+++ php/php-src/trunk/ext/standard/tests/strings/bug49785.phpt  2009-12-07 
15:41:43 UTC (rev 291821)
@@ -42,10 +42,14 @@
 var_dump(_bin2hex(htmlentities("\xf0\x80\x80\x80\xf0\x90\x80\x80", ENT_QUOTES 
| ENT_IGNORE, "UTF-8")));

 echo "--\n";
-// UTF-8: alternative (invalid) UTF-8 sequence
+// UTF-8: alternative (invalid) UTF-8 sequence / surrogate pairs
 var_dump(_bin2hex(htmlspecialchars("\xc0\xa6", ENT_QUOTES, 'UTF-8')));
 var_dump(_bin2hex(htmlspecialchars("\xe0\x80\xa6", ENT_QUOTES, 'UTF-8')));
 var_dump(_bin2hex(htmlspecialchars("\xf0\x80\x80\xa6", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xec\xbf\xbf", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xed\xa0\x80", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xed\xbf\xbf", ENT_QUOTES, 'UTF-8')));
+var_dump(_bin2hex(htmlspecialchars("\xee\x80\x80", ENT_QUOTES, 'UTF-8')));

 // Shift_JIS: non-lead byte >= 0x80
 var_dump(_bin2hex(htmlspecialchars("\x80", ENT_QUOTES, 'Shift_JIS')));
@@ -168,6 +172,10 @@
 string(0) ""
 string(0) ""
 string(0) ""
+string(6) "ecbfbf"
+string(0) ""
+string(0) ""
+string(6) "ee8080"
 string(2) "80"
 string(2) "a0"
 string(2) "a1"

-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to