cataphract Thu, 14 Oct 2010 19:14:06 +0000 Revision: http://svn.php.net/viewvc?view=revision&revision=304404
Log: - Fixed get_next_char(), used by htmlentities/htmlspecialchars, accepting certain ill-formed UTF-8 sequences. Changed paths: U php/php-src/branches/PHP_5_3/NEWS U php/php-src/branches/PHP_5_3/ext/standard/html.c U php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-2.phpt A php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-3.phpt U php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf.phpt U php/php-src/trunk/ext/standard/html.c U php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-2.phpt A php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-3.phpt U php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf.phpt
Modified: php/php-src/branches/PHP_5_3/NEWS =================================================================== --- php/php-src/branches/PHP_5_3/NEWS 2010-10-14 15:13:37 UTC (rev 304403) +++ php/php-src/branches/PHP_5_3/NEWS 2010-10-14 19:14:06 UTC (rev 304404) @@ -33,6 +33,8 @@ and when there was data in the buffer before the emulation started. Also made more consistent its behavior -- should return failure every time less data than was requested was skipped. (Gustavo) +- Fixed htmlentities/htmlspecialchars accepting certain ill-formed UTF-8 + sequences. (Gustavo) - Fixed bug #53021 (In html_entity_decode, failure to convert numeric entities with ENT_NOQUOTES and ISO-8859-1). Fixed and extended the fix of ENT_NOQUOTES Modified: php/php-src/branches/PHP_5_3/ext/standard/html.c =================================================================== --- php/php-src/branches/PHP_5_3/ext/standard/html.c 2010-10-14 15:13:37 UTC (rev 304403) +++ php/php-src/branches/PHP_5_3/ext/standard/html.c 2010-10-14 19:14:06 UTC (rev 304404) @@ -540,7 +540,7 @@ MB_WRITE(c); this_char = c; pos++; - } else if (c < 0xc0) { + } else if (c < 0xc2) { MB_FAILURE(pos); } else if (c < 0xe0) { CHECK_LEN(pos, 2); @@ -572,7 +572,7 @@ MB_WRITE((unsigned char)str[pos + 1]); MB_WRITE((unsigned char)str[pos + 2]); pos += 3; - } else if (c < 0xf8) { + } else if (c < 0xf5) { CHECK_LEN(pos, 4); if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { MB_FAILURE(pos); @@ -584,7 +584,7 @@ MB_FAILURE(pos); } this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f); - if (this_char < 0x10000) { + if (this_char < 0x10000 || this_char > 0x10FFFF) { MB_FAILURE(pos); } MB_WRITE((unsigned char)c); Modified: php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-2.phpt =================================================================== --- php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-2.phpt 2010-10-14 15:13:37 UTC (rev 304403) +++ php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-2.phpt 2010-10-14 19:14:06 UTC (rev 304404) @@ -50,12 +50,12 @@ %unicode|string%(16) "266561637574653b" %unicode|string%(2) "79" %unicode|string%(2) "79" -%unicode|string%(8) "f7bfbfbf" -%unicode|string%(8) "f7bfbfbf" %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" +%unicode|string%(0) "" +%unicode|string%(0) "" %unicode|string%(4) "4142" %unicode|string%(4) "4142" %unicode|string%(4) "4242" Added: php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-3.phpt =================================================================== --- php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-3.phpt (rev 0) +++ php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-3.phpt 2010-10-14 19:14:06 UTC (rev 304404) @@ -0,0 +1,83 @@ +--TEST-- +Test get_next_char(), used by htmlentities()/htmlspecialchars(): validity of UTF-8 sequences +--FILE-- +<?php + +/* conformance to Unicode 5.2, section 3.9, D92 */ + +$val_ranges = array( + array(array(0x00, 0x7F)), + array(array(0xC2, 0xDF), array(0x80, 0xBF)), + array(array(0xE0, 0xE0), array(0xA0, 0xBF), array(0x80, 0xBF)), + array(array(0xE1, 0xEC), array(0x80, 0xBF), array(0x80, 0xBF)), + array(array(0xED, 0xED), array(0x80, 0x9F), array(0x80, 0xBF)), + array(array(0xEE, 0xEF), array(0x80, 0xBF), array(0x80, 0xBF)), + array(array(0xF0, 0xF0), array(0x90, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)), + array(array(0xF1, 0xF3), array(0x80, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)), + array(array(0xF4, 0xF4), array(0x80, 0x8F), array(0x80, 0xBF), array(0x80, 0xBF)), +); + +function is_valid($seq) { + global $val_ranges; + $b = ord($seq[0]); + foreach ($val_ranges as $l) { + if ($b >= $l[0][0] && $b <= $l[0][1]) { + if (count($l) != strlen($seq)) { + return false; + } + for ($n = 1; $n < strlen($seq); $n++) { + if (ord($seq[$n]) < $l[$n][0] || ord($seq[$n]) > $l[$n][1]) { + return false; + } + } + return true; + } + } + return false; +} + +function concordance($s) { + $vhe = strlen(htmlspecialchars($s, ENT_QUOTES, "UTF-8")) > 0; + $v = is_valid($s); + return ($vhe === $v); +} + +for ($b1 = 0xC0; $b1 < 0xE0; $b1++) { + for ($b2 = 0x80; $b2 < 0xBF; $b2++) { + $s = chr($b1).chr($b2); + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + } +} + + +for ($b1 = 0xE0; $b1 < 0xEF; $b1++) { + for ($b2 = 0x80; $b2 < 0xBF; $b2++) { + $s = chr($b1).chr($b2)."\x80"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + $s = chr($b1).chr($b2)."\xBF"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + } +} + +for ($b1 = 0xF0; $b1 < 0xFF; $b1++) { + for ($b2 = 0x80; $b2 < 0xBF; $b2++) { + $s = chr($b1).chr($b2)."\x80\x80"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + $s = chr($b1).chr($b2)."\xBF\x80"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + $s = chr($b1).chr($b2)."\x80\xBF"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + $s = chr($b1).chr($b2)."\xBF\xBF"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + } +} +echo "Done.\n"; +--EXPECT-- +Done. Modified: php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf.phpt =================================================================== --- php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf.phpt 2010-10-14 15:13:37 UTC (rev 304403) +++ php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf.phpt 2010-10-14 19:14:06 UTC (rev 304404) @@ -50,8 +50,6 @@ %unicode|string%(16) "266561637574653b" %unicode|string%(0) "" %unicode|string%(0) "" -%unicode|string%(8) "f7bfbfbf" -%unicode|string%(8) "f7bfbfbf" %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" @@ -68,3 +66,5 @@ %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" +%unicode|string%(0) "" +%unicode|string%(0) "" Modified: php/php-src/trunk/ext/standard/html.c =================================================================== --- php/php-src/trunk/ext/standard/html.c 2010-10-14 15:13:37 UTC (rev 304403) +++ php/php-src/trunk/ext/standard/html.c 2010-10-14 19:14:06 UTC (rev 304404) @@ -129,7 +129,7 @@ MB_WRITE(c); this_char = c; pos++; - } else if (c < 0xc0) { + } else if (c < 0xc2) { MB_FAILURE(pos); } else if (c < 0xe0) { CHECK_LEN(pos, 2); @@ -161,7 +161,7 @@ MB_WRITE((unsigned char)str[pos + 1]); MB_WRITE((unsigned char)str[pos + 2]); pos += 3; - } else if (c < 0xf8) { + } else if (c < 0xf5) { CHECK_LEN(pos, 4); if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { MB_FAILURE(pos); @@ -173,7 +173,7 @@ MB_FAILURE(pos); } this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f); - if (this_char < 0x10000) { + if (this_char < 0x10000 || this_char > 0x10FFFF) { MB_FAILURE(pos); } MB_WRITE((unsigned char)c); Modified: php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-2.phpt =================================================================== --- php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-2.phpt 2010-10-14 15:13:37 UTC (rev 304403) +++ php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-2.phpt 2010-10-14 19:14:06 UTC (rev 304404) @@ -50,12 +50,12 @@ %unicode|string%(16) "266561637574653b" %unicode|string%(2) "79" %unicode|string%(2) "79" -%unicode|string%(8) "f7bfbfbf" -%unicode|string%(8) "f7bfbfbf" %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" +%unicode|string%(0) "" +%unicode|string%(0) "" %unicode|string%(4) "4142" %unicode|string%(4) "4142" %unicode|string%(4) "4242" Added: php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-3.phpt =================================================================== --- php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-3.phpt (rev 0) +++ php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-3.phpt 2010-10-14 19:14:06 UTC (rev 304404) @@ -0,0 +1,83 @@ +--TEST-- +Test get_next_char(), used by htmlentities()/htmlspecialchars(): validity of UTF-8 sequences +--FILE-- +<?php + +/* conformance to Unicode 5.2, section 3.9, D92 */ + +$val_ranges = array( + array(array(0x00, 0x7F)), + array(array(0xC2, 0xDF), array(0x80, 0xBF)), + array(array(0xE0, 0xE0), array(0xA0, 0xBF), array(0x80, 0xBF)), + array(array(0xE1, 0xEC), array(0x80, 0xBF), array(0x80, 0xBF)), + array(array(0xED, 0xED), array(0x80, 0x9F), array(0x80, 0xBF)), + array(array(0xEE, 0xEF), array(0x80, 0xBF), array(0x80, 0xBF)), + array(array(0xF0, 0xF0), array(0x90, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)), + array(array(0xF1, 0xF3), array(0x80, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)), + array(array(0xF4, 0xF4), array(0x80, 0x8F), array(0x80, 0xBF), array(0x80, 0xBF)), +); + +function is_valid($seq) { + global $val_ranges; + $b = ord($seq[0]); + foreach ($val_ranges as $l) { + if ($b >= $l[0][0] && $b <= $l[0][1]) { + if (count($l) != strlen($seq)) { + return false; + } + for ($n = 1; $n < strlen($seq); $n++) { + if (ord($seq[$n]) < $l[$n][0] || ord($seq[$n]) > $l[$n][1]) { + return false; + } + } + return true; + } + } + return false; +} + +function concordance($s) { + $vhe = strlen(htmlspecialchars($s, ENT_QUOTES, "UTF-8")) > 0; + $v = is_valid($s); + return ($vhe === $v); +} + +for ($b1 = 0xC0; $b1 < 0xE0; $b1++) { + for ($b2 = 0x80; $b2 < 0xBF; $b2++) { + $s = chr($b1).chr($b2); + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + } +} + + +for ($b1 = 0xE0; $b1 < 0xEF; $b1++) { + for ($b2 = 0x80; $b2 < 0xBF; $b2++) { + $s = chr($b1).chr($b2)."\x80"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + $s = chr($b1).chr($b2)."\xBF"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + } +} + +for ($b1 = 0xF0; $b1 < 0xFF; $b1++) { + for ($b2 = 0x80; $b2 < 0xBF; $b2++) { + $s = chr($b1).chr($b2)."\x80\x80"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + $s = chr($b1).chr($b2)."\xBF\x80"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + $s = chr($b1).chr($b2)."\x80\xBF"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + $s = chr($b1).chr($b2)."\xBF\xBF"; + if (!concordance($s)) + echo "Discordance for ".bin2hex($s),"\n"; + } +} +echo "Done.\n"; +--EXPECT-- +Done. Modified: php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf.phpt =================================================================== --- php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf.phpt 2010-10-14 15:13:37 UTC (rev 304403) +++ php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf.phpt 2010-10-14 19:14:06 UTC (rev 304404) @@ -50,8 +50,6 @@ %unicode|string%(16) "266561637574653b" %unicode|string%(0) "" %unicode|string%(0) "" -%unicode|string%(8) "f7bfbfbf" -%unicode|string%(8) "f7bfbfbf" %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" @@ -68,3 +66,5 @@ %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" +%unicode|string%(0) "" +%unicode|string%(0) ""
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php