cataphract                               Thu, 14 Oct 2010 19:14:06 +0000

Revision: http://svn.php.net/viewvc?view=revision&revision=304404

Log:
- Fixed get_next_char(), used by htmlentities/htmlspecialchars, accepting
  certain ill-formed UTF-8 sequences.

Changed paths:
    U   php/php-src/branches/PHP_5_3/NEWS
    U   php/php-src/branches/PHP_5_3/ext/standard/html.c
    U   
php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-2.phpt
    A   
php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-3.phpt
    U   
php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf.phpt
    U   php/php-src/trunk/ext/standard/html.c
    U   php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-2.phpt
    A   php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-3.phpt
    U   php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf.phpt

Modified: php/php-src/branches/PHP_5_3/NEWS
===================================================================
--- php/php-src/branches/PHP_5_3/NEWS	2010-10-14 15:13:37 UTC (rev 304403)
+++ php/php-src/branches/PHP_5_3/NEWS	2010-10-14 19:14:06 UTC (rev 304404)
@@ -33,6 +33,8 @@
   and when there was data in the buffer before the emulation started. Also made
   more consistent its behavior -- should return failure every time less data
   than was requested was skipped. (Gustavo)
+- Fixed htmlentities/htmlspecialchars accepting certain ill-formed UTF-8
+  sequences. (Gustavo)

 - Fixed bug #53021 (In html_entity_decode, failure to convert numeric entities
   with ENT_NOQUOTES and ISO-8859-1). Fixed and extended the fix of ENT_NOQUOTES

Modified: php/php-src/branches/PHP_5_3/ext/standard/html.c
===================================================================
--- php/php-src/branches/PHP_5_3/ext/standard/html.c	2010-10-14 15:13:37 UTC (rev 304403)
+++ php/php-src/branches/PHP_5_3/ext/standard/html.c	2010-10-14 19:14:06 UTC (rev 304404)
@@ -540,7 +540,7 @@
 					MB_WRITE(c);
 					this_char = c;
 					pos++;
-				} else if (c < 0xc0) {
+				} else if (c < 0xc2) {
 					MB_FAILURE(pos);
 				} else if (c < 0xe0) {
 					CHECK_LEN(pos, 2);
@@ -572,7 +572,7 @@
 					MB_WRITE((unsigned char)str[pos + 1]);
 					MB_WRITE((unsigned char)str[pos + 2]);
 					pos += 3;
-				} else if (c < 0xf8) {
+				} else if (c < 0xf5) {
 					CHECK_LEN(pos, 4);
 					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
 						MB_FAILURE(pos);
@@ -584,7 +584,7 @@
 						MB_FAILURE(pos);
 					}
 					this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
-					if (this_char < 0x10000) {
+					if (this_char < 0x10000 || this_char > 0x10FFFF) {
 						MB_FAILURE(pos);
 					}
 					MB_WRITE((unsigned char)c);

Modified: php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-2.phpt
===================================================================
--- php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-2.phpt	2010-10-14 15:13:37 UTC (rev 304403)
+++ php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-2.phpt	2010-10-14 19:14:06 UTC (rev 304404)
@@ -50,12 +50,12 @@
 %unicode|string%(16) "266561637574653b"
 %unicode|string%(2) "79"
 %unicode|string%(2) "79"
-%unicode|string%(8) "f7bfbfbf"
-%unicode|string%(8) "f7bfbfbf"
 %unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
+%unicode|string%(0) ""
+%unicode|string%(0) ""
 %unicode|string%(4) "4142"
 %unicode|string%(4) "4142"
 %unicode|string%(4) "4242"

Added: php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-3.phpt
===================================================================
--- php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-3.phpt	                        (rev 0)
+++ php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf-3.phpt	2010-10-14 19:14:06 UTC (rev 304404)
@@ -0,0 +1,83 @@
+--TEST--
+Test get_next_char(), used by htmlentities()/htmlspecialchars(): validity of UTF-8 sequences
+--FILE--
+<?php
+
+/* conformance to Unicode 5.2, section 3.9, D92 */
+
+$val_ranges = array(
+	array(array(0x00, 0x7F)),
+	array(array(0xC2, 0xDF), array(0x80, 0xBF)),
+	array(array(0xE0, 0xE0), array(0xA0, 0xBF), array(0x80, 0xBF)),
+	array(array(0xE1, 0xEC), array(0x80, 0xBF), array(0x80, 0xBF)),
+	array(array(0xED, 0xED), array(0x80, 0x9F), array(0x80, 0xBF)),
+	array(array(0xEE, 0xEF), array(0x80, 0xBF), array(0x80, 0xBF)),
+	array(array(0xF0, 0xF0), array(0x90, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)),
+	array(array(0xF1, 0xF3), array(0x80, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)),
+	array(array(0xF4, 0xF4), array(0x80, 0x8F), array(0x80, 0xBF), array(0x80, 0xBF)),
+);
+
+function is_valid($seq) {
+	global $val_ranges;
+	$b = ord($seq[0]);
+	foreach ($val_ranges as $l) {
+		if ($b >= $l[0][0] && $b <= $l[0][1]) {
+			if (count($l) != strlen($seq)) {
+				return false;
+			}
+			for ($n = 1; $n < strlen($seq); $n++) {
+				if (ord($seq[$n]) < $l[$n][0] || ord($seq[$n]) > $l[$n][1]) {
+					return false;
+				}
+			}
+			return true;
+		}
+	}
+	return false;
+}
+
+function concordance($s) {
+	$vhe = strlen(htmlspecialchars($s, ENT_QUOTES, "UTF-8")) > 0;
+	$v = is_valid($s);
+	return ($vhe === $v);
+}
+
+for ($b1 = 0xC0; $b1 < 0xE0; $b1++) {
+	for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
+		$s = chr($b1).chr($b2);
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+	}
+}
+
+
+for ($b1 = 0xE0; $b1 < 0xEF; $b1++) {
+	for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
+		$s = chr($b1).chr($b2)."\x80";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+		$s = chr($b1).chr($b2)."\xBF";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+	}
+}
+
+for ($b1 = 0xF0; $b1 < 0xFF; $b1++) {
+	for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
+		$s = chr($b1).chr($b2)."\x80\x80";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+		$s = chr($b1).chr($b2)."\xBF\x80";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+		$s = chr($b1).chr($b2)."\x80\xBF";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+		$s = chr($b1).chr($b2)."\xBF\xBF";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+	}
+}
+echo "Done.\n";
+--EXPECT--
+Done.

Modified: php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf.phpt
===================================================================
--- php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf.phpt	2010-10-14 15:13:37 UTC (rev 304403)
+++ php/php-src/branches/PHP_5_3/ext/standard/tests/strings/htmlentities-utf.phpt	2010-10-14 19:14:06 UTC (rev 304404)
@@ -50,8 +50,6 @@
 %unicode|string%(16) "266561637574653b"
 %unicode|string%(0) ""
 %unicode|string%(0) ""
-%unicode|string%(8) "f7bfbfbf"
-%unicode|string%(8) "f7bfbfbf"
 %unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
@@ -68,3 +66,5 @@
 %unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
+%unicode|string%(0) ""
+%unicode|string%(0) ""

Modified: php/php-src/trunk/ext/standard/html.c
===================================================================
--- php/php-src/trunk/ext/standard/html.c	2010-10-14 15:13:37 UTC (rev 304403)
+++ php/php-src/trunk/ext/standard/html.c	2010-10-14 19:14:06 UTC (rev 304404)
@@ -129,7 +129,7 @@
 					MB_WRITE(c);
 					this_char = c;
 					pos++;
-				} else if (c < 0xc0) {
+				} else if (c < 0xc2) {
 					MB_FAILURE(pos);
 				} else if (c < 0xe0) {
 					CHECK_LEN(pos, 2);
@@ -161,7 +161,7 @@
 					MB_WRITE((unsigned char)str[pos + 1]);
 					MB_WRITE((unsigned char)str[pos + 2]);
 					pos += 3;
-				} else if (c < 0xf8) {
+				} else if (c < 0xf5) {
 					CHECK_LEN(pos, 4);
 					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
 						MB_FAILURE(pos);
@@ -173,7 +173,7 @@
 						MB_FAILURE(pos);
 					}
 					this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
-					if (this_char < 0x10000) {
+					if (this_char < 0x10000 || this_char > 0x10FFFF) {
 						MB_FAILURE(pos);
 					}
 					MB_WRITE((unsigned char)c);

Modified: php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-2.phpt
===================================================================
--- php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-2.phpt	2010-10-14 15:13:37 UTC (rev 304403)
+++ php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-2.phpt	2010-10-14 19:14:06 UTC (rev 304404)
@@ -50,12 +50,12 @@
 %unicode|string%(16) "266561637574653b"
 %unicode|string%(2) "79"
 %unicode|string%(2) "79"
-%unicode|string%(8) "f7bfbfbf"
-%unicode|string%(8) "f7bfbfbf"
 %unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
+%unicode|string%(0) ""
+%unicode|string%(0) ""
 %unicode|string%(4) "4142"
 %unicode|string%(4) "4142"
 %unicode|string%(4) "4242"

Added: php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-3.phpt
===================================================================
--- php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-3.phpt	                        (rev 0)
+++ php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf-3.phpt	2010-10-14 19:14:06 UTC (rev 304404)
@@ -0,0 +1,83 @@
+--TEST--
+Test get_next_char(), used by htmlentities()/htmlspecialchars(): validity of UTF-8 sequences
+--FILE--
+<?php
+
+/* conformance to Unicode 5.2, section 3.9, D92 */
+
+$val_ranges = array(
+	array(array(0x00, 0x7F)),
+	array(array(0xC2, 0xDF), array(0x80, 0xBF)),
+	array(array(0xE0, 0xE0), array(0xA0, 0xBF), array(0x80, 0xBF)),
+	array(array(0xE1, 0xEC), array(0x80, 0xBF), array(0x80, 0xBF)),
+	array(array(0xED, 0xED), array(0x80, 0x9F), array(0x80, 0xBF)),
+	array(array(0xEE, 0xEF), array(0x80, 0xBF), array(0x80, 0xBF)),
+	array(array(0xF0, 0xF0), array(0x90, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)),
+	array(array(0xF1, 0xF3), array(0x80, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)),
+	array(array(0xF4, 0xF4), array(0x80, 0x8F), array(0x80, 0xBF), array(0x80, 0xBF)),
+);
+
+function is_valid($seq) {
+	global $val_ranges;
+	$b = ord($seq[0]);
+	foreach ($val_ranges as $l) {
+		if ($b >= $l[0][0] && $b <= $l[0][1]) {
+			if (count($l) != strlen($seq)) {
+				return false;
+			}
+			for ($n = 1; $n < strlen($seq); $n++) {
+				if (ord($seq[$n]) < $l[$n][0] || ord($seq[$n]) > $l[$n][1]) {
+					return false;
+				}
+			}
+			return true;
+		}
+	}
+	return false;
+}
+
+function concordance($s) {
+	$vhe = strlen(htmlspecialchars($s, ENT_QUOTES, "UTF-8")) > 0;
+	$v = is_valid($s);
+	return ($vhe === $v);
+}
+
+for ($b1 = 0xC0; $b1 < 0xE0; $b1++) {
+	for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
+		$s = chr($b1).chr($b2);
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+	}
+}
+
+
+for ($b1 = 0xE0; $b1 < 0xEF; $b1++) {
+	for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
+		$s = chr($b1).chr($b2)."\x80";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+		$s = chr($b1).chr($b2)."\xBF";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+	}
+}
+
+for ($b1 = 0xF0; $b1 < 0xFF; $b1++) {
+	for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
+		$s = chr($b1).chr($b2)."\x80\x80";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+		$s = chr($b1).chr($b2)."\xBF\x80";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+		$s = chr($b1).chr($b2)."\x80\xBF";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+		$s = chr($b1).chr($b2)."\xBF\xBF";
+		if (!concordance($s))
+			echo "Discordance for ".bin2hex($s),"\n";
+	}
+}
+echo "Done.\n";
+--EXPECT--
+Done.

Modified: php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf.phpt
===================================================================
--- php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf.phpt	2010-10-14 15:13:37 UTC (rev 304403)
+++ php/php-src/trunk/ext/standard/tests/strings/htmlentities-utf.phpt	2010-10-14 19:14:06 UTC (rev 304404)
@@ -50,8 +50,6 @@
 %unicode|string%(16) "266561637574653b"
 %unicode|string%(0) ""
 %unicode|string%(0) ""
-%unicode|string%(8) "f7bfbfbf"
-%unicode|string%(8) "f7bfbfbf"
 %unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
@@ -68,3 +66,5 @@
 %unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
+%unicode|string%(0) ""
+%unicode|string%(0) ""
-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to