andrei Tue Sep 19 20:01:11 2006 UTC
Modified files:
/php-src/ext/pcre php_pcre.c
Log:
Upgrade preg_quote() to support Unicode.
http://cvs.php.net/viewvc.cgi/php-src/ext/pcre/php_pcre.c?r1=1.201&r2=1.202&diff_format=u
Index: php-src/ext/pcre/php_pcre.c
diff -u php-src/ext/pcre/php_pcre.c:1.201 php-src/ext/pcre/php_pcre.c:1.202
--- php-src/ext/pcre/php_pcre.c:1.201 Tue Sep 19 17:37:48 2006
+++ php-src/ext/pcre/php_pcre.c Tue Sep 19 20:01:10 2006
@@ -16,7 +16,7 @@
+----------------------------------------------------------------------+
*/
-/* $Id: php_pcre.c,v 1.201 2006/09/19 17:37:48 andrei Exp $ */
+/* $Id: php_pcre.c,v 1.202 2006/09/19 20:01:10 andrei Exp $ */
/* UTODO
* - PCRE_NO_UTF8_CHECK option for Unicode strings
@@ -1684,13 +1684,13 @@
char *out_str, /* Output string with quoted characters
*/
*p, /* Iterator for input
string */
*q, /* Iterator for output
string */
- delim_char=0, /* Delimiter character to be quoted */
c; /* Current character */
+ UChar32 delim_char=0; /* Delimiter character to be quoted */
zend_bool quote_delim = 0; /* Whether to quote additional delim char */
/* Get the arguments and check for errors */
- if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s", &in_str,
&in_str_len,
- &delim, &delim_len)
== FAILURE) {
+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s&|s&", &in_str,
&in_str_len, UG(utf8_conv),
+ &delim, &delim_len,
UG(utf8_conv)) == FAILURE) {
return;
}
@@ -1698,17 +1698,24 @@
/* Nothing to do if we got an empty string */
if (in_str == in_str_end) {
- RETURN_EMPTY_STRING();
+ RETURN_EMPTY_TEXT();
}
if (delim && *delim) {
- delim_char = delim[0];
+ if (UG(unicode)) {
+ U8_GET(delim, 0, 0, delim_len, delim_char);
+ } else {
+ delim_char = (UChar32)delim[0];
+ }
quote_delim = 1;
}
/* Allocate enough memory so that even if each character
- is quoted, we won't run out of room */
- out_str = safe_emalloc(4, in_str_len, 1);
+ is quoted, we won't run out of room. In Unicode mode, the longest
UTF-8
+ sequence is 4 bytes, so the multiplier is (4+1). In non-Unicode
mode, we
+ have to assume that any character can be '\0', which needs 4 chars to
+ be escaped. */
+ out_str = safe_emalloc(UG(unicode)?5:4, in_str_len, 1);
/* Go through the string and quote necessary characters */
for(p = in_str, q = out_str; p != in_str_end; p++) {
@@ -1745,16 +1752,28 @@
break;
default:
- if (quote_delim && c == delim_char)
- *q++ = '\\';
- *q++ = c;
+ if ((UChar32)(unsigned char)c > 0x7f) { /*
non-ASCII char */
+ int tmp = 0;
+ UChar32 cp = 0;
+ U8_NEXT(p, tmp, in_str_end-p, cp);
+ if (quote_delim && cp == delim_char) {
+ *q++ = '\\';
+ }
+ memcpy(q, p, tmp);
+ q += tmp;
+ p += tmp-1; /* going to be incremented
by the loop */
+ } else {
+ if (quote_delim && c == delim_char)
+ *q++ = '\\';
+ *q++ = c;
+ }
break;
}
}
*q = '\0';
/* Reallocate string and return it */
- RETVAL_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
+ RETVAL_UTF8_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
}
/* }}} */
--
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php