andrei Mon Sep 18 20:16:36 2006 UTC Modified files: /php-src/ext/pcre php_pcre.c Log: Upgrade preg_split() to support Unicode strings. http://cvs.php.net/viewvc.cgi/php-src/ext/pcre/php_pcre.c?r1=1.198&r2=1.199&diff_format=u Index: php-src/ext/pcre/php_pcre.c diff -u php-src/ext/pcre/php_pcre.c:1.198 php-src/ext/pcre/php_pcre.c:1.199 --- php-src/ext/pcre/php_pcre.c:1.198 Mon Sep 18 17:59:10 2006 +++ php-src/ext/pcre/php_pcre.c Mon Sep 18 20:16:36 2006 @@ -16,15 +16,12 @@ +----------------------------------------------------------------------+ */ -/* $Id: php_pcre.c,v 1.198 2006/09/18 17:59:10 andrei Exp $ */ +/* $Id: php_pcre.c,v 1.199 2006/09/18 20:16:36 andrei Exp $ */ /* UTODO * - PCRE_NO_UTF8_CHECK option for Unicode strings - * - * php_pcre_match_impl(): - * - need to return matched substrings in the type matching the arguments - * - subpattern names - need to convert using UTF(utf8_conv) or just - * UG(runtime_encoding_conv) ? + * - add_offset_pair() should convert offset to refer to codepoints or bytes, + * depending on whether subject string is IS_UNICODE or IS_STRING * * php_pcre_split_impl(): * - Avoid the /./ bump for Unicode strings with U8_FWD_1() @@ -776,7 +773,7 @@ to achieve this, unless we're already at the end of the string. */ if (g_notempty != 0 && start_offset < subject_len) { offsets[0] = start_offset; - if (UG(unicode)) { + if (UG(unicode) || pce->compile_options & PCRE_UTF8) { offsets[1] = start_offset; U8_FWD_1(subject, offsets[1], subject_len); } else { @@ -1213,7 +1210,7 @@ to achieve this, unless we're already at the end of the string. */ if (g_notempty != 0 && start_offset < subject_len) { offsets[0] = start_offset; - if (UG(unicode)) { + if (UG(unicode) || pce->compile_options & PCRE_UTF8) { offsets[1] = start_offset; U8_FWD_1(subject, offsets[1], subject_len); } else { @@ -1459,37 +1456,57 @@ Split string into an array using a perl-style regular expression as a delimiter */ PHP_FUNCTION(preg_split) { - char *regex; /* Regular expression */ - char *subject; /* String to match against */ + zstr regex; /* Regular expression */ + zstr subject; /* String to match against */ int regex_len; int subject_len; long limit_val = -1;/* Integer value of limit */ long flags = 0; /* Match control flags */ pcre_cache_entry *pce; /* Compiled regular expression */ + zend_uchar str_type; + char *regex_utf8 = NULL, *subject_utf8 = NULL; + int regex_utf8_len, subject_utf8_len; + UErrorCode status = U_ZERO_ERROR; /* Get function parameters and do error checking */ - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|ll", ®ex, ®ex_len, - &subject, &subject_len, &limit_val, &flags) == FAILURE) { + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "TT|ll", ®ex, ®ex_len, &str_type, + &subject, &subject_len, &str_type, &limit_val, &flags) == FAILURE) { RETURN_FALSE; } + if (str_type == IS_UNICODE) { + zend_unicode_to_string_ex(UG(utf8_conv), ®ex_utf8, ®ex_utf8_len, regex.u, regex_len, &status); + zend_unicode_to_string_ex(UG(utf8_conv), &subject_utf8, &subject_utf8_len, subject.u, subject_len, &status); + regex.s = regex_utf8; + regex_len = regex_utf8_len; + subject.s = subject_utf8; + subject_len = subject_utf8_len; + } + /* Compile regex or get it from cache. */ - if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) { + if ((pce = pcre_get_compiled_regex_cache(regex.s, regex_len TSRMLS_CC)) == NULL) { RETURN_FALSE; + if (str_type == IS_UNICODE) { + efree(regex_utf8); + efree(subject_utf8); + } } - php_pcre_split_impl(pce, subject, subject_len, return_value, limit_val, flags TSRMLS_CC); + php_pcre_split_impl(pce, subject.s, subject_len, return_value, limit_val, flags TSRMLS_CC); + + if (str_type == IS_UNICODE) { + efree(regex_utf8); + efree(subject_utf8); + } } /* }}} */ -/* {{{ php_pcre_split +/* {{{ php_pcre_split_impl */ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value, long limit_val, long flags TSRMLS_DC) { pcre_extra *extra = NULL; /* Holds results of studying */ - pcre *re_bump = NULL; /* Regex instance for empty matches */ - pcre_extra *extra_bump = NULL; /* Almost dummy */ pcre_extra extra_data; /* Used locally for exec options */ int *offsets; /* Array of subpattern offsets */ int size_offsets; /* Size of the offsets array */ @@ -1538,6 +1555,10 @@ last_match = subject; match = NULL; PCRE_G(error_code) = PHP_PCRE_NO_ERROR; + + if (UG(unicode)) { + exoptions |= PCRE_NO_UTF8_CHECK; + } /* Get next piece if no limit or limit not yet reached and something matched*/ while ((limit_val == -1 || limit_val > 1)) { @@ -1559,11 +1580,11 @@ if (offset_capture) { /* Add (match, offset) pair to the return value */ - add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL, 0 TSRMLS_CC); + add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL, UG(unicode) TSRMLS_CC); } else { /* Add the piece to the return value */ - add_next_index_stringl(return_value, last_match, - &subject[offsets[0]]-last_match, 1); + add_next_index_utf8_stringl(return_value, last_match, + &subject[offsets[0]]-last_match, 1); } /* One less left to do */ @@ -1581,11 +1602,11 @@ /* If we have matched a delimiter */ if (!no_empty || match_len > 0) { if (offset_capture) { - add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, offsets[i<<1], NULL, 0 TSRMLS_CC); + add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, + offsets[i<<1], NULL, UG(unicode) TSRMLS_CC); } else { - add_next_index_stringl(return_value, - &subject[offsets[i<<1]], - match_len, 1); + add_next_index_utf8_stringl(return_value, &subject[offsets[i<<1]], + match_len, 1); } } } @@ -1596,24 +1617,11 @@ the start offset, and continue. Fudge the offset values to achieve this, unless we're already at the end of the string. */ if (g_notempty != 0 && start_offset < subject_len) { - if (pce->compile_options & PCRE_UTF8) { - if (re_bump == NULL) { - int dummy; - - if ((re_bump = pcre_get_compiled_regex("/./u", &extra_bump, &dummy TSRMLS_CC)) == NULL) { - RETURN_FALSE; - } - } - count = pcre_exec(re_bump, extra_bump, subject, - subject_len, start_offset, - exoptions, offsets, size_offsets); - if (count < 1) { - php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Unknown error"); - offsets[0] = start_offset; - offsets[1] = start_offset + 1; - } + offsets[0] = start_offset; + if (UG(unicode) || pce->compile_options & PCRE_UTF8) { + offsets[1] = start_offset; + U8_FWD_1(subject, offsets[1], subject_len); } else { - offsets[0] = start_offset; offsets[1] = start_offset + 1; } } else @@ -1638,13 +1646,13 @@ { if (offset_capture) { /* Add the last (match, offset) pair to the return value */ - add_offset_pair(return_value, &subject[start_offset], subject_len - start_offset, start_offset, NULL, 0 TSRMLS_CC); + add_offset_pair(return_value, &subject[start_offset], + subject_len - start_offset, start_offset, NULL, UG(unicode) TSRMLS_CC); } else { /* Add the last piece to the return value */ - add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1); + add_next_index_utf8_stringl(return_value, last_match, subject + subject_len - last_match, 1); } } - /* Clean up */ efree(offsets);
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php