andrei Thu Oct 5 22:40:38 2006 UTC Modified files: /php-src unicode-progress.txt /php-src/ext/standard string.c Log: Unicode support for str_replace() and str_ireplace(). # This was not trivial.
http://cvs.php.net/viewvc.cgi/php-src/unicode-progress.txt?r1=1.54&r2=1.55&diff_format=u Index: php-src/unicode-progress.txt diff -u php-src/unicode-progress.txt:1.54 php-src/unicode-progress.txt:1.55 --- php-src/unicode-progress.txt:1.54 Wed Oct 4 17:25:15 2006 +++ php-src/unicode-progress.txt Thu Oct 5 22:40:38 2006 @@ -26,29 +26,6 @@ sscanf() Params API. Rest - no idea yet. - str_replace() - stri_replace() - These are the problematic ones. There are a few approaches: - - 1. Case-fold both need and haystack and then do simple search. - - 2. Look at the implementation behind functions like - u_strcasecmp() and try to adapt it to a string search. The - implementation case-folds both strings incrementally. For - a search, one would want to case-fold the pattern beforehand, - but not the text in which you are searching. - - 3. Take the first character in the pattern and get the set of - all characters that have the same case folding (see the - UnicodeSet/USet API). Then search in the string for the - occurrence of any one of the set items (which include - strings!). Then do a case-insensitive comparison, allowing - a match that does not end with the end of the text. - - The problematic cases are of course those Ã->ss and similar. - - All other approaches bite. - strnatcmp(), strnatcasecmp() Params API. The rest depends on porting of strnatcmp.c @@ -145,6 +122,8 @@ similar_text() str_pad() str_repeat() + str_replace() + stri_replace() str_rot13() str_shuffle() str_split() http://cvs.php.net/viewvc.cgi/php-src/ext/standard/string.c?r1=1.604&r2=1.605&diff_format=u Index: php-src/ext/standard/string.c diff -u php-src/ext/standard/string.c:1.604 php-src/ext/standard/string.c:1.605 --- php-src/ext/standard/string.c:1.604 Wed Oct 4 17:38:09 2006 +++ php-src/ext/standard/string.c Thu Oct 5 22:40:38 2006 @@ -18,7 +18,7 @@ +----------------------------------------------------------------------+ */ -/* $Id: string.c,v 1.604 2006/10/04 17:38:09 andrei Exp $ */ +/* $Id: string.c,v 1.605 2006/10/05 22:40:38 andrei Exp $ */ /* Synced with php 3.0 revision 1.193 1999-06-16 [ssb] */ @@ -4877,6 +4877,9 @@ target += to_len; p++; s = p; + if (replace_count) { + *replace_count += 1; + } } if (s < e) { memcpy(target, s, (e - s)); @@ -4912,6 +4915,53 @@ } /* }}} */ +/* {{{ php_u_char_to_str_ex */ +PHPAPI int php_u_char_to_str_ex(UChar *str, uint len, UChar from, UChar *to, int to_len, zval *result, int *replace_count) +{ + int char_count = 0; + int replaced = 0; + UChar *target; + UChar *p, *e, *s; + + p = str; + e = p + len; + while ((p = u_memchr(p, from, (e - p)))) { + char_count++; + p++; + } + + if (char_count == 0) { + ZVAL_UNICODEL(result, str, len, 1); + return 0; + } + + Z_USTRLEN_P(result) = len + (char_count * (to_len - 1)); + Z_USTRVAL_P(result) = target = eumalloc(Z_USTRLEN_P(result) + 1); + Z_TYPE_P(result) = IS_UNICODE; + + p = str; + e = p + len; + s = str; + while ((p = u_memchr(p, from, (e - p)))) { + u_memcpy(target, s, (p - s)); + target += p - s; + u_memcpy(target, to, to_len); + target += to_len; + p++; + s = p; + if (replace_count) { + *replace_count += 1; + } + } + if (s < e) { + u_memcpy(target, s, (e - s)); + target += e - s; + } + *target = 0; + return replaced; +} +/* }}} */ + /* {{{ php_str_to_str_ex */ PHPAPI char *php_str_to_str_ex(char *haystack, int length, @@ -5067,31 +5117,234 @@ } /* }}} */ -/* {{{ php_str_to_str - */ +/* {{{ php_str_to_str */ PHPAPI char *php_str_to_str(char *haystack, int length, char *needle, int needle_len, char *str, int str_len, int *_new_length) { return php_str_to_str_ex(haystack, length, needle, needle_len, str, str_len, _new_length, 1, NULL); } -/* }}} - */ +/* }}} */ + +/* {{{ php_u_str_to_str_ex */ +PHPAPI UChar *php_u_str_to_str_ex(UChar *haystack, int length, + UChar *needle, int needle_len, UChar *repl, int repl_len, int *_new_length, int *replace_count) +{ + UChar *new_str; + + if (needle_len < length) { + UChar *end; + UChar *e, *s, *p, *r; + + if (needle_len == repl_len) { + new_str = eustrndup(haystack, length); + *_new_length = length; + + end = new_str + length; + for (p = new_str; (r = zend_u_memnstr(p, needle, needle_len, end)); p = r + needle_len) { + u_memcpy(r, repl, repl_len); + if (replace_count) { + (*replace_count)++; + } + } + return new_str; + } else { + + if (repl_len < needle_len) { + new_str = eumalloc(length + 1); + } else { + int count = 0; + UChar *o, *n, *endp; + + o = haystack; + n = needle; + endp = o + length; + + while ((o = zend_u_memnstr(o, n, needle_len, endp))) { + o += needle_len; + count++; + } + if (count == 0) { + /* Needle doesn't occur, shortcircuit the actual replacement. */ + new_str = eustrndup(haystack, length); + if (_new_length) { + *_new_length = length; + } + return new_str; + } else { + new_str = safe_emalloc(count, UBYTES(repl_len - needle_len), UBYTES(length + 1)); + } + } + + e = s = new_str; + + end = haystack + length; + for (p = haystack; (r = zend_u_memnstr(p, needle, needle_len, end)); p = r + needle_len) { + u_memcpy(e, p, r - p); + e += r - p; + u_memcpy(e, repl, repl_len); + e += repl_len; + if (replace_count) { + (*replace_count)++; + } + } + + if (p < end) { + u_memcpy(e, p, end - p); + e += end - p; + } + + *e = 0; + *_new_length = e - s; + + new_str = eurealloc(new_str, *_new_length + 1); + return new_str; + } + } else if (needle_len > length) { +nothing_todo: + *_new_length = length; + new_str = eustrndup(haystack, length); + return new_str; + } else { + if (u_strncmp(haystack, needle, length)) { + goto nothing_todo; + } else { + *_new_length = repl_len; + new_str = eustrndup(repl, repl_len); + if (replace_count) { + (*replace_count)++; + } + return new_str; + } + } +} +/* }}} */ + +/* {{{ php_u_str_to_str_case_ex */ +PHPAPI UChar *php_u_str_to_str_case_ex(UChar *str, int str_len, + UChar *pat, int pat_len, UChar *repl, int repl_len, int *result_len, int *replace_count TSRMLS_DC) +{ + UChar *str_fold, *pat_fold, *result; + int str_fold_len, pat_fold_len, alloc_len; + UChar *p, *found, *end; + int offset, found_len; + UErrorCode status = U_ZERO_ERROR; + + zend_case_fold_string(&str_fold, &str_fold_len, str, str_len, U_FOLD_CASE_DEFAULT, &status); + if (str_fold_len == str_len) { + alloc_len = str_len; + result = eumalloc(alloc_len + 1); + *result_len = 0; + + zend_case_fold_string(&pat_fold, &pat_fold_len, pat, pat_len, U_FOLD_CASE_DEFAULT, &status); + + end = str_fold + str_fold_len; + for (p = str_fold; + (found = u_strFindFirst(p, end - p, pat_fold, pat_fold_len)); + p = found + pat_fold_len) { /* we can increment by pattern length since the match in + the subject string is guaranteed to be of the same length */ + + if (*result_len + (found - p + repl_len) > alloc_len) { + alloc_len += (alloc_len >> 1); /* grow by 1.5x factor */ + result = eurealloc(result, alloc_len + 1); + } + u_memcpy(result + *result_len, str + (p - str_fold), found - p); + *result_len += found - p; + u_memcpy(result + *result_len, repl, repl_len); + *result_len += repl_len; + if (replace_count) { + (*replace_count)++; + } + } + + /* add remaining chunk, if any */ + if (p < end) { + int chunk_len = end - p; + if (*result_len + chunk_len > alloc_len) { + alloc_len += *result_len + chunk_len - alloc_len; + result = eurealloc(result, alloc_len + 1); + } + u_memcpy(result + *result_len, str + (p - str_fold), chunk_len); + *result_len += chunk_len; + } + + result[*result_len] = 0; + if (*result_len < alloc_len) { + result = eurealloc(result, *result_len + 1); + } + + efree(pat_fold); + } else { + usearch_setText(UG(root_search), str, str_len, &status); + usearch_setPattern(UG(root_search), pat, pat_len, &status); + usearch_setOffset(UG(root_search), 0, &status); + + alloc_len = str_len; + result = eumalloc(alloc_len + 1); + *result_len = 0; + + p = str; + end = str + str_len; + for (offset = usearch_first(UG(root_search), &status); + offset != USEARCH_DONE; + offset = usearch_next(UG(root_search), &status)) { + + found = str + offset; + /* matched length is not simply pattern length */ + found_len = usearch_getMatchedLength(UG(root_search)); + + if (*result_len + (found - p + repl_len) > alloc_len) { + alloc_len += (alloc_len >> 1); /* grow by 1.5x factor */ + result = eurealloc(result, alloc_len + 1); + } + u_memcpy(result + *result_len, p, found - p); + *result_len += found - p; + u_memcpy(result + *result_len, repl, repl_len); + *result_len += repl_len; + + p = found + found_len; + + if (replace_count) { + (*replace_count)++; + } + } + + /* add remaining chunk, if any */ + if (p < end) { + int chunk_len = end - p; + if (*result_len + chunk_len > alloc_len) { + alloc_len += *result_len + chunk_len - alloc_len; + result = eurealloc(result, alloc_len + 1); + } + u_memcpy(result + *result_len, p, chunk_len); + *result_len += chunk_len; + } + + result[*result_len] = 0; + if (*result_len < alloc_len) { + result = eurealloc(result, *result_len + 1); + } + } + efree(str_fold); + + return result; +} +/* }}} */ /* {{{ php_str_replace_in_subject */ -static void php_str_replace_in_subject(zval *search, zval *replace, zval **subject, zval *result, int case_sensitivity, int *replace_count) +static void php_str_replace_in_subject(zval *search, zval *replace, zval **subject, zval *result, int case_sensitivity, int *replace_count TSRMLS_DC) { zval **search_entry, **replace_entry = NULL, temp_result; - char *replace_value = NULL; + zstr replace_value = NULL_ZSTR; int replace_len = 0; /* Make sure we're dealing with strings. */ - convert_to_string_ex(subject); - Z_TYPE_P(result) = IS_STRING; - if (Z_STRLEN_PP(subject) == 0) { - ZVAL_STRINGL(result, "", 0, 1); + convert_to_text_ex(subject); + Z_TYPE_P(result) = ZEND_STR_TYPE; + if (Z_UNILEN_PP(subject) == 0) { + ZVAL_EMPTY_TEXT(result); return; } @@ -5108,16 +5361,16 @@ zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace)); } else { /* Set replacement value to the passed one */ - replace_value = Z_STRVAL_P(replace); - replace_len = Z_STRLEN_P(replace); + replace_value = Z_UNIVAL_P(replace); + replace_len = Z_UNILEN_P(replace); } /* For each entry in the search array, get the entry */ while (zend_hash_get_current_data(Z_ARRVAL_P(search), (void **) &search_entry) == SUCCESS) { /* Make sure we're dealing with strings. */ SEPARATE_ZVAL(search_entry); - convert_to_string(*search_entry); - if (Z_STRLEN_PP(search_entry) == 0) { + convert_to_text(*search_entry); + if (Z_UNILEN_PP(search_entry) == 0) { zend_hash_move_forward(Z_ARRVAL_P(search)); if (Z_TYPE_P(replace) == IS_ARRAY) { zend_hash_move_forward(Z_ARRVAL_P(replace)); @@ -5130,59 +5383,110 @@ /* Get current entry */ if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) { /* Make sure we're dealing with strings. */ - convert_to_string_ex(replace_entry); + SEPARATE_ZVAL(replace_entry); + convert_to_text(*replace_entry); /* Set replacement value to the one we got from array */ - replace_value = Z_STRVAL_PP(replace_entry); - replace_len = Z_STRLEN_PP(replace_entry); + replace_value = Z_UNIVAL_PP(replace_entry); + replace_len = Z_UNILEN_PP(replace_entry); zend_hash_move_forward(Z_ARRVAL_P(replace)); } else { /* We've run out of replacement strings, so use an empty one. */ - replace_value = ""; + replace_value = EMPTY_ZSTR; replace_len = 0; } } - if (Z_STRLEN_PP(search_entry) == 1) { - php_char_to_str_ex(Z_STRVAL_P(result), - Z_STRLEN_P(result), - Z_STRVAL_PP(search_entry)[0], - replace_value, - replace_len, - &temp_result, - case_sensitivity, - replace_count); - } else if (Z_STRLEN_PP(search_entry) > 1) { - Z_STRVAL(temp_result) = php_str_to_str_ex(Z_STRVAL_P(result), Z_STRLEN_P(result), - Z_STRVAL_PP(search_entry), Z_STRLEN_PP(search_entry), - replace_value, replace_len, &Z_STRLEN(temp_result), case_sensitivity, replace_count); - } - - efree(Z_STRVAL_P(result)); - Z_STRVAL_P(result) = Z_STRVAL(temp_result); - Z_STRLEN_P(result) = Z_STRLEN(temp_result); + if (Z_UNILEN_PP(search_entry) == 1) { + if (UG(unicode)) { + if (case_sensitivity) { + php_u_char_to_str_ex(Z_USTRVAL_P(result), Z_USTRLEN_P(result), + Z_USTRVAL_PP(search_entry)[0], + replace_value.u, replace_len, + &temp_result, replace_count); + } else { + Z_USTRVAL(temp_result) = php_u_str_to_str_case_ex(Z_USTRVAL_P(result), Z_USTRLEN_P(result), + Z_USTRVAL_PP(search_entry), Z_USTRLEN_PP(search_entry), + replace_value.u, replace_len, + &Z_USTRLEN(temp_result), replace_count TSRMLS_CC); + } + } else { + php_char_to_str_ex(Z_STRVAL_P(result), Z_STRLEN_P(result), + Z_STRVAL_PP(search_entry)[0], + replace_value.s, replace_len, + &temp_result, case_sensitivity, replace_count); + } + } else if (Z_UNILEN_PP(search_entry) > 1) { + if (UG(unicode)) { + if (case_sensitivity) { + Z_USTRVAL(temp_result) = php_u_str_to_str_ex(Z_USTRVAL_P(result), Z_USTRLEN_P(result), + Z_USTRVAL_PP(search_entry), Z_USTRLEN_PP(search_entry), + replace_value.u, replace_len, + &Z_USTRLEN(temp_result), replace_count); + } else { + Z_USTRVAL(temp_result) = php_u_str_to_str_case_ex(Z_USTRVAL_P(result), Z_USTRLEN_P(result), + Z_USTRVAL_PP(search_entry), Z_USTRLEN_PP(search_entry), + replace_value.u, replace_len, + &Z_USTRLEN(temp_result), replace_count TSRMLS_CC); + } + } else { + Z_STRVAL(temp_result) = php_str_to_str_ex(Z_STRVAL_P(result), Z_STRLEN_P(result), + Z_STRVAL_PP(search_entry), Z_STRLEN_PP(search_entry), + replace_value.s, replace_len, + &Z_STRLEN(temp_result), case_sensitivity, replace_count); + } + } + + efree(Z_UNIVAL_P(result).v); + Z_UNIVAL_P(result) = Z_UNIVAL(temp_result); + Z_UNILEN_P(result) = Z_UNILEN(temp_result); - if (Z_STRLEN_P(result) == 0) { + if (Z_UNILEN_P(result) == 0) { return; } zend_hash_move_forward(Z_ARRVAL_P(search)); } } else { - if (Z_STRLEN_P(search) == 1) { - php_char_to_str_ex(Z_STRVAL_PP(subject), - Z_STRLEN_PP(subject), - Z_STRVAL_P(search)[0], - Z_STRVAL_P(replace), - Z_STRLEN_P(replace), - result, - case_sensitivity, - replace_count); + if (Z_UNILEN_P(search) == 1) { + if (UG(unicode)) { + if (case_sensitivity) { + php_u_char_to_str_ex(Z_USTRVAL_PP(subject), Z_USTRLEN_PP(subject), + Z_USTRVAL_P(search)[0], + Z_USTRVAL_P(replace), Z_USTRLEN_P(replace), + result, replace_count); + } else { + Z_USTRVAL_P(result) = php_u_str_to_str_case_ex(Z_USTRVAL_PP(subject), Z_USTRLEN_PP(subject), + Z_USTRVAL_P(search), Z_USTRLEN_P(search), + Z_USTRVAL_P(replace), Z_USTRLEN_P(replace), + &Z_USTRLEN_P(result), replace_count TSRMLS_CC); + } + } else { + php_char_to_str_ex(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject), + Z_STRVAL_P(search)[0], + Z_STRVAL_P(replace), Z_STRLEN_P(replace), + result, case_sensitivity, replace_count); + } } else if (Z_STRLEN_P(search) > 1) { - Z_STRVAL_P(result) = php_str_to_str_ex(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject), - Z_STRVAL_P(search), Z_STRLEN_P(search), - Z_STRVAL_P(replace), Z_STRLEN_P(replace), &Z_STRLEN_P(result), case_sensitivity, replace_count); + if (UG(unicode)) { + if (case_sensitivity) { + Z_USTRVAL_P(result) = php_u_str_to_str_ex(Z_USTRVAL_PP(subject), Z_USTRLEN_PP(subject), + Z_USTRVAL_P(search), Z_USTRLEN_P(search), + Z_USTRVAL_P(replace), Z_USTRLEN_P(replace), + &Z_USTRLEN_P(result), replace_count); + } else { + Z_USTRVAL_P(result) = php_u_str_to_str_case_ex(Z_USTRVAL_PP(subject), Z_USTRLEN_PP(subject), + Z_USTRVAL_P(search), Z_USTRLEN_P(search), + Z_USTRVAL_P(replace), Z_USTRLEN_P(replace), + &Z_USTRLEN_P(result), replace_count TSRMLS_CC); + } + } else { + Z_STRVAL_P(result) = php_str_to_str_ex(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject), + Z_STRVAL_P(search), Z_STRLEN_P(search), + Z_STRVAL_P(replace), Z_STRLEN_P(replace), + &Z_STRLEN_P(result), case_sensitivity, replace_count); + } } else { *result = **subject; zval_copy_ctor(result); @@ -5196,7 +5500,7 @@ */ static void php_str_replace_common(INTERNAL_FUNCTION_PARAMETERS, int case_sensitivity) { - zval **subject, **search, **replace, **subject_entry, **zcount; + zval *subject, *search, *replace, **subject_entry, *zcount; zval *result; zstr string_key; uint string_key_len; @@ -5204,45 +5508,41 @@ int count = 0; int argc = ZEND_NUM_ARGS(); - if (argc < 3 || argc > 4 || - zend_get_parameters_ex(argc, &search, &replace, &subject, &zcount) == FAILURE) { - WRONG_PARAM_COUNT; + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z/z/z/|z", &search, + &replace, &subject, &zcount) == FAILURE) { + return; } - SEPARATE_ZVAL(search); - SEPARATE_ZVAL(replace); - SEPARATE_ZVAL(subject); - /* Make sure we're dealing with strings and do the replacement. */ - if (Z_TYPE_PP(search) != IS_ARRAY) { - convert_to_string_ex(search); - convert_to_string_ex(replace); - } else if (Z_TYPE_PP(replace) != IS_ARRAY) { - convert_to_string_ex(replace); + if (Z_TYPE_P(search) != IS_ARRAY) { + convert_to_text(search); + convert_to_text(replace); + } else if (Z_TYPE_P(replace) != IS_ARRAY) { + convert_to_text(replace); } /* if subject is an array */ - if (Z_TYPE_PP(subject) == IS_ARRAY) { + if (Z_TYPE_P(subject) == IS_ARRAY) { array_init(return_value); - zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject)); + zend_hash_internal_pointer_reset(Z_ARRVAL_P(subject)); /* For each subject entry, convert it to string, then perform replacement and add the result to the return_value array. */ - while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) { + while (zend_hash_get_current_data(Z_ARRVAL_P(subject), (void **)&subject_entry) == SUCCESS) { zend_uchar utype; if (Z_TYPE_PP(subject_entry) != IS_ARRAY && Z_TYPE_PP(subject_entry) != IS_OBJECT) { MAKE_STD_ZVAL(result); SEPARATE_ZVAL(subject_entry); - php_str_replace_in_subject(*search, *replace, subject_entry, result, case_sensitivity, (argc > 3) ? &count : NULL); + php_str_replace_in_subject(search, replace, subject_entry, result, case_sensitivity, (argc > 3) ? &count : NULL TSRMLS_CC); } else { ALLOC_ZVAL(result); ZVAL_ADDREF(*subject_entry); COPY_PZVAL_TO_ZVAL(*result, *subject_entry); } /* Add to return array */ - switch ((utype = zend_hash_get_current_key_ex(Z_ARRVAL_PP(subject), &string_key, - &string_key_len, &num_key, 0, NULL))) { + switch ((utype = zend_hash_get_current_key_ex(Z_ARRVAL_P(subject), &string_key, + &string_key_len, &num_key, 0, NULL))) { case HASH_KEY_IS_STRING: case HASH_KEY_IS_UNICODE: add_u_assoc_zval_ex(return_value, utype, string_key, string_key_len, result); @@ -5253,19 +5553,19 @@ break; } - zend_hash_move_forward(Z_ARRVAL_PP(subject)); + zend_hash_move_forward(Z_ARRVAL_P(subject)); } } else { /* if subject is not an array */ - php_str_replace_in_subject(*search, *replace, subject, return_value, case_sensitivity, (argc > 3) ? &count : NULL); + php_str_replace_in_subject(search, replace, &subject, return_value, case_sensitivity, (argc > 3) ? &count : NULL TSRMLS_CC); } if (argc > 3) { - zval_dtor(*zcount); - ZVAL_LONG(*zcount, count); + zval_dtor(zcount); + ZVAL_LONG(zcount, count); } } /* }}} */ -/* {{{ proto mixed str_replace(mixed search, mixed replace, mixed subject [, int &replace_count]) +/* {{{ proto mixed str_replace(mixed search, mixed replace, mixed subject [, int &replace_count]) U Replaces all occurrences of search in haystack with replace */ PHP_FUNCTION(str_replace) { @@ -5273,7 +5573,7 @@ } /* }}} */ -/* {{{ proto mixed str_ireplace(mixed search, mixed replace, mixed subject [, int &replace_count]) +/* {{{ proto mixed str_ireplace(mixed search, mixed replace, mixed subject [, int &replace_count]) U Replaces all occurrences of search in haystack with replace / case-insensitive */ PHP_FUNCTION(str_ireplace) {
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php