andrei Fri Aug 18 18:01:58 2006 UTC Modified files: /php-src/ext/standard string.c /php-src unicode-progress.txt Log: Unicode support for str_word_count() (using the same semantics as for binary strings). # These semantics seem quite broken, by the way, as it counts words # ending or starting with dashes (-). Since this shouldn't really be # used to count words in Unicode world anyway, supporting the "broken" # behavior is okay if it helps people with migration. http://cvs.php.net/viewvc.cgi/php-src/ext/standard/string.c?r1=1.581&r2=1.582&diff_format=u Index: php-src/ext/standard/string.c diff -u php-src/ext/standard/string.c:1.581 php-src/ext/standard/string.c:1.582 --- php-src/ext/standard/string.c:1.581 Wed Aug 16 18:07:22 2006 +++ php-src/ext/standard/string.c Fri Aug 18 18:01:58 2006 @@ -18,7 +18,7 @@ +----------------------------------------------------------------------+ */ -/* $Id: string.c,v 1.581 2006/08/16 18:07:22 andrei Exp $ */ +/* $Id: string.c,v 1.582 2006/08/18 18:01:58 andrei Exp $ */ /* Synced with php 3.0 revision 1.193 1999-06-16 [ssb] */ @@ -617,11 +617,11 @@ } /* }}} */ -/* {{{ php_expand_u_trim_range() +/* {{{ php_expand_uchar_range() * Expands possible ranges of the form 'a..b' in input charlist, * where a < b in code-point order */ -static int php_expand_u_trim_range(UChar **range, int *range_len TSRMLS_DC) +static int php_expand_uchar_range(UChar **range, int *range_len TSRMLS_DC) { UChar32 *codepts, *tmp, *input, *end, c; int32_t len, tmp_len, idx; @@ -705,13 +705,13 @@ */ static UChar *php_u_trim(UChar *c, int len, UChar *what, int what_len, zval *return_value, int mode TSRMLS_DC) { - int32_t i, j, k; - UChar ch = 0, wh = 0; + int32_t i, k; + UChar ch = 0; int32_t start = 0, end = len; if ( what ) { what = eustrndup(what, what_len); - php_expand_u_trim_range(&what, &what_len TSRMLS_CC); + php_expand_uchar_range(&what, &what_len TSRMLS_CC); } if ( mode & 1 ) { @@ -6870,27 +6870,79 @@ } /* }}} */ -/* {{{ proto mixed str_word_count(string str, [int format [, string charlist]]) - Counts the number of words inside a string. If format of 1 is specified, - then the function will return an array containing all the words - found inside the string. If format of 2 is specified, then the function - will return an associated array where the position of the word is the key - and the word itself is the value. - - For the purpose of this function, 'word' is defined as a locale dependent - string containing alphabetic characters, which also may contain, but not start - with "'" and "-" characters. -*/ -PHP_FUNCTION(str_word_count) +/* {{{ php_u_str_word_count */ +static int php_u_str_word_count(UChar *str, int str_len, long type, UChar *char_list, int char_list_len, zval *return_value TSRMLS_DC) { - char *buf, *str, *char_list = NULL, *p, *e, *s, ch[256]; - int str_len, char_list_len, word_count = 0; - long type = 0; + UChar *s, *buf; + UChar32 ch; + int ws, we, tmp, idx, last_idx; + int word_count = 0; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len, &type, &char_list, &char_list_len) == FAILURE) { - WRONG_PARAM_COUNT; + if (char_list) { + char_list = eustrndup(char_list, char_list_len); + php_expand_uchar_range(&char_list, &char_list_len TSRMLS_CC); + } + + ws = idx = 0; + /* first character cannot be ' or -, unless explicitly allowed by the user */ + if ((str[ws] == (UChar)0x27 /*'\''*/ && (!char_list || !u_memchr(char_list, 0x27 /*'\''*/, char_list_len))) || + (str[ws] == (UChar)0x2d /*'-'*/ && (!char_list || !u_memchr(char_list, 0x2d /*'-'*/, char_list_len)))) { + ws++; + idx++; + } + /* last character cannot be -, unless explicitly allowed by the user */ + if (str[str_len-1] == (UChar)0x2d /*'-'*/ && + (!char_list || !u_memchr(char_list, 0x2d /*'-'*/, char_list_len))) { + str_len--; + } + + last_idx = idx; + tmp = we = ws; + while (we < str_len) { + s = str + ws; + while (we < str_len) { + U16_NEXT(str, tmp, str_len, ch); + idx++; + if (!(u_isalpha(ch) || (char_list && u_memchr32(char_list, ch, char_list_len)) || + ch == (UChar32) 0x27 /*'\''*/ || ch == (UChar32) 0x2d /*'-'*/)) { + break; + } + we = tmp; + } + if (we > ws) { + switch (type) + { + case 1: + buf = eustrndup(s, (we-ws)); + add_next_index_unicodel(return_value, buf, (we-ws), 0); + break; + case 2: + buf = eustrndup(s, (we-ws)); + add_index_unicodel(return_value, last_idx, buf, we-ws, 0); + break; + default: + word_count++; + break; + } + } + ws = we = tmp; + last_idx = idx; + } + + if (char_list) { + efree(char_list); } + return word_count; +} +/* }}} */ + +/* {{{ php_str_word_count */ +static int php_str_word_count(char *str, int str_len, long type, char *char_list, int char_list_len, zval *return_value) +{ + char ch[256], *p, *e, *s, *buf; + int word_count = 0; + if (char_list) { php_charmask((unsigned char*)char_list, char_list_len, ch TSRMLS_CC); } @@ -6898,10 +6950,6 @@ p = str; e = str + str_len; - if (type == 1 || type == 2) { - array_init(return_value); - } - /* first character cannot be ' or -, unless explicitly allowed by the user */ if ((*p == '\'' && (!char_list || !ch['\''])) || (*p == '-' && (!char_list || !ch['-']))) { p++; @@ -6935,6 +6983,43 @@ p++; } + return word_count; +} +/* }}} */ + +/* {{{ proto mixed str_word_count(string str, [int format [, string charlist]]) U + Counts the number of words inside a string. If format of 1 is specified, + then the function will return an array containing all the words + found inside the string. If format of 2 is specified, then the function + will return an associated array where the position of the word is the key + and the word itself is the value. + + For the purpose of this function, 'word' is defined as a locale dependent + string containing alphabetic characters, which also may contain, but not start + with "'" and "-" characters. +*/ +PHP_FUNCTION(str_word_count) +{ + zstr str, char_list = NULL_ZSTR; + int str_len, char_list_len, word_count = 0; + zend_uchar str_type; + long type = 0; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "T|lT", &str, &str_len, + &str_type, &type, &char_list, &char_list_len, &str_type) == FAILURE) { + return; + } + + if (type == 1 || type == 2) { + array_init(return_value); + } + + if (str_type == IS_UNICODE) { + word_count = php_u_str_word_count(str.u, str_len, type, char_list.u, char_list_len, return_value TSRMLS_CC); + } else { + word_count = php_str_word_count(str.s, str_len, type, char_list.s, char_list_len, return_value); + } + if (!type) { RETURN_LONG(word_count); } http://cvs.php.net/viewvc.cgi/php-src/unicode-progress.txt?r1=1.45&r2=1.46&diff_format=u Index: php-src/unicode-progress.txt diff -u php-src/unicode-progress.txt:1.45 php-src/unicode-progress.txt:1.46 --- php-src/unicode-progress.txt:1.45 Tue Aug 15 20:38:12 2006 +++ php-src/unicode-progress.txt Fri Aug 18 18:01:58 2006 @@ -42,9 +42,6 @@ sscanf() Params API. Rest - no idea yet. - str_word_count() - Params API, IS_UNICODE support, using u_isalpha(), etc. - stristr() stripos() strripos() @@ -173,6 +170,7 @@ str_rot13() str_shuffle() str_split() + str_word_count() strcoll() strcspn() strip_tags()
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php