andrei Fri Aug 18 18:01:58 2006 UTC
Modified files:
/php-src/ext/standard string.c
/php-src unicode-progress.txt
Log:
Unicode support for str_word_count() (using the same semantics as for
binary strings).
# These semantics seem quite broken, by the way, as it counts words
# ending or starting with dashes (-). Since this shouldn't really be
# used to count words in Unicode world anyway, supporting the "broken"
# behavior is okay if it helps people with migration.
http://cvs.php.net/viewvc.cgi/php-src/ext/standard/string.c?r1=1.581&r2=1.582&diff_format=u
Index: php-src/ext/standard/string.c
diff -u php-src/ext/standard/string.c:1.581 php-src/ext/standard/string.c:1.582
--- php-src/ext/standard/string.c:1.581 Wed Aug 16 18:07:22 2006
+++ php-src/ext/standard/string.c Fri Aug 18 18:01:58 2006
@@ -18,7 +18,7 @@
+----------------------------------------------------------------------+
*/
-/* $Id: string.c,v 1.581 2006/08/16 18:07:22 andrei Exp $ */
+/* $Id: string.c,v 1.582 2006/08/18 18:01:58 andrei Exp $ */
/* Synced with php 3.0 revision 1.193 1999-06-16 [ssb] */
@@ -617,11 +617,11 @@
}
/* }}} */
-/* {{{ php_expand_u_trim_range()
+/* {{{ php_expand_uchar_range()
* Expands possible ranges of the form 'a..b' in input charlist,
* where a < b in code-point order
*/
-static int php_expand_u_trim_range(UChar **range, int *range_len TSRMLS_DC)
+static int php_expand_uchar_range(UChar **range, int *range_len TSRMLS_DC)
{
UChar32 *codepts, *tmp, *input, *end, c;
int32_t len, tmp_len, idx;
@@ -705,13 +705,13 @@
*/
static UChar *php_u_trim(UChar *c, int len, UChar *what, int what_len, zval
*return_value, int mode TSRMLS_DC)
{
- int32_t i, j, k;
- UChar ch = 0, wh = 0;
+ int32_t i, k;
+ UChar ch = 0;
int32_t start = 0, end = len;
if ( what ) {
what = eustrndup(what, what_len);
- php_expand_u_trim_range(&what, &what_len TSRMLS_CC);
+ php_expand_uchar_range(&what, &what_len TSRMLS_CC);
}
if ( mode & 1 ) {
@@ -6870,27 +6870,79 @@
}
/* }}} */
-/* {{{ proto mixed str_word_count(string str, [int format [, string charlist]])
- Counts the number of words inside a string. If format of 1 is specified,
- then the function will return an array containing all the words
- found inside the string. If format of 2 is specified, then the function
- will return an associated array where the position of the word is the
key
- and the word itself is the value.
-
- For the purpose of this function, 'word' is defined as a locale
dependent
- string containing alphabetic characters, which also may contain, but
not start
- with "'" and "-" characters.
-*/
-PHP_FUNCTION(str_word_count)
+/* {{{ php_u_str_word_count */
+static int php_u_str_word_count(UChar *str, int str_len, long type, UChar
*char_list, int char_list_len, zval *return_value TSRMLS_DC)
{
- char *buf, *str, *char_list = NULL, *p, *e, *s, ch[256];
- int str_len, char_list_len, word_count = 0;
- long type = 0;
+ UChar *s, *buf;
+ UChar32 ch;
+ int ws, we, tmp, idx, last_idx;
+ int word_count = 0;
- if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str,
&str_len, &type, &char_list, &char_list_len) == FAILURE) {
- WRONG_PARAM_COUNT;
+ if (char_list) {
+ char_list = eustrndup(char_list, char_list_len);
+ php_expand_uchar_range(&char_list, &char_list_len TSRMLS_CC);
+ }
+
+ ws = idx = 0;
+ /* first character cannot be ' or -, unless explicitly allowed by the
user */
+ if ((str[ws] == (UChar)0x27 /*'\''*/ && (!char_list ||
!u_memchr(char_list, 0x27 /*'\''*/, char_list_len))) ||
+ (str[ws] == (UChar)0x2d /*'-'*/ && (!char_list ||
!u_memchr(char_list, 0x2d /*'-'*/, char_list_len)))) {
+ ws++;
+ idx++;
+ }
+ /* last character cannot be -, unless explicitly allowed by the user */
+ if (str[str_len-1] == (UChar)0x2d /*'-'*/ &&
+ (!char_list || !u_memchr(char_list, 0x2d /*'-'*/,
char_list_len))) {
+ str_len--;
+ }
+
+ last_idx = idx;
+ tmp = we = ws;
+ while (we < str_len) {
+ s = str + ws;
+ while (we < str_len) {
+ U16_NEXT(str, tmp, str_len, ch);
+ idx++;
+ if (!(u_isalpha(ch) || (char_list &&
u_memchr32(char_list, ch, char_list_len)) ||
+ ch == (UChar32) 0x27 /*'\''*/ || ch ==
(UChar32) 0x2d /*'-'*/)) {
+ break;
+ }
+ we = tmp;
+ }
+ if (we > ws) {
+ switch (type)
+ {
+ case 1:
+ buf = eustrndup(s, (we-ws));
+ add_next_index_unicodel(return_value,
buf, (we-ws), 0);
+ break;
+ case 2:
+ buf = eustrndup(s, (we-ws));
+ add_index_unicodel(return_value,
last_idx, buf, we-ws, 0);
+ break;
+ default:
+ word_count++;
+ break;
+ }
+ }
+ ws = we = tmp;
+ last_idx = idx;
+ }
+
+ if (char_list) {
+ efree(char_list);
}
+ return word_count;
+}
+/* }}} */
+
+/* {{{ php_str_word_count */
+static int php_str_word_count(char *str, int str_len, long type, char
*char_list, int char_list_len, zval *return_value)
+{
+ char ch[256], *p, *e, *s, *buf;
+ int word_count = 0;
+
if (char_list) {
php_charmask((unsigned char*)char_list, char_list_len, ch
TSRMLS_CC);
}
@@ -6898,10 +6950,6 @@
p = str;
e = str + str_len;
- if (type == 1 || type == 2) {
- array_init(return_value);
- }
-
/* first character cannot be ' or -, unless explicitly allowed by the
user */
if ((*p == '\'' && (!char_list || !ch['\''])) || (*p == '-' &&
(!char_list || !ch['-']))) {
p++;
@@ -6935,6 +6983,43 @@
p++;
}
+ return word_count;
+}
+/* }}} */
+
+/* {{{ proto mixed str_word_count(string str, [int format [, string
charlist]]) U
+ Counts the number of words inside a string. If format of 1 is specified,
+ then the function will return an array containing all the words
+ found inside the string. If format of 2 is specified, then the function
+ will return an associated array where the position of the word is the
key
+ and the word itself is the value.
+
+ For the purpose of this function, 'word' is defined as a locale
dependent
+ string containing alphabetic characters, which also may contain, but
not start
+ with "'" and "-" characters.
+*/
+PHP_FUNCTION(str_word_count)
+{
+ zstr str, char_list = NULL_ZSTR;
+ int str_len, char_list_len, word_count = 0;
+ zend_uchar str_type;
+ long type = 0;
+
+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "T|lT", &str,
&str_len,
+ &str_type, &type,
&char_list, &char_list_len, &str_type) == FAILURE) {
+ return;
+ }
+
+ if (type == 1 || type == 2) {
+ array_init(return_value);
+ }
+
+ if (str_type == IS_UNICODE) {
+ word_count = php_u_str_word_count(str.u, str_len, type,
char_list.u, char_list_len, return_value TSRMLS_CC);
+ } else {
+ word_count = php_str_word_count(str.s, str_len, type,
char_list.s, char_list_len, return_value);
+ }
+
if (!type) {
RETURN_LONG(word_count);
}
http://cvs.php.net/viewvc.cgi/php-src/unicode-progress.txt?r1=1.45&r2=1.46&diff_format=u
Index: php-src/unicode-progress.txt
diff -u php-src/unicode-progress.txt:1.45 php-src/unicode-progress.txt:1.46
--- php-src/unicode-progress.txt:1.45 Tue Aug 15 20:38:12 2006
+++ php-src/unicode-progress.txt Fri Aug 18 18:01:58 2006
@@ -42,9 +42,6 @@
sscanf()
Params API. Rest - no idea yet.
- str_word_count()
- Params API, IS_UNICODE support, using u_isalpha(), etc.
-
stristr()
stripos()
strripos()
@@ -173,6 +170,7 @@
str_rot13()
str_shuffle()
str_split()
+ str_word_count()
strcoll()
strcspn()
strip_tags()
--
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php