andrei          Fri Aug 18 18:01:58 2006 UTC

  Modified files:              
    /php-src/ext/standard       string.c 
    /php-src    unicode-progress.txt 
  Log:
  Unicode support for str_word_count() (using the same semantics as for
  binary strings).
  
  # These semantics seem quite broken, by the way, as it counts words
  # ending or starting with dashes (-). Since this shouldn't really be
  # used to count words in Unicode world anyway, supporting the "broken"
  # behavior is okay if it helps people with migration.
  
  
http://cvs.php.net/viewvc.cgi/php-src/ext/standard/string.c?r1=1.581&r2=1.582&diff_format=u
Index: php-src/ext/standard/string.c
diff -u php-src/ext/standard/string.c:1.581 php-src/ext/standard/string.c:1.582
--- php-src/ext/standard/string.c:1.581 Wed Aug 16 18:07:22 2006
+++ php-src/ext/standard/string.c       Fri Aug 18 18:01:58 2006
@@ -18,7 +18,7 @@
    +----------------------------------------------------------------------+
  */
 
-/* $Id: string.c,v 1.581 2006/08/16 18:07:22 andrei Exp $ */
+/* $Id: string.c,v 1.582 2006/08/18 18:01:58 andrei Exp $ */
 
 /* Synced with php 3.0 revision 1.193 1999-06-16 [ssb] */
 
@@ -617,11 +617,11 @@
 }
 /* }}} */
 
-/* {{{ php_expand_u_trim_range()
+/* {{{ php_expand_uchar_range()
  * Expands possible ranges of the form 'a..b' in input charlist,
  * where a < b in code-point order
  */
-static int php_expand_u_trim_range(UChar **range, int *range_len TSRMLS_DC)
+static int php_expand_uchar_range(UChar **range, int *range_len TSRMLS_DC)
 {
        UChar32 *codepts, *tmp, *input, *end, c;
        int32_t len, tmp_len, idx;
@@ -705,13 +705,13 @@
  */
 static UChar *php_u_trim(UChar *c, int len, UChar *what, int what_len, zval 
*return_value, int mode TSRMLS_DC)
 {
-       int32_t i, j, k;
-       UChar   ch = 0, wh = 0;
+       int32_t i, k;
+       UChar   ch = 0;
        int32_t start = 0, end = len;
 
        if ( what ) {
                what = eustrndup(what, what_len);
-               php_expand_u_trim_range(&what, &what_len TSRMLS_CC);
+               php_expand_uchar_range(&what, &what_len TSRMLS_CC);
        }
 
        if ( mode & 1 ) {
@@ -6870,27 +6870,79 @@
 }
 /* }}} */
 
-/* {{{ proto mixed str_word_count(string str, [int format [, string charlist]])
-       Counts the number of words inside a string. If format of 1 is specified,
-       then the function will return an array containing all the words
-       found inside the string. If format of 2 is specified, then the function
-       will return an associated array where the position of the word is the 
key
-       and the word itself is the value.
-
-       For the purpose of this function, 'word' is defined as a locale 
dependent
-       string containing alphabetic characters, which also may contain, but 
not start
-       with "'" and "-" characters.
-*/
-PHP_FUNCTION(str_word_count)
+/* {{{ php_u_str_word_count */
+static int php_u_str_word_count(UChar *str, int str_len, long type, UChar 
*char_list, int char_list_len, zval *return_value TSRMLS_DC)
 {
-       char *buf, *str, *char_list = NULL, *p, *e, *s, ch[256];
-       int str_len, char_list_len, word_count = 0;
-       long type = 0;
+       UChar *s, *buf;
+       UChar32 ch;
+       int ws, we, tmp, idx, last_idx;
+       int word_count = 0;
 
-       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, 
&str_len, &type, &char_list, &char_list_len) == FAILURE) {
-               WRONG_PARAM_COUNT;
+       if (char_list) {
+               char_list = eustrndup(char_list, char_list_len);
+               php_expand_uchar_range(&char_list, &char_list_len TSRMLS_CC);
+       }
+
+       ws = idx = 0;
+       /* first character cannot be ' or -, unless explicitly allowed by the 
user */
+       if ((str[ws] == (UChar)0x27 /*'\''*/ && (!char_list || 
!u_memchr(char_list, 0x27 /*'\''*/, char_list_len))) ||
+               (str[ws] == (UChar)0x2d /*'-'*/  && (!char_list || 
!u_memchr(char_list, 0x2d /*'-'*/, char_list_len)))) {
+               ws++;
+               idx++;
+       }
+       /* last character cannot be -, unless explicitly allowed by the user */
+       if (str[str_len-1] == (UChar)0x2d /*'-'*/ &&
+               (!char_list || !u_memchr(char_list, 0x2d /*'-'*/, 
char_list_len))) {
+               str_len--;
+       }
+
+       last_idx = idx;
+       tmp = we = ws;
+       while (we < str_len) {
+               s = str + ws;
+               while (we < str_len) {
+                       U16_NEXT(str, tmp, str_len, ch);
+                       idx++;
+                       if (!(u_isalpha(ch) || (char_list && 
u_memchr32(char_list, ch, char_list_len)) ||
+                                 ch == (UChar32) 0x27 /*'\''*/ || ch == 
(UChar32) 0x2d /*'-'*/)) {
+                               break;
+                       }
+                       we = tmp;
+               }
+               if (we > ws) {
+                       switch (type)
+                       {
+                               case 1:
+                                       buf = eustrndup(s, (we-ws));
+                                       add_next_index_unicodel(return_value, 
buf, (we-ws), 0);
+                                       break;
+                               case 2:
+                                       buf = eustrndup(s, (we-ws));
+                                       add_index_unicodel(return_value, 
last_idx, buf, we-ws, 0);
+                                       break;
+                               default:
+                                       word_count++;
+                                       break;
+                       }
+               }
+               ws = we = tmp;
+               last_idx = idx;
+       }
+
+       if (char_list) {
+               efree(char_list);
        }
 
+       return word_count;
+}
+/* }}} */
+
+/* {{{ php_str_word_count */
+static int php_str_word_count(char *str, int str_len, long type, char 
*char_list, int char_list_len, zval *return_value)
+{
+       char ch[256], *p, *e, *s, *buf;
+       int word_count = 0;
+
        if (char_list) {
                php_charmask((unsigned char*)char_list, char_list_len, ch 
TSRMLS_CC);
        }
@@ -6898,10 +6950,6 @@
        p = str;
        e = str + str_len;
 
-       if (type == 1 || type == 2) {
-               array_init(return_value);
-       }
-
        /* first character cannot be ' or -, unless explicitly allowed by the 
user */
        if ((*p == '\'' && (!char_list || !ch['\''])) || (*p == '-' && 
(!char_list || !ch['-']))) {
                p++;
@@ -6935,6 +6983,43 @@
                p++;
        }
 
+       return word_count;
+}
+/* }}} */
+
+/* {{{ proto mixed str_word_count(string str, [int format [, string 
charlist]]) U
+       Counts the number of words inside a string. If format of 1 is specified,
+       then the function will return an array containing all the words
+       found inside the string. If format of 2 is specified, then the function
+       will return an associated array where the position of the word is the 
key
+       and the word itself is the value.
+
+       For the purpose of this function, 'word' is defined as a locale 
dependent
+       string containing alphabetic characters, which also may contain, but 
not start
+       with "'" and "-" characters.
+*/
+PHP_FUNCTION(str_word_count)
+{
+       zstr str, char_list = NULL_ZSTR;
+       int str_len, char_list_len, word_count = 0;
+       zend_uchar str_type;
+       long type = 0;
+
+       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "T|lT", &str, 
&str_len,
+                                                         &str_type, &type, 
&char_list, &char_list_len, &str_type) == FAILURE) {
+               return;
+       }
+
+       if (type == 1 || type == 2) {
+               array_init(return_value);
+       }
+
+       if (str_type == IS_UNICODE) {
+               word_count = php_u_str_word_count(str.u, str_len, type, 
char_list.u, char_list_len, return_value TSRMLS_CC);
+       } else {
+               word_count = php_str_word_count(str.s, str_len, type, 
char_list.s, char_list_len, return_value);
+       }
+
        if (!type) {
                RETURN_LONG(word_count);
        }
http://cvs.php.net/viewvc.cgi/php-src/unicode-progress.txt?r1=1.45&r2=1.46&diff_format=u
Index: php-src/unicode-progress.txt
diff -u php-src/unicode-progress.txt:1.45 php-src/unicode-progress.txt:1.46
--- php-src/unicode-progress.txt:1.45   Tue Aug 15 20:38:12 2006
+++ php-src/unicode-progress.txt        Fri Aug 18 18:01:58 2006
@@ -42,9 +42,6 @@
     sscanf()
         Params API. Rest - no idea yet.
 
-    str_word_count()
-        Params API, IS_UNICODE support, using u_isalpha(), etc.
-    
     stristr()
     stripos()
     strripos()
@@ -173,6 +170,7 @@
     str_rot13()
     str_shuffle()
     str_split()
+    str_word_count()
     strcoll()
     strcspn()
     strip_tags()

-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to