andrei          Mon Aug 28 20:36:50 2006 UTC

  Modified files:              
    /php-src/ext/pcre   php_pcre.c php_pcre.h 
    /php-src/ext/spl    spl_iterators.c 
  Log:
  Add Unicode support for preg_match[_all]
  
  
http://cvs.php.net/viewvc.cgi/php-src/ext/pcre/php_pcre.c?r1=1.191&r2=1.192&diff_format=u
Index: php-src/ext/pcre/php_pcre.c
diff -u php-src/ext/pcre/php_pcre.c:1.191 php-src/ext/pcre/php_pcre.c:1.192
--- php-src/ext/pcre/php_pcre.c:1.191   Thu Jul 20 22:40:44 2006
+++ php-src/ext/pcre/php_pcre.c Mon Aug 28 20:36:50 2006
@@ -16,7 +16,18 @@
    +----------------------------------------------------------------------+
  */
 
-/* $Id: php_pcre.c,v 1.191 2006/07/20 22:40:44 rrichards Exp $ */
+/* $Id: php_pcre.c,v 1.192 2006/08/28 20:36:50 andrei Exp $ */
+
+/* UTODO
+ *  - PCRE_NO_UTF8_CHECK option for Unicode strings
+ *
+ *  php_pcre_match_all():
+ *   - start_offset needs to count codepoints, probably via U8_FWD_1()
+ *   - need to return matched substrings in the type matching the arguments
+ *
+ *  php_pcre_split_impl():
+ *   - Avoid the /./ bump for Unicode strings with U8_FWD_1()
+ */
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -174,7 +185,7 @@
 
 /* {{{ pcre_get_compiled_regex_cache
  */
-PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int 
regex_len TSRMLS_DC)
+PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int 
regex_len, zend_bool from_unicode TSRMLS_DC)
 {
        pcre                            *re = NULL;
        pcre_extra                      *extra;
@@ -198,7 +209,6 @@
 
        /* Try to lookup the cached regex entry, and if successful, just pass
           back the compiled pattern, otherwise go on and compile it. */
-       regex_len = strlen(regex);
        if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void 
**)&pce) == SUCCESS) {
                /*
                 * We use a quick pcre_info() check to see whether cache is 
corrupted, and if it
@@ -208,13 +218,15 @@
                        zend_hash_clean(&PCRE_G(pcre_cache));
                } else {
 #if HAVE_SETLOCALE
-                       if (!strcmp(pce->locale, locale)) {
-#endif
+                       if (!strcmp(pce->locale, locale) && from_unicode == 
pce->from_unicode) {
+                               return pce;
+                       }
+#else
+                       if (from_unicode == pce->from_unicode) {
                                return pce;
-#if HAVE_SETLOCALE
                        }
-               }
 #endif
+               }
        }
        
        p = regex;
@@ -315,6 +327,12 @@
                }
        }
 
+       /* force UTF-8 mode for strings known to have been converted from 
Unicode
+          (UTF-16) */
+       if (from_unicode) {
+               coptions |= PCRE_UTF8;
+       }
+
 #if HAVE_SETLOCALE
        if (strcmp(locale, "C"))
                tables = pcre_maketables();
@@ -367,6 +385,7 @@
        new_entry.extra = extra;
        new_entry.preg_options = poptions;
        new_entry.compile_options = coptions;
+       new_entry.from_unicode = from_unicode;
 #if HAVE_SETLOCALE
        new_entry.locale = pestrdup(locale, 1);
        new_entry.tables = tables;
@@ -382,7 +401,7 @@
  */
 PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int 
*preg_options TSRMLS_DC)
 {
-       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, 
strlen(regex) TSRMLS_CC);
+       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, 
strlen(regex), 0 TSRMLS_CC);
 
        if (extra) {
                *extra = pce ? pce->extra : NULL;
@@ -399,7 +418,7 @@
  */
 PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int 
*preg_options, int *compile_options TSRMLS_DC)
 {
-       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, 
strlen(regex) TSRMLS_CC);
+       pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, 
strlen(regex), 0 TSRMLS_CC);
        
        if (extra) {
                *extra = pce ? pce->extra : NULL;
@@ -416,7 +435,7 @@
 /* }}} */
 
 /* {{{ add_offset_pair */
-static inline void add_offset_pair(zval *result, char *str, int len, int 
offset, char *name)
+static inline void add_offset_pair(zval *result, char *str, int len, int 
offset, char *name, zend_bool make_unicode TSRMLS_DC)
 {
        zval *match_pair;
 
@@ -425,45 +444,83 @@
        INIT_PZVAL(match_pair);
 
        /* Add (match, offset) to the return value */
-       add_next_index_stringl(match_pair, str, len, 1);
+       if (make_unicode) {
+               add_next_index_utf8_stringl(match_pair, str, len, 1);
+       } else {
+               add_next_index_stringl(match_pair, str, len, 1);
+       }
        add_next_index_long(match_pair, offset);
        
        if (name) {
                zval_add_ref(&match_pair);
-               zend_hash_update(Z_ARRVAL_P(result), name, strlen(name)+1, 
&match_pair, sizeof(zval *), NULL);
+               if (make_unicode) {
+                       UErrorCode status = U_ZERO_ERROR;
+                       UChar *u = NULL;
+                       int u_len;
+                       zend_string_to_unicode_ex(UG(utf8_conv), &u, &u_len, 
name, strlen(name), &status);
+                       zend_u_hash_update(Z_ARRVAL_P(result), IS_UNICODE, 
ZSTR(u), u_len+1, &match_pair, sizeof(zval *), NULL);
+                       efree(u);
+               } else {
+                       zend_hash_update(Z_ARRVAL_P(result), name, 
strlen(name)+1, &match_pair, sizeof(zval *), NULL);
+               }
        }
        zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair, 
sizeof(zval *), NULL);
 }
 /* }}} */
 
-static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ 
*/
+/* {{{ php_do_pcre_match */
+static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) 
 {
        /* parameters */
-       char                     *regex;                        /* Regular 
expression */
-       char                     *subject;                      /* String to 
match against */
+       zstr                      regex;                        /* Regular 
expression */
+       zstr                      subject;                      /* String to 
match against */
        int                               regex_len;
        int                               subject_len;
        pcre_cache_entry *pce;                          /* Compiled regular 
expression */
        zval                     *subpats = NULL;       /* Array for 
subpatterns */
        long                      flags = 0;            /* Match control flags 
*/
        long                      start_offset = 0;     /* Where the new search 
starts */
+       zend_uchar                str_type;
+       char                     *regex_utf8 = NULL, *subject_utf8 = NULL;
+       int                       regex_utf8_len, subject_utf8_len;
+       UErrorCode                status = U_ZERO_ERROR;
 
-       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, ((global) ? 
"ssz|ll" : "ss|zll"), &regex, &regex_len,
-                                                         &subject, 
&subject_len, &subpats, &flags, &start_offset) == FAILURE) {
+       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, ((global) ? 
"TTz|ll" : "TT|zll"), &regex, &regex_len, &str_type,
+                                                         &subject, 
&subject_len, &str_type, &subpats, &flags, &start_offset) == FAILURE) {
                RETURN_FALSE;
        }
+
+       if (str_type == IS_UNICODE) {
+               zend_unicode_to_string_ex(UG(utf8_conv), &regex_utf8, 
&regex_utf8_len, regex.u, regex_len, &status);
+               zend_unicode_to_string_ex(UG(utf8_conv), &subject_utf8, 
&subject_utf8_len, subject.u, subject_len, &status);
+               regex.s = regex_utf8;
+               regex_len = regex_utf8_len;
+               subject.s = subject_utf8;
+               subject_len = subject_utf8_len;
+       }
        
        /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) 
== NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(regex.s, regex_len, (str_type 
== IS_UNICODE) TSRMLS_CC)) == NULL) {
+               if (str_type == IS_UNICODE) {
+                       efree(regex_utf8);
+                       efree(subject_utf8);
+               }
                RETURN_FALSE;
        }
 
-       php_pcre_match_impl(pce, subject, subject_len, return_value, subpats, 
-               global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
+       php_pcre_match_impl(pce, subject.s, subject_len, return_value, subpats, 
+               global, ZEND_NUM_ARGS() >= 4, flags, start_offset, (str_type == 
IS_UNICODE) TSRMLS_CC);
+
+       if (str_type == IS_UNICODE) {
+               efree(regex_utf8);
+               efree(subject_utf8);
+       }
 }
+/* }}} */
 
+/* {{{ php_pcre_match_impl */
 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int 
subject_len, zval *return_value,
-       zval *subpats, int global, int use_flags, long flags, long start_offset 
TSRMLS_DC)
+       zval *subpats, int global, int use_flags, long flags, long 
start_offset, zend_bool is_utf8 TSRMLS_DC)
 {
        zval                    *result_set,            /* Holds a set of 
subpatterns after
                                                                                
   a global match */
@@ -512,11 +569,23 @@
                offset_capture = 0;
        }
 
-       /* Negative offset counts from the end of the string. */
-       if (start_offset < 0) {
-               start_offset = subject_len + start_offset;
+       if (is_utf8) {
+               int k = 0;
+               /* Calculate byte offset from codepoint offset */
                if (start_offset < 0) {
-                       start_offset = 0;
+                       k = subject_len;
+                       U8_BACK_N(subject, 0, k, -start_offset);
+               } else {
+                       U8_FWD_N(subject, k, subject_len, start_offset);
+               }
+               start_offset = k;
+       } else {
+               /* Negative offset counts from the end of the string. */
+               if (start_offset < 0) {
+                       start_offset = subject_len + start_offset;
+                       if (start_offset < 0) {
+                               start_offset = 0;
+                       }
                }
        }
 
@@ -630,10 +699,15 @@
                                                for (i = 0; i < count; i++) {
                                                        if (offset_capture) {
                                                                
add_offset_pair(match_sets[i], (char *)stringlist[i],
-                                                                               
                offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
+                                                                               
                offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL, is_utf8 
TSRMLS_CC);
                                                        } else {
-                                                               
add_next_index_stringl(match_sets[i], (char *)stringlist[i],
-                                                                               
                           offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                               if (is_utf8) {
+                                                                       
add_next_index_utf8_stringl(match_sets[i], (char *)stringlist[i],
+                                                                               
                                                offsets[(i<<1)+1] - 
offsets[i<<1], 1);
+                                                               } else {
+                                                                       
add_next_index_stringl(match_sets[i], (char *)stringlist[i],
+                                                                               
                                   offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                               }
                                                        }
                                                }
                                                /*
@@ -642,8 +716,14 @@
                                                 * arrays with empty strings.
                                                 */
                                                if (count < num_subpats) {
-                                                       for (; i < num_subpats; 
i++) {
-                                                               
add_next_index_string(match_sets[i], "", 1);
+                                                       if (is_utf8) {
+                                                               for (; i < 
num_subpats; i++) {
+                                                                       
add_next_index_unicode(match_sets[i], EMPTY_STR, 1);
+                                                               }
+                                                       } else {
+                                                               for (; i < 
num_subpats; i++) {
+                                                                       
add_next_index_string(match_sets[i], "", 1);
+                                                               }
                                                        }
                                                }
                                        } else {
@@ -656,14 +736,25 @@
                                                for (i = 0; i < count; i++) {
                                                        if (offset_capture) {
                                                                
add_offset_pair(result_set, (char *)stringlist[i],
-                                                                               
                offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], 
subpat_names[i]);
+                                                                               
                offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1],
+                                                                               
                subpat_names[i], is_utf8 TSRMLS_CC);
                                                        } else {
                                                                if 
(subpat_names[i]) {
-                                                                       
add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
+                                                                       if 
(is_utf8) {
+                                                                               
add_assoc_utf8_stringl(result_set, subpat_names[i], (char *)stringlist[i],
+                                                                               
                                           offsets[(i<<1)+1] - offsets[i<<1], 
1);
+                                                                       } else {
+                                                                               
add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
+                                                                               
                                  offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                                       }
+                                                               }
+                                                               if (is_utf8) {
+                                                                       
add_next_index_utf8_stringl(result_set, (char *)stringlist[i],
+                                                                               
                                                offsets[(i<<1)+1] - 
offsets[i<<1], 1);
+                                                               } else {
+                                                                       
add_next_index_stringl(result_set, (char *)stringlist[i],
                                                                                
                                   offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                                }
-                                                               
add_next_index_stringl(result_set, (char *)stringlist[i],
-                                                                               
                           offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                        }
                                                }
                                                /* And add it to the output 
array */
@@ -675,14 +766,24 @@
                                                if (offset_capture) {
                                                        
add_offset_pair(subpats, (char *)stringlist[i],
                                                                                
        offsets[(i<<1)+1] - offsets[i<<1],
-                                                                               
        offsets[i<<1], subpat_names[i]);
+                                                                               
        offsets[i<<1], subpat_names[i], is_utf8 TSRMLS_CC);
                                                } else {
                                                        if (subpat_names[i]) {
-                                                               
add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
-                                                                               
                  offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                               if (is_utf8) {
+                                                                       
add_assoc_utf8_stringl(subpats, subpat_names[i], (char *)stringlist[i],
+                                                                               
                                   offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                               } else {
+                                                                       
add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
+                                                                               
                          offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                               }
+                                                       }
+                                                       if (is_utf8) {
+                                                               
add_next_index_utf8_stringl(subpats, (char *)stringlist[i],
+                                                                               
                           offsets[(i<<1)+1] - offsets[i<<1], 1);
+                                                       } else {
+                                                               
add_next_index_stringl(subpats, (char *)stringlist[i],
+                                                                               
                           offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                        }
-                                                       
add_next_index_stringl(subpats, (char *)stringlist[i],
-                                                                               
                   offsets[(i<<1)+1] - offsets[i<<1], 1);
                                                }
                                        }
                                }
@@ -696,7 +797,12 @@
                           to achieve this, unless we're already at the end of 
the string. */
                        if (g_notempty != 0 && start_offset < subject_len) {
                                offsets[0] = start_offset;
-                               offsets[1] = start_offset + 1;
+                               if (is_utf8) {
+                                       offsets[1] = start_offset;
+                                       U8_FWD_1(subject, offsets[1], 
subject_len);
+                               } else {
+                                       offsets[1] = start_offset + 1;
+                               }
                        } else
                                break;
                } else {
@@ -921,7 +1027,7 @@
        pcre_cache_entry        *pce;                       /* Compiled regular 
expression */
 
        /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) 
== NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len, 0 
TSRMLS_CC)) == NULL) {
                return NULL;
        }
 
@@ -1368,7 +1474,7 @@
        }
        
        /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) 
== NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len, 0 
TSRMLS_CC)) == NULL) {
                RETURN_FALSE;
        }
 
@@ -1452,7 +1558,7 @@
 
                                if (offset_capture) {
                                        /* Add (match, offset) pair to the 
return value */
-                                       add_offset_pair(return_value, 
last_match, &subject[offsets[0]]-last_match, next_offset, NULL);
+                                       add_offset_pair(return_value, 
last_match, &subject[offsets[0]]-last_match, next_offset, NULL, 0 TSRMLS_CC);
                                } else {
                                        /* Add the piece to the return value */
                                        add_next_index_stringl(return_value, 
last_match,
@@ -1474,7 +1580,7 @@
                                        /* If we have matched a delimiter */
                                        if (!no_empty || match_len > 0) {
                                                if (offset_capture) {
-                                                       
add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, 
offsets[i<<1], NULL);
+                                                       
add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, 
offsets[i<<1], NULL, 0 TSRMLS_CC);
                                                } else {
                                                        
add_next_index_stringl(return_value,
                                                                                
                   &subject[offsets[i<<1]],
@@ -1531,7 +1637,7 @@
        {
                if (offset_capture) {
                        /* Add the last (match, offset) pair to the return 
value */
-                       add_offset_pair(return_value, &subject[start_offset], 
subject_len - start_offset, start_offset, NULL);
+                       add_offset_pair(return_value, &subject[start_offset], 
subject_len - start_offset, start_offset, NULL, 0 TSRMLS_CC);
                } else {
                        /* Add the last piece to the return value */
                        add_next_index_stringl(return_value, last_match, 
subject + subject_len - last_match, 1);
@@ -1647,7 +1753,7 @@
        }
        
        /* Compile regex or get it from cache. */
-       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) 
== NULL) {
+       if ((pce = pcre_get_compiled_regex_cache(regex, regex_len, 0 
TSRMLS_CC)) == NULL) {
                RETURN_FALSE;
        }
        
http://cvs.php.net/viewvc.cgi/php-src/ext/pcre/php_pcre.h?r1=1.49&r2=1.50&diff_format=u
Index: php-src/ext/pcre/php_pcre.h
diff -u php-src/ext/pcre/php_pcre.h:1.49 php-src/ext/pcre/php_pcre.h:1.50
--- php-src/ext/pcre/php_pcre.h:1.49    Thu Jul 20 21:19:05 2006
+++ php-src/ext/pcre/php_pcre.h Mon Aug 28 20:36:50 2006
@@ -16,7 +16,7 @@
    +----------------------------------------------------------------------+
  */
  
-/* $Id: php_pcre.h,v 1.49 2006/07/20 21:19:05 helly Exp $ */
+/* $Id: php_pcre.h,v 1.50 2006/08/28 20:36:50 andrei Exp $ */
 
 #ifndef PHP_PCRE_H
 #define PHP_PCRE_H
@@ -58,12 +58,13 @@
 #endif
        int compile_options;
        int refcount;
+       zend_bool from_unicode;
 } pcre_cache_entry;
 
-PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int 
regex_len TSRMLS_DC);
+PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int 
regex_len, zend_bool regex_is_utf8 TSRMLS_DC);
 
 PHPAPI void  php_pcre_match_impl(  pcre_cache_entry *pce, char *subject, int 
subject_len, zval *return_value,
-       zval *subpats, int global, int use_flags, long flags, long start_offset 
TSRMLS_DC);
+       zval *subpats, int global, int use_flags, long flags, long 
start_offset, zend_bool is_utf8  TSRMLS_DC);
 
 PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int 
subject_len, zval *return_value, 
        int is_callable_replace, int *result_len, int limit, int *replace_count 
TSRMLS_DC);
http://cvs.php.net/viewvc.cgi/php-src/ext/spl/spl_iterators.c?r1=1.147&r2=1.148&diff_format=u
Index: php-src/ext/spl/spl_iterators.c
diff -u php-src/ext/spl/spl_iterators.c:1.147 
php-src/ext/spl/spl_iterators.c:1.148
--- php-src/ext/spl/spl_iterators.c:1.147       Fri Jul 21 21:09:49 2006
+++ php-src/ext/spl/spl_iterators.c     Mon Aug 28 20:36:50 2006
@@ -16,7 +16,7 @@
    +----------------------------------------------------------------------+
  */
 
-/* $Id: spl_iterators.c,v 1.147 2006/07/21 21:09:49 helly Exp $ */
+/* $Id: spl_iterators.c,v 1.148 2006/08/28 20:36:50 andrei Exp $ */
 
 #ifdef HAVE_CONFIG_H
 # include "config.h"
@@ -1444,7 +1444,7 @@
                zval_ptr_dtor(&intern->current.data);
                ALLOC_INIT_ZVAL(intern->current.data);
                php_pcre_match_impl(intern->u.regex.pce, subject, subject_len, 
&zcount, 
-                       intern->current.data, intern->u.regex.mode == 
REGIT_MODE_ALL_MATCHES, intern->u.regex.use_flags, intern->u.regex.preg_flags, 
0 TSRMLS_CC);
+                       intern->current.data, intern->u.regex.mode == 
REGIT_MODE_ALL_MATCHES, intern->u.regex.use_flags, intern->u.regex.preg_flags, 
0, 0 TSRMLS_CC);
                count = 
zend_hash_num_elements(Z_ARRVAL_P(intern->current.data));
                RETVAL_BOOL(count > 0);
                break;

-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to