rolland Fri Sep 30 02:20:49 2005 EDT Modified files: /php-src/ext/standard levenshtein.c Log: - Unicode impl of levenshtein() http://cvs.php.net/diff.php/php-src/ext/standard/levenshtein.c?r1=1.34&r2=1.35&ty=u Index: php-src/ext/standard/levenshtein.c diff -u php-src/ext/standard/levenshtein.c:1.34 php-src/ext/standard/levenshtein.c:1.35 --- php-src/ext/standard/levenshtein.c:1.34 Wed Aug 3 10:08:08 2005 +++ php-src/ext/standard/levenshtein.c Fri Sep 30 02:20:47 2005 @@ -15,7 +15,7 @@ | Author: Hartmut Holzgraefe <[EMAIL PROTECTED]> | +----------------------------------------------------------------------+ */ -/* $Id: levenshtein.c,v 1.34 2005/08/03 14:08:08 sniper Exp $ */ +/* $Id: levenshtein.c,v 1.35 2005/09/30 06:20:47 rolland Exp $ */ #include "php.h" #include <stdlib.h> @@ -27,39 +27,58 @@ /* {{{ reference_levdist * reference implementation, only optimized for memory usage, not speed */ -static int reference_levdist(const char *s1, int l1, - const char *s2, int l2, - int cost_ins, int cost_rep, int cost_del ) +static int reference_levdist(void *s1, int32_t l1, void *s2, int32_t l2, zend_uchar str_type, int cost_ins, int cost_rep, int cost_del ) { int *p1, *p2, *tmp; - int i1, i2, c0, c1, c2; - - if(l1==0) return l2*cost_ins; - if(l2==0) return l1*cost_del; + int32_t i1, i2, j1, j2, cp1, cp2; + int32_t c0, c1, c2; + UChar32 ch1, ch2; + + if (str_type == IS_UNICODE) { + cp1 = u_countChar32((UChar *)s1, l1); + cp2 = u_countChar32((UChar *)s2, l2); + + if (cp1 == 0) return cp2*cost_ins; + if (cp2 == 0) return cp1*cost_del; + if ((cp1>LEVENSHTEIN_MAX_LENTH)||(cp2>LEVENSHTEIN_MAX_LENTH)) { + return -1; + } - if((l1>LEVENSHTEIN_MAX_LENTH)||(l2>LEVENSHTEIN_MAX_LENTH)) - return -1; + p1 = safe_emalloc((cp2+1), sizeof(int), 0); + p2 = safe_emalloc((cp2+1), sizeof(int), 0); + } else { + if (l1 == 0) return l2*cost_ins; + if (l2 == 0) return l1*cost_del; + if ((l1>LEVENSHTEIN_MAX_LENTH)||(l2>LEVENSHTEIN_MAX_LENTH)) { + return -1; + } - p1 = safe_emalloc((l2+1), sizeof(int), 0); - p2 = safe_emalloc((l2+1), sizeof(int), 0); + p1 = safe_emalloc((l2+1), sizeof(int), 0); + p2 = safe_emalloc((l2+1), sizeof(int), 0); + } - for(i2=0;i2<=l2;i2++) + for (i2 = 0 ; i2 <= l2 ; i2++) p1[i2] = i2*cost_ins; - for(i1=0;i1<l1;i1++) - { - p2[0]=p1[0]+cost_del; - for(i2=0;i2<l2;i2++) - { - c0=p1[i2]+((s1[i1]==s2[i2])?0:cost_rep); - c1=p1[i2+1]+cost_del; if(c1<c0) c0=c1; - c2=p2[i2]+cost_ins; if(c2<c0) c0=c2; - p2[i2+1]=c0; - } - tmp=p1; p1=p2; p2=tmp; + for (i1 = 0, j1 = 0 ; i1 < l1 ; i1++) { + p2[0] = p1[0] + cost_del; + if (str_type == IS_UNICODE) { + U16_NEXT((UChar *)s1, j1, l1, ch1); } - - c0=p1[l2]; + for (i2 = 0, j2 = 0 ; i2 < l2 ; i2++) { + if (str_type == IS_UNICODE) { + U16_NEXT((UChar *)s2, j2, l2, ch2); + c0 = p1[i2] + ((ch1==ch2) ? 0 : cost_rep); + } else { + c0 = p1[i2] + ((*((char *)s1+i1)==*((char *)s2+i2)) ? 0 : cost_rep); + } + c1 = p1[i2+1] + cost_del; if (c1 < c0) c0 = c1; + c2 = p2[i2] + cost_ins; if (c2 < c0) c0 = c2; + p2[i2+1] = c0; + } + tmp=p1; p1=p2; p2=tmp; + } + c0 = p1[l2]; efree(p1); efree(p2); @@ -70,7 +89,7 @@ /* {{{ custom_levdist */ -static int custom_levdist(char *str1, char *str2, char *callback_name TSRMLS_DC) +static int custom_levdist(void *str1, void *str2, char *callback_name TSRMLS_DC) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "The general Levenshtein support is not there yet"); /* not there yet */ @@ -83,56 +102,51 @@ Calculate Levenshtein distance between two strings */ PHP_FUNCTION(levenshtein) { - zval **str1, **str2, **cost_ins, **cost_rep, **cost_del, **callback_name; - int distance=-1; + int argc = ZEND_NUM_ARGS(); + void *str1, *str2; + int32_t str1_len, str2_len; + zend_uchar str1_type, str2_type; + int cost_ins, cost_rep, cost_del; + char *callback_name; + int distance = -1; - switch(ZEND_NUM_ARGS()) { + switch (argc) { case 2: /* just two string: use maximum performance version */ - if (zend_get_parameters_ex(2, &str1, &str2) == FAILURE) { - WRONG_PARAM_COUNT; + if (zend_parse_parameters(2 TSRMLS_CC, "TT", + &str1, &str1_len, &str1_type, + &str2, &str2_len, &str2_type) == FAILURE) { + return; } - convert_to_string_ex(str1); - convert_to_string_ex(str2); - - distance = reference_levdist(Z_STRVAL_PP(str1), Z_STRLEN_PP(str1), - Z_STRVAL_PP(str2), Z_STRLEN_PP(str2), 1, 1, 1); + distance = reference_levdist(str1, str1_len, str2, str2_len, str1_type, 1, 1, 1); break; case 5: /* more general version: calc cost by ins/rep/del weights */ - if (zend_get_parameters_ex(5, &str1, &str2, &cost_ins, &cost_rep, &cost_del) == FAILURE) { - WRONG_PARAM_COUNT; + if (zend_parse_parameters(5 TSRMLS_CC, "TTlll", + &str1, &str1_len, &str1_type, + &str2, &str2_len, &str2_type, + &cost_ins, &cost_rep, &cost_del) == FAILURE) { + return; } - convert_to_string_ex(str1); - convert_to_string_ex(str2); - convert_to_long_ex(cost_ins); - convert_to_long_ex(cost_rep); - convert_to_long_ex(cost_del); - - distance = reference_levdist(Z_STRVAL_PP(str1), Z_STRLEN_PP(str1), - Z_STRVAL_PP(str2), Z_STRLEN_PP(str2), - Z_LVAL_PP(cost_ins), Z_LVAL_PP(cost_rep), - Z_LVAL_PP(cost_del)); - + + distance = reference_levdist(str1, str1_len, str2, str2_len, str1_type, cost_ins, cost_rep, cost_del); break; case 3: /* most general version: calc cost by user-supplied function */ - if (zend_get_parameters_ex(3, &str1, &str2, &callback_name) == FAILURE) { - WRONG_PARAM_COUNT; + if (zend_parse_parameters(3 TSRMLS_CC, "TTs", + &str1, &str1_len, &str1_type, + &str2, &str2_len, &str2_type, + &callback_name) == FAILURE) { + return; } - convert_to_string_ex(str1); - convert_to_string_ex(str2); - convert_to_string_ex(callback_name); - - distance = custom_levdist(Z_STRVAL_PP(str1), Z_STRVAL_PP(str2), - Z_STRVAL_PP(callback_name) TSRMLS_CC); + distance = custom_levdist(str1, str2, callback_name TSRMLS_CC); break; default: WRONG_PARAM_COUNT; } - if(distance<0) { + if (distance < 0) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Argument string(s) too long"); }
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php