Commit:    ccf15cf2dc92d11f92ee30c97e2d86b07f81e030
Author:    Gustavo Lopes <glo...@nebm.ist.utl.pt>         Mon, 7 Jan 2013 
03:13:11 +0100
Committer: Gustavo Lopes <gust...@icemobile.com>      Mon, 14 Jan 2013 12:22:41 
+0100
Parents:   1a96fe0b3260b4b63627cf69d71a5b350ad3163f
Branches:  PHP-5.4 PHP-5.5 master

Link:       
http://git.php.net/?p=php-src.git;a=commitdiff;h=ccf15cf2dc92d11f92ee30c97e2d86b07f81e030

Log:
Optimize strtr w/ 2nd arg array

Fixes bug #63893: poor efficiency of strtr() using array with keys of
very different length.

The implementation is basically all new, which carries some risk with
it.

The algorithm is described in "A Fast Algorithm For Multi-Pattern
Searching" (1994) by Sun Wu and Udi Manber.

Bugs:
https://bugs.php.net/63893

Changed paths:
  M  ext/standard/string.c

diff --git a/ext/standard/string.c b/ext/standard/string.c
index 29115fe..dc92e8e 100644
--- a/ext/standard/string.c
+++ b/ext/standard/string.c
@@ -22,7 +22,9 @@
 
 /* Synced with php 3.0 revision 1.193 1999-06-16 [ssb] */
 
+#define _GNU_SOURCE 1
 #include <stdio.h>
+#include <stdint.h>
 #include "php.h"
 #include "php_rand.h"
 #include "php_string.h"
@@ -57,6 +59,7 @@
 #include "php_globals.h"
 #include "basic_functions.h"
 #include "php_smart_str.h"
+#include <Zend/zend_exceptions.h>
 #ifdef ZTS
 #include "TSRM.h"
 #endif
@@ -2772,112 +2775,288 @@ PHPAPI char *php_strtr(char *str, int len, char 
*str_from, char *str_to, int trl
 }
 /* }}} */
 
-/* {{{ php_strtr_array
- */
-static void php_strtr_array(zval *return_value, char *str, int slen, HashTable 
*hash)
+/* {{{ Definitions for php_strtr_array */
+typedef size_t STRLEN; /* STRLEN should be unsigned */
+typedef uint16_t HASH;
+typedef struct {
+       HASH                    table_mask;
+       STRLEN                  entries[1];
+} SHIFT_TAB;
+typedef struct {
+       HASH                    table_mask;
+       int                             entries[1];
+} HASH_TAB;
+typedef struct {
+       const char      *s;
+       STRLEN          l;
+} STR;
+typedef struct _match_node MATCH_NODE;
+struct _match_node {
+       STRLEN          pos;
+       MATCH_NODE      *next;
+};
+typedef struct _pat_and_repl {
+       STR                     pat;
+       STR                     repl;
+} PATNREPL;
+
+#define S(a) ((a)->s)
+#define L(a) ((a)->l)
+
+#define SHIFT_TAB_BITS 13
+#define HASH_TAB_BITS  10 /* should be less than sizeof(HASH) * 8 */
+#define SHIFT_TAB_SIZE (1U << SHIFT_TAB_BITS)
+#define HASH_TAB_SIZE  (1U << HASH_TAB_BITS)
+
+typedef struct {
+       int                             B;                      /* size of 
suffixes */
+       int                             Bp;                     /* size of 
prefixes */
+       STRLEN                  m;                      /* minimum pattern 
length */
+       int                             patnum;         /* number of patterns */
+       SHIFT_TAB               *shift;         /* table mapping hash to 
allowed shift */
+       HASH_TAB                *hash;          /* table mapping hash to int 
(pair of pointers) */
+       HASH                    *prefix;        /* array of hashes of prefixes 
by pattern suffix hash order */
+       PATNREPL                *patterns;      /* array of prefixes by pattern 
suffix hash order */
+} PPRES;
+/* }}} */
+
+/* {{{ php_strtr_hash */
+static inline HASH php_strtr_hash(const char *str, int len)
 {
-       zval **entry;
-       char  *string_key;
-       uint   string_key_len;
-       zval **trans;
-       zval   ctmp;
-       ulong num_key;
-       int minlen = 128*1024;
-       int maxlen = 0, pos, len, found;
-       char *key;
-       HashPosition hpos;
-       smart_str result = {0};
-       HashTable tmp_hash;
-
-       zend_hash_init(&tmp_hash, zend_hash_num_elements(hash), NULL, NULL, 0);
-       zend_hash_internal_pointer_reset_ex(hash, &hpos);
-       while (zend_hash_get_current_data_ex(hash, (void **)&entry, &hpos) == 
SUCCESS) {
-               switch (zend_hash_get_current_key_ex(hash, &string_key, 
&string_key_len, &num_key, 0, &hpos)) {
-                       case HASH_KEY_IS_STRING:
-                               len = string_key_len-1;
-                               if (len < 1) {
-                                       zend_hash_destroy(&tmp_hash);
-                                       RETURN_FALSE;
-                               }
-                               zend_hash_add(&tmp_hash, string_key, 
string_key_len, entry, sizeof(zval*), NULL);
-                               if (len > maxlen) {
-                                       maxlen = len;
-                               }
-                               if (len < minlen) {
-                                       minlen = len;
-                               }
-                               break;
+       HASH    res = 0;
+       int             i;
+       for (i = 0; i < len; i++) {
+               res = (res << 5) + res + (unsigned char)str[i];
+       }
 
-                       case HASH_KEY_IS_LONG:
-                               Z_TYPE(ctmp) = IS_LONG;
-                               Z_LVAL(ctmp) = num_key;
+       return res;
+}
+/* }}} */
+/* {{{ php_strtr_populate_shift */
+static inline void php_strtr_populate_shift(PATNREPL *patterns, int patnum, 
int B, STRLEN m, SHIFT_TAB *shift)
+{
+       int             i;
+       STRLEN  j,
+                       max_shift;
+
+       max_shift = m - B + 1;
+       for (i = 0; i < SHIFT_TAB_SIZE; i++) {
+               shift->entries[i] = max_shift;
+       }
+       for (i = 0; i < patnum; i++) {
+               for (j = 0; j < m - B + 1; j++) {
+                       HASH h = php_strtr_hash(&S(&patterns[i].pat)[j], B) & 
shift->table_mask;
+                       assert((long long) m - (long long) j - B >= 0); 
+                       shift->entries[h] = MIN(shift->entries[h], m - j - B);
+               }
+       }
+}
+/* }}} */
+/* {{{ php_strtr_compare_hash_suffix */
+static int php_strtr_compare_hash_suffix(const void *a, const void *b, void 
*ctx_g)
+{
+       const PPRES             *res = ctx_g;
+       const PATNREPL  *pnr_a = a,
+                                       *pnr_b = b;
+       HASH                    hash_a = php_strtr_hash(&S(&pnr_a->pat)[res->m 
- res->B], res->B)
+                                                               & 
res->hash->table_mask,
+                                       hash_b = 
php_strtr_hash(&S(&pnr_b->pat)[res->m - res->B], res->B)
+                                                               & 
res->hash->table_mask;
+       /* TODO: don't recalculate the hashes all the time */
+       return hash_a - hash_b;
+}
+/* }}} */
 
-                               convert_to_string(&ctmp);
-                               len = Z_STRLEN(ctmp);
-                               zend_hash_add(&tmp_hash, Z_STRVAL(ctmp), len+1, 
entry, sizeof(zval*), NULL);
-                               zval_dtor(&ctmp);
+/* {{{ PPRES *php_strtr_array_prepare(STR *text, PATNREPL *patterns, int 
patnum, int B, int Bp) */
+static PPRES *php_strtr_array_prepare(STR *text, PATNREPL *patterns, int 
patnum, int B, int Bp)
+{
+       int             i;
+       PPRES   *res = emalloc(sizeof *res);
 
-                               if (len > maxlen) {
-                                       maxlen = len;
-                               }
-                               if (len < minlen) {
-                                       minlen = len;
-                               }
-                               break;
+       res->m = (STRLEN)-1;
+       for (i = 0; i < patnum; i++) {
+               if (L(&patterns[i].pat) < res->m) {
+                       res->m = L(&patterns[i].pat);
                }
-               zend_hash_move_forward_ex(hash, &hpos);
        }
+       assert(res->m > 0);
+       res->B  = B             = MIN(B, res->m);
+       res->Bp = Bp    = MIN(Bp, res->m);
 
-       key = emalloc(maxlen+1);
-       pos = 0;
+       res->shift = safe_emalloc(SHIFT_TAB_SIZE, sizeof(*res->shift->entries), 
sizeof(*res->shift));
+       res->shift->table_mask = SHIFT_TAB_SIZE - 1;
+       php_strtr_populate_shift(patterns, patnum, B, res->m, res->shift);
 
-       while (pos < slen) {
-               if ((pos + maxlen) > slen) {
-                       maxlen = slen - pos;
-               }
+       res->hash = safe_emalloc(HASH_TAB_SIZE, sizeof(*res->hash->entries), 
sizeof(*res->shift));
+       res->hash->table_mask = HASH_TAB_SIZE - 1;      
+
+       res->patterns = safe_emalloc(patnum, sizeof(*res->patterns), 0);
+       memcpy(res->patterns, patterns, sizeof(*patterns) * patnum);
+       qsort_r(res->patterns, patnum, sizeof(*res->patterns), 
php_strtr_compare_hash_suffix, res);
+
+       res->prefix = safe_emalloc(patnum, sizeof(*res->prefix), 0);
+       for (i = 0; i < patnum; i++) {
+               res->prefix[i] = php_strtr_hash(S(&res->patterns[i].pat), Bp);
+       }
 
-               found = 0;
-               memcpy(key, str+pos, maxlen);
+       /* Initialize the rest of ->hash */
+       for (i = 0; i < HASH_TAB_SIZE; i++) {
+               res->hash->entries[i] = -1;
+       }
+       {
+               HASH last_h = -1; /* assumes not all bits are used in res->hash 
*/
+               /* res->patterns is already ordered by hash.
+                * Make res->hash->entries[h] de index of the first pattern in
+                * res->patterns that has hash h */
+               for (i = 0; i < patnum; i++) {
+                       HASH h = 
php_strtr_hash(&S(&res->patterns[i].pat)[res->m - res->B], res->B)
+                                               & res->hash->table_mask;
+                       if (h != last_h) {
+                               res->hash->entries[h] = i;
+                               last_h = h;
+                       }
+               }
+       }
+       res->hash->entries[HASH_TAB_SIZE] = patnum;
+       for (i = HASH_TAB_SIZE - 1; i >= 0; i--) {
+               if (res->hash->entries[i] == -1) {
+                       res->hash->entries[i] = res->hash->entries[i + 1];
+               }
+       }
 
-               for (len = maxlen; len >= minlen; len--) {
-                       key[len] = 0;
+       res->patnum     = patnum;
 
-                       if (zend_hash_find(&tmp_hash, key, len+1, 
(void**)&trans) == SUCCESS) {
-                               char *tval;
-                               int tlen;
-                               zval tmp;
+       return res;
+}
+/* }}} */
+/* {{{ php_strtr_array_destroy_ppres(PPRES *d) */
+static void php_strtr_array_destroy_ppres(PPRES *d)
+{
+       efree(d->shift);
+       efree(d->hash);
+       efree(d->prefix);
+       efree(d->patterns);
+       efree(d);
+}
+/* }}} */
 
-                               if (Z_TYPE_PP(trans) != IS_STRING) {
-                                       tmp = **trans;
-                                       zval_copy_ctor(&tmp);
-                                       convert_to_string(&tmp);
-                                       tval = Z_STRVAL(tmp);
-                                       tlen = Z_STRLEN(tmp);
-                               } else {
-                                       tval = Z_STRVAL_PP(trans);
-                                       tlen = Z_STRLEN_PP(trans);
-                               }
+/* {{{ php_strtr_array_do_repl(STR *text, PPRES *d, zval *return_value) */
+static void php_strtr_array_do_repl(STR *text, PPRES *d, zval *return_value)
+{
+       STRLEN          pos = 0,
+                               lastpos = L(text) - d->m;
+       smart_str       result = {0};
 
-                               smart_str_appendl(&result, tval, tlen);
-                               pos += len;
-                               found = 1;
+       while (pos <= lastpos) {
+               HASH    h               = php_strtr_hash(&S(text)[pos + d->m - 
d->B], d->B) & d->shift->table_mask;
+               STRLEN  shift   = d->shift->entries[h];
 
-                               if (Z_TYPE_PP(trans) != IS_STRING) {
-                                       zval_dtor(&tmp);
-                               }
-                               break;
+               if (shift > 0) {
+                       smart_str_appendl(&result, &S(text)[pos], shift);
+                       pos += shift;
+               } else {
+                       HASH    h2                              = h & 
d->hash->table_mask,
+                                       prefix_h                = 
php_strtr_hash(&S(text)[pos], d->Bp);
+
+                       int             offset_start    = d->hash->entries[h2],
+                                       offset_end              = 
d->hash->entries[h2 + 1], /* exclusive */
+                                       i                               = 0;
+
+                       for (i = offset_start; i < offset_end; i++) {
+                               PATNREPL *pnr;
+                               if (d->prefix[i] != prefix_h)
+                                       continue;
+
+                               pnr = &d->patterns[i];
+                               if (L(&pnr->pat) > L(text) - pos ||
+                                               memcmp(S(&pnr->pat), 
&S(text)[pos], L(&pnr->pat)) != 0)
+                                       continue;
+                               
+                               smart_str_appendl(&result, S(&pnr->repl), 
(int)L(&pnr->repl));
+                               pos += L(&pnr->pat);
+                               goto end_outer_loop;
                        }
+
+                       smart_str_appendc(&result, S(text)[pos]);
+                       pos++;
+end_outer_loop: ;
                }
+       }
+
+       if (pos < L(text)) {
+               smart_str_appendl(&result, &S(text)[pos], (int)(L(text) - pos));
+       }
+
+       if (result.c != NULL) {
+               smart_str_0(&result);
+               RETVAL_STRINGL(result.c, result.len, 0);
+       } else {
+               RETURN_EMPTY_STRING();
+       }
+}
+/* }}} */
+
+/* {{{ php_strtr_array */
+static void php_strtr_array(zval *return_value, char *str, int slen, HashTable 
*pats)
+{      
+       PPRES                   *data;
+       STR                             text;
+       PATNREPL                *patterns;
+       HashPosition    hpos;
+       zval                    **entry;
+       int                             num_pats = zend_hash_num_elements(pats),
+                                       i;
+
+       S(&text) = str;
+       L(&text) = slen;
+       patterns = safe_emalloc(num_pats, sizeof(*patterns), 0);
+
+       for (i = 0, zend_hash_internal_pointer_reset_ex(pats, &hpos);
+                       zend_hash_get_current_data_ex(pats, (void **)&entry, 
&hpos) == SUCCESS;
+                       i++, zend_hash_move_forward_ex(pats, &hpos)) {
+               char    *string_key;
+               uint    string_key_len;
+               ulong   num_key;
+               int             free_str = 0,
+                               free_repl = 0;
+               zval    *tzv;
+
+               switch (zend_hash_get_current_key_ex(pats, &string_key, 
&string_key_len, &num_key, 0, &hpos)) {
+               case HASH_KEY_IS_LONG:
+                       string_key_len = 1 + zend_spprintf(&string_key, 0, 
"%ld", (long)num_key);
+                       free_str = 1;
+                       /* break missing intentionally */
+
+               case HASH_KEY_IS_STRING:
+                       string_key_len--; /* exclude final '\0' */
+                       if (string_key_len == 0) { /* empty string given as 
pattern */
+                               efree(patterns);
+                               RETURN_FALSE;   
+                       }
+                       if (string_key_len > slen) { /* this pattern can never 
match */
+                               continue;
+                       }
+
+                       if (Z_TYPE_PP(entry) != IS_STRING) {
+                               tzv = *entry;
+                               zval_addref_p(tzv);
+                               SEPARATE_ZVAL(&tzv);
+                               convert_to_string(tzv);
+                               entry = &tzv;
+                               free_repl = 1;
+                       }
 
-               if (! found) {
-                       smart_str_appendc(&result, str[pos++]);
+                       S(&patterns[i].pat) = string_key;
+                       L(&patterns[i].pat) = string_key_len;
+                       S(&patterns[i].repl) = Z_STRVAL_PP(entry);
+                       L(&patterns[i].repl) = Z_STRLEN_PP(entry);
                }
        }
 
-       efree(key);
-       zend_hash_destroy(&tmp_hash);
-       smart_str_0(&result);
-       RETVAL_STRINGL(result.c, result.len, 0);
+       data = php_strtr_array_prepare(&text, patterns, i, 2, 2);
+       efree(patterns);
+       php_strtr_array_do_repl(&text, data, return_value);
+       php_strtr_array_destroy_ppres(data);
 }
 /* }}} */
-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to