Ok for me, +1. --
Nicos - CHAILLAN Nicolas [EMAIL PROTECTED] www.WorldAKT.com - Hébergement de sites Internet "Ilia A." <[EMAIL PROTECTED]> a écrit dans le message de news: [EMAIL PROTECTED] > As clearly demonstrated buy bug report #16545, the current implementation of > parse_url() function is extremely slow. I wrote a custom state machine parser > to do the parsing in the place of the current regular expression. > The result is code that takes a fraction of the time to run and in addition to > running faster, resolves some bugs in the existing implementation of the > parse_url() function. > I've tested the code using a parse_url() test created by Derick, which can be > found at php4/ext/standard/tests/strings/url_t.phpt. The new code passes the > test admirably and is in excess of 30 times faster then the existing > implementation. > > Please try the attached patch and let me know if there are any objections to > replacing the current parse_url implementation with the one I propose. > > Ilia ---------------------------------------------------------------------------- ---- > Index: url.c > =================================================================== > RCS file: /repository/php4/ext/standard/url.c,v > retrieving revision 1.52 > diff -u -3 -p -r1.52 url.c > --- url.c 10 Sep 2002 08:06:25 -0000 1.52 > +++ url.c 6 Oct 2002 00:51:11 -0000 > @@ -85,107 +85,143 @@ PHPAPI char *php_replace_controlchars(ch > */ > PHPAPI php_url *php_url_parse(char *str) > { > - regex_t re; > - regmatch_t subs[11]; > - int err; > int length = strlen(str); > - char *result; > + char port_buf[5]; > php_url *ret = ecalloc(1, sizeof(php_url)); > + char *s, *e, *p, *pp, *ue; > + > + s = str; > + ue = s + length; > > - /* from Appendix B of draft-fielding-url-syntax-09, > - http://www.ics.uci.edu/~fielding/url/url.txt */ > - err = regcomp(&re, "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?", REG_EXTENDED); > - if (err) { > - /*php_error(E_WARNING, "Unable to compile regex: %d\n", err);*/ > - efree(ret); > - return NULL; > - } > - err = regexec(&re, str, 10, subs, 0); > - if (err) { > - /*php_error(E_WARNING, "Error with regex\n");*/ > - efree(ret); > - regfree(&re); > - return NULL; > - } > - /* no processing necessary on the scheme */ > - if (subs[2].rm_so != -1 && subs[2].rm_so <= length) { > - ret->scheme = estrndup(str + subs[2].rm_so, subs[2].rm_eo - subs[2].rm_so); > + /* parse scheme */ > + if ((e = strchr(s, ':')) && *(e+1) == '/' && *(e+2) == '/' && (e-s)) { > + ret->scheme = estrndup(s, (e-s)); > php_replace_controlchars(ret->scheme); > - } > - > - /* the path to the resource */ > - if (subs[5].rm_so != -1 && subs[5].rm_so <= length) { > - ret->path = estrndup(str + subs[5].rm_so, subs[5].rm_eo - subs[5].rm_so); > + s = e + 3; > + } else if (e) { /* no scheme, look for port */ > + p = e + 1; > + pp = p; > + > + while (pp-p < 6 && isdigit(*pp)) { > + pp++; > + } > + > + if (pp-p < 6 && (*pp == '/' || *pp == '\0')) { > + memcpy(port_buf, p, (pp-p)); > + port_buf[pp-p] = '\0'; > + ret->port = atoi(port_buf); > + } else { > + goto just_path; > + } > + } else { > + just_path: > + ret->path = estrndup(str, length); > php_replace_controlchars(ret->path); > + return ret; > } > - > - /* the query part */ > - if (subs[7].rm_so != -1 && subs[7].rm_so <= length) { > - ret->query = estrndup(str + subs[7].rm_so, subs[7].rm_eo - subs[7].rm_so); > - php_replace_controlchars(ret->query); > - } > - > - /* the fragment */ > - if (subs[9].rm_so != -1 && subs[9].rm_so <= length) { > - ret->fragment = estrndup(str + subs[9].rm_so, subs[9].rm_eo - subs[9].rm_so); > - php_replace_controlchars(ret->fragment); > + > + if (!(e = strchr(s, '/'))) { > + e = ue; > } > > - /* extract the username, pass, and port from the hostname */ > - if (subs[4].rm_so != -1 && subs[4].rm_so <= length) { > - > - int cerr; > - /* extract username:pass@host:port from regex results */ > - result = estrndup(str + subs[4].rm_so, subs[4].rm_eo - subs[4].rm_so); > - length = strlen(result); > - > - regfree(&re); /* free the old regex */ > + /* check for login and password */ > + if ((p = memchr(s, '@', (e-s)))) { > + if ((pp = memchr(s, ':', (p-s)))) { > + if ((pp-s) > 0) { > + ret->user = estrndup(s, (pp-s)); > + php_replace_controlchars(ret->user); > + } > > - if (length) { > - if ((cerr=regcomp(&re, "^(([^@:]+)(:([^@:]+))?@)?((\\[([^]]+)\\])|([^:@]+))(:([^:@]+))?", REG_EXTENDED)) > - || (err=regexec(&re, result, 11, subs, 0))) { > + if (p-pp > 1) { > + ret->pass = estrndup(++pp, (p-pp-1)); > + php_replace_controlchars(ret->pass); > + } > + } > + > + s = p + 1; > + } > + > + /* check for port */ > + if ((p = memchr(s, ':', (e-s)))) { > + if (!ret->port) { > + p++; > + if ( e-p > 5 || e-p < 1 ) { /* port cannot be longer then 5 characters */ > STR_FREE(ret->scheme); > - STR_FREE(ret->path); > - STR_FREE(ret->query); > - STR_FREE(ret->fragment); > + STR_FREE(ret->user); > + STR_FREE(ret->pass); > efree(ret); > - efree(result); > - /*php_error(E_WARNING, "Unable to compile regex: %d\n", err);*/ > - if (!cerr) regfree(&re); > return NULL; > } > - /* now deal with all of the results */ > - if (subs[2].rm_so != -1 && subs[2].rm_so < length) { > - ret->user = estrndup(result + subs[2].rm_so, subs[2].rm_eo - subs[2].rm_so); > - php_replace_controlchars(ret->user); > - } > - if (subs[4].rm_so != -1 && subs[4].rm_so < length) { > - ret->pass = estrndup(result + subs[4].rm_so, subs[4].rm_eo - subs[4].rm_so); > - php_replace_controlchars(ret->pass); > - } > - if (subs[7].rm_so != -1 && subs[7].rm_so < length) { > - ret->host = estrndup(result + subs[7].rm_so, subs[7].rm_eo - subs[7].rm_so); > - php_replace_controlchars(ret->host); > - } else if (subs[8].rm_so != -1 && subs[8].rm_so < length) { > - ret->host = estrndup(result + subs[8].rm_so, subs[8].rm_eo - subs[8].rm_so); > - php_replace_controlchars(ret->host); > - } > - if (subs[10].rm_so != -1 && subs[10].rm_so < length) { > - ret->port = (unsigned short) strtol(result + subs[10].rm_so, NULL, 10); > - } > - } > - efree(result); > - } > - else if (ret->scheme && !strcmp(ret->scheme, "http")) { > + > + memcpy(port_buf, p, (e-p)); > + port_buf[e-p] = '\0'; > + ret->port = atoi(port_buf); > + p--; > + } > + } else { > + p = e; > + } > + > + /* check if we have a valid host, if we don't reject the string as url */ > + if ((p-s) < 1) { > STR_FREE(ret->scheme); > - STR_FREE(ret->path); > - STR_FREE(ret->query); > - STR_FREE(ret->fragment); > + STR_FREE(ret->user); > + STR_FREE(ret->pass); > efree(ret); > - regfree(&re); > return NULL; > } > - regfree(&re); > + > + ret->host = estrndup(s, (p-s)); > + php_replace_controlchars(ret->host); > + > + if (e == ue) { > + return ret; > + } > + > + s = e; > + > + if ((p = strchr(s, '?'))) { > + pp = strchr(s, '#'); > + > + if (pp && pp < p) { > + p = pp; > + pp = strchr(pp+2, '#'); > + } > + > + if (p - s) { > + ret->path = estrndup(s, (p-s)); > + php_replace_controlchars(ret->path); > + } > + > + if (pp) { > + if (pp - ++p) { > + ret->query = estrndup(p, (pp-p)); > + php_replace_controlchars(ret->query); > + } > + p = pp; > + goto label_parse; > + } else if (++p - ue) { > + ret->query = estrndup(p, (ue-p)); > + php_replace_controlchars(ret->query); > + } > + } else if ((p = strchr(s, '#'))) { > + if (p - s) { > + ret->path = estrndup(s, (p-s)); > + php_replace_controlchars(ret->path); > + } > + > + label_parse: > + p++; > + > + if (ue - p) { > + ret->fragment = estrndup(p, (ue-p)); > + php_replace_controlchars(ret->fragment); > + } > + } else { > + ret->path = estrndup(s, (ue-s)); > + php_replace_controlchars(ret->path); > + } > + > return ret; > } > /* }}} */ > -- PHP Development Mailing List <http://www.php.net/> To unsubscribe, visit: http://www.php.net/unsub.php