As clearly demonstrated buy bug report #16545, the current implementation of 
parse_url() function is extremely slow. I wrote a custom state machine parser 
to do the parsing in the place of the current regular expression.
The result is code that takes a fraction of the time to run and in addition to 
running faster, resolves some bugs in the existing implementation of the 
parse_url() function.
I've tested the code using a parse_url() test created by Derick, which can be 
found at php4/ext/standard/tests/strings/url_t.phpt. The new code passes the 
test admirably and is in excess of 30 times faster then the existing 
implementation.

Please try the attached patch and let me know if there are any objections to 
replacing the current parse_url implementation with the one I propose.

Ilia
Index: url.c
===================================================================
RCS file: /repository/php4/ext/standard/url.c,v
retrieving revision 1.52
diff -u -3 -p -r1.52 url.c
--- url.c       10 Sep 2002 08:06:25 -0000      1.52
+++ url.c       6 Oct 2002 00:51:11 -0000
@@ -85,107 +85,143 @@ PHPAPI char *php_replace_controlchars(ch
  */
 PHPAPI php_url *php_url_parse(char *str)
 {
-       regex_t re;
-       regmatch_t subs[11];
-       int err;
        int length = strlen(str);
-       char *result;
+       char port_buf[5];
        php_url *ret = ecalloc(1, sizeof(php_url));
+       char *s, *e, *p, *pp, *ue;
+               
+       s = str;
+       ue = s + length;
 
-       /* from Appendix B of draft-fielding-url-syntax-09,
-          http://www.ics.uci.edu/~fielding/url/url.txt */
-       err = regcomp(&re, 
"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?", REG_EXTENDED);
-       if (err) {
-               /*php_error(E_WARNING, "Unable to compile regex: %d\n", err);*/
-               efree(ret);
-               return NULL;
-       }
-       err = regexec(&re, str, 10, subs, 0);
-       if (err) {
-               /*php_error(E_WARNING, "Error with regex\n");*/
-               efree(ret);
-               regfree(&re);
-               return NULL;
-       }
-       /* no processing necessary on the scheme */
-       if (subs[2].rm_so != -1 && subs[2].rm_so <= length) {
-               ret->scheme = estrndup(str + subs[2].rm_so, subs[2].rm_eo - 
subs[2].rm_so);
+       /* parse scheme */
+       if ((e = strchr(s, ':')) && *(e+1) == '/' && *(e+2) == '/' && (e-s)) {
+               ret->scheme = estrndup(s, (e-s));
                php_replace_controlchars(ret->scheme);
-       }
-
-       /* the path to the resource */
-       if (subs[5].rm_so != -1 && subs[5].rm_so <= length) {
-               ret->path = estrndup(str + subs[5].rm_so, subs[5].rm_eo - 
subs[5].rm_so);
+               s = e + 3;
+       } else if (e) { /* no scheme, look for port */
+               p = e + 1;
+               pp = p;
+               
+               while (pp-p < 6 && isdigit(*pp)) {
+                       pp++;
+               }
+               
+               if (pp-p < 6 && (*pp == '/' || *pp == '\0')) {
+                       memcpy(port_buf, p, (pp-p));
+                       port_buf[pp-p] = '\0';
+                       ret->port = atoi(port_buf);
+               } else {
+                       goto just_path;
+               }
+       } else {
+               just_path:
+               ret->path = estrndup(str, length);
                php_replace_controlchars(ret->path);
+               return ret;
        }
-
-       /* the query part */
-       if (subs[7].rm_so != -1 && subs[7].rm_so <= length) {
-               ret->query = estrndup(str + subs[7].rm_so, subs[7].rm_eo - 
subs[7].rm_so);
-               php_replace_controlchars(ret->query);
-       }
-
-       /* the fragment */
-       if (subs[9].rm_so != -1 && subs[9].rm_so <= length) {
-               ret->fragment = estrndup(str + subs[9].rm_so, subs[9].rm_eo - 
subs[9].rm_so);
-               php_replace_controlchars(ret->fragment);
+       
+       if (!(e = strchr(s, '/'))) {
+               e = ue;
        }
 
-       /* extract the username, pass, and port from the hostname */
-       if (subs[4].rm_so != -1 && subs[4].rm_so <= length) {
-
-               int cerr;
-               /* extract username:pass@host:port from regex results */
-               result = estrndup(str + subs[4].rm_so, subs[4].rm_eo - subs[4].rm_so);
-               length = strlen(result);
-
-               regfree(&re);                   /* free the old regex */
+       /* check for login and password */
+       if ((p = memchr(s, '@', (e-s)))) {
+               if ((pp = memchr(s, ':', (p-s)))) {
+                       if ((pp-s) > 0) {
+                               ret->user = estrndup(s, (pp-s));
+                               php_replace_controlchars(ret->user);
+                       }       
                
-               if (length) {
-                       if ((cerr=regcomp(&re, 
"^(([^@:]+)(:([^@:]+))?@)?((\\[([^]]+)\\])|([^:@]+))(:([^:@]+))?", REG_EXTENDED))
-                               || (err=regexec(&re, result, 11, subs, 0))) {
+                       if (p-pp > 1) { 
+                               ret->pass = estrndup(++pp, (p-pp-1));
+                               php_replace_controlchars(ret->pass);
+                       }       
+               }
+               
+               s = p + 1;
+       }
+       
+       /* check for port */
+       if ((p = memchr(s, ':', (e-s)))) {
+               if (!ret->port) {
+                       p++;
+                       if ( e-p > 5 || e-p < 1 ) { /* port cannot be longer then 5 
+characters */
                                STR_FREE(ret->scheme);
-                               STR_FREE(ret->path);
-                               STR_FREE(ret->query);
-                               STR_FREE(ret->fragment);
+                               STR_FREE(ret->user);
+                               STR_FREE(ret->pass);
                                efree(ret);
-                               efree(result);
-                               /*php_error(E_WARNING, "Unable to compile regex: 
%d\n", err);*/
-                               if (!cerr) regfree(&re); 
                                return NULL;
                        }
-                       /* now deal with all of the results */
-                       if (subs[2].rm_so != -1 && subs[2].rm_so < length) {
-                               ret->user = estrndup(result + subs[2].rm_so, 
subs[2].rm_eo - subs[2].rm_so);
-                               php_replace_controlchars(ret->user);
-                       }
-                       if (subs[4].rm_so != -1 && subs[4].rm_so < length) {
-                               ret->pass = estrndup(result + subs[4].rm_so, 
subs[4].rm_eo - subs[4].rm_so);
-                               php_replace_controlchars(ret->pass);
-                       }
-                       if (subs[7].rm_so != -1 && subs[7].rm_so < length) {
-                               ret->host = estrndup(result + subs[7].rm_so, 
subs[7].rm_eo - subs[7].rm_so);
-                               php_replace_controlchars(ret->host);
-                       } else if (subs[8].rm_so != -1 && subs[8].rm_so < length) {
-                               ret->host = estrndup(result + subs[8].rm_so, 
subs[8].rm_eo - subs[8].rm_so);
-                               php_replace_controlchars(ret->host);
-                       }
-                       if (subs[10].rm_so != -1 && subs[10].rm_so < length) {
-                               ret->port = (unsigned short) strtol(result + 
subs[10].rm_so, NULL, 10);
-                       }
-               }
-               efree(result);
-       }
-       else if (ret->scheme && !strcmp(ret->scheme, "http")) {
+               
+                       memcpy(port_buf, p, (e-p));
+                       port_buf[e-p] = '\0';
+                       ret->port = atoi(port_buf);
+                       p--;
+               }       
+       } else {
+               p = e;
+       }
+       
+       /* check if we have a valid host, if we don't reject the string as url */
+       if ((p-s) < 1) {
                STR_FREE(ret->scheme);
-               STR_FREE(ret->path);
-               STR_FREE(ret->query);
-               STR_FREE(ret->fragment);
+               STR_FREE(ret->user);
+               STR_FREE(ret->pass);
                efree(ret);
-               regfree(&re);
                return NULL;
        }
-       regfree(&re);
+       
+       ret->host = estrndup(s, (p-s));
+       php_replace_controlchars(ret->host);
+       
+       if (e == ue) {
+               return ret;
+       }
+       
+       s = e;
+       
+       if ((p = strchr(s, '?'))) {
+               pp = strchr(s, '#');
+               
+               if (pp && pp < p) {
+                       p = pp;
+                       pp = strchr(pp+2, '#');
+               }
+       
+               if (p - s) {
+                       ret->path = estrndup(s, (p-s));
+                       php_replace_controlchars(ret->path);
+               }       
+       
+               if (pp) {
+                       if (pp - ++p) { 
+                               ret->query = estrndup(p, (pp-p));
+                               php_replace_controlchars(ret->query);
+                       }
+                       p = pp;
+                       goto label_parse;
+               } else if (++p - ue) {
+                       ret->query = estrndup(p, (ue-p));
+                       php_replace_controlchars(ret->query);
+               }
+       } else if ((p = strchr(s, '#'))) {
+               if (p - s) {
+                       ret->path = estrndup(s, (p-s));
+                       php_replace_controlchars(ret->path);
+               }       
+               
+               label_parse:
+               p++;
+               
+               if (ue - p) {
+                       ret->fragment = estrndup(p, (ue-p));
+                       php_replace_controlchars(ret->fragment);
+               }       
+       } else {
+               ret->path = estrndup(s, (ue-s));
+               php_replace_controlchars(ret->path);
+       }
+
        return ret;
 }
 /* }}} */

-- 
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to