The attached patch makes url_normalize take care of whitespace in a
fairly useful way, consistent with other browsers:
- Leading and trailing whitespace is trimmed
- Internal whitespace is urlescaped
For example,
" http://www.google.co.uk/search?q=hello world "
becomes
"http://www.google.co.uk/search?q=hello%20world"
Explicit trailing whitespace, e.g. "...hello world%20", is left alone.
The upshot is that if you sloppily copy-paste a URL from IRC or whatnot
into the address bar, NetSurf no longer silently ignores you if you
caught some adjacent whitespace.
Apparently there are some badly-written websites out there in the wild
(who knew?) which don't escape their mid-URL whitespace properly,
either, so this should improve the chances of NetSurf working with them.
LionsPhil
Index: utils/url.c
===================================================================
--- utils/url.c (revision 4195)
+++ utils/url.c (working copy)
@@ -129,39 +129,66 @@
int m;
int i;
size_t len;
+ size_t bufsize;
bool http = false;
regmatch_t match[10];
*result = NULL;
- if ((m = regexec(&url_re, url, 10, match, 0))) {
+ /* skip past any leading whitespace (likely if URL was copy-pasted) */
+ while (isspace(*url))
+ url++;
+
+ /* allocate sufficiently large buffer for new URL */
+ len = strlen(url);
+ bufsize = len + 7 + 1 + 1; /* 'http://' + '/' + '\0' */
+ /* work out how much extra to leave for internal whitespace */
+ for(i = 0; i < len; i++) {
+ if(isspace(url[i])) bufsize += 2; /* ' ' -> '%20' */
+ }
+ if ((*result = malloc(bufsize)) == NULL) {
+ LOG(("malloc failed"));
+ return URL_FUNC_NOMEM;
+ }
+ strcpy(*result, url);
+
+ /* truncate trailing whitespace (significant should be uriencoded) */
+ for (i = len - 1; (i > 0) && isspace((*result)[i]); i--) {
+ (*result)[i] = '\0';
+ len--;
+ }
+
+ /* encode any remaining (internal) whitespace */
+ for (i = 0; i < len; i++) {
+ if(isspace((*result)[i])) {
+ /* snprintf is all too keen to write damn nulls */
+ char esc[4];
+ char space = (*result)[i];
+ memmove((*result) + i + 2, (*result) + i, 1 + len - i);
+ len += 2;
+ snprintf(esc, 4, "%%%02hhx", space);
+ strncpy((*result) + i, esc, 3);
+ }
+ }
+
+ /* finally verify that it's actually an URL we're working on
+ * (RFC regex too fussy to tolerate above WSP problems) */
+ if ((m = regexec(&url_re, *result, 10, match, 0))) {
LOG(("url '%s' failed to match regex", url));
return URL_FUNC_FAILED;
}
- len = strlen(url);
-
if (match[URL_RE_SCHEME].rm_so == -1) {
/* scheme missing: add http:// and reparse */
/* LOG(("scheme missing: using http"));*/
- if ((*result = malloc(len + 13)) == NULL) {
- LOG(("malloc failed"));
- return URL_FUNC_NOMEM;
- }
- strcpy(*result, "http://");
- strcpy(*result + sizeof("http://")-1, url);
+ memmove(*result + 7, *result, len + 1);
+ strncpy(*result, "http://", 7); /* do NOT copy null */
+ len += 7;
if ((m = regexec(&url_re, *result, 10, match, 0))) {
LOG(("url '%s' failed to match regex", (*result)));
free(*result);
return URL_FUNC_FAILED;
}
- len += sizeof("http://")-1;
- } else {
- if ((*result = malloc(len + 6)) == NULL) {
- LOG(("malloc failed"));
- return URL_FUNC_NOMEM;
- }
- strcpy(*result, url);
}
/*for (unsigned int i = 0; i != 10; i++) {
@@ -200,7 +227,7 @@
if (match[URL_RE_AUTHORITY].rm_so != -1) {
for (i = match[URL_RE_AUTHORITY].rm_so;
i != match[URL_RE_AUTHORITY].rm_eo; i++) {
- if ((*result)[i] == ':') {
+ if ((*result)[i] == ':' && (i + 3) < len) {
if (http && (*result)[i + 1] == '8' &&
(*result)[i + 2] == '0' &&
i + 3 ==
@@ -228,7 +255,7 @@
}
/* unescape non-"reserved" escaped characters */
- for (i = 0; (unsigned)i != len; i++) {
+ for (i = 0; (size_t)i + 2 < len; i++) {
if ((*result)[i] != '%')
continue;
c = tolower((*result)[i + 1]);