Patch to be tolerant of whitespace in URLs

Philip Boulain Sat, 24 May 2008 04:34:10 -0700

The attached patch makes url_normalize take care of whitespace in a
fairly useful way, consistent with other browsers:


 - Leading and trailing whitespace is trimmed
 - Internal whitespace is urlescaped

For example,
 "  http://www.google.co.uk/search?q=hello world  "
becomes
 "http://www.google.co.uk/search?q=hello%20world";

Explicit trailing whitespace, e.g. "...hello world%20", is left alone.

The upshot is that if you sloppily copy-paste a URL from IRC or whatnot
into the address bar, NetSurf no longer silently ignores you if you
caught some adjacent whitespace.

Apparently there are some badly-written websites out there in the wild
(who knew?) which don't escape their mid-URL whitespace properly,
either, so this should improve the chances of NetSurf working with them.

LionsPhil

Index: utils/url.c
===================================================================
--- utils/url.c	(revision 4195)
+++ utils/url.c	(working copy)
@@ -129,39 +129,66 @@
 	int m;
 	int i;
 	size_t len;
+	size_t bufsize;
 	bool http = false;
 	regmatch_t match[10];
 
 	*result = NULL;
 
-	if ((m = regexec(&url_re, url, 10, match, 0))) {
+	/* skip past any leading whitespace (likely if URL was copy-pasted) */
+	while (isspace(*url))
+		url++;
+
+	/* allocate sufficiently large buffer for new URL */
+	len = strlen(url);
+	bufsize = len + 7 + 1 + 1; /* 'http://' + '/' + '\0' */
+	/* work out how much extra to leave for internal whitespace */
+	for(i = 0; i < len; i++) {
+		if(isspace(url[i])) bufsize += 2; /* ' ' -> '%20' */
+	}
+	if ((*result = malloc(bufsize)) == NULL) {
+		LOG(("malloc failed"));
+		return URL_FUNC_NOMEM;
+	}
+	strcpy(*result, url);
+
+	/* truncate trailing whitespace (significant should be uriencoded) */
+	for (i = len - 1; (i > 0) && isspace((*result)[i]); i--) {
+		(*result)[i] = '\0';
+		len--;
+	}
+
+	/* encode any remaining (internal) whitespace */
+	for (i = 0; i < len; i++) {
+		if(isspace((*result)[i])) {
+			/* snprintf is all too keen to write damn nulls */
+			char esc[4];
+			char space = (*result)[i];
+			memmove((*result) + i + 2, (*result) + i, 1 + len - i);
+			len += 2;
+			snprintf(esc, 4, "%%%02hhx", space);
+			strncpy((*result) + i, esc, 3);
+		}
+	}
+
+	/* finally verify that it's actually an URL we're working on
+	 * (RFC regex too fussy to tolerate above WSP problems) */
+	if ((m = regexec(&url_re, *result, 10, match, 0))) {
 		LOG(("url '%s' failed to match regex", url));
 		return URL_FUNC_FAILED;
 	}
 
-	len = strlen(url);
-
 	if (match[URL_RE_SCHEME].rm_so == -1) {
 		/* scheme missing: add http:// and reparse */
 /*		LOG(("scheme missing: using http"));*/
-		if ((*result = malloc(len + 13)) == NULL) {
-			LOG(("malloc failed"));
-			return URL_FUNC_NOMEM;
-		}
-		strcpy(*result, "http://";);
-		strcpy(*result + sizeof("http://";)-1, url);
+		memmove(*result + 7, *result, len + 1);
+		strncpy(*result, "http://";, 7); /* do NOT copy null */
+		len += 7;
 		if ((m = regexec(&url_re, *result, 10, match, 0))) {
 			LOG(("url '%s' failed to match regex", (*result)));
 			free(*result);
 			return URL_FUNC_FAILED;
 		}
-		len += sizeof("http://";)-1;
-	} else {
-		if ((*result = malloc(len + 6)) == NULL) {
-			LOG(("malloc failed"));
-			return URL_FUNC_NOMEM;
-		}
-		strcpy(*result, url);
 	}
 
 	/*for (unsigned int i = 0; i != 10; i++) {
@@ -200,7 +227,7 @@
 	if (match[URL_RE_AUTHORITY].rm_so != -1) {
 		for (i = match[URL_RE_AUTHORITY].rm_so;
 				i != match[URL_RE_AUTHORITY].rm_eo; i++) {
-			if ((*result)[i] == ':') {
+			if ((*result)[i] == ':' && (i + 3) < len) {
 				if (http && (*result)[i + 1] == '8' &&
 						(*result)[i + 2] == '0' &&
 						i + 3 ==
@@ -228,7 +255,7 @@
 	}
 
 	/* unescape non-"reserved" escaped characters */
-	for (i = 0; (unsigned)i != len; i++) {
+	for (i = 0; (size_t)i + 2 < len; i++) {
 		if ((*result)[i] != '%')
 			continue;
 		c = tolower((*result)[i + 1]);

Patch to be tolerant of whitespace in URLs

Reply via email to