On 04/21/2015 04:19 PM, Darshit Shah wrote:
Regarding the patch itself, I wanted to ask if it would not be cleaner to dig
into the code and replace every call to url_unescape with the new prototype? In
my opinion that would help in maintaining readability and more importantly
maintainability of the code.
I thought of it too, and I agree with you. The reason I haven't done it is
because I'm not really sure whether all the functions that call url_unescape
need the reserved characters escaped or not. I believe there'll be no problems,
but I didn't want to just blindly replace all the calls to url_unescape without
even having a quick look, which is exactly what I didn't have time to do so
far. What do you guys think?
I'll have a closer look as soon as I can (and provided no one does it before)
and roll another patch with the replacements. Unless of course someone already
knows the answer.
Regarding the patches, I resend them with the changes made according to your
feedback.
Changes made so far:
- Merged the prototype patch into 1.
- Shortened commit messages.
- New test added to Makefile.am (in patch 2).
--
Regards,
- AJ
>From 0123e79976aa2ec35f9610fbf3094457b689d091 Mon Sep 17 00:00:00 2001
From: Ander Juaristi <[email protected]>
Date: Mon, 13 Apr 2015 16:28:36 +0200
Subject: [PATCH 1/2] Fixed incorrect handling of reserved chars.
* src/url.c (url_unescape_1): New static function.
(url_unescape): Calls url_unescape_1 with mask zero. Preserves
same behavior as before. Only code changes.
(url_unescape_except_reserved): New function.
* src/url.h: Added prototype for url_unescape_except_reserved().
When the locale is US-ASCII, URIs that contain special characters
in them are converted to IRIs according to RFC 3987, section 3.2
"Converting URIs to IRIs".
---
src/iri.c | 2 +-
src/url.c | 40 +++++++++++++++++++++++++++++-----------
src/url.h | 1 +
3 files changed, 31 insertions(+), 12 deletions(-)
diff --git a/src/iri.c b/src/iri.c
index 5278dee..10ae994 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -136,7 +136,7 @@ do_conversion (const char *tocode, const char *fromcode, char const *in_org, siz
/* iconv() has to work on an unescaped string */
in_save = in = xstrndup (in_org, inlen);
- url_unescape(in);
+ url_unescape_except_reserved (in);
inlen = strlen(in);
len = outlen = inlen * 2;
diff --git a/src/url.c b/src/url.c
index e78dbc6..73c8dd0 100644
--- a/src/url.c
+++ b/src/url.c
@@ -161,17 +161,8 @@ static const unsigned char urlchr_table[256] =
#undef U
#undef RU
-/* URL-unescape the string S.
-
- This is done by transforming the sequences "%HH" to the character
- represented by the hexadecimal digits HH. If % is not followed by
- two hexadecimal digits, it is inserted literally.
-
- The transformation is done in place. If you need the original
- string intact, make a copy before calling this function. */
-
-void
-url_unescape (char *s)
+static void
+url_unescape_1 (char *s, unsigned char mask)
{
char *t = s; /* t - tortoise */
char *h = s; /* h - hare */
@@ -190,6 +181,8 @@ url_unescape (char *s)
if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
goto copychar;
c = X2DIGITS_TO_NUM (h[1], h[2]);
+ if (urlchr_test(c, mask))
+ goto copychar;
/* Don't unescape %00 because there is no way to insert it
into a C string without effectively truncating it. */
if (c == '\0')
@@ -201,6 +194,31 @@ url_unescape (char *s)
*t = '\0';
}
+/* URL-unescape the string S.
+
+ This is done by transforming the sequences "%HH" to the character
+ represented by the hexadecimal digits HH. If % is not followed by
+ two hexadecimal digits, it is inserted literally.
+
+ The transformation is done in place. If you need the original
+ string intact, make a copy before calling this function. */
+void
+url_unescape (char *s)
+{
+ url_unescape_1 (s, 0);
+}
+
+/* URL-unescape the string S.
+
+ This functions behaves identically as url_unescape(), but does not
+ convert characters from "reserved". In other words, it only converts
+ "unsafe" characters. */
+void
+url_unescape_except_reserved (char *s)
+{
+ url_unescape_1 (s, urlchr_reserved);
+}
+
/* The core of url_escape_* functions. Escapes the characters that
match the provided mask in urlchr_table.
diff --git a/src/url.h b/src/url.h
index b1c46c1..a543f3d 100644
--- a/src/url.h
+++ b/src/url.h
@@ -106,6 +106,7 @@ struct url
char *url_escape (const char *);
char *url_escape_unsafe_and_reserved (const char *);
void url_unescape (char *);
+void url_unescape_except_reserved (char *);
struct url *url_parse (const char *, int *, struct iri *iri, bool percent_encode);
char *url_error (const char *, int);
--
1.9.1
>From 2e295c70bd89f2d40380404ae7a2745a7b0c6075 Mon Sep 17 00:00:00 2001
From: Ander Juaristi <[email protected]>
Date: Mon, 20 Apr 2015 23:16:18 +0200
Subject: [PATCH 2/2] Make sure Wget does not unescape reserved chars.
* testenv/Test-reserved-chars.py: New file.
When following redirections, Wget should not unescape the reserved
characters that might appear in target URLs.
---
testenv/Makefile.am | 3 ++-
testenv/Test-reserved-chars.py | 59 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 61 insertions(+), 1 deletion(-)
create mode 100755 testenv/Test-reserved-chars.py
diff --git a/testenv/Makefile.am b/testenv/Makefile.am
index a4e0352..9acf0f3 100644
--- a/testenv/Makefile.am
+++ b/testenv/Makefile.am
@@ -53,7 +53,8 @@ if HAVE_PYTHON3
Test-Post.py \
Test-504.py \
Test--spider-r.py \
- Test-redirect-crash.py
+ Test-redirect-crash.py \
+ Test-reserved-chars.py
# added test cases expected to fail here and under TESTS
XFAIL_TESTS =
diff --git a/testenv/Test-reserved-chars.py b/testenv/Test-reserved-chars.py
new file mode 100755
index 0000000..5ef366a
--- /dev/null
+++ b/testenv/Test-reserved-chars.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+from sys import exit
+from os import environ # to set LC_ALL
+from test.http_test import HTTPTest
+from misc.wget_file import WgetFile
+
+"""
+This test ensures that Wget keeps reserved characters in URLs in non-UTF-8 charsets.
+"""
+# This bug only happened with ASCII charset,
+# so we need to set LC_ALL="C" in order to reproduce it.
+environ["LC_ALL"] = "C"
+
+TEST_NAME = "URLs with reserved characters"
+######### File Definitions #########
+RequestList = [
+ [
+ "HEAD /base.html",
+ "GET /base.html",
+ "GET /robots.txt",
+ "HEAD /a%2Bb.html",
+ "GET /a%2Bb.html"
+ ]
+]
+A_File_Name = "base.html"
+B_File_Name = "a%2Bb.html"
+A_File = WgetFile (A_File_Name, "<a href=\"a%2Bb.html\">")
+B_File = WgetFile (B_File_Name, "this is file B")
+
+WGET_OPTIONS = " --spider -r"
+WGET_URLS = [[A_File_Name]]
+
+Files = [[A_File, B_File]]
+
+ExpectedReturnCode = 0
+ExpectedDownloadedFiles = []
+
+######### Pre and Post Test Hooks #########
+pre_test = {
+ "ServerFiles" : Files
+}
+test_options = {
+ "WgetCommands" : WGET_OPTIONS,
+ "Urls" : WGET_URLS
+}
+post_test = {
+ "ExpectedFiles" : ExpectedDownloadedFiles,
+ "ExpectedRetcode" : ExpectedReturnCode,
+ "FilesCrawled" : RequestList
+}
+
+err = HTTPTest (
+ name=TEST_NAME,
+ pre_hook=pre_test,
+ test_params=test_options,
+ post_hook=post_test
+).begin ()
+
+exit (err)
--
1.9.1