On 04/21/2015 04:19 PM, Darshit Shah wrote:
Regarding the patch itself, I wanted to ask if it would not be cleaner to dig 
into the code and replace every call to url_unescape with the new prototype? In 
my opinion that would help in maintaining readability and more importantly 
maintainability of the code.

I thought of it too, and I agree with you. The reason I haven't done it is 
because I'm not really sure whether all the functions that call url_unescape 
need the reserved characters escaped or not. I believe there'll be no problems, 
but I didn't want to just blindly replace all the calls to url_unescape without 
even having a quick look, which is exactly what I didn't have time to do so 
far. What do you guys think?
I'll have a closer look as soon as I can (and provided no one does it before) 
and roll another patch with the replacements. Unless of course someone already 
knows the answer.

Regarding the patches, I resend them with the changes made according to your 
feedback.
Changes made so far:
    - Merged the prototype patch into 1.
    - Shortened commit messages.
    - New test added to Makefile.am (in patch 2).

--
Regards,
- AJ
>From 0123e79976aa2ec35f9610fbf3094457b689d091 Mon Sep 17 00:00:00 2001
From: Ander Juaristi <[email protected]>
Date: Mon, 13 Apr 2015 16:28:36 +0200
Subject: [PATCH 1/2] Fixed incorrect handling of reserved chars.

* src/url.c (url_unescape_1): New static function.
(url_unescape): Calls url_unescape_1 with mask zero. Preserves
same behavior as before. Only code changes.
(url_unescape_except_reserved): New function.

* src/url.h: Added prototype for url_unescape_except_reserved().

When the locale is US-ASCII, URIs that contain special characters
in them are converted to IRIs according to RFC 3987, section 3.2
"Converting URIs to IRIs".
---
 src/iri.c |  2 +-
 src/url.c | 40 +++++++++++++++++++++++++++++-----------
 src/url.h |  1 +
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/src/iri.c b/src/iri.c
index 5278dee..10ae994 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -136,7 +136,7 @@ do_conversion (const char *tocode, const char *fromcode, char const *in_org, siz
 
   /* iconv() has to work on an unescaped string */
   in_save = in = xstrndup (in_org, inlen);
-  url_unescape(in);
+  url_unescape_except_reserved (in);
   inlen = strlen(in);
 
   len = outlen = inlen * 2;
diff --git a/src/url.c b/src/url.c
index e78dbc6..73c8dd0 100644
--- a/src/url.c
+++ b/src/url.c
@@ -161,17 +161,8 @@ static const unsigned char urlchr_table[256] =
 #undef U
 #undef RU
 
-/* URL-unescape the string S.
-
-   This is done by transforming the sequences "%HH" to the character
-   represented by the hexadecimal digits HH.  If % is not followed by
-   two hexadecimal digits, it is inserted literally.
-
-   The transformation is done in place.  If you need the original
-   string intact, make a copy before calling this function.  */
-
-void
-url_unescape (char *s)
+static void
+url_unescape_1 (char *s, unsigned char mask)
 {
   char *t = s;                  /* t - tortoise */
   char *h = s;                  /* h - hare     */
@@ -190,6 +181,8 @@ url_unescape (char *s)
           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
             goto copychar;
           c = X2DIGITS_TO_NUM (h[1], h[2]);
+          if (urlchr_test(c, mask))
+            goto copychar;
           /* Don't unescape %00 because there is no way to insert it
              into a C string without effectively truncating it. */
           if (c == '\0')
@@ -201,6 +194,31 @@ url_unescape (char *s)
   *t = '\0';
 }
 
+/* URL-unescape the string S.
+
+   This is done by transforming the sequences "%HH" to the character
+   represented by the hexadecimal digits HH.  If % is not followed by
+   two hexadecimal digits, it is inserted literally.
+
+   The transformation is done in place.  If you need the original
+   string intact, make a copy before calling this function.  */
+void
+url_unescape (char *s)
+{
+  url_unescape_1 (s, 0);
+}
+
+/* URL-unescape the string S.
+
+   This functions behaves identically as url_unescape(), but does not
+   convert characters from "reserved". In other words, it only converts
+   "unsafe" characters.  */
+void
+url_unescape_except_reserved (char *s)
+{
+  url_unescape_1 (s, urlchr_reserved);
+}
+
 /* The core of url_escape_* functions.  Escapes the characters that
    match the provided mask in urlchr_table.
 
diff --git a/src/url.h b/src/url.h
index b1c46c1..a543f3d 100644
--- a/src/url.h
+++ b/src/url.h
@@ -106,6 +106,7 @@ struct url
 char *url_escape (const char *);
 char *url_escape_unsafe_and_reserved (const char *);
 void url_unescape (char *);
+void url_unescape_except_reserved (char *);
 
 struct url *url_parse (const char *, int *, struct iri *iri, bool percent_encode);
 char *url_error (const char *, int);
-- 
1.9.1

>From 2e295c70bd89f2d40380404ae7a2745a7b0c6075 Mon Sep 17 00:00:00 2001
From: Ander Juaristi <[email protected]>
Date: Mon, 20 Apr 2015 23:16:18 +0200
Subject: [PATCH 2/2] Make sure Wget does not unescape reserved chars.

* testenv/Test-reserved-chars.py: New file.

When following redirections, Wget should not unescape the reserved
characters that might appear in target URLs.
---
 testenv/Makefile.am            |  3 ++-
 testenv/Test-reserved-chars.py | 59 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100755 testenv/Test-reserved-chars.py

diff --git a/testenv/Makefile.am b/testenv/Makefile.am
index a4e0352..9acf0f3 100644
--- a/testenv/Makefile.am
+++ b/testenv/Makefile.am
@@ -53,7 +53,8 @@ if HAVE_PYTHON3
     Test-Post.py                                    \
     Test-504.py                                     \
     Test--spider-r.py                               \
-    Test-redirect-crash.py
+    Test-redirect-crash.py                          \
+    Test-reserved-chars.py
 
   # added test cases expected to fail here and under TESTS
   XFAIL_TESTS =
diff --git a/testenv/Test-reserved-chars.py b/testenv/Test-reserved-chars.py
new file mode 100755
index 0000000..5ef366a
--- /dev/null
+++ b/testenv/Test-reserved-chars.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+from sys import exit
+from os import environ # to set LC_ALL
+from test.http_test import HTTPTest
+from misc.wget_file import WgetFile
+
+"""
+This test ensures that Wget keeps reserved characters in URLs in non-UTF-8 charsets.
+"""
+# This bug only happened with ASCII charset,
+# so we need to set LC_ALL="C" in order to reproduce it.
+environ["LC_ALL"] = "C"
+
+TEST_NAME = "URLs with reserved characters"
+######### File Definitions #########
+RequestList = [
+    [
+        "HEAD /base.html",
+        "GET /base.html",
+        "GET /robots.txt",
+        "HEAD /a%2Bb.html",
+        "GET /a%2Bb.html"
+    ]
+]
+A_File_Name = "base.html"
+B_File_Name = "a%2Bb.html"
+A_File = WgetFile (A_File_Name, "<a href=\"a%2Bb.html\">")
+B_File = WgetFile (B_File_Name, "this is file B")
+
+WGET_OPTIONS = " --spider -r"
+WGET_URLS = [[A_File_Name]]
+
+Files = [[A_File, B_File]]
+
+ExpectedReturnCode = 0
+ExpectedDownloadedFiles = []
+
+######### Pre and Post Test Hooks #########
+pre_test = {
+    "ServerFiles"   : Files
+}
+test_options = {
+    "WgetCommands"      : WGET_OPTIONS,
+    "Urls"              : WGET_URLS
+}
+post_test = {
+    "ExpectedFiles"     : ExpectedDownloadedFiles,
+    "ExpectedRetcode"   : ExpectedReturnCode,
+    "FilesCrawled"      : RequestList
+}
+
+err = HTTPTest (
+                name=TEST_NAME,
+                pre_hook=pre_test,
+                test_params=test_options,
+                post_hook=post_test
+).begin ()
+
+exit (err)
-- 
1.9.1

Reply via email to