Hello,
I have created a small patch for wget 1.15. It is related to the
experimental --content-disposition option. The patch is attached.

This patch ensures that if the filename parameter of Content-Disposition
header is url-encoded as described in RFC 2231, then wget decodes it (the
character set is ignored).

Besides using the --content-disposition option, there may be also needed to
use the --restrict-file-names=nocontrol option to avoid wget from escaping
some characters of the filename.

I have created a testing URL:
http://www.vladpride.cz/res/content-disposition-with-unicode.php

The source code of the PHP script at that URL is as follows:

<?php
$str = 'aácč dďsš zžrř iínň';
header('Content-Type: text/plain; charset=UTF-8');
header("Content-Disposition: attachment;
filename*=UTF-8''".rawurlencode($str.'.txt'));
echo $str;
?>

You can use the following command to test it:

wget --content-disposition --restrict-file-names=nocontrol "
http://www.vladpride.cz/res/content-disposition-with-unicode.php";

Using the above command, my patched version of wget correctly saves the
file as:

aácč dďsš zžrř iínň.txt

Using the same command with unpatched wget, the file is incorrectly saved
as:

a%C3%A1c%C4%8D%20d%C4%8Fs%C5%A1%20z%C5%BEr%C5%99%20i%C3%ADn%C5%88.txt

It would be great if my patch could be incorporated into the next release
of wget.

I have created the patch file with the following command:

diff -rupN wget-1.15/src/ wget-1.15-custom/src/ > content-disposition.patch

You can apply the patch with the following command, while in the directory
where the source code tarball was extracted to:

patch -p1 < ../content-disposition.patch

Then the output will be like this:

patching file src/http.c
patching file src/http.h
patching file src/url.c
patching file src/url.h

Note that in my system, Ubuntu 12.04, I had to install package
libgnutls-dev to be able to compile wget.

Cheers,
Vlad
diff -rupN wget-1.15/src/http.c wget-1.15-custom/src/http.c
--- wget-1.15/src/http.c	2014-01-07 15:58:49.000000000 +0100
+++ wget-1.15-custom/src/http.c	2014-02-12 08:27:28.713919909 +0100
@@ -1066,7 +1066,20 @@ bool
 extract_param (const char **source, param_token *name, param_token *value,
                char separator)
 {
+  return extract_param_new (source, name, value, separator, (bool *) 0);
+}
+
+/* Like extract_param, but with the addition of parameter is_url_encoded that is set to true if the value is url-encoded (see RFC 2231 for details). */
+
+bool
+extract_param_new (const char **source, param_token *name, param_token *value,
+               char separator, bool *is_url_encoded)
+{
   const char *p = *source;
+  if (is_url_encoded)
+    {
+      *is_url_encoded = false;
+    }
 
   while (c_isspace (*p)) ++p;
   if (!*p)
@@ -1125,6 +1138,9 @@ extract_param (const char **source, para
   int param_type = modify_param_name(name);
   if (NOT_RFC2231 != param_type)
     {
+      if (RFC2231_ENCODING == param_type && is_url_encoded) {
+        *is_url_encoded = true;
+      }
       modify_param_value(value, param_type);
     }
   return true;
@@ -1137,13 +1153,17 @@ extract_param (const char **source, para
 /* Appends the string represented by VALUE to FILENAME */
 
 static void
-append_value_to_filename (char **filename, param_token const * const value)
+append_value_to_filename (char **filename, param_token const * const value, bool is_url_encoded)
 {
   int original_length = strlen(*filename);
   int new_length = strlen(*filename) + (value->e - value->b);
   *filename = xrealloc (*filename, new_length+1);
   memcpy (*filename + original_length, value->b, (value->e - value->b)); 
   (*filename)[new_length] = '\0';
+  if (is_url_encoded)
+    {
+      url_unescape(*filename + original_length);
+    }
 }
 
 #undef MAX
@@ -1176,7 +1196,8 @@ parse_content_disposition (const char *h
 {
   param_token name, value;
   *filename = NULL;
-  while (extract_param (&hdr, &name, &value, ';'))
+  bool is_url_encoded;
+  for (is_url_encoded = false; extract_param_new (&hdr, &name, &value, ';', &is_url_encoded); is_url_encoded = false)
     {
       int isFilename = BOUNDED_EQUAL_NO_CASE ( name.b, name.e, "filename" );
       if ( isFilename && value.b != NULL)
@@ -1192,9 +1213,15 @@ parse_content_disposition (const char *h
             continue;
 
           if (*filename)
-            append_value_to_filename (filename, &value);
+            append_value_to_filename (filename, &value, is_url_encoded);
           else
-            *filename = strdupdelim (value.b, value.e);
+            {
+              *filename = strdupdelim (value.b, value.e);
+              if (is_url_encoded)
+                {
+                  url_unescape (*filename);
+                }
+            }
         }
     }
 
diff -rupN wget-1.15/src/http.h wget-1.15-custom/src/http.h
--- wget-1.15/src/http.h	2014-01-04 13:49:47.000000000 +0100
+++ wget-1.15-custom/src/http.h	2014-02-12 08:27:28.725919927 +0100
@@ -44,6 +44,7 @@ typedef struct {
   const char *b, *e;
 } param_token;
 bool extract_param (const char **, param_token *, param_token *, char);
+bool extract_param_new (const char **, param_token *, param_token *, char, bool *);
 
 
 #endif /* HTTP_H */
diff -rupN wget-1.15/src/url.c wget-1.15-custom/src/url.c
--- wget-1.15/src/url.c	2014-01-04 13:49:47.000000000 +0100
+++ wget-1.15-custom/src/url.c	2014-02-12 08:27:28.725919927 +0100
@@ -169,7 +169,7 @@ static const unsigned char urlchr_table[
    The transformation is done in place.  If you need the original
    string intact, make a copy before calling this function.  */
 
-static void
+void
 url_unescape (char *s)
 {
   char *t = s;                  /* t - tortoise */
diff -rupN wget-1.15/src/url.h wget-1.15-custom/src/url.h
--- wget-1.15/src/url.h	2013-10-21 16:50:12.000000000 +0200
+++ wget-1.15-custom/src/url.h	2014-02-12 08:27:28.725919927 +0100
@@ -100,6 +100,7 @@ struct url
 /* Function declarations */
 
 char *url_escape (const char *);
+void url_unescape (char *);
 char *url_escape_unsafe_and_reserved (const char *);
 
 struct url *url_parse (const char *, int *, struct iri *iri, bool percent_encode);

Reply via email to