Hello,
I have created a small patch for wget 1.15. It is related to the
experimental --content-disposition option. The patch is attached.
This patch ensures that if the filename parameter of Content-Disposition
header is url-encoded as described in RFC 2231, then wget decodes it (the
character set is ignored).
Besides using the --content-disposition option, there may be also needed to
use the --restrict-file-names=nocontrol option to avoid wget from escaping
some characters of the filename.
I have created a testing URL:
http://www.vladpride.cz/res/content-disposition-with-unicode.php
The source code of the PHP script at that URL is as follows:
<?php
$str = 'aácč dďsš zžrř iínň';
header('Content-Type: text/plain; charset=UTF-8');
header("Content-Disposition: attachment;
filename*=UTF-8''".rawurlencode($str.'.txt'));
echo $str;
?>
You can use the following command to test it:
wget --content-disposition --restrict-file-names=nocontrol "
http://www.vladpride.cz/res/content-disposition-with-unicode.php"
Using the above command, my patched version of wget correctly saves the
file as:
aácč dďsš zžrř iínň.txt
Using the same command with unpatched wget, the file is incorrectly saved
as:
a%C3%A1c%C4%8D%20d%C4%8Fs%C5%A1%20z%C5%BEr%C5%99%20i%C3%ADn%C5%88.txt
It would be great if my patch could be incorporated into the next release
of wget.
I have created the patch file with the following command:
diff -rupN wget-1.15/src/ wget-1.15-custom/src/ > content-disposition.patch
You can apply the patch with the following command, while in the directory
where the source code tarball was extracted to:
patch -p1 < ../content-disposition.patch
Then the output will be like this:
patching file src/http.c
patching file src/http.h
patching file src/url.c
patching file src/url.h
Note that in my system, Ubuntu 12.04, I had to install package
libgnutls-dev to be able to compile wget.
Cheers,
Vlad
diff -rupN wget-1.15/src/http.c wget-1.15-custom/src/http.c
--- wget-1.15/src/http.c 2014-01-07 15:58:49.000000000 +0100
+++ wget-1.15-custom/src/http.c 2014-02-12 08:27:28.713919909 +0100
@@ -1066,7 +1066,20 @@ bool
extract_param (const char **source, param_token *name, param_token *value,
char separator)
{
+ return extract_param_new (source, name, value, separator, (bool *) 0);
+}
+
+/* Like extract_param, but with the addition of parameter is_url_encoded that is set to true if the value is url-encoded (see RFC 2231 for details). */
+
+bool
+extract_param_new (const char **source, param_token *name, param_token *value,
+ char separator, bool *is_url_encoded)
+{
const char *p = *source;
+ if (is_url_encoded)
+ {
+ *is_url_encoded = false;
+ }
while (c_isspace (*p)) ++p;
if (!*p)
@@ -1125,6 +1138,9 @@ extract_param (const char **source, para
int param_type = modify_param_name(name);
if (NOT_RFC2231 != param_type)
{
+ if (RFC2231_ENCODING == param_type && is_url_encoded) {
+ *is_url_encoded = true;
+ }
modify_param_value(value, param_type);
}
return true;
@@ -1137,13 +1153,17 @@ extract_param (const char **source, para
/* Appends the string represented by VALUE to FILENAME */
static void
-append_value_to_filename (char **filename, param_token const * const value)
+append_value_to_filename (char **filename, param_token const * const value, bool is_url_encoded)
{
int original_length = strlen(*filename);
int new_length = strlen(*filename) + (value->e - value->b);
*filename = xrealloc (*filename, new_length+1);
memcpy (*filename + original_length, value->b, (value->e - value->b));
(*filename)[new_length] = '\0';
+ if (is_url_encoded)
+ {
+ url_unescape(*filename + original_length);
+ }
}
#undef MAX
@@ -1176,7 +1196,8 @@ parse_content_disposition (const char *h
{
param_token name, value;
*filename = NULL;
- while (extract_param (&hdr, &name, &value, ';'))
+ bool is_url_encoded;
+ for (is_url_encoded = false; extract_param_new (&hdr, &name, &value, ';', &is_url_encoded); is_url_encoded = false)
{
int isFilename = BOUNDED_EQUAL_NO_CASE ( name.b, name.e, "filename" );
if ( isFilename && value.b != NULL)
@@ -1192,9 +1213,15 @@ parse_content_disposition (const char *h
continue;
if (*filename)
- append_value_to_filename (filename, &value);
+ append_value_to_filename (filename, &value, is_url_encoded);
else
- *filename = strdupdelim (value.b, value.e);
+ {
+ *filename = strdupdelim (value.b, value.e);
+ if (is_url_encoded)
+ {
+ url_unescape (*filename);
+ }
+ }
}
}
diff -rupN wget-1.15/src/http.h wget-1.15-custom/src/http.h
--- wget-1.15/src/http.h 2014-01-04 13:49:47.000000000 +0100
+++ wget-1.15-custom/src/http.h 2014-02-12 08:27:28.725919927 +0100
@@ -44,6 +44,7 @@ typedef struct {
const char *b, *e;
} param_token;
bool extract_param (const char **, param_token *, param_token *, char);
+bool extract_param_new (const char **, param_token *, param_token *, char, bool *);
#endif /* HTTP_H */
diff -rupN wget-1.15/src/url.c wget-1.15-custom/src/url.c
--- wget-1.15/src/url.c 2014-01-04 13:49:47.000000000 +0100
+++ wget-1.15-custom/src/url.c 2014-02-12 08:27:28.725919927 +0100
@@ -169,7 +169,7 @@ static const unsigned char urlchr_table[
The transformation is done in place. If you need the original
string intact, make a copy before calling this function. */
-static void
+void
url_unescape (char *s)
{
char *t = s; /* t - tortoise */
diff -rupN wget-1.15/src/url.h wget-1.15-custom/src/url.h
--- wget-1.15/src/url.h 2013-10-21 16:50:12.000000000 +0200
+++ wget-1.15-custom/src/url.h 2014-02-12 08:27:28.725919927 +0100
@@ -100,6 +100,7 @@ struct url
/* Function declarations */
char *url_escape (const char *);
+void url_unescape (char *);
char *url_escape_unsafe_and_reserved (const char *);
struct url *url_parse (const char *, int *, struct iri *iri, bool percent_encode);