Hi! I've been using the first version of wget for a long time and first of all, I want to say thank you to all of the maintainers and contributors of this project!
I was looking at the code recently to find that it doesn't support "<meta charset=...>" tag yet. I don't see any issues in bug tracker related to this, so I created a patch. I'm hoping it helps. I also attach two HTML files for verification. One of them specifies Japanese path in UTF-8, others does in Shift-JIS. Serve these files on localhost:8080, and let wget follow the link. (e.g. `wget -d --recursive --level=2 http://localhost:8080/charset_test_shift_jis.html`) Verify that in both cases, wget tries to download http://localhost:8080/%E6%97%A5%E6%9C%AC%E8%AA%9E.html. Thanks! Sho Amano --- src/html-url.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/html-url.c b/src/html-url.c index b80cf269..5324d244 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -182,6 +182,7 @@ static const char *additional_attributes[] = { "http-equiv", /* used by tag_handle_meta */ "name", /* used by tag_handle_meta */ "content", /* used by tag_handle_meta */ + "charset", /* used by tag_handle_meta */ "action", /* used by tag_handle_form */ "style", /* used by check_style_attr */ "srcset", /* used by tag_handle_img */ @@ -191,7 +192,7 @@ static struct hash_table *interesting_tags; static struct hash_table *interesting_attributes; /* Will contains the (last) charset found in 'http-equiv=content-type' - meta tags */ + or 'charset' meta tags */ static char *meta_charset; static void @@ -574,6 +575,7 @@ tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context * { char *name = find_attr (tag, "name", NULL); char *http_equiv = find_attr (tag, "http-equiv", NULL); + char *charset = find_attr (tag, "charset", NULL); if (http_equiv && 0 == c_strcasecmp (http_equiv, "refresh")) { @@ -673,6 +675,20 @@ tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context * } } } + else if (charset) + { + /* Handle stuff like: + <meta charset="CHARSET"> + If charset is acquired from http-equiv then it is overwritten. */ + + /* Do a minimum check on the charset value */ + if (check_encoding_name (charset)) + { + char *mcharset = xstrdup (charset); + xfree (meta_charset); + meta_charset = mcharset; + } + } } /* Handle the IMG tag. This requires special handling for the srcset attr, -- 2.17.1
[1]Path in Japanese References 1. http://localhost:8080/日本語.html
[1]Path in Japanese References 1. http://localhost:8080/譌・譛ャ隱�.html
