On Fri, Aug 07, 2015 at 05:13:19PM +0200, Tim Ruehsen wrote: > The solution would something like > > if locale is UTF-8 > do not escape valid UTF-8 sequences > else > keep wget's current behavior
> If you provide patch for this we will appreciate that. OK - a first version of such a patch. This splits the restrict_control into two halves. The low control is as before. The high control is permitted by default on a Unix system with something that looks like an UTF-8 locale. For Windows the behavior is unchanged. Andries Test: fetch http://he.wikipedia.org/wiki/הרפש_.ש diff -ru wget-1.16.3/src/init.c wget-1.16.3a/src/init.c --- wget-1.16.3/src/init.c 2015-01-31 00:25:57.000000000 +0100 +++ wget-1.16.3a/src/init.c 2015-08-09 21:44:54.260215105 +0200 @@ -333,6 +333,27 @@ return -1; } + +/* Used to determine whether bytes 128-159 are OK in a filename */ +static int +have_utf8_locale() { +#if defined(WINDOWS) || defined(MSDOS) || defined(__CYGWIN__) + /* insert some test for Windows */ +#else + char *p; + + p = getenv("LC_ALL"); + if (p == NULL) + p = getenv("LC_CTYPE"); + if (p == NULL) + p = getenv("LANG"); + if (strstr(p, "UTF-8") != NULL || strstr(p, "UTF8") != NULL || + strstr(p, "utf-8") != NULL || strstr(p, "utf8") != NULL) + return true; +#endif + return false; +} + /* Reset the variables to default values. */ void defaults (void) @@ -401,6 +422,7 @@ opt.restrict_files_os = restrict_unix; #endif opt.restrict_files_ctrl = true; + opt.restrict_files_highctrl = (have_utf8_locale() ? false : true); opt.restrict_files_nonascii = false; opt.restrict_files_case = restrict_no_case_restriction; @@ -1466,6 +1488,7 @@ { int restrict_os = opt.restrict_files_os; int restrict_ctrl = opt.restrict_files_ctrl; + int restrict_highctrl = opt.restrict_files_highctrl; int restrict_case = opt.restrict_files_case; int restrict_nonascii = opt.restrict_files_nonascii; @@ -1488,7 +1511,7 @@ else if (VAL_IS ("uppercase")) restrict_case = restrict_uppercase; else if (VAL_IS ("nocontrol")) - restrict_ctrl = false; + restrict_ctrl = restrict_highctrl = false; else if (VAL_IS ("ascii")) restrict_nonascii = true; else @@ -1509,6 +1532,7 @@ opt.restrict_files_os = restrict_os; opt.restrict_files_ctrl = restrict_ctrl; + opt.restrict_files_highctrl = restrict_highctrl; opt.restrict_files_case = restrict_case; opt.restrict_files_nonascii = restrict_nonascii; diff -ru wget-1.16.3/src/options.h wget-1.16.3a/src/options.h --- wget-1.16.3/src/options.h 2015-01-31 00:25:57.000000000 +0100 +++ wget-1.16.3a/src/options.h 2015-08-09 21:22:35.984186065 +0200 @@ -244,6 +244,7 @@ bool restrict_files_ctrl; /* non-zero if control chars in URLs are restricted from appearing in generated file names. */ + bool restrict_files_highctrl; /* idem for bytes 128-159 */ bool restrict_files_nonascii; /* non-zero if bytes with values greater than 127 are restricted. */ enum { diff -ru wget-1.16.3/src/url.c wget-1.16.3a/src/url.c --- wget-1.16.3/src/url.c 2015-02-23 16:10:22.000000000 +0100 +++ wget-1.16.3a/src/url.c 2015-08-09 21:14:34.876175626 +0200 @@ -1329,7 +1329,8 @@ enum { filechr_not_unix = 1, /* unusable on Unix, / and \0 */ filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */ - filechr_control = 4 /* a control character, e.g. 0-31 */ + filechr_control = 4, /* a control character, e.g. 0-31 */ + filechr_highcontrol = 8 /* a high control character, in 128-159 */ }; #define FILE_CHAR_TEST(c, mask) \ @@ -1340,6 +1341,7 @@ #define U filechr_not_unix #define W filechr_not_windows #define C filechr_control +#define Z filechr_highcontrol #define UW U|W #define UWC U|W|C @@ -1370,8 +1372,8 @@ 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ 0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */ - C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */ - C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */ + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, /* 128-143 */ + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, /* 144-159 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1383,6 +1385,7 @@ #undef U #undef W #undef C +#undef Z #undef UW #undef UWC @@ -1417,8 +1420,11 @@ mask = filechr_not_unix; else mask = filechr_not_windows; + if (opt.restrict_files_ctrl) mask |= filechr_control; + if (opt.restrict_files_highctrl) + mask |= filechr_highcontrol; /* Copy [b, e) to PATHEL and URL-unescape it. */ if (escaped)
