Hi Andries, thanks for your work.
Just a few questions. 1. Why don't you use 'opt.locale' to check if the local encoding is UTF-8 ? 2. I don't understand how you distinguish between illegal and legal UTF-8 sequences. I guess only legal sequences should be unescaped. Or to make it easy: if the string is valid UTF-8, do not escape. If it is not valid UTF-8, escape it. You could: Add unistr/u8-check to bootstrap.conf (./bootstrap thereafter), include #include "unistr.h" and use if (u8_check (s, strlen(s)) == 0) to test for validity. Regards, Tim On Sunday 09 August 2015 22:08:34 Andries E. Brouwer wrote: > On Fri, Aug 07, 2015 at 05:13:19PM +0200, Tim Ruehsen wrote: > > The solution would something like > > > > if locale is UTF-8 > > > > do not escape valid UTF-8 sequences > > > > else > > > > keep wget's current behavior > > > > If you provide patch for this we will appreciate that. > > OK - a first version of such a patch. > This splits the restrict_control into two halves. > The low control is as before. > The high control is permitted by default on a Unix system > with something that looks like an UTF-8 locale. > For Windows the behavior is unchanged. > > Andries > > Test: fetch http://he.wikipedia.org/wiki/הרפש_.ש > > > diff -ru wget-1.16.3/src/init.c wget-1.16.3a/src/init.c > --- wget-1.16.3/src/init.c 2015-01-31 00:25:57.000000000 +0100 > +++ wget-1.16.3a/src/init.c 2015-08-09 21:44:54.260215105 +0200 > @@ -333,6 +333,27 @@ > return -1; > } > > + > +/* Used to determine whether bytes 128-159 are OK in a filename */ > +static int > +have_utf8_locale() { > +#if defined(WINDOWS) || defined(MSDOS) || defined(__CYGWIN__) > + /* insert some test for Windows */ > +#else > + char *p; > + > + p = getenv("LC_ALL"); > + if (p == NULL) > + p = getenv("LC_CTYPE"); > + if (p == NULL) > + p = getenv("LANG"); > + if (strstr(p, "UTF-8") != NULL || strstr(p, "UTF8") != NULL || > + strstr(p, "utf-8") != NULL || strstr(p, "utf8") != NULL) > + return true; > +#endif > + return false; > +} > + > /* Reset the variables to default values. */ > void > defaults (void) > @@ -401,6 +422,7 @@ > opt.restrict_files_os = restrict_unix; > #endif > opt.restrict_files_ctrl = true; > + opt.restrict_files_highctrl = (have_utf8_locale() ? false : true); > opt.restrict_files_nonascii = false; > opt.restrict_files_case = restrict_no_case_restriction; > > @@ -1466,6 +1488,7 @@ > { > int restrict_os = opt.restrict_files_os; > int restrict_ctrl = opt.restrict_files_ctrl; > + int restrict_highctrl = opt.restrict_files_highctrl; > int restrict_case = opt.restrict_files_case; > int restrict_nonascii = opt.restrict_files_nonascii; > > @@ -1488,7 +1511,7 @@ > else if (VAL_IS ("uppercase")) > restrict_case = restrict_uppercase; > else if (VAL_IS ("nocontrol")) > - restrict_ctrl = false; > + restrict_ctrl = restrict_highctrl = false; > else if (VAL_IS ("ascii")) > restrict_nonascii = true; > else > @@ -1509,6 +1532,7 @@ > > opt.restrict_files_os = restrict_os; > opt.restrict_files_ctrl = restrict_ctrl; > + opt.restrict_files_highctrl = restrict_highctrl; > opt.restrict_files_case = restrict_case; > opt.restrict_files_nonascii = restrict_nonascii; > > diff -ru wget-1.16.3/src/options.h wget-1.16.3a/src/options.h > --- wget-1.16.3/src/options.h 2015-01-31 00:25:57.000000000 +0100 > +++ wget-1.16.3a/src/options.h 2015-08-09 21:22:35.984186065 +0200 > @@ -244,6 +244,7 @@ > bool restrict_files_ctrl; /* non-zero if control chars in URLs > are restricted from appearing in > generated file names. */ > + bool restrict_files_highctrl; /* idem for bytes 128-159 */ > bool restrict_files_nonascii; /* non-zero if bytes with values greater > than 127 are restricted. */ > enum { > diff -ru wget-1.16.3/src/url.c wget-1.16.3a/src/url.c > --- wget-1.16.3/src/url.c 2015-02-23 16:10:22.000000000 +0100 > +++ wget-1.16.3a/src/url.c 2015-08-09 21:14:34.876175626 +0200 > @@ -1329,7 +1329,8 @@ > enum { > filechr_not_unix = 1, /* unusable on Unix, / and \0 */ > filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */ > - filechr_control = 4 /* a control character, e.g. 0-31 */ + > filechr_control = 4, /* a control character, e.g. 0-31 */ + > filechr_highcontrol = 8 /* a high control character, in 128-159 */ }; > > #define FILE_CHAR_TEST(c, mask) \ > @@ -1340,6 +1341,7 @@ > #define U filechr_not_unix > #define W filechr_not_windows > #define C filechr_control > +#define Z filechr_highcontrol > > #define UW U|W > #define UWC U|W|C > @@ -1370,8 +1372,8 @@ > 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ > 0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */ > > - C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */ > - C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */ > + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, /* 128-143 */ > + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, /* 144-159 */ > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > @@ -1383,6 +1385,7 @@ > #undef U > #undef W > #undef C > +#undef Z > #undef UW > #undef UWC > > @@ -1417,8 +1420,11 @@ > mask = filechr_not_unix; > else > mask = filechr_not_windows; > + > if (opt.restrict_files_ctrl) > mask |= filechr_control; > + if (opt.restrict_files_highctrl) > + mask |= filechr_highcontrol; > > /* Copy [b, e) to PATHEL and URL-unescape it. */ > if (escaped)
signature.asc
Description: This is a digitally signed message part.
