> On 19 Feb 2023, at 21:17, Maxim Dounin <mdou...@mdounin.ru> wrote: > > Hello! > > On Fri, Feb 17, 2023 at 06:38:54PM +0400, Sergey Kandaurov wrote: > >>> On 13 Jan 2023, at 01:35, Maxim Dounin <mdou...@mdounin.ru> wrote: >>> >>> # HG changeset patch >>> # User Maxim Dounin <mdou...@mdounin.ru> >>> # Date 1673548890 -10800 >>> # Thu Jan 12 21:41:30 2023 +0300 >>> # Node ID 60d845f9505fe1b97c1e04b680523b790e29fdb1 >>> # Parent 07b0bee87f32be91a33210bc06973e07c4c1dac9 >>> Win32: non-ASCII names support in autoindex (ticket #458). >>> >>> Notably, ngx_open_dir() now supports opening directories with non-ASCII >>> characters, and directory entries returned by ngx_read_dir() are properly >>> converted to UTF-8. >>> >>> diff -r 07b0bee87f32 -r 60d845f9505f src/os/win32/ngx_files.c >>> --- a/src/os/win32/ngx_files.c Wed Dec 21 14:53:27 2022 +0300 >>> +++ b/src/os/win32/ngx_files.c Thu Jan 12 21:41:30 2023 +0300 >>> @@ -13,7 +13,11 @@ >>> >>> static ngx_int_t ngx_win32_check_filename(u_char *name, u_short *u, >>> size_t len); >>> -static u_short *ngx_utf8_to_utf16(u_short *utf16, u_char *utf8, size_t >>> *len); >>> +static u_short *ngx_utf8_to_utf16(u_short *utf16, u_char *utf8, size_t >>> *len, >>> + size_t reserved); >>> +static u_char *ngx_utf16_to_utf8(u_char *utf8, u_short *utf16, size_t *len, >>> + size_t *allocated); >>> +uint32_t ngx_utf16_decode(u_short **u, size_t n); >>> >>> >>> /* FILE_FLAG_BACKUP_SEMANTICS allows to obtain a handle to a directory */ >>> @@ -28,7 +32,7 @@ ngx_open_file(u_char *name, u_long mode, >>> u_short utf16[NGX_UTF16_BUFLEN]; >>> >>> len = NGX_UTF16_BUFLEN; >>> - u = ngx_utf8_to_utf16(utf16, name, &len); >>> + u = ngx_utf8_to_utf16(utf16, name, &len, 0); >>> >>> if (u == NULL) { >>> return INVALID_HANDLE_VALUE; >>> @@ -269,7 +273,7 @@ ngx_file_info(u_char *file, ngx_file_inf >>> >>> len = NGX_UTF16_BUFLEN; >>> >>> - u = ngx_utf8_to_utf16(utf16, file, &len); >>> + u = ngx_utf8_to_utf16(utf16, file, &len, 0); >>> >>> if (u == NULL) { >>> return NGX_FILE_ERROR; >>> @@ -427,49 +431,51 @@ ngx_realpath(u_char *path, u_char *resol >>> ngx_int_t >>> ngx_open_dir(ngx_str_t *name, ngx_dir_t *dir) >>> { >>> - u_char *pattern, *p; >>> + size_t len; >>> + u_short *u, *p; >>> ngx_err_t err; >>> + u_short utf16[NGX_UTF16_BUFLEN]; >>> >>> - pattern = malloc(name->len + 3); >>> - if (pattern == NULL) { >>> + len = NGX_UTF16_BUFLEN - 2; >>> + u = ngx_utf8_to_utf16(utf16, name->data, &len, 2); >>> + >>> + if (u == NULL) { >>> return NGX_ERROR; >>> } >>> >>> - p = ngx_cpymem(pattern, name->data, name->len); >>> + if (ngx_win32_check_filename(name->data, u, len) != NGX_OK) { >>> + goto failed; >>> + } >>> + >>> + p = &u[len - 1]; >>> >>> *p++ = '/'; >>> *p++ = '*'; >>> *p = '\0'; >>> >>> - dir->dir = FindFirstFile((const char *) pattern, &dir->finddata); >>> + dir->dir = FindFirstFileW(u, &dir->finddata); >>> >>> if (dir->dir == INVALID_HANDLE_VALUE) { >>> - err = ngx_errno; >>> - ngx_free(pattern); >>> - ngx_set_errno(err); >>> - return NGX_ERROR; >>> + goto failed; >>> } >>> >>> - ngx_free(pattern); >>> + if (u != utf16) { >>> + ngx_free(u); >>> + } >>> >>> dir->valid_info = 1; >>> dir->ready = 1; >>> + dir->name = NULL; >>> + dir->allocated = 0; >>> >>> return NGX_OK; >>> -} >>> >>> +failed: >>> >>> -ngx_int_t >>> -ngx_read_dir(ngx_dir_t *dir) >>> -{ >>> - if (dir->ready) { >>> - dir->ready = 0; >>> - return NGX_OK; >>> - } >>> - >>> - if (FindNextFile(dir->dir, &dir->finddata) != 0) { >>> - dir->type = 1; >>> - return NGX_OK; >>> + if (u != utf16) { >>> + err = ngx_errno; >>> + ngx_free(u); >>> + ngx_set_errno(err); >>> } >>> >>> return NGX_ERROR; >>> @@ -477,8 +483,56 @@ ngx_read_dir(ngx_dir_t *dir) >>> >>> >>> ngx_int_t >>> +ngx_read_dir(ngx_dir_t *dir) >>> +{ >>> + u_char *name; >>> + size_t len, allocated; >>> + >>> + if (dir->ready) { >>> + dir->ready = 0; >>> + goto convert; >>> + } >>> + >>> + if (FindNextFileW(dir->dir, &dir->finddata) != 0) { >>> + dir->type = 1; >>> + goto convert; >>> + } >>> + >>> + return NGX_ERROR; >>> + >>> +convert: >>> + >>> + name = dir->name; >>> + len = dir->allocated; >>> + >>> + name = ngx_utf16_to_utf8(name, dir->finddata.cFileName, &len, >>> &allocated); >>> + if (name == NULL) { >> >> Note that you just return on NULL, but see below. >> >>> + return NGX_ERROR; >>> + } >>> + >>> + if (name != dir->name) { >>> + >>> + if (dir->name) { >>> + ngx_free(dir->name); >>> + } >>> + >>> + dir->name = name; >>> + dir->allocated = allocated; >>> + } >>> + >>> + dir->namelen = len - 1; >>> + >>> + return NGX_OK; >>> +} >>> + >>> + >>> +ngx_int_t >>> ngx_close_dir(ngx_dir_t *dir) >>> { >>> + if (dir->name) { >>> + ngx_free(dir->name); >>> + } >>> + >>> if (FindClose(dir->dir) == 0) { >>> return NGX_ERROR; >>> } >>> @@ -816,7 +870,7 @@ failed: >>> >>> >>> static u_short * >>> -ngx_utf8_to_utf16(u_short *utf16, u_char *utf8, size_t *len) >>> +ngx_utf8_to_utf16(u_short *utf16, u_char *utf8, size_t *len, size_t >>> reserved) >>> { >>> u_char *p; >>> u_short *u, *last; >>> @@ -865,7 +919,7 @@ ngx_utf8_to_utf16(u_short *utf16, u_char >>> >>> /* the given buffer is not enough, allocate a new one */ >>> >>> - u = malloc(((p - utf8) + ngx_strlen(p) + 1) * sizeof(u_short)); >>> + u = malloc(((p - utf8) + ngx_strlen(p) + 1 + reserved) * >>> sizeof(u_short)); >>> if (u == NULL) { >>> return NULL; >>> } >>> @@ -910,3 +964,170 @@ ngx_utf8_to_utf16(u_short *utf16, u_char >>> >>> /* unreachable */ >>> } >>> + >>> + >>> +static u_char * >>> +ngx_utf16_to_utf8(u_char *utf8, u_short *utf16, size_t *len, size_t >>> *allocated) >>> +{ >>> + u_char *p, *last; >>> + u_short *u, *j; >>> + uint32_t n; >>> + >>> + u = utf16; >>> + p = utf8; >>> + last = utf8 + *len; >>> + >>> + while (p < last) { >>> + >>> + if (*u < 0x80) { >>> + *p++ = (u_char) *u; >>> + >>> + if (*u == 0) { >>> + *len = p - utf8; >>> + return utf8; >>> + } >>> + >>> + u++; >>> + >>> + continue; >>> + } >>> + >>> + if (p >= last - 4) { >>> + *len = p - utf8; >>> + break; >>> + } >>> + >>> + n = ngx_utf16_decode(&u, 2); >>> + >>> + if (n > 0x10ffff) { >>> + ngx_free(utf8); >>> + ngx_set_errno(NGX_EILSEQ); >> >> This can result in double-free on a subsequent ngx_close_dir() call. > > Thanks for catching, that's slipped in from the second loop. > Removed ngx_free() here.
I believe it should be fine now. > >> >>> + return NULL; >>> + } >>> + >>> + if (n >= 0x10000) { >>> + *p++ = (u_char) (0xf0 + (n >> 18)); >>> + *p++ = (u_char) (0x80 + ((n >> 12) & 0x3f)); >>> + *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f)); >>> + *p++ = (u_char) (0x80 + (n & 0x3f)); >>> + continue; >>> + >> >> stray ws-only line (one more below) > > Fixed both, thanks. > >>> + } >>> + >>> + if (n >= 0x0800) { >>> + *p++ = (u_char) (0xe0 + (n >> 12)); >>> + *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f)); >>> + *p++ = (u_char) (0x80 + (n & 0x3f)); >>> + continue; >>> + } >>> + >>> + *p++ = (u_char) (0xc0 + (n >> 6)); >>> + *p++ = (u_char) (0x80 + (n & 0x3f)); >>> + } >>> + >>> + /* the given buffer is not enough, allocate a new one */ >>> + >>> + for (j = u; *j; j++) { /* void */ } >>> + >>> + p = malloc((j - utf16) * 4 + 1); >> >> [Note that pointer arithmetics is in u_short (2 bytes) units.] >> It seems you are allocating memory assuming that utf16 -> utf8 >> conversion can result in memory expansion up to 4 bytes per unit. >> At first glance that's true, because UTF-8 symbol may take 4 bytes. >> IIUC, 4-byte UTF-8 can result only from UTF-16 surrogate pairs, >> this gives 2x growth (2-paired UTF-16 to 4-bytes UTF-8). >> Otherwise, 1/2/3-byte encoded UTF-8 can result from 2-byte UTF-16, >> which gives from 1x to 3x growth per unit. >> So, "malloc((j - utf16) * 3 + 1)" should be enough there. > > That's correct, though this depends on not-directly-visible > encoding details. I would rather prefer to keep 4 here, since it > can be easily verified to be enough from the loop itself. > > I don't think that a minor optimization of memory usage by going > from 4x to 3x of the particular name is important here. Further, > extra allocated bytes are likely to be used as a preallocation for > other names. > Ok, it makes sense then. >>> + if (p == NULL) { >>> + return NULL; >>> + } >> >> Old memory is still there, allocated and referenced by dir->name, >> see above. It seems ok, it should be freed by ngx_close_dir() call. >> If we happen to change callers to not call ngx_close_dir() on error, >> this could turn into memory leaks, though. > > The ngx_close_dir() is certainly required (including for other > reasons), so this looks perfectly fine. > >>> + >>> + if (allocated) { >>> + *allocated = (j - utf16) * 4 + 1; >>> + } >>> + >>> + ngx_memcpy(p, utf8, *len); >>> + >>> + utf8 = p; >>> + p += *len; >>> + >>> + for ( ;; ) { >>> + >>> + if (*u < 0x80) { >>> + *p++ = (u_char) *u; >>> + >>> + if (*u == 0) { >>> + *len = p - utf8; >>> + return utf8; >>> + } >>> + >>> + u++; >>> + >>> + continue; >>> + } >>> + >>> + n = ngx_utf16_decode(&u, 2); >>> + >>> + if (n > 0x10ffff) { >>> + ngx_free(utf8); >>> + ngx_set_errno(NGX_EILSEQ); >>> + return NULL; >>> + } >>> + >>> + if (n >= 0x10000) { >>> + *p++ = (u_char) (0xf0 + (n >> 18)); >>> + *p++ = (u_char) (0x80 + ((n >> 12) & 0x3f)); >>> + *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f)); >>> + *p++ = (u_char) (0x80 + (n & 0x3f)); >>> + continue; >>> + >>> + } >>> + >>> + if (n >= 0x0800) { >>> + *p++ = (u_char) (0xe0 + (n >> 12)); >>> + *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f)); >>> + *p++ = (u_char) (0x80 + (n & 0x3f)); >>> + continue; >>> + } >>> + >>> + *p++ = (u_char) (0xc0 + (n >> 6)); >>> + *p++ = (u_char) (0x80 + (n & 0x3f)); >>> + } >>> + >>> + /* unreachable */ >>> +} >>> + >>> + >>> +/* >>> + * ngx_utf16_decode() decodes one or two UTF-16 code units >>> + * the return values: >>> + * 0x80 - 0x10ffff valid character >>> + * 0x110000 - 0xfffffffd invalid sequence >>> + * 0xfffffffe incomplete sequence >>> + * 0xffffffff error >>> + */ >>> + >>> +uint32_t >>> +ngx_utf16_decode(u_short **u, size_t n) >>> +{ >>> + uint32_t k, m; >>> + >>> + k = **u; >>> + >>> + if (k < 0xd800 || k > 0xdfff) { >>> + (*u)++; >>> + return k; >>> + } >>> + >>> + if (k > 0xdbff) { >>> + (*u)++; >>> + return 0xffffffff; >>> + } >>> + >>> + if (n < 2) { >>> + return 0xfffffffe; >>> + } >>> + >>> + (*u)++; >>> + >>> + m = *(*u)++; >>> + >>> + if (m < 0xdc00 || m > 0xdfff) { >>> + return 0xffffffff; >>> + >>> + } >>> + >>> + return 0x10000 + ((k - 0xd800) << 10) + (m - 0xdc00); >>> +} >>> diff -r 07b0bee87f32 -r 60d845f9505f src/os/win32/ngx_files.h >>> --- a/src/os/win32/ngx_files.h Wed Dec 21 14:53:27 2022 +0300 >>> +++ b/src/os/win32/ngx_files.h Thu Jan 12 21:41:30 2023 +0300 >>> @@ -30,7 +30,11 @@ typedef struct { >>> >>> typedef struct { >>> HANDLE dir; >>> - WIN32_FIND_DATA finddata; >>> + WIN32_FIND_DATAW finddata; >>> + >>> + u_char *name; >>> + size_t namelen; >>> + size_t allocated; >>> >>> unsigned valid_info:1; >>> unsigned type:1; >>> @@ -205,8 +209,8 @@ ngx_int_t ngx_close_dir(ngx_dir_t *dir); >>> #define ngx_dir_access(a) (a) >>> >>> >>> -#define ngx_de_name(dir) ((u_char *) (dir)->finddata.cFileName) >>> -#define ngx_de_namelen(dir) ngx_strlen((dir)->finddata.cFileName) >>> +#define ngx_de_name(dir) (dir)->name >>> +#define ngx_de_namelen(dir) (dir)->namelen >>> >>> ngx_int_t ngx_de_info(u_char *name, ngx_dir_t *dir); >>> #define ngx_de_info_n "dummy()" >> >> Otherwise, looks good. > > Fixes: > > diff -r d7521ae9e138 src/os/win32/ngx_files.c > --- a/src/os/win32/ngx_files.c Sat Feb 18 12:08:25 2023 +0300 > +++ b/src/os/win32/ngx_files.c Sat Feb 18 12:57:02 2023 +0300 > @@ -506,6 +506,7 @@ convert: > len = dir->allocated; > > name = ngx_utf16_to_utf8(name, dir->finddata.cFileName, &len, &allocated); > + > if (name == NULL) { > return NGX_ERROR; > } > @@ -1000,7 +1000,6 @@ ngx_utf16_to_utf8(u_char *utf8, u_short > n = ngx_utf16_decode(&u, 2); > > if (n > 0x10ffff) { > - ngx_free(utf8); > ngx_set_errno(NGX_EILSEQ); > return NULL; > } > @@ -1011,7 +1010,6 @@ ngx_utf16_to_utf8(u_char *utf8, u_short > *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f)); > *p++ = (u_char) (0x80 + (n & 0x3f)); > continue; > - > } > > if (n >= 0x0800) { > @@ -1072,7 +1070,6 @@ ngx_utf16_to_utf8(u_char *utf8, u_short > *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f)); > *p++ = (u_char) (0x80 + (n & 0x3f)); > continue; > - > } > > if (n >= 0x0800) { > Looks good. -- Sergey Kandaurov _______________________________________________ nginx-devel mailing list nginx-devel@nginx.org https://mailman.nginx.org/mailman/listinfo/nginx-devel