On Tue, Jun 11, 2019 at 5:38 PM William A Rowe Jr <wr...@rowe-clan.net> wrote:
> On Tue, Jun 11, 2019 at 1:44 PM Branko Čibej <br...@apache.org> wrote: > >> We either reserve about 2x buffers for file name transliteration in heap >> per thread, or we use the thread stack. As long as we trust that our utf-8 >> to ucs-2 logic is rock solid and the allocations and limits are correctly >> coded, this continues to be a safe approach. >> >> >> Apropos of that, for 2.0 we're about to or have already ditched support >> for versions of Windows that do not have native UTF-8/UTF-16 conversions >> (ah, yes ... Windows has finally moved from UCS-2 to UTF-16). Wouldn't this >> be the right time to switch to using Windows' functions instead of staying >> with our own? Especially since, with the transition to UTF-16, we have to >> deal correctly with surrogate pairs, something our current code (IIRC) >> doesn't do. >> > > A bit of a misnomer, the code is full of references to ucs-2 w/surrogate > pair > support, the combo of these is utf-16. The comments can be refreshed to > today's utf-16 nomenclature. > How strongly do we feel about the naming? I have the following patch to commit if we want to observe current convention in utf-naming and deprecate many (but not all) ucs references.
Index: file_io/win32/open.c =================================================================== --- file_io/win32/open.c (revision 1856754) +++ file_io/win32/open.c (working copy) @@ -86,7 +86,7 @@ } } - if ((rv = apr_conv_utf8_to_ucs2(srcstr, &srcremains, t, &retlen))) { + if ((rv = apr_conv_utf8_to_utf16(srcstr, &srcremains, t, &retlen))) { return (rv == APR_INCOMPLETE) ? APR_EINVAL : rv; } if (srcremains) { @@ -127,7 +127,7 @@ } } - if ((rv = apr_conv_ucs2_to_utf8(srcstr, &srcremains, t, &retlen))) { + if ((rv = apr_conv_utf16_to_utf8(srcstr, &srcremains, t, &retlen))) { return rv; } if (srcremains) { @@ -168,7 +168,7 @@ wfile = apr_palloc(pool, (r + n) * sizeof(apr_wchar_t)); wcscpy(wfile, wpre); d = n; - if (apr_conv_utf8_to_ucs2(file, &n, wfile + r, &d)) { + if (apr_conv_utf8_to_utf16(file, &n, wfile + r, &d)) { return NULL; } for (ch = wfile + r; *ch; ++ch) { Index: include/arch/win32/apr_arch_utf8.h =================================================================== --- include/arch/win32/apr_arch_utf8.h (revision 1856754) +++ include/arch/win32/apr_arch_utf8.h (working copy) @@ -27,30 +27,32 @@ /** * An APR internal function for fast utf-8 octet-encoded Unicode conversion - * to the ucs-2 wide Unicode format. This function is used for filename and - * other resource conversions for platforms providing native Unicode support. + * to the utf-16 Unicode with surrogate pairs format. This function is used + * for filename and other resource conversions for platforms providing native + * Unicode support. * * @tip Only the errors APR_EINVAL and APR_INCOMPLETE may occur, the former * when the character code is invalid (in or out of context) and the later * when more characters were expected, but insufficient characters remain. */ -APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in, - apr_size_t *inbytes, - apr_wchar_t *out, - apr_size_t *outwords); +APR_DECLARE(apr_status_t) apr_conv_utf8_to_utf16(const char *in, + apr_size_t *inbytes, + apr_wchar_t *out, + apr_size_t *outwords); /** - * An APR internal function for fast ucs-2 wide Unicode format conversion to - * the utf-8 octet-encoded Unicode. This function is used for filename and - * other resource conversions for platforms providing native Unicode support. + * An APR internal function for fast utf-16 Unicode including surrogate pairs + * format conversion to the utf-8 octet-encoded Unicode. This function is used + * for filename and other resource conversions for platforms providing native + * Unicode support. * * @tip Only the errors APR_EINVAL and APR_INCOMPLETE may occur, the former - * when the character code is invalid (in or out of context) and the later - * when more words were expected, but insufficient words remain. + * when the surrogate word is invalid (in or out of context) and the later + * when another surrogate word is expected, but insufficient words remain. */ -APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in, - apr_size_t *inwords, - char *out, - apr_size_t *outbytes); +APR_DECLARE(apr_status_t) apr_conv_utf16_to_utf8(const apr_wchar_t *in, + apr_size_t *inwords, + char *out, + apr_size_t *outbytes); #endif /* def UTF8_H */ Index: misc/win32/env.c =================================================================== --- misc/win32/env.c (revision 1856754) +++ misc/win32/env.c (working copy) @@ -33,7 +33,7 @@ apr_status_t status; inchars = strlen(envvar) + 1; - status = apr_conv_utf8_to_ucs2(envvar, &inchars, buffer, &bufflen); + status = apr_conv_utf8_to_utf16(envvar, &inchars, buffer, &bufflen); if (status == APR_INCOMPLETE) status = APR_ENAMETOOLONG; @@ -82,7 +82,7 @@ inchars = wcslen(wvalue) + 1; outchars = 3 * inchars; /* Enough for any UTF-8 representation */ val = apr_palloc(pool, outchars); - status = apr_conv_ucs2_to_utf8(wvalue, &inchars, val, &outchars); + status = apr_conv_utf16_to_utf8(wvalue, &inchars, val, &outchars); if (status) return status; } @@ -139,7 +139,7 @@ outchars = inchars = strlen(value) + 1; wvalue = apr_palloc(pool, outchars * sizeof(*wvalue)); - status = apr_conv_utf8_to_ucs2(value, &inchars, wvalue, &outchars); + status = apr_conv_utf8_to_utf16(value, &inchars, wvalue, &outchars); if (status) return status; Index: misc/win32/internal.c =================================================================== --- misc/win32/internal.c (revision 1856754) +++ misc/win32/internal.c (working copy) @@ -61,8 +61,8 @@ /* This is a safe max allocation, we will realloc after * processing and return the excess to the free store. - * 3 ucs bytes hold any single wchar_t value (16 bits) - * 4 ucs bytes will hold a wchar_t pair value (20 bits) + * 3 utf-8 bytes hold any single wchar_t value (16 bits) + * 4 utf-8 bytes will hold a 2 word utf-16 surrogate pair value (20 bits) */ elesize = elesize * 3 + 1; ele = elements = apr_malloc_dbg(elesize * sizeof(char), @@ -73,7 +73,7 @@ apr_size_t newlen = elesize; newarr[arg] = ele; - (void)apr_conv_ucs2_to_utf8(arr[arg], &len, + (void)apr_conv_utf16_to_utf8(arr[arg], &len, newarr[arg], &elesize); newlen -= elesize; Index: misc/win32/start.c =================================================================== --- misc/win32/start.c (revision 1856754) +++ misc/win32/start.c (working copy) @@ -58,14 +58,14 @@ /* This is a safe max allocation, we will alloc each * string exactly after processing and return this * temporary buffer to the free store. - * 3 ucs bytes hold any single wchar_t value (16 bits) - * 4 ucs bytes will hold a wchar_t pair value (20 bits) + * 3 utf-8 bytes hold any single utf-16 value (16 bits) + * 4 utf-8 bytes will hold a 2 word utf-16 surrogate pair value (20 bits) */ newlen = totlen = wsize * 3 + 1; pstrs = strs = apr_malloc_dbg(newlen * sizeof(char), __FILE__, __LINE__); - (void)apr_conv_ucs2_to_utf8(arrsz, &wsize, strs, &newlen); + (void)apr_conv_utf16_to_utf8(arrsz, &wsize, strs, &newlen); assert(newlen && !wsize); Index: misc/win32/utf8.c =================================================================== --- misc/win32/utf8.c (revision 1856754) +++ misc/win32/utf8.c (working copy) @@ -23,28 +23,28 @@ * with particular attention to canonical translation forms (see section 10 * "Security Considerations" of the RFC for more info). * - * Since several architectures including Windows support unicode, with UCS2 + * Since several architectures including Windows support unicode, with utf-16 * used as the actual storage conventions by that archicture, these functions - * exist to transform or validate UCS2 strings into APR's 'char' type + * exist to transform or validate utf-16 strings into APR's 'char' type * convention. It is left up to the operating system to determine the * validitity of the string, e.g. normative forms, in the context of * its native language support. Other file systems which support filename * characters of 0x80-0xff but have no explicit requirement for Unicode * will find this function useful only for validating the character sequences - * and rejecting poorly encoded UTF8 sequences. + * and rejecting poorly encoded utf-8 sequences. * - * Len UCS-4 range (hex) UTF-8 octet sequence (binary) - * 1:2 00000000-0000007F 0xxxxxxx - * 2:2 00000080-000007FF 110XXXXx 10xxxxxx - * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx - * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx - * 00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx - * 04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * len utf-4 range (hex) utf-8 octet sequence (binary) + * 1:2 00000000-0000007F 0xxxxxxx + * 2:2 00000080-000007FF 110XXXXx 10xxxxxx + * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx + * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx + * 00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * - * One of the X bits must be 1 to avoid overlong representation of ucs2 values. + * One of the X bits must be 1 to avoid overlong representation in utf-8. * - * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF, - * and the final two forms are used only by full ucs4, per RFC 3629; + * For conversion into utf-16, the 4th form is limited in range to 0010 FFFF, + * and the final two forms are used only by full ucs-4, per RFC 3629; * * "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in * Unicode parlance), being actually UCS-4 characters transformed @@ -54,24 +54,29 @@ * * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask * - * U' = U - 0x10000 - * U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx - * W1 = 110110yy yyyyyyyy - * W2 = 110111xx xxxxxxxx - * Max U' = 0000 00001111 11111111 11111111 - * Max U = 0000 00010000 11111111 11111111 + * U' = U - 0x10000 + * U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx + * W1 = 110110yy yyyyyyyy + * W2 = 110111xx xxxxxxxx + * Max U' = 0000 00001111 11111111 11111111 + * Max U = 0000 00010000 11111111 11111111 * - * Len is the table above is a mapping of bytes used for utf8:ucs2 values, + * Also note ISO/IEC 10646:2014 Clause 9.4: "Because surrogate code points + * are not UCS scalar values, UTF-32 code units in the range + * 0000 D800-0000 DFFF are ill-formed" for future reference in adding any + * utf-32 accessor functions. + * + * Len is the table above is a mapping of bytes used for utf-8:utf-16 values, * which results in these conclusions of maximum allocations; * - * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2 - * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2 + * apr_conv_utf8_to_utf16 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2 + * apr_conv_utf16_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2 */ -APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in, - apr_size_t *inbytes, - apr_wchar_t *out, - apr_size_t *outwords) +APR_DECLARE(apr_status_t) apr_conv_utf8_to_utf16(const char *in, + apr_size_t *inbytes, + apr_wchar_t *out, + apr_size_t *outwords) { apr_int64_t newch, mask; apr_size_t expect, eating; @@ -105,7 +110,7 @@ expect = 1; while ((ch & mask) == mask) { mask |= mask >> 1; - if (++expect > 3) /* (truly 5 for ucs-4) */ + if (++expect > 3) /* (or 5 for a ucs-4 code point) */ return APR_EINVAL; } newch = ch & ~mask; @@ -125,8 +130,7 @@ if (!newch && !((unsigned char)*in & 0077 & (mask << 1))) return APR_EINVAL; if (expect == 2) { - /* Reject values D800-DFFF when not utf16 encoded - * (may not be an appropriate restriction for ucs-4) + /* Reject values D800-DFFF when not utf-16 encoded */ if (newch == 0015 && ((unsigned char)*in & 0040)) return APR_EINVAL; @@ -156,7 +160,7 @@ *inbytes -= eating; /* newch is now a true ucs-4 character * - * now we need to fold to ucs-2 + * now we need to fold to utf-16 */ if (newch < 0x10000) { @@ -179,10 +183,10 @@ return APR_SUCCESS; } -APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in, - apr_size_t *inwords, - char *out, - apr_size_t *outbytes) +APR_DECLARE(apr_status_t) apr_conv_utf16_to_utf8(const apr_wchar_t *in, + apr_size_t *inwords, + char *out, + apr_size_t *outbytes) { apr_int64_t newch, require; apr_size_t need; @@ -201,20 +205,20 @@ else { if ((ch & 0xFC00) == 0xDC00) { - /* Invalid Leading ucs-2 Multiword Continuation Character + /* Invalid Leading utf-16 Multiword Continuation Character */ return APR_EINVAL; } if ((ch & 0xFC00) == 0xD800) { - /* Leading ucs-2 Multiword Character + /* Leading utf-16 Multiword Character */ if (*inwords < 2) { - /* Missing ucs-2 Multiword Continuation Character + /* Missing utf-16 Multiword Continuation Character */ return APR_INCOMPLETE; } if (((unsigned short)(*in) & 0xFC00) != 0xDC00) { - /* Invalid ucs-2 Multiword Continuation Character + /* Invalid utf-16 Multiword Continuation Character */ return APR_EINVAL; } @@ -222,7 +226,7 @@ newch += 0x10000; } else { - /* ucs-2 Single Word Character + /* utf-16 Single Word Character */ newch = ch; } Index: test/internal/testucs.c =================================================================== --- test/internal/testucs.c (revision 1856754) +++ test/internal/testucs.c (working copy) @@ -113,7 +113,7 @@ do { apr_size_t nl = s.nl, wl = sizeof(s.w) / 2; - rc = apr_conv_utf8_to_ucs2(s.n, &nl, s.w, &wl); + rc = apr_conv_utf8_to_utf16(s.n, &nl, s.w, &wl); s.wl = (sizeof(s.w) / 2) - wl; if (!nl && rc == APR_SUCCESS) { if (!success) { @@ -165,7 +165,7 @@ do { apr_size_t nl = sizeof(s.n), wl = s.wl; - rc = apr_conv_ucs2_to_utf8(s.w, &wl, s.n, &nl); + rc = apr_conv_utf16_to_utf8(s.w, &wl, s.n, &nl); s.nl = sizeof(s.n) - nl; if (!wl && rc == APR_SUCCESS) { if (!success) { @@ -197,7 +197,7 @@ do { apr_size_t wl = s.wl, nl = sizeof(s.n); - rc = apr_conv_ucs2_to_utf8(s.w, &wl, s.n, &nl); + rc = apr_conv_utf16_to_utf8(s.w, &wl, s.n, &nl); s.nl = sizeof(s.n) - s.nl; if (rc == APR_INCOMPLETE) { test_wrange(&s); @@ -227,7 +227,7 @@ do { inlen = ntest.nl; ntest.wl = sizeof(ntest.w) / 2; - nrc = apr_conv_utf8_to_ucs2(ntest.n, &inlen, ntest.w, &ntest.wl); + nrc = apr_conv_utf8_to_utf16(ntest.n, &inlen, ntest.w, &ntest.wl); if (nrc == APR_SUCCESS) { ntest.wl = (sizeof(ntest.w) / 2) - ntest.wl; break; @@ -251,7 +251,7 @@ do { inlen = wtest.wl; wtest.nl = sizeof(wtest.n); - wrc = apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl); + wrc = apr_conv_utf16_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl); if (wrc == APR_SUCCESS) { wtest.nl = sizeof(wtest.n) - wtest.nl; break; @@ -267,7 +267,7 @@ do { inlen = 1; wtest.nl = sizeof(wtest.n); - if (apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl) + if (apr_conv_utf16_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl) == APR_INCOMPLETE) break; if (!(++wtest.w[0])) { @@ -308,7 +308,7 @@ do { inlen = 1; wtest.nl = sizeof(wtest.n); - if (apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl) + if (apr_conv_utf16_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl) == APR_INCOMPLETE) break; if (!(++wtest.w[0])) { @@ -319,7 +319,7 @@ } } while (wtest.wl || ntest.nl); - printf ("\n\nutf8 and ucs2 sequences of %lu transformations matched OK.\n", + printf ("\n\nutf8 and utf16 sequences of %lu transformations matched OK.\n", matches); }