On Tue, Jun 11, 2019 at 5:38 PM William A Rowe Jr <wr...@rowe-clan.net>
wrote:

> On Tue, Jun 11, 2019 at 1:44 PM Branko Čibej <br...@apache.org> wrote:
>
>>  We either reserve about 2x buffers for file name transliteration in heap
>> per thread, or we use the thread stack. As long as we trust that our utf-8
>> to ucs-2 logic is rock solid and the allocations and limits are correctly
>> coded, this continues to be a safe approach.
>>
>>
>> Apropos of that, for 2.0 we're about to or have already ditched support
>> for versions of Windows that do not have native UTF-8/UTF-16 conversions
>> (ah, yes ... Windows has finally moved from UCS-2 to UTF-16). Wouldn't this
>> be the right time to switch to using Windows' functions instead of staying
>> with our own? Especially since, with the transition to UTF-16, we have to
>> deal correctly with surrogate pairs, something our current code (IIRC)
>> doesn't do.
>>
>
> A bit of a misnomer, the code is full of references to ucs-2 w/surrogate
> pair
> support, the combo of these is utf-16. The comments can be refreshed to
> today's utf-16 nomenclature.
>

How strongly do we feel about the naming? I have the following patch to
commit if we want to observe current convention in utf-naming and deprecate
many (but not all) ucs references.
Index: file_io/win32/open.c
===================================================================
--- file_io/win32/open.c	(revision 1856754)
+++ file_io/win32/open.c	(working copy)
@@ -86,7 +86,7 @@
         }
     }
 
-    if ((rv = apr_conv_utf8_to_ucs2(srcstr, &srcremains, t, &retlen))) {
+    if ((rv = apr_conv_utf8_to_utf16(srcstr, &srcremains, t, &retlen))) {
         return (rv == APR_INCOMPLETE) ? APR_EINVAL : rv;
     }
     if (srcremains) {
@@ -127,7 +127,7 @@
         }
     }
         
-    if ((rv = apr_conv_ucs2_to_utf8(srcstr, &srcremains, t, &retlen))) {
+    if ((rv = apr_conv_utf16_to_utf8(srcstr, &srcremains, t, &retlen))) {
         return rv;
     }
     if (srcremains) {
@@ -168,7 +168,7 @@
         wfile = apr_palloc(pool, (r + n) * sizeof(apr_wchar_t));
         wcscpy(wfile, wpre);
         d = n;
-        if (apr_conv_utf8_to_ucs2(file, &n, wfile + r, &d)) {
+        if (apr_conv_utf8_to_utf16(file, &n, wfile + r, &d)) {
             return NULL;
         }
         for (ch = wfile + r; *ch; ++ch) {
Index: include/arch/win32/apr_arch_utf8.h
===================================================================
--- include/arch/win32/apr_arch_utf8.h	(revision 1856754)
+++ include/arch/win32/apr_arch_utf8.h	(working copy)
@@ -27,30 +27,32 @@
 
 /**
  * An APR internal function for fast utf-8 octet-encoded Unicode conversion
- * to the ucs-2 wide Unicode format.  This function is used for filename and 
- * other resource conversions for platforms providing native Unicode support.
+ * to the utf-16 Unicode with surrogate pairs format.  This function is used
+ * for filename and other resource conversions for platforms providing native
+ * Unicode support.
  *
  * @tip Only the errors APR_EINVAL and APR_INCOMPLETE may occur, the former
  * when the character code is invalid (in or out of context) and the later
  * when more characters were expected, but insufficient characters remain.
  */
-APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in, 
-                                                apr_size_t *inbytes,
-                                                apr_wchar_t *out, 
-                                                apr_size_t *outwords);
+APR_DECLARE(apr_status_t) apr_conv_utf8_to_utf16(const char *in, 
+                                                 apr_size_t *inbytes,
+                                                 apr_wchar_t *out, 
+                                                 apr_size_t *outwords);
 
 /**
- * An APR internal function for fast ucs-2 wide Unicode format conversion to 
- * the utf-8 octet-encoded Unicode.  This function is used for filename and 
- * other resource conversions for platforms providing native Unicode support.
+ * An APR internal function for fast utf-16 Unicode including surrogate pairs
+ * format conversion to the utf-8 octet-encoded Unicode.  This function is used
+ * for filename and other resource conversions for platforms providing native
+ * Unicode support.
  *
  * @tip Only the errors APR_EINVAL and APR_INCOMPLETE may occur, the former
- * when the character code is invalid (in or out of context) and the later
- * when more words were expected, but insufficient words remain.
+ * when the surrogate word is invalid (in or out of context) and the later
+ * when another surrogate word is expected, but insufficient words remain.
  */
-APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in, 
-                                                apr_size_t *inwords,
-                                                char *out, 
-                                                apr_size_t *outbytes);
+APR_DECLARE(apr_status_t) apr_conv_utf16_to_utf8(const apr_wchar_t *in, 
+                                                 apr_size_t *inwords,
+                                                 char *out, 
+                                                 apr_size_t *outbytes);
 
 #endif /* def UTF8_H */
Index: misc/win32/env.c
===================================================================
--- misc/win32/env.c	(revision 1856754)
+++ misc/win32/env.c	(working copy)
@@ -33,7 +33,7 @@
     apr_status_t status;
 
     inchars = strlen(envvar) + 1;
-    status = apr_conv_utf8_to_ucs2(envvar, &inchars, buffer, &bufflen);
+    status = apr_conv_utf8_to_utf16(envvar, &inchars, buffer, &bufflen);
     if (status == APR_INCOMPLETE)
         status = APR_ENAMETOOLONG;
 
@@ -82,7 +82,7 @@
         inchars = wcslen(wvalue) + 1;
         outchars = 3 * inchars; /* Enough for any UTF-8 representation */
         val = apr_palloc(pool, outchars);
-        status = apr_conv_ucs2_to_utf8(wvalue, &inchars, val, &outchars);
+        status = apr_conv_utf16_to_utf8(wvalue, &inchars, val, &outchars);
         if (status)
             return status;
     }
@@ -139,7 +139,7 @@
 
         outchars = inchars = strlen(value) + 1;
         wvalue = apr_palloc(pool, outchars * sizeof(*wvalue));
-        status = apr_conv_utf8_to_ucs2(value, &inchars, wvalue, &outchars);
+        status = apr_conv_utf8_to_utf16(value, &inchars, wvalue, &outchars);
         if (status)
             return status;
 
Index: misc/win32/internal.c
===================================================================
--- misc/win32/internal.c	(revision 1856754)
+++ misc/win32/internal.c	(working copy)
@@ -61,8 +61,8 @@
 
     /* This is a safe max allocation, we will realloc after
      * processing and return the excess to the free store.
-     * 3 ucs bytes hold any single wchar_t value (16 bits)
-     * 4 ucs bytes will hold a wchar_t pair value (20 bits)
+     * 3 utf-8 bytes hold any single wchar_t value (16 bits)
+     * 4 utf-8 bytes will hold a 2 word utf-16 surrogate pair value (20 bits)
      */
     elesize = elesize * 3 + 1;
     ele = elements = apr_malloc_dbg(elesize * sizeof(char),
@@ -73,7 +73,7 @@
         apr_size_t newlen = elesize;
 
         newarr[arg] = ele;
-        (void)apr_conv_ucs2_to_utf8(arr[arg], &len,
+        (void)apr_conv_utf16_to_utf8(arr[arg], &len,
                                     newarr[arg], &elesize);
 
         newlen -= elesize;
Index: misc/win32/start.c
===================================================================
--- misc/win32/start.c	(revision 1856754)
+++ misc/win32/start.c	(working copy)
@@ -58,14 +58,14 @@
     /* This is a safe max allocation, we will alloc each
      * string exactly after processing and return this
      * temporary buffer to the free store.
-     * 3 ucs bytes hold any single wchar_t value (16 bits)
-     * 4 ucs bytes will hold a wchar_t pair value (20 bits)
+     * 3 utf-8 bytes hold any single utf-16 value (16 bits)
+     * 4 utf-8 bytes will hold a 2 word utf-16 surrogate pair value (20 bits)
      */
     newlen = totlen = wsize * 3 + 1;
     pstrs = strs = apr_malloc_dbg(newlen * sizeof(char),
                                   __FILE__, __LINE__);
 
-    (void)apr_conv_ucs2_to_utf8(arrsz, &wsize, strs, &newlen);
+    (void)apr_conv_utf16_to_utf8(arrsz, &wsize, strs, &newlen);
 
     assert(newlen && !wsize);
 
Index: misc/win32/utf8.c
===================================================================
--- misc/win32/utf8.c	(revision 1856754)
+++ misc/win32/utf8.c	(working copy)
@@ -23,28 +23,28 @@
  * with particular attention to canonical translation forms (see section 10
  * "Security Considerations" of the RFC for more info).
  *
- * Since several architectures including Windows support unicode, with UCS2
+ * Since several architectures including Windows support unicode, with utf-16
  * used as the actual storage conventions by that archicture, these functions
- * exist to transform or validate UCS2 strings into APR's 'char' type
+ * exist to transform or validate utf-16 strings into APR's 'char' type
  * convention.  It is left up to the operating system to determine the
  * validitity of the string, e.g. normative forms, in the context of 
  * its native language support.  Other file systems which support filename 
  * characters of 0x80-0xff but have no explicit requirement for Unicode
  * will find this function useful only for validating the character sequences 
- * and rejecting poorly encoded UTF8 sequences.
+ * and rejecting poorly encoded utf-8 sequences.
  *
- * Len UCS-4 range (hex) UTF-8 octet sequence (binary)
- * 1:2 00000000-0000007F 0xxxxxxx
- * 2:2 00000080-000007FF 110XXXXx 10xxxxxx
- * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
- * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
- *     00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
- *     04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * len  utf-4 range (hex)  utf-8 octet sequence (binary)
+ * 1:2  00000000-0000007F  0xxxxxxx
+ * 2:2  00000080-000007FF  110XXXXx 10xxxxxx
+ * 3:2  00000800-0000FFFF  1110XXXX 10Xxxxxx 10xxxxxx
+ * 4:4  00010000-001FFFFF  11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
+ *      00200000-03FFFFFF  111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *      04000000-7FFFFFFF  1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  *
- * One of the X bits must be 1 to avoid overlong representation of ucs2 values. 
+ * One of the X bits must be 1 to avoid overlong representation in utf-8.
  *
- * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
- * and the final two forms are used only by full ucs4, per RFC 3629;
+ * For conversion into utf-16, the 4th form is limited in range to 0010 FFFF,
+ * and the final two forms are used only by full ucs-4, per RFC 3629;
  *
  *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in 
  *   Unicode parlance), being actually UCS-4 characters transformed 
@@ -54,24 +54,29 @@
  *
  * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
  *
- *  U' = U - 0x10000
- *  U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
- *                    W1 = 110110yy yyyyyyyy
- *                    W2 = 110111xx xxxxxxxx
- *  Max U' = 0000 00001111 11111111 11111111
- *  Max U  = 0000 00010000 11111111 11111111
+ *   U' = U - 0x10000
+ *   U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
+ *                     W1 = 110110yy yyyyyyyy
+ *                     W2 = 110111xx xxxxxxxx
+ *   Max U' = 0000 00001111 11111111 11111111
+ *   Max U  = 0000 00010000 11111111 11111111
  *
- * Len is the table above is a mapping of bytes used for utf8:ucs2 values,
+ * Also note ISO/IEC 10646:2014 Clause 9.4: "Because surrogate code points
+ * are not UCS scalar values, UTF-32 code units in the range 
+ * 0000 D800-0000 DFFF are ill-formed" for future reference in adding any
+ * utf-32 accessor functions.
+ *
+ * Len is the table above is a mapping of bytes used for utf-8:utf-16 values,
  * which results in these conclusions of maximum allocations;
  *
- *  apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
- *  apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
+ * apr_conv_utf8_to_utf16 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
+ * apr_conv_utf16_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
  */
 
-APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in, 
-                                                apr_size_t *inbytes,
-                                                apr_wchar_t *out, 
-                                                apr_size_t *outwords)
+APR_DECLARE(apr_status_t) apr_conv_utf8_to_utf16(const char *in, 
+                                                 apr_size_t *inbytes,
+                                                 apr_wchar_t *out, 
+                                                 apr_size_t *outwords)
 {
     apr_int64_t newch, mask;
     apr_size_t expect, eating;
@@ -105,7 +110,7 @@
                 expect = 1;
                 while ((ch & mask) == mask) {
                     mask |= mask >> 1;
-                    if (++expect > 3) /* (truly 5 for ucs-4) */
+                    if (++expect > 3) /* (or 5 for a ucs-4 code point) */
                         return APR_EINVAL;
                 }
                 newch = ch & ~mask;
@@ -125,8 +130,7 @@
                     if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
                         return APR_EINVAL;
                     if (expect == 2) {
-                        /* Reject values D800-DFFF when not utf16 encoded
-                         * (may not be an appropriate restriction for ucs-4)
+                        /* Reject values D800-DFFF when not utf-16 encoded
                          */
                         if (newch == 0015 && ((unsigned char)*in & 0040))
                             return APR_EINVAL;
@@ -156,7 +160,7 @@
                 *inbytes -= eating;
                 /* newch is now a true ucs-4 character
                  *
-                 * now we need to fold to ucs-2
+                 * now we need to fold to utf-16
                  */
                 if (newch < 0x10000) 
                 {
@@ -179,10 +183,10 @@
     return APR_SUCCESS;
 }
 
-APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in, 
-                                                apr_size_t *inwords,
-                                                char *out, 
-                                                apr_size_t *outbytes)
+APR_DECLARE(apr_status_t) apr_conv_utf16_to_utf8(const apr_wchar_t *in, 
+                                                 apr_size_t *inwords,
+                                                 char *out, 
+                                                 apr_size_t *outbytes)
 {
     apr_int64_t newch, require;
     apr_size_t need;
@@ -201,20 +205,20 @@
         else 
         {
             if ((ch & 0xFC00) == 0xDC00) {
-                /* Invalid Leading ucs-2 Multiword Continuation Character
+                /* Invalid Leading utf-16 Multiword Continuation Character
                  */
                 return APR_EINVAL;
             }
             if ((ch & 0xFC00) == 0xD800) {
-                /* Leading ucs-2 Multiword Character
+                /* Leading utf-16 Multiword Character
                  */
                 if (*inwords < 2) {
-                    /* Missing ucs-2 Multiword Continuation Character
+                    /* Missing utf-16 Multiword Continuation Character
                      */
                     return APR_INCOMPLETE;
                 }
                 if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
-                    /* Invalid ucs-2 Multiword Continuation Character
+                    /* Invalid utf-16 Multiword Continuation Character
                      */
                     return APR_EINVAL;
                 }
@@ -222,7 +226,7 @@
                 newch += 0x10000;
             }
             else {
-                /* ucs-2 Single Word Character
+                /* utf-16 Single Word Character
                  */
                 newch = ch;
             }
Index: test/internal/testucs.c
===================================================================
--- test/internal/testucs.c	(revision 1856754)
+++ test/internal/testucs.c	(working copy)
@@ -113,7 +113,7 @@
     
     do {
         apr_size_t nl = s.nl, wl = sizeof(s.w) / 2;
-        rc = apr_conv_utf8_to_ucs2(s.n, &nl, s.w, &wl);
+        rc = apr_conv_utf8_to_utf16(s.n, &nl, s.w, &wl);
         s.wl = (sizeof(s.w) / 2) - wl;
         if (!nl && rc == APR_SUCCESS) {
             if (!success) {
@@ -165,7 +165,7 @@
     
     do {
         apr_size_t nl = sizeof(s.n), wl = s.wl;        
-        rc = apr_conv_ucs2_to_utf8(s.w, &wl, s.n, &nl);
+        rc = apr_conv_utf16_to_utf8(s.w, &wl, s.n, &nl);
         s.nl = sizeof(s.n) - nl;
         if (!wl && rc == APR_SUCCESS) {
             if (!success) {
@@ -197,7 +197,7 @@
 
     do {
         apr_size_t wl = s.wl, nl = sizeof(s.n);
-        rc = apr_conv_ucs2_to_utf8(s.w, &wl, s.n, &nl);
+        rc = apr_conv_utf16_to_utf8(s.w, &wl, s.n, &nl);
         s.nl = sizeof(s.n) - s.nl;
         if (rc == APR_INCOMPLETE) {
             test_wrange(&s);
@@ -227,7 +227,7 @@
         do {
             inlen = ntest.nl;
             ntest.wl = sizeof(ntest.w) / 2;
-            nrc = apr_conv_utf8_to_ucs2(ntest.n, &inlen, ntest.w, &ntest.wl);
+            nrc = apr_conv_utf8_to_utf16(ntest.n, &inlen, ntest.w, &ntest.wl);
             if (nrc == APR_SUCCESS) {
                 ntest.wl = (sizeof(ntest.w) / 2) - ntest.wl;
                 break;
@@ -251,7 +251,7 @@
         do {
             inlen = wtest.wl;
             wtest.nl = sizeof(wtest.n);
-            wrc = apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl);
+            wrc = apr_conv_utf16_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl);
             if (wrc == APR_SUCCESS) {
                 wtest.nl = sizeof(wtest.n) - wtest.nl;
                 break;
@@ -267,7 +267,7 @@
                     do {
                         inlen = 1;
                         wtest.nl = sizeof(wtest.n);
-                        if (apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl)
+                        if (apr_conv_utf16_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl)
                                 == APR_INCOMPLETE)
                             break;
                         if (!(++wtest.w[0])) {
@@ -308,7 +308,7 @@
             do {
                 inlen = 1;
                 wtest.nl = sizeof(wtest.n);
-                if (apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl)
+                if (apr_conv_utf16_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl)
                         == APR_INCOMPLETE)
                     break;
                 if (!(++wtest.w[0])) {
@@ -319,7 +319,7 @@
         }
     } while (wtest.wl || ntest.nl);
 
-    printf ("\n\nutf8 and ucs2 sequences of %lu transformations matched OK.\n",
+    printf ("\n\nutf8 and utf16 sequences of %lu transformations matched OK.\n",
             matches);
 }
 

Reply via email to