Author: ianmacarthur
Date: 2011-04-13 08:43:22 -0700 (Wed, 13 Apr 2011)
New Revision: 8585
Log:
More attempts to clean up WIN32 handling of UTF16 surrogate pairs.

In particular, I have added a new function to src/fl_utf.c called 
fl_ucs_to_Utf16() which
converts a single 32-bit Unicode value into one (or more) UTF16 cells.

This is needed in the win32 char-by-char text width() logic, and I suspect may 
also be useful in the OSX code in some places.



Modified:
   branches/branch-1.3/FL/fl_utf8.h
   branches/branch-1.3/src/fl_font_win32.cxx
   branches/branch-1.3/src/fl_utf.c

Modified: branches/branch-1.3/FL/fl_utf8.h
===================================================================
--- branches/branch-1.3/FL/fl_utf8.h    2011-04-13 09:03:19 UTC (rev 8584)
+++ branches/branch-1.3/FL/fl_utf8.h    2011-04-13 15:43:22 UTC (rev 8585)
@@ -28,8 +28,6 @@
  * with the functions provided in OksiD's fltk-1.1.6-utf8 port
  */
 
-/*** NOTE : all functions are LIMITED to 24 bits Unicode values !!! ***/
-
 /**
   \file fl_utf8.h
   \brief header for Unicode and UTF8 chracter handling
@@ -99,16 +97,16 @@
 
 /* OD: returns the byte length of the first UTF-8 char sequence (returns -1 if 
not valid) */
 FL_EXPORT int fl_utf8len(char c);
-  
+
 /* OD: returns the byte length of the first UTF-8 char sequence (returns +1 if 
not valid) */
 FL_EXPORT int fl_utf8len1(char c);
-  
+
 /* OD: returns the number of Unicode chars in the UTF-8 string */
 FL_EXPORT int fl_utf_nb_char(const unsigned char *buf, int len);
 
 /* F2: Convert the next UTF8 char-sequence into a Unicode value (and say how 
many bytes were used) */
 FL_EXPORT unsigned fl_utf8decode(const char* p, const char* end, int* len);
-  
+
 /* F2: Encode a Unicode value into a UTF8 sequence, return the number of bytes 
used */
 FL_EXPORT int fl_utf8encode(unsigned ucs, char* buf);
 
@@ -118,6 +116,9 @@
 /* F2: Move backward to the previous valid UTF8 sequence start */
 FL_EXPORT const char* fl_utf8back(const char* p, const char* start, const 
char* end);
 
+/* XX: Convert a single 32-bit Unicode value into UTF16 */
+FL_EXPORT unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, 
const unsigned dstlen);
+
 /* F2: Convert a UTF8 string into UTF16 */
 FL_EXPORT unsigned fl_utf8toUtf16(const char* src, unsigned srclen, unsigned 
short* dst, unsigned dstlen);
 

Modified: branches/branch-1.3/src/fl_font_win32.cxx
===================================================================
--- branches/branch-1.3/src/fl_font_win32.cxx   2011-04-13 09:03:19 UTC (rev 
8584)
+++ branches/branch-1.3/src/fl_font_win32.cxx   2011-04-13 15:43:22 UTC (rev 
8585)
@@ -185,26 +185,23 @@
   Fl_Font_Descriptor *fl_fontsize = font_descriptor();
   unsigned int r;
   SIZE s;
-  // Special Case Handling of Unicode points over U+FFFF
+  // Special Case Handling of Unicode points over U+FFFF.
   // The logic (below) computes a lookup table for char widths
   // on-the-fly, but the table only covers codepoints up to
   // U+FFFF, which covers the basic multilingual plane, but
   // not any higher plane, or glyphs that require surrogate-pairs
-  // to encode them in WinXX which is UTF16.
+  // to encode them in WinXX, which is UTF16.
   // This code assumes that these glyphs are rarely used and simply
-  // measures them explicitly if they occur - Which may be slow...
+  // measures them explicitly if they occur - This will be slow...
   if(c > 0x0000FFFF) { // UTF16 surrogate pair is needed
     if (!fl_gc) { // We have no valid gc, so nothing to measure - bail out
       return 0.0;
     }
     int cc; // cell count
-    char utf8[8];          // Array for UTF-8 representation of c
-    unsigned short ucs[4]; // Array for UTF16 representation of c
-    // This fl_utf8encode / fl_utf8toUtf16 dance creates a UTF16 string
-    // from a UCS code point.
-    cc = fl_utf8encode(c, utf8);
-    cc = fl_utf8toUtf16(utf8, cc, ucs, 4);
-    GetTextExtentPoint32W(fl_gc, (WCHAR*)ucs, cc, &s);
+    unsigned short u16[4]; // Array for UTF16 representation of c
+    // Creates a UTF16 string from a UCS code point.
+    cc = fl_ucs_to_Utf16(c, u16, 4);
+    GetTextExtentPoint32W(fl_gc, (WCHAR*)u16, cc, &s);
     return (double)s.cx;
   }
   // else - this falls through to the lookup-table for glyph widths

Modified: branches/branch-1.3/src/fl_utf.c
===================================================================
--- branches/branch-1.3/src/fl_utf.c    2011-04-13 09:03:19 UTC (rev 8584)
+++ branches/branch-1.3/src/fl_utf.c    2011-04-13 15:43:22 UTC (rev 8585)
@@ -37,11 +37,11 @@
 
 
 #if 0
-  /** 
+  /**
    \defgroup fl_unichar Unicode Character Functions
    Global Functions Handling Single Unicode Characters
    @{ */
-  
+
   /**
    Converts a Unicode character into a utf-8 sequence.
    \param[in] uc Unicode character
@@ -50,24 +50,24 @@
    \return length of the sequence in bytes
    */
   /* FL_EXPORT int fl_unichar_to_utf8(unsigned int uc, char *text); */
-  
-  /** @} */  
-  
-  /** 
+
+  /** @} */
+
+  /**
    \defgroup fl_utf8 Unicode String Functions
    Global Functions Handling Unicode Text
    @{ */
-  
+
   /**
    Calculate the size of a utf-8 sequence for a Unicode character.
    \param[in] uc Unicode character
    \return length of the sequence in bytes
    */
   /* FL_EXPORT int fl_utf8_size(unsigned int uc); */
-  
-  /** @} */  
+
+  /** @} */
 #endif /* 0 */
-  
+
 /*!Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
    value 0xfffd.
@@ -337,6 +337,73 @@
   }
 }
 
+/*! Convert a single 32-bit Unicode codepoint into an array of 16-bit
+    characters. These are used by some system calls, especially on Windows.
+
+    \p ucs is the value to convert.
+
+    \p dst points at an array to write, and \p dstlen is the number of
+    locations in this array. At most \p dstlen words will be
+    written, and a 0 terminating word will be added if \p dstlen is
+    large enough. Thus this function will never overwrite the buffer
+    and will attempt return a zero-terminated string if space permits.
+    If \p dstlen is zero then \p dst can be set to NULL and no data
+    is written, but the length is returned.
+
+    The return value is the number of 16-bit words that \e would be written
+    to \p dst if it is large enough, not counting any terminating
+    zero.
+
+    If the return value is greater than \p dstlen it indicates truncation,
+    you should then allocate a new array of size return+1 and call this again.
+
+    Unicode characters in the range 0x10000 to 0x10ffff are converted to
+    "surrogate pairs" which take two words each (in UTF-16 encoding).
+    Typically, setting \p dstlen to 2 will ensure that any valid Unicode
+    value can be converted, and setting \p dstlen to 3 or more will allow
+    a NULL terminated sequence to be returned.
+*/
+unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const 
unsigned dstlen)
+{
+  /* The rule for direct conversion from UCS to UTF16 is:
+   * - if UCS >  0x0010FFFF then UCS is invalid
+   * - if UCS >= 0xD800 && UCS <= 0xDFFF UCS is invalid
+   * - if UCS <= 0x0000FFFF then U16 = UCS, len = 1
+   * - else
+   * -- U16[0] = ((UCS - 0x00010000) >> 10) & 0x3FF + 0xD800
+   * -- U16[1] = (UCS & 0x3FF) + 0xDC00
+   * -- len = 2;
+   */
+  unsigned count;        /* Count of converted UTF16 cells */
+  unsigned short u16[4]; /* Alternate buffer if dst is not set */
+  unsigned short *out;   /* points to the active buffer */
+  /* Ensure we have a valid buffer to write to */
+  if((!dstlen) || (!dst)) {
+    out = u16;
+  } else {
+    out = dst;
+  }
+  /* Convert from UCS to UTF16 */
+  if((ucs > 0x0010FFFF) || /* UCS is too large */
+  ((ucs > 0xD7FF) && (ucs < 0xE000))) { /* UCS in invalid range */
+    out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
+    count = 1;
+  } else if(ucs < 0x00010000) {
+    out[0] = (unsigned short)ucs;
+    count = 1;
+  } else if(dstlen < 2) { /* dst is too small for the result */
+    out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
+    count = 2;
+  } else {
+    out[0] = (((ucs - 0x00010000) >> 10) & 0x3FF) + 0xD800;
+    out[1] = (ucs & 0x3FF) + 0xDC00;
+    count = 2;
+  }
+  /* NULL terminate the output, if there is space */
+  if(count < dstlen) { out[count] = 0; }
+  return count;
+} /* fl_ucs_to_Utf16 */
+
 /*! Convert a UTF-8 sequence into an array of 16-bit characters. These
     are used by some system calls, especially on Windows.
 
@@ -363,7 +430,7 @@
 
     Unicode characters in the range 0x10000 to 0x10ffff are converted to
     "surrogate pairs" which take two words each (this is called UTF-16
-    encoding). 
+    encoding).
 */
 unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
                  unsigned short* dst, unsigned dstlen)
@@ -407,21 +474,21 @@
   Converts a UTF-8 string into a wide character string.
 
   This function generates 32-bit wchar_t (e.g. "ucs4" as it were) except
-  on Windows where it is equivalent to fl_utf8toUtf16 and returns 
+  on Windows where it is equivalent to fl_utf8toUtf16 and returns
   UTF-16.
- 
+
   \p src points at the UTF-8, and \p srclen is the number of bytes to
   convert.
- 
+
   \p dst points at an array to write, and \p dstlen is the number of
   locations in this array. At most \p dstlen-1 wchar_t will be
   written there, plus a 0 terminating wchar_t.
- 
+
   The return value is the number of wchar_t that \e would be written
   to \p dst if it were long enough, not counting the terminating
   zero. If the return value is greater or equal to \p dstlen it
   indicates truncation, you can then allocate a new array of size
-  return+1 and call this again. 
+  return+1 and call this again.
 
   Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
   and most other systems. Where wchar_t is 16 bits, Unicode
@@ -429,7 +496,7 @@
   "surrogate pairs" which take two words each (this is called UTF-16
   encoding). If wchar_t is 32 bits this rather nasty problem is
   avoided.
- 
+
   Note that Windows includes Cygwin, i.e. compiled with Cygwin's POSIX
   layer (cygwin1.dll, --enable-cygwin), either native (GDI) or X11.
   */

_______________________________________________
fltk-commit mailing list
[email protected]
http://lists.easysw.com/mailman/listinfo/fltk-commit

Reply via email to