Author: matt
Date: 2010-12-06 10:22:22 -0800 (Mon, 06 Dec 2010)
New Revision: 7965
Log:
Fixed crashes when Fl_Text_* detects illegal UTF 8 sequences. Widgets will not 
do any further processing but just jump over the character. Screen 
representation depends largely on whatever the underlying OS does with those 
sequences, but I feel that this is out of the scope of this library. (STR 2348)

Modified:
   branches/branch-1.3/CHANGES
   branches/branch-1.3/FL/Fl_Text_Buffer.H
   branches/branch-1.3/FL/fl_utf8.h
   branches/branch-1.3/src/Fl_Text_Buffer.cxx
   branches/branch-1.3/src/Fl_Text_Display.cxx
   branches/branch-1.3/src/fl_utf8.cxx

Modified: branches/branch-1.3/CHANGES
===================================================================
--- branches/branch-1.3/CHANGES 2010-12-06 17:42:48 UTC (rev 7964)
+++ branches/branch-1.3/CHANGES 2010-12-06 18:22:22 UTC (rev 7965)
@@ -1,5 +1,7 @@
 CHANGES IN FLTK 1.3.0
 
+       - Fixed crashes when detecting illegal utf 8 sequences
+         in Fl_Text_* widgets (STR #2348)
        - Fixed Fl_Text_Display Tabulator calculations (STR #2450)
        - Fixed file access code to use UTF-8 strings (STR #2440)
        - Fixed ARM Unicode cross compilation issue (STR #2432)

Modified: branches/branch-1.3/FL/Fl_Text_Buffer.H
===================================================================
--- branches/branch-1.3/FL/Fl_Text_Buffer.H     2010-12-06 17:42:48 UTC (rev 
7964)
+++ branches/branch-1.3/FL/Fl_Text_Buffer.H     2010-12-06 18:22:22 UTC (rev 
7965)
@@ -34,7 +34,7 @@
 #define FL_TEXT_BUFFER_H
 
 
-#define ASSERT_UTF8
+#undef ASSERT_UTF8
 
 #ifdef ASSERT_UTF8
 # include <assert.h>
@@ -47,22 +47,11 @@
 
 
 /*
- Suggested UTF-8 terminology for this file:
- 
- ?? "length" is the number of characters in a string
- ?? "size" is the number of bytes
- ?? "index" is the position in a string in number of characters
- ?? "offset" is the position in a string in bytes (and must be kept on a 
charater boundary)
- (there seems to be no standard in Uncode documents, howevere "length" is 
commonly
- referencing the number of bytes. Maybe "bytes" and "glyphs" would be the most
- obvious way to describe sizes?)
- 
  "character size" is the size of a UTF-8 character in bytes
- "character width" is the width of a Unicode character in pixels
- 
- "column" was orginally defined as a character offset from the left margin. It 
was
- identical to the byte offset. In UTF-8, we have neither a byte offset nor
- truly fixed width fonts (*). Column could be a pixel value multiplied with
+ "character width" is the width of a Unicode character in pixels 
+ "column" was orginally defined as a character offset from the left margin. 
+ It was identical to the byte offset. In UTF-8, we have neither a byte offset 
+ nor truly fixed width fonts (*). Column could be a pixel value multiplied with
  an average character width (which is a bearable approximation).
  
  * in Unicode, there are no fixed width fonts! Even if the ASCII characters 
may 

Modified: branches/branch-1.3/FL/fl_utf8.h
===================================================================
--- branches/branch-1.3/FL/fl_utf8.h    2010-12-06 17:42:48 UTC (rev 7964)
+++ branches/branch-1.3/FL/fl_utf8.h    2010-12-06 18:22:22 UTC (rev 7965)
@@ -99,13 +99,16 @@
 
 /* OD: returns the byte length of the first UTF-8 char sequence (returns -1 if 
not valid) */
 FL_EXPORT int fl_utf8len(char c);
-
+  
+/* OD: returns the byte length of the first UTF-8 char sequence (returns +1 if 
not valid) */
+FL_EXPORT int fl_utf8len1(char c);
+  
 /* OD: returns the number of Unicode chars in the UTF-8 string */
 FL_EXPORT int fl_utf_nb_char(const unsigned char *buf, int len);
 
 /* F2: Convert the next UTF8 char-sequence into a Unicode value (and say how 
many bytes were used) */
 FL_EXPORT unsigned fl_utf8decode(const char* p, const char* end, int* len);
-
+  
 /* F2: Encode a Unicode value into a UTF8 sequence, return the number of bytes 
used */
 FL_EXPORT int fl_utf8encode(unsigned ucs, char* buf);
 

Modified: branches/branch-1.3/src/Fl_Text_Buffer.cxx
===================================================================
--- branches/branch-1.3/src/Fl_Text_Buffer.cxx  2010-12-06 17:42:48 UTC (rev 
7964)
+++ branches/branch-1.3/src/Fl_Text_Buffer.cxx  2010-12-06 18:22:22 UTC (rev 
7965)
@@ -1025,7 +1025,7 @@
           *foundPos = startPos;
           return 1;
         }
-        int l = fl_utf8len(c);
+        int l = fl_utf8len1(c);
         if (memcmp(sp, address(bp), l))
           break;
         sp += l; bp += l;
@@ -1077,7 +1077,7 @@
           *foundPos = startPos;
           return 1;
         }
-        int l = fl_utf8len(c);
+        int l = fl_utf8len1(c);
         if (memcmp(sp, address(bp), l))
           break;
         sp += l; bp += l;
@@ -1602,7 +1602,7 @@
 int Fl_Text_Buffer::next_char(int pos) const
 {
   IS_UTF8_ALIGNED2(this, (pos))  
-  int n = fl_utf8len(byte_at(pos));
+  int n = fl_utf8len1(byte_at(pos));
   pos += n;
   if (pos>=mLength)
     return mLength;

Modified: branches/branch-1.3/src/Fl_Text_Display.cxx
===================================================================
--- branches/branch-1.3/src/Fl_Text_Display.cxx 2010-12-06 17:42:48 UTC (rev 
7964)
+++ branches/branch-1.3/src/Fl_Text_Display.cxx 2010-12-06 18:22:22 UTC (rev 
7965)
@@ -753,7 +753,7 @@
   /* determine how many displayed character positions are covered */
   startIndent = mBuffer->count_displayed_characters( lineStart, startPos );
   indent = startIndent;
-  for ( c = text; *c != '\0'; c += fl_utf8len(*c) )
+  for ( c = text; *c != '\0'; c += fl_utf8len1(*c) )
     indent++;
   endIndent = indent;
   
@@ -1735,7 +1735,7 @@
   style = position_style(lineStartPos, lineLen, 0);
   for (i=0; i<lineLen; ) {
     currChar = lineStr[i]; // one byte is enough to handele tabs and other 
cases
-    int len = fl_utf8len(currChar);
+    int len = fl_utf8len1(currChar);
     if (len<=0) len = 1; // OUCH!
     charStyle = position_style(lineStartPos, lineLen, i);
     if (charStyle!=style || currChar=='\t' || prevChar=='\t') {
@@ -1829,7 +1829,7 @@
   // TODO: use binary search which may be quicker.
   int i = 0;
   while (i<len) {
-    int cl = fl_utf8len(s[i]);
+    int cl = fl_utf8len1(s[i]);
     int w = int( string_width(s, i+cl, style) );
     if (w>x) 
       return i;
@@ -3204,7 +3204,7 @@
     return (((xPix/tab)+1)*tab) - xPix;
   }
   
-  int charLen = fl_utf8len(*s), style = 0;
+  int charLen = fl_utf8len1(*s), style = 0;
   if (mStyleBuffer) {
     style = mStyleBuffer->byte_at(pos);
   }
@@ -3284,7 +3284,7 @@
   
   c = buffer()->char_at(lineEndPos);
   return c == '\n' || ((c == '\t' || c == ' ') &&
-                       lineEndPos + fl_utf8len(c) < buffer()->length());
+                       lineEndPos + fl_utf8len1(c) < buffer()->length());
 }
 
 

Modified: branches/branch-1.3/src/fl_utf8.cxx
===================================================================
--- branches/branch-1.3/src/fl_utf8.cxx 2010-12-06 17:42:48 UTC (rev 7964)
+++ branches/branch-1.3/src/fl_utf8.cxx 2010-12-06 18:22:22 UTC (rev 7965)
@@ -112,9 +112,11 @@
 }
 
 /**
-  return the byte length of the UTF-8 sequence with first byte \p c,
-  or -1 if \p c is not valid.
-  */
+ return the byte length of the UTF-8 sequence with first byte \p c,
+ or -1 if \p c is not valid.
+ This function is helpful for finding faulty UTF8 sequences.
+ \see fl_utf8len1
+ */
 int fl_utf8len(char c)
 {
   if (!(c & 0x80)) return 1;
@@ -137,16 +139,35 @@
 } // fl_utf8len
 
 
-#if 0
-int fl_utflen(
-        const unsigned char     *buf,
-        int                     len)
+/**
+ Return the byte length of the UTF-8 sequence with first byte \p c,
+ or 1 if \p c is not valid. 
+ This function can be used to scan faulty UTF8 sequence, albeit ignoring 
invalid
+ codes.
+ \see fl_utf8len
+ */
+int fl_utf8len1(char c)
 {
-       unsigned int ucs;
-       return fl_utf2ucs(buf, len, &ucs);
-}
-#endif
+  if (!(c & 0x80)) return 1;
+  if (c & 0x40) {
+    if (c & 0x20) {
+      if (c & 0x10) {
+        if (c & 0x08) {
+          if (c & 0x04) {
+            return 6;
+          }
+          return 5;
+        }
+        return 4;
+      }
+      return 3;
+    }
+    return 2;
+  }
+  return 1;
+} // fl_utf8len1
 
+
 /**
   returns the number of Unicode chars in the UTF-8 string
   */

_______________________________________________
fltk-commit mailing list
[email protected]
http://lists.easysw.com/mailman/listinfo/fltk-commit

Reply via email to