[PATCH:app/xprop] Print UTF8_STRING type as UTF-8 when locale supports it

Yang Zhao Tue, 20 Oct 2009 16:51:23 -0700

Introduces 'u' format character, which behaves like 's', but leaves
UTF-8 encoding intact.


Property value is checked for UTF-8 validity according to RFC 3629.
If invalid, an error string is printed, followed by the string formatted
using 's'. ie:

  PROP(UTF8_STRING) = <Invalid UTF-8 string: Forbidden value> "\374\233"

Signed-off-by: Yang Zhao <[email protected]>
---
 xprop.c   |  148 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 xprop.man |    8 +++
 2 files changed, 147 insertions(+), 9 deletions(-)

diff --git a/xprop.c b/xprop.c
index 8261b15..ea65013 100644
--- a/xprop.c
+++ b/xprop.c
@@ -409,6 +409,7 @@ static propertyRec windowPropTable[] = {
     {"RECTANGLE",      XA_RECTANGLE,    "16iicc",    RECTANGLE_DFORMAT },
     {"RGB_COLOR_MAP",  XA_RGB_COLOR_MAP,"32xcccccccxx",RGB_COLOR_MAP_DFORMAT},
     {"STRING",         XA_STRING,       "8s",        0 },
+    {"UTF8_STRING",            0,       "8u",        0 },
     {"WINDOW",         XA_WINDOW,       "32x",       ": window id # $0+\n" },
     {"VISUALID",       XA_VISUALID,     "32x",       ": visual id # $0\n" },
     {"WM_COLORMAP_WINDOWS",    0,       "32x",       ": window id # $0+\n"},
@@ -683,7 +684,7 @@ _put_char (char c)
 }
 
 static void
-_format_char (char c)
+_format_char (char c, int unicode)
 {
     switch (c) {
       case '\\':
@@ -701,17 +702,21 @@ _format_char (char c)
        break;
       default:
        if (!c_isprint(c)) {
-           _put_char('\\');
-           snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c);
-           _buf_ptr += 3;
-           _buf_len -= 3;
+           if (unicode && (c & 0x80)) {
+               _put_char(c);
+           } else {
+               _put_char('\\');
+               snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c);
+               _buf_ptr += 3;
+               _buf_len -= 3;
+           }
        } else
          _put_char(c);
     }
 }
 
 static const char *
-Format_String (const char *string)
+Format_String (const char *string, int unicode)
 {
     char c;
 
@@ -720,7 +725,7 @@ Format_String (const char *string)
     _put_char('\"');
 
     while ((c = string++[0]))
-       _format_char(c);
+       _format_char(c, unicode);
 
     *_buf_ptr++ = '"';
     *_buf_ptr++ = '\0';
@@ -738,7 +743,7 @@ Format_Len_String (const char *string, int len)
     memcpy(data, string, len);
     data[len] = '\0';
 
-    result = Format_String(data);
+    result = Format_String(data, 0);
     free(data);
 
     return result;
@@ -905,6 +910,129 @@ Format_Len_Text (const char *string, int len, Atom 
encoding)
 }
 
 /*
+ * Validate a string as UTF-8 encoded according to RFC 3629
+ *
+ * Simply, a unicode code point (up to 21-bits long) is encoded as follows:
+ *
+ *    Char. number range  |        UTF-8 octet sequence
+ *       (hexadecimal)    |              (binary)
+ *    --------------------+---------------------------------------------
+ *    0000 0000-0000 007F | 0xxxxxxx
+ *    0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+ *    0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+ *    0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * Validation is done left-to-right, and an error condition, if any, refers to
+ * only the left-most problem in the string.
+ *
+ * Return values:
+ *   UTF8_VALID: Valid UTF-8 encoded string
+ *   UTF8_OVERLONG: Using more bytes than needed for a code point
+ *   UTF8_SHORT_TAIL: Not enough bytes in a multi-byte sequence
+ *   UTF8_LONG_TAIL: Too many bytes in a multi-byte sequence
+ *   UTF8_FORBIDDEN_VALUE: Forbidden prefix or code point outside 0x10FFFF
+ */
+#define UTF8_VALID 0
+#define UTF8_FORBIDDEN_VALUE 1
+#define UTF8_OVERLONG 2
+#define UTF8_SHORT_TAIL 3
+#define UTF8_LONG_TAIL 4
+static int
+is_valid_utf8 (const char *string, int len)
+{
+    unsigned long codepoint;
+    int rem, i;
+    unsigned char c;
+
+    rem = 0;
+    for (i = 0; i < len; i++) {
+       c = (unsigned char) string[i];
+
+       /* Order of type check:
+        *   - Single byte code point
+        *   - Non-starting byte of multi-byte sequence
+        *   - Start of 2-byte sequence
+        *   - Start of 3-byte sequence
+        *   - Start of 4-byte sequence
+        */
+       if (!(c & 0x80)) {
+           if (rem > 0) return UTF8_SHORT_TAIL;
+           rem = 0;
+           codepoint = c;
+       } else if ((c & 0xC0) == 0x80) {
+           if (rem == 0) return UTF8_LONG_TAIL;
+           rem--;
+           codepoint |= (c & 0x3F) << (rem * 6);
+           if (codepoint == 0) return UTF8_OVERLONG;
+       } else if ((c & 0xE0) == 0xC0) {
+           if (rem > 0) return UTF8_SHORT_TAIL;
+           rem = 1;
+           codepoint = (c & 0x1F) << 6;
+           if (codepoint == 0) return UTF8_OVERLONG;
+       } else if ((c & 0xF0) == 0xE0) {
+           if (rem > 0) return UTF8_SHORT_TAIL;
+           rem = 2;
+           codepoint = (c & 0x0F) << 12;
+       } else if ((c & 0xF8) == 0xF0) {
+           if (rem > 0) return UTF8_SHORT_TAIL;
+           rem = 3;
+           codepoint = (c & 0x07) << 18;
+           if (codepoint > 0x10FFFF) return UTF8_FORBIDDEN_VALUE;
+       } else
+           return UTF8_FORBIDDEN_VALUE;
+    }
+
+    return UTF8_VALID;
+}
+
+static const char *
+Format_Len_Unicode (const char *string, int len)
+{
+    char *data;
+    const char *result, *error;
+    int len2;
+
+    int validity = is_valid_utf8(string, len);
+
+    if (validity != UTF8_VALID) {
+       switch (validity) {
+         case UTF8_FORBIDDEN_VALUE:
+           error = "<Invalid UTF-8 string: Forbidden value> "; break;
+         case UTF8_OVERLONG:
+           error = "<Invalid UTF-8 string: Overlong encoding> "; break;
+         case UTF8_SHORT_TAIL:
+           error = "<Invalid UTF-8 string: Tail too short> "; break;
+         case UTF8_LONG_TAIL:
+           error = "<Invalid UTF-8 string: Tail too long> "; break;
+       }
+
+       result = Format_Len_String(string, len);
+       len2 = strlen(result);
+       data = (char *) Malloc(len2+1);
+       memcpy(data, result, len2+1);
+
+       memcpy(_formatting_buffer, error, strlen(error)+1);
+       strcat(_formatting_buffer, data);
+       free(data);
+
+       return _formatting_buffer;
+    }
+
+    if (!is_utf8_locale())
+       return Format_Len_String(string, len);
+
+    data = (char *) Malloc(len+1);
+
+    memcpy(data, string, len);
+    data[len] = '\0';
+
+    result = Format_String(data, 1);
+    free(data);
+
+    return result;
+}
+
+/*
  *
  * The Format Manager: a group of routines to manage "formats"
  *
@@ -956,6 +1084,8 @@ Format_Thunk (thunk t, char format_char)
     switch (format_char) {
       case 's':
        return Format_Len_String(t.extra_value, (int)t.value);
+      case 'u':
+       return Format_Len_Unicode(t.extra_value, (int)t.value);
       case 't':
        return Format_Len_Text(t.extra_value, (int)t.value, t.extra_encoding);
       case 'x':
@@ -1252,7 +1382,7 @@ Break_Down_Property (const char *pointer, int length, 
Atom type, const char *for
 
     while (length >= size/8) {
        format_char = Get_Format_Char(format, i);
-       if (format_char == 's')
+       if (format_char == 's' || format_char == 'u')
            t.value = Extract_Len_String(&pointer,&length,size,&t.extra_value);
        else if (format_char == 't') {
            t.extra_encoding = type;
diff --git a/xprop.man b/xprop.man
index 498faa9..310812f 100644
--- a/xprop.man
+++ b/xprop.man
@@ -234,6 +234,14 @@ usable with a field size of 8. The string is assumed to be 
in an ICCCM
 compliant encoding and is converted to the current locale encoding before
 being output.
 .TP
+u
+This field and the next ones until either a 0 or the end of the property
+represent an UTF-8 encoded unicode string. This format character is only
+usable with a field size of 8. If the string is found to be an invalid
+character, the type of encoding violation is printed instead, followed by
+the string formatted using 's'. When in an environment not capable of
+displaying UTF-8 encoded string, behaviour is identical to 's'.
+.TP
 x
 The field is a hex number (like 'c' but displayed in hex - most useful
 for displaying window ids and the like)
-- 
1.6.4.4

_______________________________________________
xorg-devel mailing list
[email protected]
http://lists.x.org/mailman/listinfo/xorg-devel

[PATCH:app/xprop] Print UTF8_STRING type as UTF-8 when locale supports it

Reply via email to