[RFC PATCH:app/xprop] Print UTF8_STRING type as UTF-8 when locale supports it

Yang Zhao Sat, 17 Oct 2009 20:18:23 -0700

Introduces 'u' format character, which behaves like 's', but leaves
UTF-8 encoding intact.


Property value is checked for UTF-8 validity before being printed.  What
happens when the value isn't a valid UTF-8 string needs improvement.
---

Currently, when an invalid UTF-8 string is detected, an error message is printed
instead of the string value.  I don't think this is ideal.  What would be 
better?

 xprop.c |   89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/xprop.c b/xprop.c
index 8261b15..bb8c71f 100644
--- a/xprop.c
+++ b/xprop.c
@@ -409,6 +409,7 @@ static propertyRec windowPropTable[] = {
     {"RECTANGLE",      XA_RECTANGLE,    "16iicc",    RECTANGLE_DFORMAT },
     {"RGB_COLOR_MAP",  XA_RGB_COLOR_MAP,"32xcccccccxx",RGB_COLOR_MAP_DFORMAT},
     {"STRING",         XA_STRING,       "8s",        0 },
+    {"UTF8_STRING",            0,       "8u",        0 },
     {"WINDOW",         XA_WINDOW,       "32x",       ": window id # $0+\n" },
     {"VISUALID",       XA_VISUALID,     "32x",       ": visual id # $0\n" },
     {"WM_COLORMAP_WINDOWS",    0,       "32x",       ": window id # $0+\n"},
@@ -683,7 +684,7 @@ _put_char (char c)
 }
 
 static void
-_format_char (char c)
+_format_char (char c, int unicode)
 {
     switch (c) {
       case '\\':
@@ -701,17 +702,21 @@ _format_char (char c)
        break;
       default:
        if (!c_isprint(c)) {
-           _put_char('\\');
-           snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c);
-           _buf_ptr += 3;
-           _buf_len -= 3;
+           if (unicode && (c & 0x80)) {
+               _put_char(c);
+           } else {
+               _put_char('\\');
+               snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c);
+               _buf_ptr += 3;
+               _buf_len -= 3;
+           }
        } else
          _put_char(c);
     }
 }
 
 static const char *
-Format_String (const char *string)
+Format_String (const char *string, int unicode)
 {
     char c;
 
@@ -720,7 +725,7 @@ Format_String (const char *string)
     _put_char('\"');
 
     while ((c = string++[0]))
-       _format_char(c);
+       _format_char(c, unicode);
 
     *_buf_ptr++ = '"';
     *_buf_ptr++ = '\0';
@@ -738,7 +743,7 @@ Format_Len_String (const char *string, int len)
     memcpy(data, string, len);
     data[len] = '\0';
 
-    result = Format_String(data);
+    result = Format_String(data, 0);
     free(data);
 
     return result;
@@ -904,6 +909,70 @@ Format_Len_Text (const char *string, int len, Atom 
encoding)
        return Format_Len_String(string, len);
 }
 
+static int
+is_valid_utf8 (const char *string, int len)
+{
+    unsigned short codepoint;
+    int rem, i;
+    char c;
+
+    rem = 0;
+    for (i = 0; i < len; i++) {
+       c = string[i];
+       codepoint = 0;
+
+       if ((c & 0x8F) ^ 0x80) {
+           if (rem > 0) return 0;
+           rem = 0;
+           codepoint |= c;
+       } else if ((c & 0xC0) == 0x80) {
+           if (rem == 0) return 0;
+           rem--;
+           codepoint |= (c & 0x3F) << (rem * 2);
+           if (codepoint == 0x00) return 0;
+       } else if ((c & 0xE0) == 0xC0) {
+           if (rem > 0) return 0;
+           rem = 1;
+           codepoint = (c & 0x1F) << 6;
+           if ((codepoint & 0xF0) == 0x00) return 0;
+       } else if ((c & 0xF0) == 0xE0) {
+           if (rem > 0) return 0;
+           rem = 2;
+           codepoint = (c & 0x0F) << 12;
+       } else if ((c & 0xF8) == 0xF0) {
+           if (rem > 0) return 0;
+           rem = 3;
+           codepoint = (c & 0x07) << 20;
+       } else
+           return 0;
+    }
+
+    return 1;
+}
+
+static const char *
+Format_Len_Unicode (const char *string, int len)
+{
+    char *data;
+    const char *result;
+
+    if (!is_valid_utf8(string, len))
+       return "<Not a valid UTF-8 string>";
+
+    if (!is_utf8_locale())
+       return Format_Len_String(string, len);
+
+    data = (char *) Malloc(len+1);
+
+    memcpy(data, string, len);
+    data[len] = '\0';
+
+    result = Format_String(data, 1);
+    free(data);
+
+    return result;
+}
+
 /*
  *
  * The Format Manager: a group of routines to manage "formats"
@@ -956,6 +1025,8 @@ Format_Thunk (thunk t, char format_char)
     switch (format_char) {
       case 's':
        return Format_Len_String(t.extra_value, (int)t.value);
+      case 'u':
+       return Format_Len_Unicode(t.extra_value, (int)t.value);
       case 't':
        return Format_Len_Text(t.extra_value, (int)t.value, t.extra_encoding);
       case 'x':
@@ -1252,7 +1323,7 @@ Break_Down_Property (const char *pointer, int length, 
Atom type, const char *for
 
     while (length >= size/8) {
        format_char = Get_Format_Char(format, i);
-       if (format_char == 's')
+       if (format_char == 's' || format_char == 'u')
            t.value = Extract_Len_String(&pointer,&length,size,&t.extra_value);
        else if (format_char == 't') {
            t.extra_encoding = type;
-- 
1.6.4.4

_______________________________________________
xorg-devel mailing list
[email protected]
http://lists.x.org/mailman/listinfo/xorg-devel

[RFC PATCH:app/xprop] Print UTF8_STRING type as UTF-8 when locale supports it

Reply via email to