[ast-developers] [patch] Accessing widechar codepoints without unicode (GB18030-related) ...

Roland Mainz Sun, 04 Aug 2013 19:13:49 -0700

Hi!

----


Attached (as "astksh20130727_printf_w_gb18030_001.diff.txt") is a
prototype patch which handles two issues related to GB18030:

1. Currently ksh93 supports "\u[codepoint]" in printf(1) to write a
unicode character to stdout. However the current code assumes that
|wchar_t| in the current locale represents an unicode code point...
which isn't true for all locales (for example zn_CN.GB18030 on Solaris
uses GB18030 codepoints for |wchar_t| and not unicode codepoints). In
that case printf "\u[codepoint]" doesn't work.
The patch fixes this by using |iconv()| to convert between the UTF32
codepoint value and the locale's character set. If the requested
character can not be represented in the current locale/encoding printf
"\u[codepoint]" will return an empty string

2. The other issue is that some users greatly wish to use the
codepoint values of their locale and _not_ the unicode codepoint
value. Therefore the patch adds printf "\w[codepoint]" that a
codepoint can be specified using the wchar_t value.

Note that the patch is a _prototype_ ... if the general idea is OK
I'll craft a better patch...

* Questions (mainly for David&&Glenn):
- Is the patch OK so far ?
- Does libast have any code to detect whether the locale is a unicode locale ?
- Is there any reason that printf(1)'s "%b" format does not support
"\u[codepoint], e.g. bug or feature ? :-)
- Somehow I can't include <iconv.h> ... it seems the AST <iconv.h>
header is build later than src/lib/libast/string/chresc.c which causes
the build to fail... any idea why ?

----

Bye,
Roland

-- 
  __ .  . __
 (o.\ \/ /.o) [email protected]
  \__\/\/__/  MPEG specialist, C&&JAVA&&Sun&&Unix programmer
  /O /==\ O\  TEL +49 641 3992797
 (;O/ \/ \O;)

diff -r -u original/src/cmd/ksh93/data/builtins.c 
build_print_w/src/cmd/ksh93/data/builtins.c
--- original/src/cmd/ksh93/data/builtins.c      2013-06-26 20:29:58.000000000 
+0200
+++ build_print_w/src/cmd/ksh93/data/builtins.c 2013-08-05 03:53:00.744434090 
+0200
@@ -1214,8 +1214,16 @@
                "the collating element \aname\a.]"
        "[+-?The escape sequence \b\\x{\b\ahex\a\b}\b expands to the "
                "character corresponding to the hexidecimal value \ahex\a.]"
-       "[+-?The escape sequence \b\\u{\b\ahex\a\b}\b expands to the unicode "
-               "character corresponding to the hexidecimal value \ahex\a.]"
+       "[+-?The escape sequence \b\\u[\b\ahex\a\b]]\b or "
+               "\b\\u{\b\ahex\a\b}\b expands to the unicode character "
+               "corresponding to the unicode code point defined "
+               "by the hexidecimal value \ahex\a. If the character is "
+               "not available in the current locale the escape sequence "
+               "will return an empty string.]"
+       "[+-?The escape sequence \b\\w[\b\ahex\a\b]]\b or "
+               "\b\\w{\b\ahex\a\b}\b expands to the character "
+               "corresponding to the (wchar_t) code point defined "
+               "by the hexidecimal value \ahex\a in the current locale.]"
        "[+-?The format modifier flag \b=\b can be used to center a field to "
                "a specified width.]"
        "[+-?The format modifier flag \bL\b can be used with the \bc\b and "
diff -r -u original/src/lib/libast/string/chresc.c 
build_print_w/src/lib/libast/string/chresc.c
--- original/src/lib/libast/string/chresc.c     2013-07-16 20:02:13.000000000 
+0200
+++ build_print_w/src/lib/libast/string/chresc.c        2013-08-05 
03:04:27.063316193 +0200
@@ -38,6 +38,115 @@
 #include <regex.h>
 #endif
 
+#include <errno.h>
+#include <wchar.h>
+#include <endian.h>
+#include "/usr/include/iconv.h" // WHY ?
+#include <langinfo.h>
+#include <locale.h>
+
+
+static
+ssize_t ucs4_to_wchar(uint32_t *ucs4, size_t ucs4_len, wchar_t *wchar)
+{
+#ifdef HAS_ICONV_WCHAR_T
+       iconv_t         cd;
+       char            *inbuf;
+       char            *outbuf;
+       size_t          inbytesleft,
+                       outbytesleft,
+                       res,
+                       wchar_buf_len;
+       int             saved_errno;
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+       cd = iconv_open("WCHAR_T", "UTF32LE");
+#elif __BYTE_ORDER == __BIG_ENDIAN
+       cd = iconv_open("WCHAR_T", "UTF32BE");
+#else
+#error Unknown __BYTE_ORDER
+#endif
+       if (cd == (iconv_t)-1)
+               return (-1);
+
+       inbytesleft     = ucs4_len * sizeof(uint32_t);
+       wchar_buf_len   = ucs4_len * sizeof(wchar_t);
+       inbuf           = (char *)ucs4;
+       outbytesleft    = wchar_buf_len;
+       outbuf          = (char *)wchar;
+
+       res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+
+       saved_errno = errno;
+       (void)iconv_close(cd);
+       errno = saved_errno;
+
+#else /* HAS_ICONV_WCHAR_T */
+
+       iconv_t         cd;
+       char            *inbuf;
+       char            *outbuf,
+                       *outbuf_start;
+       size_t          inbytesleft,
+                       outbytesleft,
+                       res,
+                       mb_buf_len;
+       int             saved_errno;
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+       cd = iconv_open(nl_langinfo(CODESET), "UTF32LE");
+#elif __BYTE_ORDER == __BIG_ENDIAN
+       cd = iconv_open(nl_langinfo(CODESET), "UTF32BE");
+#else
+#error Unknown __BYTE_ORDER
+#endif
+       if (cd == (iconv_t)-1)
+               return (-1);
+
+       inbytesleft     = ucs4_len * sizeof(uint32_t);
+       mb_buf_len      = ucs4_len * MB_CUR_MAX;
+       inbuf           = (char *)ucs4;
+       outbytesleft    = mb_buf_len;
+       outbuf_start    = (char *)malloc(mb_buf_len+2);
+       if(!outbuf_start)
+       {
+               saved_errno = errno;
+               (void)iconv_close(cd);
+               errno = saved_errno;
+               return(-1);
+       }
+
+       outbuf = outbuf_start;
+
+       res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+
+       if (res >= 0)
+       {
+               ssize_t len;
+               size_t i;
+
+               for(outbuf = outbuf_start, i = 0 ;
+                       i < ucs4_len ;
+                       i++, outbuf += len)
+               {
+                       len = mbtowc(&wchar[i], outbuf, MB_CUR_MAX);
+                       if (len < 0)
+                       {
+                               wchar[i]=L'\0';
+                               break;
+                       }
+               }
+       }
+
+       saved_errno = errno;
+       (void)iconv_close(cd);
+       free(outbuf_start);
+       errno = saved_errno;
+#endif
+       return (res);
+}
+
+
 int
 chrexp(register const char* s, char** p, int* m, register int flags)
 {
@@ -47,9 +156,10 @@
        const char*             b;
        char*                   r;
        int                     n;
-       int                     w;
+       bool                    w;
+       bool                    unicode;
 
-       w = 0;
+       w = unicode = false;
        for (;;)
        {
                b = s;
@@ -153,14 +263,18 @@
                                c = CC_vt;
                                break;
                        case 'u':
+                               unicode = true;
+                       case 'w':
                                q = s + 4;
                                goto wex;
                        case 'U':
+                               unicode = true;
+                       case 'W':
                                q = s + 8;
                        wex:
                                if (!(flags & FMT_EXP_WIDE))
                                        goto noexpand;
-                               w = 1;
+                               w = true;
                                goto hex;
                        case 'x':
                                q = s + 2;
@@ -191,7 +305,7 @@
                                                        break;
                                                e = 0;
                                                s++;
-                                               if (w && *s == 'U' && *(s + 1) 
== '+')
+                                               if (w && ((*s == 'U') || (*s == 
'W')) && *(s + 1) == '+')
                                                        s += 2;
                                                continue;
                                        case '}':
@@ -204,7 +318,7 @@
                                        }
                                        break;
                                }
-                               if (n <= 2 && !(flags & FMT_EXP_CHAR) || n > 2 
&& (w = 1) && !(flags & FMT_EXP_WIDE))
+                               if (n <= 2 && !(flags & FMT_EXP_CHAR) || n > 2 
&& (w = true) && !(flags & FMT_EXP_WIDE))
                                {
                                        c = '\\';
                                        s = b;
@@ -217,7 +331,7 @@
                        break;
                default:
                        if ((s - b) > 1)
-                               w = 1;
+                               w = true;
                        break;
                }
                break;
@@ -226,7 +340,18 @@
        if (p)
                *p = (char*)s;
        if (m)
-               *m = w;
+               *m = w?1:0;
+
+       if (w && unicode && (c > 127))
+       {
+               uint32_t in = c;
+               wchar_t out = -1;
+
+               if (ucs4_to_wchar(&in, 1, &out) < 0)
+                       c = -1;
+               else
+                       c = out;
+       }
        return c;
  noexpand:
        c = '\\';
diff -r -u original/src/lib/libast/string/stresc.c 
build_print_w/src/lib/libast/string/stresc.c
--- original/src/lib/libast/string/stresc.c     2010-05-01 07:46:26.000000000 
+0200
+++ build_print_w/src/lib/libast/string/stresc.c        2013-08-05 
00:22:26.918700000 +0200
@@ -36,7 +36,7 @@
 strexp(register char* s, int flags)
 {
        register char*          t;
-       register unsigned int   c;
+       register int            c;
        char*                   b;
        char*                   e;
        int                     w;
@@ -48,6 +48,12 @@
                {
                        c = chrexp(s - 1, &e, &w, flags);
                        s = e;
+
+                       if (c < 0)
+                       {
+                               /* conversion failed == empty string */
+                               continue;
+                       }
                        if (w)
                        {
                                t += mbwide() ? mbconv(t, c) : wc2utf8(t, c);

_______________________________________________
ast-developers mailing list
[email protected]
http://lists.research.att.com/mailman/listinfo/ast-developers

[ast-developers] [patch] Accessing widechar codepoints without unicode (GB18030-related) ...

Reply via email to