Re: [ast-developers] [patch] Updated $'\w[hex]' patch for GB18030&&co. ... / was: Re: [patch] Accessing widechar codepoints without unicode (GB18030-related) ...

Roland Mainz Sun, 01 Sep 2013 16:43:20 -0700

On Mon, Sep 2, 2013 at 1:09 AM, Roland Mainz <[email protected]> wrote:
> On Mon, Sep 2, 2013 at 1:06 AM, Roland Mainz <[email protected]> wrote:
>> On Mon, Sep 2, 2013 at 12:36 AM, Roland Mainz <[email protected]> 
>> wrote:
>>> On Mon, Aug 5, 2013 at 5:01 AM, Roland Mainz <[email protected]> 
>>> wrote:
>>>> On Mon, Aug 5, 2013 at 4:13 AM, Roland Mainz <[email protected]> 
>>>> wrote:
> [snip]
>> ** More notes:
>> 1. $ ksh -c 'export LC_ALL=en_US.ISO8859-15 ; printf "x\u[20ac]x\n" |
>> iconv -f ISO8859-15 -t UTF-8' # now works... it the correct outpput is
>> "x€x"
>> 2. The reason why this didn't work in the *002* patch was that the
>> original code in ast-ksh.2013-08-29 used |wc2utf8()| on an "extended
>> single-byte locale" like "en_US.ISO8859-15" ... this can **never**
>> work because the locale is not UTF-8 based
>>
>> Glenn/David: What do you think about the patch ?
>
> I forgot one note:
> - The patch _explicitly_ uses |iconv()| even for UTF-8 locales to
> weed-out unassigned codepoints to fullfit the unicode requirement that
> no unassigned codepoints should be accessible.


Last updated patch for tonight:

Attached (as "astksh20130829_printf_w_gb18030_004.diff.txt") is an
updated version of the patch which now automagically uses "\u[hex]" as
output instead of "\w[hex]" for UTF-8 locales, making the output 100%
compatible to previous ksh93 versions except for the describes bugs in
those versions.

BTW: Some example usage for $ set -o convunicode # (byte "a4" is the
Euro character in ISO8859-15):
-- snip --
$ ksh -c 'export LC_ALL=en_US.ISO8859-15 ; printf "euro=|%q|\n"
"$(printf "\xa4")" | iconv -f ISO8859-15 -t UTF-8'
euro=|$'€'|
$ ksh -o convunicode -c 'export LC_ALL=en_US.ISO8859-15 ; printf
"euro=|%q|\n" "$(printf "\xa4")" | iconv -f ISO8859-15 -t UTF-8'
euro=|$'\u[20ac]'|
-- snip --

Comments/rants/etc. welcome...

... and David/Glenn: Please don't remove the comments in the code if
you take the patch... there's a reason why I'm quite verbose in the
comments (short: Hideously complex and lots of traps in the code) ...

----

Bye,
Roland

-- 
  __ .  . __
 (o.\ \/ /.o) [email protected]
  \__\/\/__/  MPEG specialist, C&&JAVA&&Sun&&Unix programmer
  /O /==\ O\  TEL +49 641 3992797
 (;O/ \/ \O;)

diff -r -u original/src/cmd/ksh93/bltins/print.c 
build_wchar/src/cmd/ksh93/bltins/print.c
--- src/cmd/ksh93/bltins/print.c        2013-08-22 16:10:07.000000000 +0200
+++ src/cmd/ksh93/bltins/print.c        2013-09-02 00:54:13.624865935 +0200
@@ -421,9 +421,26 @@
                         s = p;
 #if SHOPT_MULTIBYTE
 #if defined(FMT_EXP_WIDE)
+                       if (c < 0)
+                       {
+                               /* conversion failed == empty string */
+                               continue;
+                       }
                        if(w)
                        {
-                               t += mbwide() ? mbconv(t, c) : wc2utf8(t, c);
+                               if(mbwide())
+                               {
+                                       t += mbconv(t, c);
+                               }
+                               else
+                               {
+                                       /*
+                                        * single-byte locale, maybe "C" or
+                                        * extended single-byte locale like
+                                        * "en_US.ISO8859-15"
+                                        */
+                                       *t++ = c;
+                               }
                                continue;
                        }
 #else
diff -r -u original/src/cmd/ksh93/data/builtins.c 
build_wchar/src/cmd/ksh93/data/builtins.c
--- src/cmd/ksh93/data/builtins.c       2013-08-27 21:21:27.000000000 +0200
+++ src/cmd/ksh93/data/builtins.c       2013-09-01 13:36:19.741340598 +0200
@@ -1213,8 +1213,16 @@
                "the collating element \aname\a.]"
        "[+-?The escape sequence \b\\x{\b\ahex\a\b}\b expands to the "
                "character corresponding to the hexidecimal value \ahex\a.]"
-       "[+-?The escape sequence \b\\u{\b\ahex\a\b}\b expands to the unicode "
-               "character corresponding to the hexidecimal value \ahex\a.]"
+       "[+-?The escape sequence \b\\u[\b\ahex\a\b]]\b or "
+               "\b\\u{\b\ahex\a\b}\b expands to the unicode character "
+               "corresponding to the unicode code point defined "
+               "by the hexidecimal value \ahex\a. If the character is "
+               "not available in the current locale the escape sequence "
+               "will return an empty string.]"
+       "[+-?The escape sequence \b\\w[\b\ahex\a\b]]\b or "
+               "\b\\w{\b\ahex\a\b}\b expands to the character "
+               "corresponding to the (wchar_t) code point defined "
+               "by the hexidecimal value \ahex\a in the current locale.]"
        "[+-?The format modifier flag \b=\b can be used to center a field to "
                "a specified width.]"
        "[+-?The format modifier flag \bL\b can be used with the \bc\b and "
Only in build_wchar/src/cmd/ksh93/data: builtins.c.orig
diff -r -u original/src/cmd/ksh93/data/options.c 
build_wchar/src/cmd/ksh93/data/options.c
--- src/cmd/ksh93/data/options.c        2013-08-07 16:02:08.000000000 +0200
+++ src/cmd/ksh93/data/options.c        2013-09-01 13:50:30.124285103 +0200
@@ -48,6 +48,7 @@
        bashopt("cdspell",              SH_CDSPELL)
        bashopt("checkhash",            SH_CHECKHASH)
        bashopt("checkwinsize",         SH_CHECKWINSIZE)
+       "convunicode",                  SH_CONVUNICODE,
        "noclobber",                    SH_NOCLOBBER,
        bashopt("dotglob",              SH_DOTGLOB)
        "emacs",                        SH_EMACS,
diff -r -u original/src/cmd/ksh93/include/defs.h 
build_wchar/src/cmd/ksh93/include/defs.h
--- src/cmd/ksh93/include/defs.h        2013-08-26 21:50:32.000000000 +0200
+++ src/cmd/ksh93/include/defs.h        2013-09-01 13:34:27.544653918 +0200
@@ -325,6 +325,7 @@
 #define SH_BRACEEXPAND         42
 #define SH_POSIX               46
 #define SH_MULTILINE           47
+#define SH_CONVUNICODE         48
 
 #define SH_NOPROFILE           78
 #define SH_NOUSRPROFILE                79
@@ -498,6 +499,7 @@
 #define sh_sigcheck(shp) 
do{if(shp->trapnote&SH_SIGSET)sh_exit((shp),SH_EXITSIG);} while(0)
 
 extern int32_t         sh_mailchk;
+extern bool            sh_fmtstr_force_unicode;
 extern const char      e_dict[];
 
 /* sh_printopts() mode flags -- set --[no]option by default */
diff -r -u original/src/cmd/ksh93/sh/init.c build_wchar/src/cmd/ksh93/sh/init.c
--- src/cmd/ksh93/sh/init.c     2013-08-27 22:48:50.000000000 +0200
+++ src/cmd/ksh93/sh/init.c     2013-09-01 13:35:36.775162791 +0200
@@ -1627,6 +1627,7 @@
        /* initialize jobs table */
        job_clear(shp);
        sh_onoption(shp,SH_MULTILINE);
+       sh_offoption(shp,SH_CONVUNICODE);
        if(argc>0)
        {
                /* check for restricted shell */
diff -r -u original/src/cmd/ksh93/sh/string.c 
build_wchar/src/cmd/ksh93/sh/string.c
--- src/cmd/ksh93/sh/string.c   2013-06-12 22:53:57.000000000 +0200
+++ src/cmd/ksh93/sh/string.c   2013-09-02 01:28:40.290408258 +0200
@@ -30,6 +30,7 @@
 #include       "shtable.h"
 #include       "lexstates.h"
 #include       "national.h"
+#include       <lc.h>
 
 #if _hdr_wctype
 #   include <wctype.h>
@@ -324,6 +325,7 @@
        return(stakptr(offset));
 }
 
+
 /*
  * print <str> quoting chars so that it can be read by the shell
  * puts null terminated result on stack, but doesn't freeze it
@@ -333,6 +335,9 @@
        register const char *cp = string, *op;
        register int c, state, type=quote;
        int offset;
+       bool force_unicode  = sh_isoption(&sh,SH_CONVUNICODE);
+       bool is_utf8_locale = (lcinfo(LC_CTYPE)->lc->flags & 
LC_utf8)?true:false;
+
        if(!cp)
                return((char*)0);
        offset = staktell();
@@ -384,7 +389,7 @@
        }
        else
        {
-               int isbyte=0;
+               bool isbyte = false;
                if(quote=='"')
                        stakputc('"');
                else
@@ -427,19 +432,73 @@
                                        break;
                            default:
 #if SHOPT_MULTIBYTE
-                               isbyte = 0;
                                if(c<0)
                                {
                                        c = *((unsigned char *)op);
                                        cp = op+1;
-                                       isbyte = 1;
+                                       isbyte = true;
+                               }
+                               else
+                               {
+                                       isbyte = false;
+                               }
+
+                               /*
+                                * If we convert the data to Unicode we want
+                                * to produce portable ASCII-only output and
+                                * therefore convert all non-ASCII (e.g.
+                                * |c > 127|) characters to \u[] sequences.
+                                *
+                                * Note that this requires to pass all data
+                                * through |wchar_to_ucs4()| to handle
+                                * "extended" single-byte locales like
+                                * "en_US.ISO8859-15" or "ru_RU.koi8r"
+                                *
+                                * If we do not convert to Unicode we only
+                                * convert the non-printable characters to
+                                * locale-specific \w[] sequences.
+                                *
+                                * Be *VERY* careful with the logic below -
+                                * some single-byte locale implementations
+                                * have wchar_t values > 127 but can return
+                                * bytes, too
+                                */
+#if 0
+                               sfprintf(sfstderr, "## wc=%d, uc=%lx, 
isbyte=%d, mbwide()=%d\n",
+                                       wc, (long)uc, (int)isbyte, 
(int)mbwide());
+#endif
+
+                               if(force_unicode && !isbyte)
+                               {
+                                       wchar_t wc = c;
+                                       uint32_t uc = 0;
+
+                                       if (wchar_to_ucs4(&wc, 1, &uc) < 0)
+                                               continue;
+
+                                       /*
+                                        * We assume that all locales have ASCII
+                                        * as their base character set
+                                        */
+                                       if(!iswprint(c) || (uc > 127))
+                                       {
+                                               sfprintf(staksp,"\\u[%lx]", 
(unsigned long)uc);
+                                               continue;
+                                       }
                                }
-                               if(mbwide() && ((cp-op)>1))
+                               else if(mbwide() && !isbyte)
                                {
-                                       sfprintf(staksp,"\\u[%x]",c);
-                                       continue;
+                                       if(!iswprint(c))
+                                       {
+                                               if(is_utf8_locale)
+                                                       
sfprintf(staksp,"\\u[%x]", c);
+                                               else
+                                                       
sfprintf(staksp,"\\w[%x]", c);
+                                               continue;
+                                       }
                                }
-                               else if(!iswprint(c) || isbyte)
+                               
+                               if(!iswprint(c) || isbyte)
 #else
                                if(!isprint(c))
 #endif
diff -r -u original/src/cmd/ksh93/tests/locale.sh 
build_wchar/src/cmd/ksh93/tests/locale.sh
--- src/cmd/ksh93/tests/locale.sh       2013-03-06 15:18:45.000000000 +0100
+++ src/cmd/ksh93/tests/locale.sh       2013-09-01 20:47:54.167644365 +0200
@@ -205,11 +205,17 @@
 # multibyte identifiers
 
 exp=OK
-got=$(LC_ALL=C.UTF-8 $SHELL -c $'\u[5929]=OK; print ${\u[5929]}' 2>&1)
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf '\u[5929]=OK; print 
${\u[5929]}')" 2>&1)
 [[ $got == "$exp" ]] || err_exit "multibyte variable definition/expansion 
failed -- expected '$exp', got '$got'"
-got=$(LC_ALL=C.UTF-8 $SHELL -c $'function \u[5929]\n{\nprint OK;\n}; \u[5929]' 
2>&1)
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf 'function 
\u[5929]\n{\nprint OK;\n}; \u[5929]')" 2>&1)
 [[ $got == "$exp" ]] || err_exit "multibyte ksh function definition/execution 
failed -- expected '$exp', got '$got'"
-got=$(LC_ALL=C.UTF-8 $SHELL -c $'\u[5929]()\n{\nprint OK;\n}; \u[5929]' 2>&1)
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf '\u[5929]()\n{\nprint 
OK;\n}; \u[5929]')" 2>&1)
+[[ $got == "$exp" ]] || err_exit "multibyte posix function 
definition/execution failed -- expected '$exp', got '$got'"
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf '\w[5929]=OK; print 
${\w[5929]}')" 2>&1)
+[[ $got == "$exp" ]] || err_exit "multibyte variable definition/expansion 
failed -- expected '$exp', got '$got'"
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf 'function 
\w[5929]\n{\nprint OK;\n}; \w[5929]')" 2>&1)
+[[ $got == "$exp" ]] || err_exit "multibyte ksh function definition/execution 
failed -- expected '$exp', got '$got'"
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf '\w[5929]()\n{\nprint 
OK;\n}; \w[5929]')" 2>&1)
 [[ $got == "$exp" ]] || err_exit "multibyte posix function 
definition/execution failed -- expected '$exp', got '$got'"
 
 # this locale is supported by ast on all platforms
@@ -332,8 +338,8 @@
                err_exit "unicode char$p1 ${x#?} $p2 in locale $LC_ALL"
        fi
        unset x
-       x=$(printf "hello\u[20ac]\xee world")
-       [[ $(print -r -- "$x") == $'hello\u[20ac]\xee world' ]] || err_exit '%q 
with unicode and non-unicode not working'
+       x=$(export LC_ALL='en_US.UTF-8' ; printf "hello\u[20ac]\xee world")
+       LC_ALL='en_US.UTF-8' eval $'[[ $(print -r -- "$x") == 
$\'hello\\u[20ac]\\xee world\' ]]' || err_exit '%q with unicode and non-unicode 
not working'
        if      [[ $(whence od) ]]
        then    got='68 65 6c 6c 6f e2 82 ac ee 20 77 6f 72 6c 64 0a'
                [[ $(print -r -- "$x" | od -An -tx1) == "$got" ]] || err_exit 
"incorrect string from printf %q"
diff -r -u original/src/lib/libast/features/map.c 
build_wchar/src/lib/libast/features/map.c
--- src/lib/libast/features/map.c       2013-08-20 07:41:54.000000000 +0200
+++ src/lib/libast/features/map.c       2013-09-01 07:57:52.081818481 +0200
@@ -105,7 +105,7 @@
 #if !_WINIX
        printf("#undef  fgetcwd\n");
        printf("#define fgetcwd         _ast_fgetcwd\n");
-       printf("extern char*            fgetcwd(fd, char*, size_t);\n");
+       printf("extern char*            fgetcwd(int, char*, size_t);\n");
        printf("#undef  getcwd\n");
        printf("#define getcwd          _ast_getcwd\n");
        printf("extern char*            getcwd(char*, size_t);\n");
diff -r -u original/src/lib/libast/include/ast.h 
build_wchar/src/lib/libast/include/ast.h
--- src/lib/libast/include/ast.h        2013-08-23 23:23:00.000000000 +0200
+++ src/lib/libast/include/ast.h        2013-09-01 10:35:20.926054848 +0200
@@ -398,6 +398,8 @@
 extern int             struniq(char**, int);
 extern int             strvcmp(const char*, const char*);
 extern int             wc2utf8(char*, uint32_t);
+extern ssize_t         ucs4_to_wchar(uint32_t *, size_t, wchar_t *);
+extern ssize_t         wchar_to_ucs4(wchar_t *, size_t, uint32_t *);
 
 #undef                 extern
 
diff -r -u original/src/lib/libast/include/ast_std.h 
build_wchar/src/lib/libast/include/ast_std.h
--- src/lib/libast/include/ast_std.h    2013-08-23 16:04:28.000000000 +0200
+++ src/lib/libast/include/ast_std.h    2013-09-01 16:54:57.560921017 +0200
@@ -257,7 +257,18 @@
 
        int             pwd;
 
-       char            pad[936 - sizeof(void*) - sizeof(int)];
+       struct
+       {
+               long            ic; /* should be |iconv_t| */
+               uint32_t        locale_serial;
+       } ucs4_to_wchar;
+       struct
+       {
+               long            ic; /* should be |iconv_t| */
+               uint32_t        locale_serial;
+       } wchar_to_ucs4;
+
+       char            pad[936 - sizeof(void*) - sizeof(int) - 2*(sizeof(long) 
+ sizeof(uint32_t))];
 
 } _Ast_info_t;
 
diff -r -u original/src/lib/libast/string/chresc.c 
build_wchar/src/lib/libast/string/chresc.c
--- src/lib/libast/string/chresc.c      2013-07-16 20:02:13.000000000 +0200
+++ src/lib/libast/string/chresc.c      2013-09-01 20:51:14.406571159 +0200
@@ -38,6 +38,192 @@
 #include <regex.h>
 #endif
 
+#include <errno.h>
+#include <wchar.h>
+#include <endian.h>
+#include "/usr/include/iconv.h" // WHY ?
+#include <langinfo.h>
+#include <locale.h>
+
+
+ssize_t ucs4_to_wchar(uint32_t *ucs4, size_t ucs4_len, wchar_t *wchar)
+{
+       char            *inbuf;
+       char            *outbuf,
+                       *outbuf_start;
+       size_t          inbytesleft,
+                       outbytesleft;
+       ssize_t         res;
+       size_t          mb_buf_len;
+       int             saved_errno;
+
+       if ((_ast_info.ucs4_to_wchar.locale_serial != _ast_info.locale.serial) 
||
+               (_ast_info.ucs4_to_wchar.ic == 0) || 
+               (_ast_info.ucs4_to_wchar.ic == -1))
+       {
+               _ast_info.ucs4_to_wchar.locale_serial = _ast_info.locale.serial;
+
+               if ((_ast_info.ucs4_to_wchar.ic != 0) && 
+                       (_ast_info.ucs4_to_wchar.ic != -1))
+               {
+                       (void)iconv_close((iconv_t)_ast_info.ucs4_to_wchar.ic);
+               }
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+               _ast_info.ucs4_to_wchar.ic = 
(long)iconv_open(nl_langinfo(CODESET), "UTF32LE");
+#elif __BYTE_ORDER == __BIG_ENDIAN
+               _ast_info.ucs4_to_wchar.ic = 
(long)iconv_open(nl_langinfo(CODESET), "UTF32BE");
+#else
+#error Unknown __BYTE_ORDER
+#endif
+       }
+
+       if (_ast_info.ucs4_to_wchar.ic == -1)
+               return (-1);
+
+       inbytesleft     = ucs4_len * sizeof(uint32_t);
+       mb_buf_len      = ucs4_len * MB_CUR_MAX;
+       inbuf           = (char *)ucs4;
+       outbytesleft    = mb_buf_len;
+       outbuf_start    = (char *)malloc(mb_buf_len+2);
+       if(!outbuf_start)
+               return(-1);
+
+       outbuf = outbuf_start;
+
+       res = iconv((iconv_t)_ast_info.ucs4_to_wchar.ic, &inbuf, &inbytesleft, 
&outbuf, &outbytesleft);
+
+       if (res >= 0)
+       {
+               if (mbwide())
+               {
+                       ssize_t len;
+                       size_t i;
+
+                       for(outbuf = outbuf_start, i = 0 ;
+                               i < ucs4_len ;
+                               i++, outbuf += len)
+                       {
+                               len = mbtowc(&wchar[i], outbuf, MB_CUR_MAX);
+                               if (len < 0)
+                               {
+                                       wchar[i]=L'\0';
+                                       break;
+                               }
+                       }
+               }
+               else
+               {
+                       size_t i;
+
+                       for(outbuf = outbuf_start, i = 0 ;
+                               i < ucs4_len ;
+                               i++)
+                               wchar[i]=(unsigned char)(*outbuf++);
+               }
+       }
+
+       saved_errno = errno;
+       free(outbuf_start);
+       errno = saved_errno;
+
+       return (res);
+}
+
+
+ssize_t wchar_to_ucs4(wchar_t *wchar, size_t wchar_len, uint32_t *ucs4)
+{
+       char            *inbuf;
+       char            *inbuf_start;
+       char            *outbuf,
+                       *outbuf_start;
+       size_t          inbytesleft,
+                       outbytesleft;
+       ssize_t         res;
+       size_t          mb_buf_len;
+       int             saved_errno;
+
+       if ((_ast_info.wchar_to_ucs4.locale_serial != _ast_info.locale.serial) 
||
+               (_ast_info.wchar_to_ucs4.ic == 0) || 
+               (_ast_info.wchar_to_ucs4.ic == -1))
+       {
+               _ast_info.wchar_to_ucs4.locale_serial = _ast_info.locale.serial;
+
+               if ((_ast_info.wchar_to_ucs4.ic != 0) && 
+                       (_ast_info.wchar_to_ucs4.ic != -1))
+               {
+                       (void)iconv_close((iconv_t)_ast_info.wchar_to_ucs4.ic);
+               }
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+               _ast_info.wchar_to_ucs4.ic = (long)iconv_open("UTF32LE", 
nl_langinfo(CODESET));
+#elif __BYTE_ORDER == __BIG_ENDIAN
+               _ast_info.wchar_to_ucs4.ic = (long)iconv_open("UTF32BE", 
nl_langinfo(CODESET));
+#else
+#error Unknown __BYTE_ORDER
+#endif
+       }
+
+       if (_ast_info.wchar_to_ucs4.ic == (iconv_t)-1)
+               return (-1);
+       
+       mb_buf_len      = wchar_len * MB_CUR_MAX;
+       inbuf_start     = (char *)malloc(mb_buf_len+2);
+       outbytesleft    = wchar_len * sizeof(uint32_t);
+       outbuf_start    = (char *)ucs4;
+       if(!inbuf_start)
+               return(-1);
+
+       if (mbwide())
+       {
+               mbstate_t       ps = {0};
+               ssize_t         len;
+               size_t          i;
+
+               memset(&ps, '\0', sizeof(ps));
+               (void)mbsinit(&ps);
+
+               for(inbuf = inbuf_start, i = 0 ;
+                       i < wchar_len ;
+                       i++, inbuf += len)
+               {
+                       len = wcrtomb(inbuf, wchar[i], &ps);
+                       if (len < 0)
+                       {
+                               inbuf[i]='\0';
+                               break;
+                       }
+               }
+       }
+       else
+       {
+               size_t  i;
+
+               /*
+                * We need this because Linux's |wcrtomb()| can't
+                * handle single-byte locales like ISO8859-15
+                * correctly
+                */
+               for(inbuf = inbuf_start, i = 0 ;
+                       i < wchar_len ;
+                       i++)
+                       *inbuf++=wchar[i];
+       }
+
+       inbytesleft = inbuf - inbuf_start;
+
+       inbuf = inbuf_start;
+       outbuf = outbuf_start;
+
+       res = iconv(_ast_info.wchar_to_ucs4.ic, &inbuf, &inbytesleft, &outbuf, 
&outbytesleft);
+
+       saved_errno = errno;
+       free(inbuf_start);
+       errno = saved_errno;
+
+       return (res);
+}
+
+
 int
 chrexp(register const char* s, char** p, int* m, register int flags)
 {
@@ -47,9 +233,10 @@
        const char*             b;
        char*                   r;
        int                     n;
-       int                     w;
+       bool                    w;
+       bool                    unicode;
 
-       w = 0;
+       w = unicode = false;
        for (;;)
        {
                b = s;
@@ -153,14 +340,18 @@
                                c = CC_vt;
                                break;
                        case 'u':
+                               unicode = true;
+                       case 'w':
                                q = s + 4;
                                goto wex;
                        case 'U':
+                               unicode = true;
+                       case 'W':
                                q = s + 8;
                        wex:
                                if (!(flags & FMT_EXP_WIDE))
                                        goto noexpand;
-                               w = 1;
+                               w = true;
                                goto hex;
                        case 'x':
                                q = s + 2;
@@ -191,7 +382,7 @@
                                                        break;
                                                e = 0;
                                                s++;
-                                               if (w && *s == 'U' && *(s + 1) 
== '+')
+                                               if (w && ((*s == 'U') || (*s == 
'W')) && *(s + 1) == '+')
                                                        s += 2;
                                                continue;
                                        case '}':
@@ -204,7 +395,7 @@
                                        }
                                        break;
                                }
-                               if (n <= 2 && !(flags & FMT_EXP_CHAR) || n > 2 
&& (w = 1) && !(flags & FMT_EXP_WIDE))
+                               if (n <= 2 && !(flags & FMT_EXP_CHAR) || n > 2 
&& (w = true) && !(flags & FMT_EXP_WIDE))
                                {
                                        c = '\\';
                                        s = b;
@@ -217,7 +408,7 @@
                        break;
                default:
                        if ((s - b) > 1)
-                               w = 1;
+                               w = true;
                        break;
                }
                break;
@@ -226,7 +417,18 @@
        if (p)
                *p = (char*)s;
        if (m)
-               *m = w;
+               *m = w?1:0;
+
+       if (w && unicode && (c > 127))
+       {
+               uint32_t in = c;
+               wchar_t out = -1;
+
+               if (ucs4_to_wchar(&in, 1, &out) < 0)
+                       c = -1;
+               else
+                       c = out;
+       }
        return c;
  noexpand:
        c = '\\';
diff -r -u original/src/lib/libast/string/stresc.c 
build_wchar/src/lib/libast/string/stresc.c
--- src/lib/libast/string/stresc.c      2010-05-01 07:46:26.000000000 +0200
+++ src/lib/libast/string/stresc.c      2013-09-02 00:54:07.332552362 +0200
@@ -36,7 +36,7 @@
 strexp(register char* s, int flags)
 {
        register char*          t;
-       register unsigned int   c;
+       register int            c;
        char*                   b;
        char*                   e;
        int                     w;
@@ -48,11 +48,28 @@
                {
                        c = chrexp(s - 1, &e, &w, flags);
                        s = e;
-                       if (w)
+
+                       if (c < 0)
                        {
-                               t += mbwide() ? mbconv(t, c) : wc2utf8(t, c);
+                               /* conversion failed == empty string */
                                continue;
                        }
+                       if (w)
+                       {
+                               if(mbwide())
+                               {
+                                       t += mbconv(t, c);
+                               }
+                               else
+                               {
+                                       /*
+                                        * single-byte locale, maybe "C" or
+                                        * extended single-byte locale like
+                                        * "en_US.ISO8859-15"
+                                        */
+                                       *t++ = c;
+                               }
+                       }
                }
                *t++ = c;
        }

_______________________________________________
ast-developers mailing list
[email protected]
http://lists.research.att.com/mailman/listinfo/ast-developers

Re: [ast-developers] [patch] Updated $'\w[hex]' patch for GB18030&&co. ... / was: Re: [patch] Accessing widechar codepoints without unicode (GB18030-related) ...

Reply via email to