Glenn, please find my comments in line:
Sunday, October 13, 2013  Glenn Fowler wrote:
> wchar.sh patch I think can be simplified by picking another mb char that we
> know is classified the same in all locales -- i picked euro but that was 
> probably
> too new

Just replying to wrap things up before I leave.
Which character would you have in mind? It must be a multi byte
character in a Unicode/UTF8 locale and must be represented in 2 of
ISO8859 derived single byte locales.
As far as I know only the Euro character matches these requirements.

Also, Roland's patch has been written, and tested, and works, and
covers far more details than the original test case did.

I have attached the original patch, for reconsideration.

Olga
-- 
      ,   _                                    _   ,
     { \/`o;====-    Olga Kryzhanovska   -====;o`\/ }
.----'-/`-/     olga.kryzhanov...@gmail.com   \-`\-'----.
 `'-..-| /       http://twitter.com/fleyta     \ |-..-'`
      /\/\     Solaris/BSD//C/C++ programmer   /\/\
      `--`                                      `--`
diff -r -u build_i386_64bit_debug/src/cmd/ksh93/sh/string.c 
build_i386_64bit_debug_wcharfix/src/cmd/ksh93/sh/string.c
--- src/cmd/ksh93/sh/string.c   2013-09-12 18:00:21.000000000 +0200
+++ src/cmd/ksh93/sh/string.c   2013-09-16 17:49:39.214780585 +0200
@@ -339,7 +339,9 @@
        if(!cp)
                return((char*)0);
        offset = staktell();
+#if SHOPT_MULTIBYTE
        mbinit();
+#endif
        state = ((c= mbchar(cp))==0);
 #if SHOPT_MULTIBYTE
        lc_unicode = quote=='u' ? 1 : quote=='U' ? 0 : !!(ast.locale.set & 
AST_LC_unicode);
@@ -349,7 +351,8 @@
        quote = '\'';
        if(isaletter(c) && (!lc_unicode || c<=0x7f))
        {
-               while((c=mbchar(cp)), isaname(c) && (!lc_unicode || c<=0x7f));
+               while((c=mbchar(cp)), isaname(c) && (!lc_unicode || c<=0x7f))
+                       ;
                if(c==0)
                        return((char*)string);
                if(c=='=')
diff -r -u build_i386_64bit_debug/src/cmd/ksh93/tests/wchar.sh 
build_i386_64bit_debug_wcharfix/src/cmd/ksh93/tests/wchar.sh
--- src/cmd/ksh93/tests/wchar.sh        2013-09-12 18:12:23.000000000 +0200
+++ src/cmd/ksh93/tests/wchar.sh        2013-09-16 17:29:11.934026898 +0200
@@ -32,31 +32,45 @@
 Command=${0##*/}
 integer Errors=0
 
-locales="en_US.UTF-8 en_US.ISO-8859-15 zh_CN.GB18030"
-supported="C.UTF-8"
+typeset -a locales=(
+       'en_US.UTF-8'
+       'en_US.ISO8859-15'
+       'zh_CN.GB18030'
+)
+typeset -a supported=( 'C.UTF-8' )
 
-for lc_all in $locales
+for lc_all in "${locales[@]}"
 do     if      {
                        PATH=/bin:/usr/bin:$PATH locale -a | grep -w 
${lc_all%.*} &&
                        LC_ALL=$lc_all PATH=/bin:/usr/bin:$PATH iconv -f 
${lc_all#*.} -t UTF-8 </dev/null
                } >/dev/null 2>&1
-       then    supported+=" $lc_all"
+       then    supported+=( "$lc_all" )
        else    : "LC_ALL=$lc_all not supported" :
        fi
 done
 
-exp0=$'0000000 24 27 e2 82 ac 27 0a'
 exp2=$'\'\\u[20ac]\''
 exp1='$'$exp2
 
-for lc_all in $supported
-do
+for lc_all in "${supported[@]}" ; do
+
+# We need both cases here since locales may or may not handle the
+# Euro symbol as alphabetical symbol
+# $ ~/bin/ksh -c 'LC_ALL=en_US.UTF-8 ; [[ $(printf "\u[20ac]") == 
~(E)[[:alpha:]] ]] && print isalpha || print isnotalpha'   
+# isnotalpha
+# $ ~/bin/ksh -c 'LC_ALL=zh_CN.GB18030 ; [[ $(printf "\u[20ac]") == 
~(E)[[:alpha:]] ]] && print isalpha || print isnotalpha'   
+# isalpha
+if $SHELL -c 'LC_ALL='${lc_all}' ; [[ $(printf "\u[20ac]") == ~(E)[[:alpha:]] 
]]' ; then
+       exp0=$'~(E)(0000000\ (e2\ 82\ ac)\ (0a))'                       # <euro>
+else
+       exp0=$'~(E)(0000000\ (24)\ (27)\ (e2\ 82\ ac)\ (27)\ (0a))'     # 
$'<euro>'
+fi
 
 got=$(LC_OPTIONS=nounicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%q\n" 
"$(printf "\u[20ac]")"' | iconv -f ${lc_all#*.} -t UTF-8 | od -tx1 | head -1)
-[[ $got == "$exp0" ]] || err_exit "${lc_all} nounicode FAILED -- locale 
probably not supported -- expected '$exp0', got '$got'"
+[[ $got == $exp0 ]] || err_exit "${lc_all} nounicode FAILED -- locale probably 
not supported -- expected '$exp0', got '$got'"
 
 got=$(LC_OPTIONS=unicode $SHELL -c 'export LC_ALL='${lc_all}'; printf 
"%(nounicode)q\n" "$(printf "\u[20ac]")"' | iconv -f ${lc_all#*.} -t UTF-8 | od 
-tx1 | head -1)
-[[ $got == "$exp0" ]] || err_exit "${lc_all} (nounicode) FAILED -- locale 
probably not supported -- expected '$exp0', got '$got'"
+[[ $got == $exp0 ]] || err_exit "${lc_all} (nounicode) FAILED -- locale 
probably not supported -- expected '$exp0', got '$got'"
 
 got=$(LC_OPTIONS=unicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%q\n" 
"$(printf "\u[20ac]")"')
 [[ $got == "$exp1" || $got == "$exp2" ]] || err_exit "${lc_all} unicode FAILED 
-- expected $exp1, got $got"
diff -r -u build_i386_64bit_debug/src/lib/libast/string/utf32stowcs.c 
build_i386_64bit_debug_wcharfix/src/lib/libast/string/utf32stowcs.c
--- src/lib/libast/string/utf32stowcs.c 2013-09-11 19:35:00.000000000 +0200
+++ src/lib/libast/string/utf32stowcs.c 2013-09-14 13:38:25.560529198 +0200
@@ -41,6 +41,8 @@
        if (ast.locale.set & AST_LC_utf8)
        {
                char    tmp[UTF8_LEN_MAX+1];
+               
+               mbinit();
 
                for (i = 0; i < n; i++)
                {
@@ -60,6 +62,10 @@
                        ast.mb_uc2wc = 0;
                if (ast.mb_uc2wc == 0)
                        return -1;
+               
+               /* Reset shift state */
+               (void)iconv(ast.mb_uc2wc, NULL, NULL, NULL, NULL);
+               
                if (n == 1)
                {
                        char    tmp_in[UTF8_LEN_MAX+1];
@@ -84,8 +90,13 @@
                                        return -1;
 #endif
                        }
-                       else if (mb2wc(wchar[0], tmp_out, outbuf - tmp_out) <= 
0)
-                               return -1;
+                       else
+                       {
+                               mbinit();
+
+                               if (mb2wc(wchar[0], tmp_out, outbuf - tmp_out) 
<= 0)
+                                       return -1;
+                       }
                        i = 1;
                }
                else
@@ -112,6 +123,8 @@
                        if (mbwide())
                        {
                                ssize_t len;
+                               
+                               mbinit();
 
                                for (outbuf = outbuf_start; i < n && outbuf < 
inbuf; i++, outbuf += len)
                                        if ((len = mb2wc(wchar[i], outbuf, 
inbuf - outbuf)) < 0)
diff -r -u build_i386_64bit_debug/src/lib/libast/string/wcstoutf32s.c 
build_i386_64bit_debug_wcharfix/src/lib/libast/string/wcstoutf32s.c
--- src/lib/libast/string/wcstoutf32s.c 2013-09-11 15:11:34.000000000 +0200
+++ src/lib/libast/string/wcstoutf32s.c 2013-09-14 13:37:59.191221921 +0200
@@ -34,12 +34,14 @@
 ssize_t
 wcstoutf32s(uint32_t* utf32, wchar_t* wchar, size_t n)
 {
-       size_t          i;
-       ssize_t         res;
+       size_t  i;
+       ssize_t res;
 
        if (ast.locale.set & AST_LC_utf8)
        {
                char    tmp[UTF8_LEN_MAX+1];
+               
+               mbinit();
 
                for (i = 0; i < n; i++)
                {
@@ -63,6 +65,10 @@
                        ast.mb_wc2uc = 0;
                if (ast.mb_wc2uc == 0)
                        return -1;
+                       
+               /* Reset shift state */
+               (void)iconv(ast.mb_wc2uc, NULL, NULL, NULL, NULL);
+
                inbytesleft     = n * mbmax();
                outbytesleft    = n * sizeof(uint32_t);
                inbuf_start     = oldof(0, char, (inbytesleft + 2) + 
outbytesleft, 0);
_______________________________________________
ast-developers mailing list
ast-developers@lists.research.att.com
http://lists.research.att.com/mailman/listinfo/ast-developers

Reply via email to