Glenn, please find my comments in line: Sunday, October 13, 2013 Glenn Fowler wrote: > wchar.sh patch I think can be simplified by picking another mb char that we > know is classified the same in all locales -- i picked euro but that was > probably > too new
Just replying to wrap things up before I leave. Which character would you have in mind? It must be a multi byte character in a Unicode/UTF8 locale and must be represented in 2 of ISO8859 derived single byte locales. As far as I know only the Euro character matches these requirements. Also, Roland's patch has been written, and tested, and works, and covers far more details than the original test case did. I have attached the original patch, for reconsideration. Olga -- , _ _ , { \/`o;====- Olga Kryzhanovska -====;o`\/ } .----'-/`-/ olga.kryzhanov...@gmail.com \-`\-'----. `'-..-| / http://twitter.com/fleyta \ |-..-'` /\/\ Solaris/BSD//C/C++ programmer /\/\ `--` `--`
diff -r -u build_i386_64bit_debug/src/cmd/ksh93/sh/string.c build_i386_64bit_debug_wcharfix/src/cmd/ksh93/sh/string.c --- src/cmd/ksh93/sh/string.c 2013-09-12 18:00:21.000000000 +0200 +++ src/cmd/ksh93/sh/string.c 2013-09-16 17:49:39.214780585 +0200 @@ -339,7 +339,9 @@ if(!cp) return((char*)0); offset = staktell(); +#if SHOPT_MULTIBYTE mbinit(); +#endif state = ((c= mbchar(cp))==0); #if SHOPT_MULTIBYTE lc_unicode = quote=='u' ? 1 : quote=='U' ? 0 : !!(ast.locale.set & AST_LC_unicode); @@ -349,7 +351,8 @@ quote = '\''; if(isaletter(c) && (!lc_unicode || c<=0x7f)) { - while((c=mbchar(cp)), isaname(c) && (!lc_unicode || c<=0x7f)); + while((c=mbchar(cp)), isaname(c) && (!lc_unicode || c<=0x7f)) + ; if(c==0) return((char*)string); if(c=='=') diff -r -u build_i386_64bit_debug/src/cmd/ksh93/tests/wchar.sh build_i386_64bit_debug_wcharfix/src/cmd/ksh93/tests/wchar.sh --- src/cmd/ksh93/tests/wchar.sh 2013-09-12 18:12:23.000000000 +0200 +++ src/cmd/ksh93/tests/wchar.sh 2013-09-16 17:29:11.934026898 +0200 @@ -32,31 +32,45 @@ Command=${0##*/} integer Errors=0 -locales="en_US.UTF-8 en_US.ISO-8859-15 zh_CN.GB18030" -supported="C.UTF-8" +typeset -a locales=( + 'en_US.UTF-8' + 'en_US.ISO8859-15' + 'zh_CN.GB18030' +) +typeset -a supported=( 'C.UTF-8' ) -for lc_all in $locales +for lc_all in "${locales[@]}" do if { PATH=/bin:/usr/bin:$PATH locale -a | grep -w ${lc_all%.*} && LC_ALL=$lc_all PATH=/bin:/usr/bin:$PATH iconv -f ${lc_all#*.} -t UTF-8 </dev/null } >/dev/null 2>&1 - then supported+=" $lc_all" + then supported+=( "$lc_all" ) else : "LC_ALL=$lc_all not supported" : fi done -exp0=$'0000000 24 27 e2 82 ac 27 0a' exp2=$'\'\\u[20ac]\'' exp1='$'$exp2 -for lc_all in $supported -do +for lc_all in "${supported[@]}" ; do + +# We need both cases here since locales may or may not handle the +# Euro symbol as alphabetical symbol +# $ ~/bin/ksh -c 'LC_ALL=en_US.UTF-8 ; [[ $(printf "\u[20ac]") == ~(E)[[:alpha:]] ]] && print isalpha || print isnotalpha' +# isnotalpha +# $ ~/bin/ksh -c 'LC_ALL=zh_CN.GB18030 ; [[ $(printf "\u[20ac]") == ~(E)[[:alpha:]] ]] && print isalpha || print isnotalpha' +# isalpha +if $SHELL -c 'LC_ALL='${lc_all}' ; [[ $(printf "\u[20ac]") == ~(E)[[:alpha:]] ]]' ; then + exp0=$'~(E)(0000000\ (e2\ 82\ ac)\ (0a))' # <euro> +else + exp0=$'~(E)(0000000\ (24)\ (27)\ (e2\ 82\ ac)\ (27)\ (0a))' # $'<euro>' +fi got=$(LC_OPTIONS=nounicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%q\n" "$(printf "\u[20ac]")"' | iconv -f ${lc_all#*.} -t UTF-8 | od -tx1 | head -1) -[[ $got == "$exp0" ]] || err_exit "${lc_all} nounicode FAILED -- locale probably not supported -- expected '$exp0', got '$got'" +[[ $got == $exp0 ]] || err_exit "${lc_all} nounicode FAILED -- locale probably not supported -- expected '$exp0', got '$got'" got=$(LC_OPTIONS=unicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%(nounicode)q\n" "$(printf "\u[20ac]")"' | iconv -f ${lc_all#*.} -t UTF-8 | od -tx1 | head -1) -[[ $got == "$exp0" ]] || err_exit "${lc_all} (nounicode) FAILED -- locale probably not supported -- expected '$exp0', got '$got'" +[[ $got == $exp0 ]] || err_exit "${lc_all} (nounicode) FAILED -- locale probably not supported -- expected '$exp0', got '$got'" got=$(LC_OPTIONS=unicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%q\n" "$(printf "\u[20ac]")"') [[ $got == "$exp1" || $got == "$exp2" ]] || err_exit "${lc_all} unicode FAILED -- expected $exp1, got $got" diff -r -u build_i386_64bit_debug/src/lib/libast/string/utf32stowcs.c build_i386_64bit_debug_wcharfix/src/lib/libast/string/utf32stowcs.c --- src/lib/libast/string/utf32stowcs.c 2013-09-11 19:35:00.000000000 +0200 +++ src/lib/libast/string/utf32stowcs.c 2013-09-14 13:38:25.560529198 +0200 @@ -41,6 +41,8 @@ if (ast.locale.set & AST_LC_utf8) { char tmp[UTF8_LEN_MAX+1]; + + mbinit(); for (i = 0; i < n; i++) { @@ -60,6 +62,10 @@ ast.mb_uc2wc = 0; if (ast.mb_uc2wc == 0) return -1; + + /* Reset shift state */ + (void)iconv(ast.mb_uc2wc, NULL, NULL, NULL, NULL); + if (n == 1) { char tmp_in[UTF8_LEN_MAX+1]; @@ -84,8 +90,13 @@ return -1; #endif } - else if (mb2wc(wchar[0], tmp_out, outbuf - tmp_out) <= 0) - return -1; + else + { + mbinit(); + + if (mb2wc(wchar[0], tmp_out, outbuf - tmp_out) <= 0) + return -1; + } i = 1; } else @@ -112,6 +123,8 @@ if (mbwide()) { ssize_t len; + + mbinit(); for (outbuf = outbuf_start; i < n && outbuf < inbuf; i++, outbuf += len) if ((len = mb2wc(wchar[i], outbuf, inbuf - outbuf)) < 0) diff -r -u build_i386_64bit_debug/src/lib/libast/string/wcstoutf32s.c build_i386_64bit_debug_wcharfix/src/lib/libast/string/wcstoutf32s.c --- src/lib/libast/string/wcstoutf32s.c 2013-09-11 15:11:34.000000000 +0200 +++ src/lib/libast/string/wcstoutf32s.c 2013-09-14 13:37:59.191221921 +0200 @@ -34,12 +34,14 @@ ssize_t wcstoutf32s(uint32_t* utf32, wchar_t* wchar, size_t n) { - size_t i; - ssize_t res; + size_t i; + ssize_t res; if (ast.locale.set & AST_LC_utf8) { char tmp[UTF8_LEN_MAX+1]; + + mbinit(); for (i = 0; i < n; i++) { @@ -63,6 +65,10 @@ ast.mb_wc2uc = 0; if (ast.mb_wc2uc == 0) return -1; + + /* Reset shift state */ + (void)iconv(ast.mb_wc2uc, NULL, NULL, NULL, NULL); + inbytesleft = n * mbmax(); outbytesleft = n * sizeof(uint32_t); inbuf_start = oldof(0, char, (inbytesleft + 2) + outbytesleft, 0);
_______________________________________________ ast-developers mailing list ast-developers@lists.research.att.com http://lists.research.att.com/mailman/listinfo/ast-developers