On Mon, Sep 2, 2013 at 1:42 AM, Roland Mainz <[email protected]> wrote:
> On Mon, Sep 2, 2013 at 1:09 AM, Roland Mainz <[email protected]> wrote:
>> On Mon, Sep 2, 2013 at 1:06 AM, Roland Mainz <[email protected]>
>> wrote:
>>> On Mon, Sep 2, 2013 at 12:36 AM, Roland Mainz <[email protected]>
>>> wrote:
>>>> On Mon, Aug 5, 2013 at 5:01 AM, Roland Mainz <[email protected]>
>>>> wrote:
>>>>> On Mon, Aug 5, 2013 at 4:13 AM, Roland Mainz <[email protected]>
>>>>> wrote:
>> [snip]
>>> ** More notes:
>>> 1. $ ksh -c 'export LC_ALL=en_US.ISO8859-15 ; printf "x\u[20ac]x\n" |
>>> iconv -f ISO8859-15 -t UTF-8' # now works... it the correct outpput is
>>> "x€x"
>>> 2. The reason why this didn't work in the *002* patch was that the
>>> original code in ast-ksh.2013-08-29 used |wc2utf8()| on an "extended
>>> single-byte locale" like "en_US.ISO8859-15" ... this can **never**
>>> work because the locale is not UTF-8 based
>>>
>>> Glenn/David: What do you think about the patch ?
>>
>> I forgot one note:
>> - The patch _explicitly_ uses |iconv()| even for UTF-8 locales to
>> weed-out unassigned codepoints to fullfit the unicode requirement that
>> no unassigned codepoints should be accessible.
>
> Last updated patch for tonight:
>
> Attached (as "astksh20130829_printf_w_gb18030_004.diff.txt") is an
> updated version of the patch which now automagically uses "\u[hex]" as
> output instead of "\w[hex]" for UTF-8 locales, making the output 100%
> compatible to previous ksh93 versions except for the describes bugs in
> those versions.
>
> BTW: Some example usage for $ set -o convunicode # (byte "a4" is the
> Euro character in ISO8859-15):
> -- snip --
> $ ksh -c 'export LC_ALL=en_US.ISO8859-15 ; printf "euro=|%q|\n"
> "$(printf "\xa4")" | iconv -f ISO8859-15 -t UTF-8'
> euro=|$'€'|
> $ ksh -o convunicode -c 'export LC_ALL=en_US.ISO8859-15 ; printf
> "euro=|%q|\n" "$(printf "\xa4")" | iconv -f ISO8859-15 -t UTF-8'
> euro=|$'\u[20ac]'|
> -- snip --
>
> Comments/rants/etc. welcome...
>
> ... and David/Glenn: Please don't remove the comments in the code if
> you take the patch... there's a reason why I'm quite verbose in the
> comments (short: Hideously complex and lots of traps in the code) ...
Attached (as "astksh20130829_printf_w_gb18030_005.diff.txt") is a
fixed patch... the previous one missed a |continue;| statement which
caused failures in the "locale.sh" test module (found by Wang Shouhua)
...
----
Bye,
Roland
--
__ . . __
(o.\ \/ /.o) [email protected]
\__\/\/__/ MPEG specialist, C&&JAVA&&Sun&&Unix programmer
/O /==\ O\ TEL +49 641 3992797
(;O/ \/ \O;)
diff -r -u original/src/cmd/ksh93/bltins/print.c
build_wchar/src/cmd/ksh93/bltins/print.c
--- src/cmd/ksh93/bltins/print.c 2013-08-22 16:10:07.000000000 +0200
+++ src/cmd/ksh93/bltins/print.c 2013-09-02 00:54:13.624865935 +0200
@@ -421,9 +421,26 @@
s = p;
#if SHOPT_MULTIBYTE
#if defined(FMT_EXP_WIDE)
+ if (c < 0)
+ {
+ /* conversion failed == empty string */
+ continue;
+ }
if(w)
{
- t += mbwide() ? mbconv(t, c) : wc2utf8(t, c);
+ if(mbwide())
+ {
+ t += mbconv(t, c);
+ }
+ else
+ {
+ /*
+ * single-byte locale, maybe "C" or
+ * extended single-byte locale like
+ * "en_US.ISO8859-15"
+ */
+ *t++ = c;
+ }
continue;
}
#else
diff -r -u original/src/cmd/ksh93/data/builtins.c
build_wchar/src/cmd/ksh93/data/builtins.c
--- src/cmd/ksh93/data/builtins.c 2013-08-27 21:21:27.000000000 +0200
+++ src/cmd/ksh93/data/builtins.c 2013-09-01 13:36:19.741340598 +0200
@@ -1213,8 +1213,16 @@
"the collating element \aname\a.]"
"[+-?The escape sequence \b\\x{\b\ahex\a\b}\b expands to the "
"character corresponding to the hexidecimal value \ahex\a.]"
- "[+-?The escape sequence \b\\u{\b\ahex\a\b}\b expands to the unicode "
- "character corresponding to the hexidecimal value \ahex\a.]"
+ "[+-?The escape sequence \b\\u[\b\ahex\a\b]]\b or "
+ "\b\\u{\b\ahex\a\b}\b expands to the unicode character "
+ "corresponding to the unicode code point defined "
+ "by the hexidecimal value \ahex\a. If the character is "
+ "not available in the current locale the escape sequence "
+ "will return an empty string.]"
+ "[+-?The escape sequence \b\\w[\b\ahex\a\b]]\b or "
+ "\b\\w{\b\ahex\a\b}\b expands to the character "
+ "corresponding to the (wchar_t) code point defined "
+ "by the hexidecimal value \ahex\a in the current locale.]"
"[+-?The format modifier flag \b=\b can be used to center a field to "
"a specified width.]"
"[+-?The format modifier flag \bL\b can be used with the \bc\b and "
Only in build_wchar/src/cmd/ksh93/data: builtins.c.orig
diff -r -u original/src/cmd/ksh93/data/options.c
build_wchar/src/cmd/ksh93/data/options.c
--- src/cmd/ksh93/data/options.c 2013-08-07 16:02:08.000000000 +0200
+++ src/cmd/ksh93/data/options.c 2013-09-01 13:50:30.124285103 +0200
@@ -48,6 +48,7 @@
bashopt("cdspell", SH_CDSPELL)
bashopt("checkhash", SH_CHECKHASH)
bashopt("checkwinsize", SH_CHECKWINSIZE)
+ "convunicode", SH_CONVUNICODE,
"noclobber", SH_NOCLOBBER,
bashopt("dotglob", SH_DOTGLOB)
"emacs", SH_EMACS,
diff -r -u original/src/cmd/ksh93/include/defs.h
build_wchar/src/cmd/ksh93/include/defs.h
--- src/cmd/ksh93/include/defs.h 2013-08-26 21:50:32.000000000 +0200
+++ src/cmd/ksh93/include/defs.h 2013-09-01 13:34:27.544653918 +0200
@@ -325,6 +325,7 @@
#define SH_BRACEEXPAND 42
#define SH_POSIX 46
#define SH_MULTILINE 47
+#define SH_CONVUNICODE 48
#define SH_NOPROFILE 78
#define SH_NOUSRPROFILE 79
@@ -498,6 +499,7 @@
#define sh_sigcheck(shp)
do{if(shp->trapnote&SH_SIGSET)sh_exit((shp),SH_EXITSIG);} while(0)
extern int32_t sh_mailchk;
+extern bool sh_fmtstr_force_unicode;
extern const char e_dict[];
/* sh_printopts() mode flags -- set --[no]option by default */
diff -r -u original/src/cmd/ksh93/sh/init.c build_wchar/src/cmd/ksh93/sh/init.c
--- src/cmd/ksh93/sh/init.c 2013-08-27 22:48:50.000000000 +0200
+++ src/cmd/ksh93/sh/init.c 2013-09-01 13:35:36.775162791 +0200
@@ -1627,6 +1627,7 @@
/* initialize jobs table */
job_clear(shp);
sh_onoption(shp,SH_MULTILINE);
+ sh_offoption(shp,SH_CONVUNICODE);
if(argc>0)
{
/* check for restricted shell */
diff -r -u original/src/cmd/ksh93/sh/string.c
build_wchar/src/cmd/ksh93/sh/string.c
--- src/cmd/ksh93/sh/string.c 2013-06-12 22:53:57.000000000 +0200
+++ src/cmd/ksh93/sh/string.c 2013-09-02 03:44:48.961453853 +0200
@@ -30,6 +30,7 @@
#include "shtable.h"
#include "lexstates.h"
#include "national.h"
+#include <lc.h>
#if _hdr_wctype
# include <wctype.h>
@@ -324,6 +325,7 @@
return(stakptr(offset));
}
+
/*
* print <str> quoting chars so that it can be read by the shell
* puts null terminated result on stack, but doesn't freeze it
@@ -333,6 +335,9 @@
register const char *cp = string, *op;
register int c, state, type=quote;
int offset;
+ bool force_unicode = sh_isoption(&sh,SH_CONVUNICODE);
+ bool is_utf8_locale = (lcinfo(LC_CTYPE)->lc->flags &
LC_utf8)?true:false;
+
if(!cp)
return((char*)0);
offset = staktell();
@@ -384,7 +389,7 @@
}
else
{
- int isbyte=0;
+ bool isbyte = false;
if(quote=='"')
stakputc('"');
else
@@ -427,19 +432,73 @@
break;
default:
#if SHOPT_MULTIBYTE
- isbyte = 0;
if(c<0)
{
c = *((unsigned char *)op);
cp = op+1;
- isbyte = 1;
+ isbyte = true;
+ }
+ else
+ {
+ isbyte = false;
+ }
+
+ /*
+ * If we convert the data to Unicode we want
+ * to produce portable ASCII-only output and
+ * therefore convert all non-ASCII (e.g.
+ * |c > 127|) characters to \u[] sequences.
+ *
+ * Note that this requires to pass all data
+ * through |wchar_to_ucs4()| to handle
+ * "extended" single-byte locales like
+ * "en_US.ISO8859-15" or "ru_RU.koi8r"
+ *
+ * If we do not convert to Unicode we only
+ * convert the non-printable characters to
+ * locale-specific \w[] sequences.
+ *
+ * Be *VERY* careful with the logic below -
+ * some single-byte locale implementations
+ * have wchar_t values > 127 but can return
+ * bytes, too
+ */
+#if 0
+ sfprintf(sfstderr, "## wc=%d, uc=%lx,
isbyte=%d, mbwide()=%d\n",
+ wc, (long)uc, (int)isbyte,
(int)mbwide());
+#endif
+
+ if(force_unicode && !isbyte)
+ {
+ wchar_t wc = c;
+ uint32_t uc = 0;
+
+ if (wchar_to_ucs4(&wc, 1, &uc) < 0)
+ continue;
+
+ /*
+ * We assume that all locales have ASCII
+ * as their base character set
+ */
+ if(!iswprint(c) || (uc > 127))
+ {
+ sfprintf(staksp,"\\u[%lx]",
(unsigned long)uc);
+ continue;
+ }
}
- if(mbwide() && ((cp-op)>1))
+ else if(mbwide() && !isbyte)
{
- sfprintf(staksp,"\\u[%x]",c);
- continue;
+ if(!iswprint(c))
+ {
+ if(is_utf8_locale)
+
sfprintf(staksp,"\\u[%x]", c);
+ else
+
sfprintf(staksp,"\\w[%x]", c);
+ continue;
+ }
}
- else if(!iswprint(c) || isbyte)
+
+ if(!iswprint(c) || isbyte)
#else
if(!isprint(c))
#endif
diff -r -u original/src/cmd/ksh93/tests/locale.sh
build_wchar/src/cmd/ksh93/tests/locale.sh
--- src/cmd/ksh93/tests/locale.sh 2013-03-06 15:18:45.000000000 +0100
+++ src/cmd/ksh93/tests/locale.sh 2013-09-02 03:07:32.325164923 +0200
@@ -205,11 +205,17 @@
# multibyte identifiers
exp=OK
-got=$(LC_ALL=C.UTF-8 $SHELL -c $'\u[5929]=OK; print ${\u[5929]}' 2>&1)
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf '\u[5929]=OK; print
${\u[5929]}')" 2>&1)
[[ $got == "$exp" ]] || err_exit "multibyte variable definition/expansion
failed -- expected '$exp', got '$got'"
-got=$(LC_ALL=C.UTF-8 $SHELL -c $'function \u[5929]\n{\nprint OK;\n}; \u[5929]'
2>&1)
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf 'function
\u[5929]\n{\nprint OK;\n}; \u[5929]')" 2>&1)
[[ $got == "$exp" ]] || err_exit "multibyte ksh function definition/execution
failed -- expected '$exp', got '$got'"
-got=$(LC_ALL=C.UTF-8 $SHELL -c $'\u[5929]()\n{\nprint OK;\n}; \u[5929]' 2>&1)
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf '\u[5929]()\n{\nprint
OK;\n}; \u[5929]')" 2>&1)
+[[ $got == "$exp" ]] || err_exit "multibyte posix function
definition/execution failed -- expected '$exp', got '$got'"
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf '\w[5929]=OK; print
${\w[5929]}')" 2>&1)
+[[ $got == "$exp" ]] || err_exit "multibyte variable definition/expansion
failed -- expected '$exp', got '$got'"
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf 'function
\w[5929]\n{\nprint OK;\n}; \w[5929]')" 2>&1)
+[[ $got == "$exp" ]] || err_exit "multibyte ksh function definition/execution
failed -- expected '$exp', got '$got'"
+got=$(export LC_ALL='en_US.UTF-8' ; $SHELL -c "$(printf '\w[5929]()\n{\nprint
OK;\n}; \w[5929]')" 2>&1)
[[ $got == "$exp" ]] || err_exit "multibyte posix function
definition/execution failed -- expected '$exp', got '$got'"
# this locale is supported by ast on all platforms
@@ -332,8 +338,8 @@
err_exit "unicode char$p1 ${x#?} $p2 in locale $LC_ALL"
fi
unset x
- x=$(printf "hello\u[20ac]\xee world")
- [[ $(print -r -- "$x") == $'hello\u[20ac]\xee world' ]] || err_exit '%q
with unicode and non-unicode not working'
+ x=$(export LC_ALL='en_US.UTF-8' ; printf "hello\u[20ac]\xee world")
+ LC_ALL='en_US.UTF-8' eval $'[[ $(print -r -- "$x") ==
$\'hello\\u[20ac]\\xee world\' ]]' || err_exit '%q with unicode and non-unicode
not working'
if [[ $(whence od) ]]
then got='68 65 6c 6c 6f e2 82 ac ee 20 77 6f 72 6c 64 0a'
[[ $(print -r -- "$x" | od -An -tx1) == "$got" ]] || err_exit
"incorrect string from printf %q"
diff -r -u original/src/lib/libast/features/map.c
build_wchar/src/lib/libast/features/map.c
--- src/lib/libast/features/map.c 2013-08-20 07:41:54.000000000 +0200
+++ src/lib/libast/features/map.c 2013-09-01 07:57:52.081818481 +0200
@@ -105,7 +105,7 @@
#if !_WINIX
printf("#undef fgetcwd\n");
printf("#define fgetcwd _ast_fgetcwd\n");
- printf("extern char* fgetcwd(fd, char*, size_t);\n");
+ printf("extern char* fgetcwd(int, char*, size_t);\n");
printf("#undef getcwd\n");
printf("#define getcwd _ast_getcwd\n");
printf("extern char* getcwd(char*, size_t);\n");
diff -r -u original/src/lib/libast/include/ast.h
build_wchar/src/lib/libast/include/ast.h
--- src/lib/libast/include/ast.h 2013-08-23 23:23:00.000000000 +0200
+++ src/lib/libast/include/ast.h 2013-09-01 10:35:20.926054848 +0200
@@ -398,6 +398,8 @@
extern int struniq(char**, int);
extern int strvcmp(const char*, const char*);
extern int wc2utf8(char*, uint32_t);
+extern ssize_t ucs4_to_wchar(uint32_t *, size_t, wchar_t *);
+extern ssize_t wchar_to_ucs4(wchar_t *, size_t, uint32_t *);
#undef extern
diff -r -u original/src/lib/libast/include/ast_std.h
build_wchar/src/lib/libast/include/ast_std.h
--- src/lib/libast/include/ast_std.h 2013-08-23 16:04:28.000000000 +0200
+++ src/lib/libast/include/ast_std.h 2013-09-01 16:54:57.560921017 +0200
@@ -257,7 +257,18 @@
int pwd;
- char pad[936 - sizeof(void*) - sizeof(int)];
+ struct
+ {
+ long ic; /* should be |iconv_t| */
+ uint32_t locale_serial;
+ } ucs4_to_wchar;
+ struct
+ {
+ long ic; /* should be |iconv_t| */
+ uint32_t locale_serial;
+ } wchar_to_ucs4;
+
+ char pad[936 - sizeof(void*) - sizeof(int) - 2*(sizeof(long)
+ sizeof(uint32_t))];
} _Ast_info_t;
diff -r -u original/src/lib/libast/string/chresc.c
build_wchar/src/lib/libast/string/chresc.c
--- src/lib/libast/string/chresc.c 2013-07-16 20:02:13.000000000 +0200
+++ src/lib/libast/string/chresc.c 2013-09-01 20:51:14.406571159 +0200
@@ -38,6 +38,192 @@
#include <regex.h>
#endif
+#include <errno.h>
+#include <wchar.h>
+#include <endian.h>
+#include "/usr/include/iconv.h" // WHY ?
+#include <langinfo.h>
+#include <locale.h>
+
+
+ssize_t ucs4_to_wchar(uint32_t *ucs4, size_t ucs4_len, wchar_t *wchar)
+{
+ char *inbuf;
+ char *outbuf,
+ *outbuf_start;
+ size_t inbytesleft,
+ outbytesleft;
+ ssize_t res;
+ size_t mb_buf_len;
+ int saved_errno;
+
+ if ((_ast_info.ucs4_to_wchar.locale_serial != _ast_info.locale.serial)
||
+ (_ast_info.ucs4_to_wchar.ic == 0) ||
+ (_ast_info.ucs4_to_wchar.ic == -1))
+ {
+ _ast_info.ucs4_to_wchar.locale_serial = _ast_info.locale.serial;
+
+ if ((_ast_info.ucs4_to_wchar.ic != 0) &&
+ (_ast_info.ucs4_to_wchar.ic != -1))
+ {
+ (void)iconv_close((iconv_t)_ast_info.ucs4_to_wchar.ic);
+ }
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ _ast_info.ucs4_to_wchar.ic =
(long)iconv_open(nl_langinfo(CODESET), "UTF32LE");
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ _ast_info.ucs4_to_wchar.ic =
(long)iconv_open(nl_langinfo(CODESET), "UTF32BE");
+#else
+#error Unknown __BYTE_ORDER
+#endif
+ }
+
+ if (_ast_info.ucs4_to_wchar.ic == -1)
+ return (-1);
+
+ inbytesleft = ucs4_len * sizeof(uint32_t);
+ mb_buf_len = ucs4_len * MB_CUR_MAX;
+ inbuf = (char *)ucs4;
+ outbytesleft = mb_buf_len;
+ outbuf_start = (char *)malloc(mb_buf_len+2);
+ if(!outbuf_start)
+ return(-1);
+
+ outbuf = outbuf_start;
+
+ res = iconv((iconv_t)_ast_info.ucs4_to_wchar.ic, &inbuf, &inbytesleft,
&outbuf, &outbytesleft);
+
+ if (res >= 0)
+ {
+ if (mbwide())
+ {
+ ssize_t len;
+ size_t i;
+
+ for(outbuf = outbuf_start, i = 0 ;
+ i < ucs4_len ;
+ i++, outbuf += len)
+ {
+ len = mbtowc(&wchar[i], outbuf, MB_CUR_MAX);
+ if (len < 0)
+ {
+ wchar[i]=L'\0';
+ break;
+ }
+ }
+ }
+ else
+ {
+ size_t i;
+
+ for(outbuf = outbuf_start, i = 0 ;
+ i < ucs4_len ;
+ i++)
+ wchar[i]=(unsigned char)(*outbuf++);
+ }
+ }
+
+ saved_errno = errno;
+ free(outbuf_start);
+ errno = saved_errno;
+
+ return (res);
+}
+
+
+ssize_t wchar_to_ucs4(wchar_t *wchar, size_t wchar_len, uint32_t *ucs4)
+{
+ char *inbuf;
+ char *inbuf_start;
+ char *outbuf,
+ *outbuf_start;
+ size_t inbytesleft,
+ outbytesleft;
+ ssize_t res;
+ size_t mb_buf_len;
+ int saved_errno;
+
+ if ((_ast_info.wchar_to_ucs4.locale_serial != _ast_info.locale.serial)
||
+ (_ast_info.wchar_to_ucs4.ic == 0) ||
+ (_ast_info.wchar_to_ucs4.ic == -1))
+ {
+ _ast_info.wchar_to_ucs4.locale_serial = _ast_info.locale.serial;
+
+ if ((_ast_info.wchar_to_ucs4.ic != 0) &&
+ (_ast_info.wchar_to_ucs4.ic != -1))
+ {
+ (void)iconv_close((iconv_t)_ast_info.wchar_to_ucs4.ic);
+ }
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ _ast_info.wchar_to_ucs4.ic = (long)iconv_open("UTF32LE",
nl_langinfo(CODESET));
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ _ast_info.wchar_to_ucs4.ic = (long)iconv_open("UTF32BE",
nl_langinfo(CODESET));
+#else
+#error Unknown __BYTE_ORDER
+#endif
+ }
+
+ if (_ast_info.wchar_to_ucs4.ic == (iconv_t)-1)
+ return (-1);
+
+ mb_buf_len = wchar_len * MB_CUR_MAX;
+ inbuf_start = (char *)malloc(mb_buf_len+2);
+ outbytesleft = wchar_len * sizeof(uint32_t);
+ outbuf_start = (char *)ucs4;
+ if(!inbuf_start)
+ return(-1);
+
+ if (mbwide())
+ {
+ mbstate_t ps = {0};
+ ssize_t len;
+ size_t i;
+
+ memset(&ps, '\0', sizeof(ps));
+ (void)mbsinit(&ps);
+
+ for(inbuf = inbuf_start, i = 0 ;
+ i < wchar_len ;
+ i++, inbuf += len)
+ {
+ len = wcrtomb(inbuf, wchar[i], &ps);
+ if (len < 0)
+ {
+ inbuf[i]='\0';
+ break;
+ }
+ }
+ }
+ else
+ {
+ size_t i;
+
+ /*
+ * We need this because Linux's |wcrtomb()| can't
+ * handle single-byte locales like ISO8859-15
+ * correctly
+ */
+ for(inbuf = inbuf_start, i = 0 ;
+ i < wchar_len ;
+ i++)
+ *inbuf++=wchar[i];
+ }
+
+ inbytesleft = inbuf - inbuf_start;
+
+ inbuf = inbuf_start;
+ outbuf = outbuf_start;
+
+ res = iconv(_ast_info.wchar_to_ucs4.ic, &inbuf, &inbytesleft, &outbuf,
&outbytesleft);
+
+ saved_errno = errno;
+ free(inbuf_start);
+ errno = saved_errno;
+
+ return (res);
+}
+
+
int
chrexp(register const char* s, char** p, int* m, register int flags)
{
@@ -47,9 +233,10 @@
const char* b;
char* r;
int n;
- int w;
+ bool w;
+ bool unicode;
- w = 0;
+ w = unicode = false;
for (;;)
{
b = s;
@@ -153,14 +340,18 @@
c = CC_vt;
break;
case 'u':
+ unicode = true;
+ case 'w':
q = s + 4;
goto wex;
case 'U':
+ unicode = true;
+ case 'W':
q = s + 8;
wex:
if (!(flags & FMT_EXP_WIDE))
goto noexpand;
- w = 1;
+ w = true;
goto hex;
case 'x':
q = s + 2;
@@ -191,7 +382,7 @@
break;
e = 0;
s++;
- if (w && *s == 'U' && *(s + 1)
== '+')
+ if (w && ((*s == 'U') || (*s ==
'W')) && *(s + 1) == '+')
s += 2;
continue;
case '}':
@@ -204,7 +395,7 @@
}
break;
}
- if (n <= 2 && !(flags & FMT_EXP_CHAR) || n > 2
&& (w = 1) && !(flags & FMT_EXP_WIDE))
+ if (n <= 2 && !(flags & FMT_EXP_CHAR) || n > 2
&& (w = true) && !(flags & FMT_EXP_WIDE))
{
c = '\\';
s = b;
@@ -217,7 +408,7 @@
break;
default:
if ((s - b) > 1)
- w = 1;
+ w = true;
break;
}
break;
@@ -226,7 +417,18 @@
if (p)
*p = (char*)s;
if (m)
- *m = w;
+ *m = w?1:0;
+
+ if (w && unicode && (c > 127))
+ {
+ uint32_t in = c;
+ wchar_t out = -1;
+
+ if (ucs4_to_wchar(&in, 1, &out) < 0)
+ c = -1;
+ else
+ c = out;
+ }
return c;
noexpand:
c = '\\';
diff -r -u original/src/lib/libast/string/stresc.c
build_wchar/src/lib/libast/string/stresc.c
--- src/lib/libast/string/stresc.c 2010-05-01 07:46:26.000000000 +0200
+++ src/lib/libast/string/stresc.c 2013-09-02 03:01:27.949212696 +0200
@@ -36,7 +36,7 @@
strexp(register char* s, int flags)
{
register char* t;
- register unsigned int c;
+ register int c;
char* b;
char* e;
int w;
@@ -48,9 +48,27 @@
{
c = chrexp(s - 1, &e, &w, flags);
s = e;
+
+ if (c < 0)
+ {
+ /* conversion failed == empty string */
+ continue;
+ }
if (w)
{
- t += mbwide() ? mbconv(t, c) : wc2utf8(t, c);
+ if(mbwide())
+ {
+ t += mbconv(t, c);
+ }
+ else
+ {
+ /*
+ * single-byte locale, maybe "C" or
+ * extended single-byte locale like
+ * "en_US.ISO8859-15"
+ */
+ *t++ = c;
+ }
continue;
}
}
_______________________________________________
ast-developers mailing list
[email protected]
http://lists.research.att.com/mailman/listinfo/ast-developers