$ locale charmap UTF-8 $ bash -c '"$@"' sh printf '%d\n' $'"\xff' $'"\xff' $'"\xff' 32767 0 0
That's because we store the return value of mblen() (which may be -1) into a size_t (unsigned) variable. See patch below which aligns the behaviour with that of other shells which use the byte value when the initial sequence of bytes can't be converted to a character. So: printf '%d\n' $'"\uff' $'"\xff' outputs 255 255 The call to mblen() has been removed. It's wrong to use it here as it would return -1 on a string like "ábc\x80" in UTF-8, so would end up getting the value for the first byte instead of the codepoint of the first character. diff --git a/builtins/printf.def b/builtins/printf.def index 3d374ff..67e5b59 100644 --- a/builtins/printf.def +++ b/builtins/printf.def @@ -1245,18 +1245,16 @@ asciicode () register intmax_t ch; #if defined (HANDLE_MULTIBYTE) wchar_t wc; - size_t mblength, slen; + int mblength; + size_t slen; #endif DECLARE_MBSTATE; #if defined (HANDLE_MULTIBYTE) slen = strlen (garglist->word->word+1); - mblength = MBLEN (garglist->word->word+1, slen); - if (mblength > 1) - { - mblength = mbtowc (&wc, garglist->word->word+1, slen); - ch = wc; /* XXX */ - } + mblength = mbtowc (&wc, garglist->word->word+1, slen); + if (mblength > 0) + ch = wc; else #endif ch = (unsigned char)garglist->word->word[1]; diff --git a/support/bashbug.sh b/support/bashbug.sh index 29ce134..01db35d 100644 -- Stephane