https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=8a87d3501f63239ab24104a41fdb4a122dba2179
commit 8a87d3501f63239ab24104a41fdb4a122dba2179 Author: Corinna Vinschen <cori...@vinschen.de> AuthorDate: Wed Jul 23 22:42:01 2025 +0200 Commit: Corinna Vinschen <cori...@vinschen.de> CommitDate: Thu Jul 24 12:11:12 2025 +0200 Cygwin: _sys_mbstowcs: fix handling invalid 4-byte UTF-8 sequences When a 4 byte utf-8 sequence has an invalid 4th byte, it's actually an invalid 3 byte sequence. In this case we already generated the high surrogate and only realize the problem when byte 4 doesn't match. At this point _sys_mbstowcs transposes the invalid 4th byte into the private use area. This is wrong. The invalid byte sequence here is the 3 byte sequence already converted to a high surrogate, not the trailing 4th byte. Fix this by backtracking to the start of the broken sequence and overwrite the already written high surrogate with a sequence of the original three bytes transposed to the private use area. Reset the mbstate and restart normal conversion at the non-matching 4th byte, which might start a new multibyte sequence. The resulting wide-char string can be converted back to multibyte and back again to wide-char, and the result will be identical, even if the multibyte sequence differs from the original sequence. Fixes: e44b9069cd227 ("* strfuncs.cc (sys_cp_mbstowcs): Treat src as unsigned char *. Convert failure of f_mbtowc into a single malformed utf-16 value.") Signed-off-by: Corinna Vinschen <cori...@vinschen.de> (cherry picked from commit 1463b41d403e861e4033387cdc71006e1664203a) Diff: --- winsup/cygwin/strfuncs.cc | 51 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc index cb7911c6b83d..caaf6b786295 100644 --- a/winsup/cygwin/strfuncs.cc +++ b/winsup/cygwin/strfuncs.cc @@ -1071,6 +1071,7 @@ _sys_mbstowcs (mbtowc_p f_mbtowc, wchar_t *dst, size_t dlen, const char *src, { wchar_t *ptr = dst; unsigned const char *pmbs = (unsigned const char *) src; + unsigned const char *got_high_surrogate = NULL; size_t count = 0; size_t len = dlen; int bytes; @@ -1142,16 +1143,58 @@ _sys_mbstowcs (mbtowc_p f_mbtowc, wchar_t *dst, size_t dlen, const char *src, Invalid bytes in a multibyte sequence are converted to the private use area which is already used to store ASCII - chars invalid in Windows filenames. This technque allows + chars invalid in Windows filenames. This technique allows to store them in a symmetric way. */ - bytes = 1; - if (dst) - *ptr = L'\xf000' | *pmbs; + + /* Special case high surrogate: if we already converted the first + 3 bytes of a sequence to a high surrogate, and only then encounter + a non-matching forth byte, the sequence is simply cut short. In + that case not the currently handled 4th byte is the invalid + sequence, but the 3 bytes converted to the high surrogate. So we + have to backtrack to the high surrogate and convert it to a + sequence of bytes in the private use area. Next, reset the + mbstate and retry to convert starting at the current byte. */ + if (got_high_surrogate) + { + if (dst) + { + --ptr; + *ptr++ = L'\xf000' | *got_high_surrogate++; + /* we know len > 0 at this point */ + *ptr++ = L'\xf000' | *got_high_surrogate++; + } + --len; + if (len > 0) + { + if (dst) + *ptr++ = L'\xf000' | *got_high_surrogate++; + --len; + } + count += 2; /* Actually 3, but we already counted one when + generating the high surrogate. */ + memset (&ps, 0, sizeof ps); + continue; + } + /* Never convert ASCII NUL */ + if (*pmbs) + { + bytes = 1; + if (dst) + *ptr = L'\xf000' | *pmbs; + } memset (&ps, 0, sizeof ps); } + got_high_surrogate = NULL; if (bytes > 0) { + /* Check if we got the high surrogate from a UTF-8 4 byte sequence. + This is used above to handle an invalid 4 byte sequence cut short + at byte 3. */ + /* FIXME: do we need an equivalent check for gb18030? */ + if (bytes == 3 && ps.__count == 4 && f_mbtowc == __utf8_mbtowc) + got_high_surrogate = pmbs; + pmbs += bytes; nms -= bytes; ++count;