Android < 5.0 had only dummy locales. Starting with Android 5.0 (according to the Android libc's git history), they have locales. But there are two problems:
1) The default locale (i.e. the locale in use when setlocale was not called) is the "C.UTF-8" locale, not the "C" locale. Test case: ================================================================================ #include <stdio.h> #include <stdlib.h> #include <locale.h> int main () { printf ("Locale=|%s| LC_CTYPE=|%s| MB_CUR_MAX=%d\n", setlocale (LC_ALL, NULL), setlocale (LC_CTYPE, NULL), (int) MB_CUR_MAX); } ================================================================================ prints Locale=|C.UTF-8| LC_CTYPE=|C.UTF-8| MB_CUR_MAX=4 rather than the expected Locale=|C| LC_CTYPE=|C| MB_CUR_MAX=1 POSIX <https://pubs.opengroup.org/onlinepubs/9699919799.2018edition/basedefs/V1_chap07.html#tag_07_02> says that the default locale should be the "C"/"POSIX" locale. 2) A setlocale call that is meant to set the "C" or "POSIX" locale actually sets a locale with UTF-8 encoding. Test case 1: ================================================================================ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <locale.h> #include <wchar.h> int main () { mbstate_t state; if (setlocale (LC_ALL, "") == NULL) return 1; memset (&state, '\0', sizeof (state)); printf ("Locale=|%s| LC_CTYPE=|%s| MB_CUR_MAX=%d mbrtowc(0xC0)=%d\n", setlocale (LC_ALL, NULL), setlocale (LC_CTYPE, NULL), (int) MB_CUR_MAX, (int) mbrtowc (NULL, "\xC0", 1, &state)); } ================================================================================ $ LC_ALL=C ./a.out and $ LC_ALL=POSIX ./a.out print Locale=|C.UTF-8| LC_CTYPE=|C.UTF-8| MB_CUR_MAX=4 mbrtowc(0xC0)=-2 rather than the expected Locale=|C| LC_CTYPE=|C| MB_CUR_MAX=1 mbrtowc(0xC0)=-1 Test case 2: ================================================================================ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <locale.h> #include <wchar.h> int main () { mbstate_t state; if (setlocale (LC_ALL, "C") == NULL) return 1; memset (&state, '\0', sizeof (state)); printf ("Locale=|%s| LC_CTYPE=|%s| MB_CUR_MAX=%d mbrtowc(0xC0)=%d\n", setlocale (LC_ALL, NULL), setlocale (LC_CTYPE, NULL), (int) MB_CUR_MAX, (int) mbrtowc (NULL, "\xC0", 1, &state)); if (setlocale (LC_ALL, "POSIX") == NULL) return 1; memset (&state, '\0', sizeof (state)); printf ("Locale=|%s| LC_CTYPE=|%s| MB_CUR_MAX=%d mbrtowc(0xC0)=%d\n", setlocale (LC_ALL, NULL), setlocale (LC_CTYPE, NULL), (int) MB_CUR_MAX, (int) mbrtowc (NULL, "\xC0", 1, &state)); } ================================================================================ prints Locale=|C| LC_CTYPE=|C| MB_CUR_MAX=4 mbrtowc(0xC0)=-2 Locale=|C| LC_CTYPE=|C| MB_CUR_MAX=4 mbrtowc(0xC0)=-2 rather than the expected Locale=|C| LC_CTYPE=|C| MB_CUR_MAX=1 mbrtowc(0xC0)=-1 Locale=|C| LC_CTYPE=|C| MB_CUR_MAX=1 mbrtowc(0xC0)=-1 One of the consequences are these two test failures: FAIL: test-mbrtoc32-5.sh ======================== ../../gltests/test-mbrtoc32.c:105: assertion 'ret == 1' failed Aborted FAIL test-mbrtoc32-5.sh (exit status: 134) FAIL: test-mbrtowc5.sh ====================== ../../gltests/test-mbrtowc.c:105: assertion 'ret == 1' failed Aborted FAIL test-mbrtowc5.sh (exit status: 134) As a workaround, I'm applying these two patches. 2023-01-16 Bruno Haible <br...@clisp.org> mbrtowc, mbrtoc32 tests: Avoid test failure on Android ≥ 5.0. * tests/test-mbrtowc.c (main): On Android 5.0 or newer, when testing the "C" locale, verify that the encoding is UTF-8. * tests/test-mbrtoc32.c (main): Likewise. * doc/posix-functions/setlocale.texi: Mention the Android problems. mbrtowc, mbrtoc32 tests: Refactor. * tests/test-mbrtowc.c (main): Straighten convoluted code. * tests/test-mbrtoc32.c (main): Likewise.
>From 1ca5866371acd6b4bdcb1913d18cc14b7a8528c1 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Mon, 16 Jan 2023 14:30:06 +0100 Subject: [PATCH 1/2] mbrtowc, mbrtoc32 tests: Refactor. * tests/test-mbrtowc.c (main): Straighten convoluted code. * tests/test-mbrtoc32.c (main): Likewise. --- ChangeLog | 6 +++++ tests/test-mbrtoc32.c | 54 ++++++++++++++++++++++++++++++------------- tests/test-mbrtowc.c | 54 ++++++++++++++++++++++++++++++------------- 3 files changed, 82 insertions(+), 32 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9bc953423f..045e1c6247 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2023-01-16 Bruno Haible <br...@clisp.org> + + mbrtowc, mbrtoc32 tests: Refactor. + * tests/test-mbrtowc.c (main): Straighten convoluted code. + * tests/test-mbrtoc32.c (main): Likewise. + 2023-01-16 Paul Eggert <egg...@cs.ucla.edu> sigpipe tests: Modernize use of 'head'. diff --git a/tests/test-mbrtoc32.c b/tests/test-mbrtoc32.c index c8f735d520..36b520f7b8 100644 --- a/tests/test-mbrtoc32.c +++ b/tests/test-mbrtoc32.c @@ -72,10 +72,6 @@ main (int argc, char *argv[]) for (c = 0; c < 0x100; c++) switch (c) { - default: - if (! (c && 1 < argc && argv[1][0] == '5')) - break; - FALLTHROUGH; case '\t': case '\v': case '\f': case ' ': case '!': case '"': case '#': case '%': case '&': case '\'': case '(': case ')': case '*': @@ -97,25 +93,23 @@ main (int argc, char *argv[]) case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case '{': case '|': case '}': case '~': - /* c is in the ISO C "basic character set", or argv[1] starts - with '5' so we are testing all nonnull bytes. */ + /* c is in the ISO C "basic character set". */ + ASSERT (c < 0x80); + /* c is an ASCII character. */ buf[0] = c; + wc = (char32_t) 0xBADFACE; ret = mbrtoc32 (&wc, buf, 1, &state); ASSERT (ret == 1); - if (c < 0x80) - /* c is an ASCII character. */ - ASSERT (wc == c); - else - /* argv[1] starts with '5', that is, we are testing the C or POSIX - locale. - On most platforms, the bytes 0x80..0xFF map to U+0080..U+00FF. - But on musl libc, the bytes 0x80..0xFF map to U+DF80..U+DFFF. */ - ASSERT (wc == (btowc (c) == 0xDF00 + c ? btowc (c) : c)); + ASSERT (wc == c); ASSERT (mbsinit (&state)); + ret = mbrtoc32 (NULL, buf, 1, &state); ASSERT (ret == 1); ASSERT (mbsinit (&state)); + + break; + default: break; } } @@ -368,7 +362,35 @@ main (int argc, char *argv[]) return 0; case '5': - /* C locale; tested above. */ + /* C or POSIX locale. */ + { + int c; + char buf[1]; + + memset (&state, '\0', sizeof (mbstate_t)); + for (c = 0; c < 0x100; c++) + if (c != 0) + { + /* We are testing all nonnull bytes. */ + buf[0] = c; + + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, buf, 1, &state); + ASSERT (ret == 1); + if (c < 0x80) + /* c is an ASCII character. */ + ASSERT (wc == c); + else + /* On most platforms, the bytes 0x80..0xFF map to U+0080..U+00FF. + But on musl libc, the bytes 0x80..0xFF map to U+DF80..U+DFFF. */ + ASSERT (wc == (btowc (c) == 0xDF00 + c ? btowc (c) : c)); + ASSERT (mbsinit (&state)); + + ret = mbrtoc32 (NULL, buf, 1, &state); + ASSERT (ret == 1); + ASSERT (mbsinit (&state)); + } + } return 0; } diff --git a/tests/test-mbrtowc.c b/tests/test-mbrtowc.c index 9019ea0e71..b358d8d583 100644 --- a/tests/test-mbrtowc.c +++ b/tests/test-mbrtowc.c @@ -72,10 +72,6 @@ main (int argc, char *argv[]) for (c = 0; c < 0x100; c++) switch (c) { - default: - if (! (c && 1 < argc && argv[1][0] == '5')) - break; - FALLTHROUGH; case '\t': case '\v': case '\f': case ' ': case '!': case '"': case '#': case '%': case '&': case '\'': case '(': case ')': case '*': @@ -97,25 +93,23 @@ main (int argc, char *argv[]) case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case '{': case '|': case '}': case '~': - /* c is in the ISO C "basic character set", or argv[1] starts - with '5' so we are testing all nonnull bytes. */ + /* c is in the ISO C "basic character set". */ + ASSERT (c < 0x80); + /* c is an ASCII character. */ buf[0] = c; + wc = (wchar_t) 0xBADFACE; ret = mbrtowc (&wc, buf, 1, &state); ASSERT (ret == 1); - if (c < 0x80) - /* c is an ASCII character. */ - ASSERT (wc == c); - else - /* argv[1] starts with '5', that is, we are testing the C or POSIX - locale. - On most platforms, the bytes 0x80..0xFF map to U+0080..U+00FF. - But on musl libc, the bytes 0x80..0xFF map to U+DF80..U+DFFF. */ - ASSERT (wc == (btowc (c) == 0xDF00 + c ? btowc (c) : c)); + ASSERT (wc == c); ASSERT (mbsinit (&state)); + ret = mbrtowc (NULL, buf, 1, &state); ASSERT (ret == 1); ASSERT (mbsinit (&state)); + + break; + default: break; } } @@ -349,7 +343,35 @@ main (int argc, char *argv[]) return 0; case '5': - /* C locale; tested above. */ + /* C or POSIX locale. */ + { + int c; + char buf[1]; + + memset (&state, '\0', sizeof (mbstate_t)); + for (c = 0; c < 0x100; c++) + if (c != 0) + { + /* We are testing all nonnull bytes. */ + buf[0] = c; + + wc = (wchar_t) 0xBADFACE; + ret = mbrtowc (&wc, buf, 1, &state); + ASSERT (ret == 1); + if (c < 0x80) + /* c is an ASCII character. */ + ASSERT (wc == c); + else + /* On most platforms, the bytes 0x80..0xFF map to U+0080..U+00FF. + But on musl libc, the bytes 0x80..0xFF map to U+DF80..U+DFFF. */ + ASSERT (wc == (btowc (c) == 0xDF00 + c ? btowc (c) : c)); + ASSERT (mbsinit (&state)); + + ret = mbrtowc (NULL, buf, 1, &state); + ASSERT (ret == 1); + ASSERT (mbsinit (&state)); + } + } return 0; } -- 2.34.1
From 653bc7d23e08ab61ee2382f8773f0a95d93ab871 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Mon, 16 Jan 2023 14:34:56 +0100 Subject: [PATCH 2/2] =?UTF-8?q?mbrtowc,=20mbrtoc32=20tests:=20Avoid=20test?= =?UTF-8?q?=20failure=20on=20Android=20=E2=89=A5=205.0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * tests/test-mbrtowc.c (main): On Android 5.0 or newer, when testing the "C" locale, verify that the encoding is UTF-8. * tests/test-mbrtoc32.c (main): Likewise. * doc/posix-functions/setlocale.texi: Mention the Android problems. --- ChangeLog | 6 ++++++ doc/posix-functions/setlocale.texi | 8 +++++++- tests/test-mbrtoc32.c | 10 ++++++++++ tests/test-mbrtowc.c | 10 ++++++++++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 045e1c6247..0051e3237f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,11 @@ 2023-01-16 Bruno Haible <br...@clisp.org> + mbrtowc, mbrtoc32 tests: Avoid test failure on Android ≥ 5.0. + * tests/test-mbrtowc.c (main): On Android 5.0 or newer, when testing + the "C" locale, verify that the encoding is UTF-8. + * tests/test-mbrtoc32.c (main): Likewise. + * doc/posix-functions/setlocale.texi: Mention the Android problems. + mbrtowc, mbrtoc32 tests: Refactor. * tests/test-mbrtowc.c (main): Straighten convoluted code. * tests/test-mbrtoc32.c (main): Likewise. diff --git a/doc/posix-functions/setlocale.texi b/doc/posix-functions/setlocale.texi index 11364d3901..6e232200f8 100644 --- a/doc/posix-functions/setlocale.texi +++ b/doc/posix-functions/setlocale.texi @@ -21,7 +21,7 @@ On Windows platforms (excluding Cygwin), @code{setlocale} understands different locale names, that are not based on ISO 639 language names and ISO 3166 country names. @item -On Android 4.3, which which doesn't have locales, the @code{setlocale} function +On Android < 5.0, which doesn't have locales, the @code{setlocale} function always fails. The replacement, however, supports only the locale names @code{"C"} and @code{"POSIX"}. @end itemize @@ -52,4 +52,10 @@ In addition any value is accepted for @code{LC_CTYPE}, and so NULL is never returned to indicate a failure to set locale. To verify category values, each category must be set individually with @code{setlocale(LC_COLLATE,"")} etc. +@item +On Android 5.0 and newer, the default locale (i.e.@: the locale in use when +@code{setlocale} was not called) is the @code{"C.UTF-8"} locale, not the +@code{"C"} locale. Additionally, a @code{setlocale} call that is meant to set +the @code{"C"} or @code{"POSIX"} locale actually sets an equivalent of the +@code{"C.UTF-8"} locale. @end itemize diff --git a/tests/test-mbrtoc32.c b/tests/test-mbrtoc32.c index 36b520f7b8..0d75c3db14 100644 --- a/tests/test-mbrtoc32.c +++ b/tests/test-mbrtoc32.c @@ -26,6 +26,7 @@ SIGNATURE_CHECK (mbrtoc32, size_t, #include <locale.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> #include "macros.h" @@ -124,6 +125,15 @@ main (int argc, char *argv[]) ASSERT (mbsinit (&state)); } +#ifdef __ANDROID__ + /* On Android ≥ 5.0, the default locale is the "C.UTF-8" locale, not the + "C" locale. Furthermore, when you attempt to set the "C" or "POSIX" + locale via setlocale(), what you get is a "C" locale with UTF-8 encoding, + that is, effectively the "C.UTF-8" locale. */ + if (argc > 1 && strcmp (argv[1], "5") == 0 && MB_CUR_MAX > 1) + argv[1] = "2"; +#endif + if (argc > 1) switch (argv[1][0]) { diff --git a/tests/test-mbrtowc.c b/tests/test-mbrtowc.c index b358d8d583..1fdf039c42 100644 --- a/tests/test-mbrtowc.c +++ b/tests/test-mbrtowc.c @@ -26,6 +26,7 @@ SIGNATURE_CHECK (mbrtowc, size_t, (wchar_t *, char const *, size_t, #include <locale.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> #include "macros.h" @@ -124,6 +125,15 @@ main (int argc, char *argv[]) ASSERT (mbsinit (&state)); } +#ifdef __ANDROID__ + /* On Android ≥ 5.0, the default locale is the "C.UTF-8" locale, not the + "C" locale. Furthermore, when you attempt to set the "C" or "POSIX" + locale via setlocale(), what you get is a "C" locale with UTF-8 encoding, + that is, effectively the "C.UTF-8" locale. */ + if (argc > 1 && strcmp (argv[1], "5") == 0 && MB_CUR_MAX > 1) + argv[1] = "2"; +#endif + if (argc > 1) switch (argv[1][0]) { -- 2.34.1