This is an automated email from the ASF dual-hosted git repository. xiaoxiang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-nuttx.git
commit 673a2e0136c9d7f3d6137a125769b1dbea9796f0 Author: Jiuzhu Dong <dongjiuz...@xiaomi.com> AuthorDate: Sun May 29 16:27:52 2022 +0800 libc/wchar: support wchar Implemented according to https://en.wikipedia.org/wiki/UTF-8 Signed-off-by: Jiuzhu Dong <dongjiuz...@xiaomi.com> --- include/limits.h | 2 +- include/stdlib.h | 2 +- libs/libc/stdlib/lib_mbtowc.c | 9 +-- libs/libc/stdlib/lib_wctomb.c | 18 +---- libs/libc/wchar/Make.defs | 2 +- libs/libc/wchar/lib_mbrtowc.c | 140 +++++++++++++++++++++++++++++++++++---- libs/libc/wchar/lib_mbsinit.c | 42 ++++++++++++ libs/libc/wchar/lib_mbsnrtowcs.c | 56 +++++++++++++--- libs/libc/wchar/lib_wcrtomb.c | 37 ++++++++--- libs/libc/wchar/lib_wcsnrtombs.c | 74 +++++++++++---------- 10 files changed, 288 insertions(+), 94 deletions(-) diff --git a/include/limits.h b/include/limits.h index df8355b787..5b7a8adf24 100644 --- a/include/limits.h +++ b/include/limits.h @@ -60,7 +60,7 @@ * the correct value. */ -#define MB_LEN_MAX 1 +#define MB_LEN_MAX 4 /* Configurable limits required by POSIX **************************************** * diff --git a/include/stdlib.h b/include/stdlib.h index 255341f997..72724061b1 100644 --- a/include/stdlib.h +++ b/include/stdlib.h @@ -56,7 +56,7 @@ * character specified by the current locale. */ -#define MB_CUR_MAX 1 +#define MB_CUR_MAX 4 /* The environ variable, normally 'char **environ;' is not implemented as a * function call. However, get_environ_ptr() can be used in its place. diff --git a/libs/libc/stdlib/lib_mbtowc.c b/libs/libc/stdlib/lib_mbtowc.c index ff3ca71e01..3ea8de49c9 100644 --- a/libs/libc/stdlib/lib_mbtowc.c +++ b/libs/libc/stdlib/lib_mbtowc.c @@ -40,7 +40,7 @@ ****************************************************************************/ /**************************************************************************** - * Name: mbtowc.c + * Name: mbtowc * * Description: * Minimal multibyte to wide char converter @@ -59,10 +59,5 @@ int mbtowc(FAR wchar_t *pwc, FAR const char *s, size_t n) return -1; } - if (pwc) - { - *pwc = (wchar_t)*s; - } - - return (*s != '\0'); + return mbrtowc(pwc, s, n, NULL); } diff --git a/libs/libc/stdlib/lib_wctomb.c b/libs/libc/stdlib/lib_wctomb.c index 663c372e46..855ed1c904 100644 --- a/libs/libc/stdlib/lib_wctomb.c +++ b/libs/libc/stdlib/lib_wctomb.c @@ -34,10 +34,8 @@ * Included Files ****************************************************************************/ -#include <string.h> #include <stdlib.h> #include <wchar.h> -#include <errno.h> /**************************************************************************** * Public Functions @@ -53,19 +51,5 @@ int wctomb(FAR char *s, wchar_t wc) { - if (s == NULL) - { - return 0; - } - - /* Verify that wchar is a valid single-byte character. */ - - if ((size_t) wc >= 0x100) - { - set_errno(EILSEQ); - return -1; - } - - *s = (char)wc; - return 1; + return wcrtomb(s, wc, NULL); } diff --git a/libs/libc/wchar/Make.defs b/libs/libc/wchar/Make.defs index 6673033513..ece1358b87 100644 --- a/libs/libc/wchar/Make.defs +++ b/libs/libc/wchar/Make.defs @@ -25,7 +25,7 @@ CSRCS += lib_wmemmove.c lib_wmemset.c lib_btowc.c lib_mbrtowc.c lib_wctob.c CSRCS += lib_wcslcpy.c lib_wcsxfrm.c lib_wcrtomb.c lib_wcsftime.c CSRCS += lib_wcscoll.c lib_wcstol.c lib_wcstoll.c lib_wcstoul.c CSRCS += lib_wcstoull.c lib_wcstold.c lib_wcstof.c lib_wcstod.c -CSRCS += lib_swprintf.c lib_mbsnrtowcs.c lib_wcsnrtombs.c +CSRCS += lib_swprintf.c lib_mbsnrtowcs.c lib_wcsnrtombs.c lib_mbsinit.c CSRCS += lib_mbrlen.c lib_mbsrtowcs.c lib_wcsrtombs.c # Add the wchar directory to the build diff --git a/libs/libc/wchar/lib_mbrtowc.c b/libs/libc/wchar/lib_mbrtowc.c index c8eb92303a..2f2f79ade8 100644 --- a/libs/libc/wchar/lib_mbrtowc.c +++ b/libs/libc/wchar/lib_mbrtowc.c @@ -32,12 +32,62 @@ * Included Files ****************************************************************************/ -#include <stdlib.h> -#include <stdio.h> #include <errno.h> -#include <string.h> #include <wchar.h> +/**************************************************************************** + * Pre-processor Definitions + ****************************************************************************/ + +/* Implemented according to https://en.wikipedia.org/wiki/UTF-8 */ + +#define SA 0xc2u +#define SB 0xf4u + +/* Upper 6 state bits are a negative integer offset to bound-check next byte + * equivalent to: (((b) - 0x80) | ((b) + offset)) & ~0x3f + */ + +#define OOB(c, b) (((((b) >> 3) - 0x10) | \ + (((b) >> 3) + ((int32_t)(c) >> 26))) & ~7) + +/* Interval [a,b). Either a must be 80 or b must be c0, lower 3 bits clear. */ + +#define R(a, b) ((uint32_t)((uint32_t)((a) == 0x80 ? 0x40u - (b) : \ + 0u - (a)) << 23)) + +#define C(x) ((x) < 2 ? -1 : (R(0x80, 0xc0) | (x))) +#define D(x) C((x) + 16) +#define E(x) (((x) == 0 ? R(0xa0, 0xc0) : \ + (x) == 0xd ? R(0x80, 0xa0) : R(0x80, 0xc0)) \ + | (R(0x80, 0xc0) >> 6) \ + | (x)) +#define F(x) (((x) >= 5 ? 0 : \ + (x) == 0 ? R(0x90, 0xc0) : \ + (x) == 4 ? R(0x80, 0x90) : R(0x80, 0xc0)) \ + | (R(0x80, 0xc0) >> 6) \ + | (R(0x80, 0xc0) >> 12) \ + | (x)) + +/**************************************************************************** + * Private Data + ****************************************************************************/ + +/* This definition of g_bittab refer to link: + * https://en.wikipedia.org/wiki/UTF-8 [Codepage layout]. + */ + +static const uint32_t g_bittab[] = +{ + C(0x2), C(0x3), C(0x4), C(0x5), C(0x6), C(0x7), + C(0x8), C(0x9), C(0xa), C(0xb), C(0xc), C(0xd), C(0xe), C(0xf), + D(0x0), D(0x1), D(0x2), D(0x3), D(0x4), D(0x5), D(0x6), D(0x7), + D(0x8), D(0x9), D(0xa), D(0xb), D(0xc), D(0xd), D(0xe), D(0xf), + E(0x0), E(0x1), E(0x2), E(0x3), E(0x4), E(0x5), E(0x6), E(0x7), + E(0x8), E(0x9), E(0xa), E(0xb), E(0xc), E(0xd), E(0xe), E(0xf), + F(0x0), F(0x1), F(0x2), F(0x3), F(0x4) +}; + /**************************************************************************** * Public Functions ****************************************************************************/ @@ -53,20 +103,86 @@ size_t mbrtowc(FAR wchar_t *pwc, FAR const char *s, size_t n, FAR mbstate_t *ps) { - FAR const char *e = s; - size_t retval = 0; + FAR const unsigned char *src = (FAR const void *)s; + static mbstate_t state; + size_t num = n; + wchar_t dummy; + uint32_t c; + + if (ps == NULL) + { + ps = &state; + } - if (s == NULL) + c = *(FAR uint32_t *)ps; + if (src == NULL) + { + if (c != 0) + { + goto ilseq; + } + + return 0; + } + else if (pwc == NULL) { - s = e = ""; - n = 1; + pwc = &dummy; } - retval = mbsnrtowcs(pwc, &e, 1, n, ps); - if (retval == 1) + if (n == 0) { - retval = e - s; + return -2; } - return retval; + if (c == 0) + { + if (*src < 0x80) + { + return !!(*pwc = *src); + } + + if (*src - SA > SB - SA) + { + goto ilseq; + } + + c = g_bittab[*src++ - SA]; + n--; + } + + if (n != 0) + { + if (OOB(c, *src) != 0) + { + goto ilseq; + } + +loop: + c = (c << 6) | (*src++ - 0x80); + n--; + if ((c >> 31) == 0) + { + *(FAR uint32_t *)ps = 0; + *pwc = c; + return num - n; + } + + if (n != 0) + { + if (*src - 0x80u >= 0x40) + { + goto ilseq; + } + + goto loop; + } + } + + *(FAR uint32_t *)ps = c; + return -2; + +ilseq: + *(FAR uint32_t *)ps = 0; + set_errno(EILSEQ); + return -1; } diff --git a/libs/libc/wchar/lib_mbsinit.c b/libs/libc/wchar/lib_mbsinit.c new file mode 100644 index 0000000000..39288edb9c --- /dev/null +++ b/libs/libc/wchar/lib_mbsinit.c @@ -0,0 +1,42 @@ +/**************************************************************************** + * libs/libc/wchar/lib_mbsinit.c + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. The + * ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ + +#include <wchar.h> + +/**************************************************************************** + * Public Functions + ****************************************************************************/ + +/**************************************************************************** + * Name: mbsinit + * + * Description: + * test for initial shift state + * + ****************************************************************************/ + +int mbsinit(FAR const mbstate_t *st) +{ + return st == NULL || !*(FAR uint32_t *)st; +} diff --git a/libs/libc/wchar/lib_mbsnrtowcs.c b/libs/libc/wchar/lib_mbsnrtowcs.c index 3899cade3f..c81356f261 100644 --- a/libs/libc/wchar/lib_mbsnrtowcs.c +++ b/libs/libc/wchar/lib_mbsnrtowcs.c @@ -22,8 +22,6 @@ * Included Files ****************************************************************************/ -#include <sys/types.h> -#include <string.h> #include <wchar.h> /**************************************************************************** @@ -68,23 +66,59 @@ size_t mbsnrtowcs(FAR wchar_t *dst, FAR const char **src, size_t nms, size_t len, FAR mbstate_t *ps) { - size_t i; + FAR const char *s = *src; + FAR wchar_t *ws = dst; + size_t cnt = 0; + size_t l; if (dst == NULL) { - return strnlen(*src, nms); + len = SIZE_MAX; } - for (i = 0; i < nms && i < len; i++) + if (s != NULL) { - dst[i] = (wchar_t)(*src)[i]; - if (dst[i] == L'\0') + while (len > 0 && nms > 0) { - *src = NULL; - return i; + l = mbrtowc(ws, s, nms, ps); + if ((ssize_t)l <= 0) + { + if ((ssize_t)l == -2) + { + /* if the input buffer ends with an incomplete character + * stops at the end of the input buffer. + */ + + s += nms; + } + else if (l == 0) + { + s = NULL; + } + else + { + cnt = l; + } + + break; + } + + s += l; + nms -= l; + if (ws != NULL) + { + ws++; + } + + len--; + cnt++; } } - *src += i; - return i; + if (dst != NULL) + { + *src = s; + } + + return cnt; } diff --git a/libs/libc/wchar/lib_wcrtomb.c b/libs/libc/wchar/lib_wcrtomb.c index 061b80150b..e2a0a575e6 100644 --- a/libs/libc/wchar/lib_wcrtomb.c +++ b/libs/libc/wchar/lib_wcrtomb.c @@ -34,10 +34,7 @@ * Included Files ****************************************************************************/ -#include <string.h> #include <wchar.h> -#include <stdlib.h> -#include <stdio.h> #include <errno.h> /**************************************************************************** @@ -54,17 +51,37 @@ size_t wcrtomb(FAR char *s, wchar_t wc, FAR mbstate_t *ps) { - int retval = 0; - char buf[MB_LEN_MAX]; - if (s == NULL) { - retval = wctomb(buf, wc); + return 0; + } + else if ((unsigned)wc < 0x80) + { + *s = wc; + return 1; + } + else if ((unsigned)wc < 0x800) + { + *s++ = 0xc0 | (wc >> 6); + *s = 0x80 | (wc & 0x3f); + return 2; + } + else if ((unsigned)wc < 0xd800 || (unsigned)wc <= 0xffff) + { + *s++ = 0xe0 | (wc >> 12); + *s++ = 0x80 | ((wc >> 6) & 0x3f); + *s = 0x80 | (wc & 0x3f); + return 3; } - else + else if ((unsigned long)wc < 0x110000) { - retval = wctomb(s, wc); + *s++ = 0xf0 | ((unsigned long)wc >> 18); + *s++ = 0x80 | ((wc >> 12) & 0x3f); + *s++ = 0x80 | ((wc >> 6) & 0x3f); + *s = 0x80 | (wc & 0x3f); + return 4; } - return retval; + set_errno(EILSEQ); + return -1; } diff --git a/libs/libc/wchar/lib_wcsnrtombs.c b/libs/libc/wchar/lib_wcsnrtombs.c index 1b959767c8..a0f8c4aafa 100644 --- a/libs/libc/wchar/lib_wcsnrtombs.c +++ b/libs/libc/wchar/lib_wcsnrtombs.c @@ -22,12 +22,9 @@ * Included Files ****************************************************************************/ -#include <sys/types.h> #include <wchar.h> - -#include <stdlib.h> -#include <stdio.h> -#include <errno.h> +#include <string.h> +#include <limits.h> /**************************************************************************** * Public Functions @@ -70,48 +67,57 @@ size_t wcsnrtombs(FAR char *dst, FAR const wchar_t **src, size_t nwc, size_t len, FAR mbstate_t *ps) { - size_t i; + FAR const wchar_t *ws = *src; + size_t cnt = 0; if (dst == NULL) { - for (i = 0; i < nwc; i++) + len = 0; + } + + while (ws != NULL && nwc != 0) + { + char tmp[MB_LEN_MAX]; + size_t res; + + if (*ws == 0) { - wchar_t wc = (*src)[i]; + ws = NULL; + break; + } - if (wc < 0 || wc > 0xff) - { - set_errno(EILSEQ); - return -1; - } + res = wcrtomb(len < MB_LEN_MAX ? tmp : dst, *ws, ps); + if ((ssize_t)res < 0) + { + cnt = res; + break; + } - if (wc == L'\0') + if (dst != NULL) + { + if (len < MB_LEN_MAX) { - return i; + if (res > len) + { + break; + } + + memcpy(dst, tmp, res); } + + dst += res; + len -= res; } - return i; + ws++; + nwc--; + cnt += res; } - for (i = 0; i < nwc && i < len; i++) + if (dst != NULL) { - wchar_t wc = (*src)[i]; - - if (wc < 0 || wc > 0xff) - { - *src += i; - set_errno(EILSEQ); - return -1; - } - - dst[i] = wc; - if (wc == L'\0') - { - *src = NULL; - return i; - } + *src = ws; } - *src += i; - return i; + return cnt; }