[hackers] [libgrapheme] Encourage strict aliasing for library users (uint8_t * -> char *) || Laslo Hunhold

git Thu, 16 Dec 2021 15:47:54 -0800

commit b99a40eefc2ec1ad8714ed210a3aeedfb3283159
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Fri Dec 17 00:34:27 2021 +0100
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Fri Dec 17 00:34:27 2021 +0100


    Encourage strict aliasing for library users (uint8_t * -> char *)
    
    After a long-winded discussion with Michael Forney who has a really
    deep understanding of the C-specification, he rightfully pointed out
    that using uint8_t * might look good on paper, but leads to subtle
    problems due to intrinsics within the C99-specification.
    
    While you can alias any pointer to character types (char, unsigned char,
    signed char), uint8_t is not a character type and aliasing to it breaks
    the strict aliasing rule. This is not a problem in practice as gcc
    is the only big compiler enforcing strict aliasing and uint8_t is
    usually defined as unsigned char, inheriting the aliasing property for
    technical reasons, but strictly speaking uint8_t is not a character
    type.
    
    With uint8_t * in the API, library users would've been forced to cast
    any input-string to uint8_t *, breaking the strict aliasing rule. A
    lot of code relies on this or conveniently disables strict aliasing
    through compiler flags, but using char-arrays is the only really
    portable and safe way to work with it.
    Given char is usually 8 bits and indicates strongly that we're dealing
    with a string is one strong point for using char *, another is that
    C11 introduced UTF-8-string-literals of the form u8"..." which are
    of type char[]. In this sense, using char * ensures some form of
    forward-compatibility and fits nicely within the spec that's slowly
    converging towards UTF-8.
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/grapheme.h b/grapheme.h
index bd5244b..3294c8e 100644
--- a/grapheme.h
+++ b/grapheme.h
@@ -19,11 +19,11 @@ typedef struct lg_internal_segmentation_state {
 
 #define LG_CODEPOINT_INVALID UINT32_C(0xFFFD)
 
-size_t lg_grapheme_nextbreak(const uint8_t *);
+size_t lg_grapheme_nextbreak(const char *);
 
 bool lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE 
*);
 
-size_t lg_utf8_decode(const uint8_t *, size_t, uint_least32_t *);
-size_t lg_utf8_encode(uint_least32_t, uint8_t *, size_t);
+size_t lg_utf8_decode(const char *, size_t, uint_least32_t *);
+size_t lg_utf8_encode(uint_least32_t, char *, size_t);
 
 #endif /* GRAPHEME_H */
diff --git a/man/lg_grapheme_nextbreak.3 b/man/lg_grapheme_nextbreak.3
index 795e1b4..ff78395 100644
--- a/man/lg_grapheme_nextbreak.3
+++ b/man/lg_grapheme_nextbreak.3
@@ -7,7 +7,7 @@
 .Sh SYNOPSIS
 .In grapheme.h
 .Ft size_t
-.Fn lg_grapheme_nextbreak "const uint8_t *str"
+.Fn lg_grapheme_nextbreak "const char *str"
 .Sh DESCRIPTION
 .Fn lg_grapheme_nextbreak
 computes the offset (in bytes) to the next grapheme
@@ -52,7 +52,7 @@ main(void)
 
        /* print each grapheme cluster with byte-length */
        for (; *s != '\\0';) {
-               len = lg_grapheme_nextbreak((uint8_t *)s);
+               len = lg_grapheme_nextbreak(s);
                printf("%2zu bytes | %.*s\\n", len, (int)len, s, len);
                s += len;
        }
diff --git a/src/grapheme.c b/src/grapheme.c
index 56993af..78d0993 100644
--- a/src/grapheme.c
+++ b/src/grapheme.c
@@ -179,7 +179,7 @@ hasbreak:
 }
 
 size_t
-lg_grapheme_nextbreak(const uint8_t *str)
+lg_grapheme_nextbreak(const char *str)
 {
        uint_least32_t cp0, cp1;
        size_t ret, len = 0;
diff --git a/src/utf8.c b/src/utf8.c
index b21c920..327deea 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -48,7 +48,7 @@ static const struct {
 };
 
 size_t
-lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
+lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
 {
        size_t off, i;
 
@@ -60,13 +60,14 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t 
*cp)
 
        /* identify sequence type with the first byte */
        for (off = 0; off < LEN(lut); off++) {
-               if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
+               if (BETWEEN(((unsigned char *)s)[0], lut[off].lower,
+                           lut[off].upper)) {
                        /*
                         * first byte is within the bounds; fill
                         * p with the the first bits contained in
                         * the first byte (by subtracting the high bits)
                         */
-                       *cp = s[0] - lut[off].lower;
+                       *cp = ((unsigned char *)s)[0] - lut[off].lower;
                        break;
                }
        }
@@ -74,6 +75,9 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
                /*
                 * first byte does not match a sequence type;
                 * set cp as invalid and return 1 byte processed
+                *
+                * this also includes the cases where bits higher than
+                * the 8th are set on systems with CHAR_BIT > 8
                 */
                *cp = LG_CODEPOINT_INVALID;
                return 1;
@@ -92,12 +96,16 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t 
*cp)
         * (i.e. between 0x80 (10000000) and 0xBF (10111111))
         */
        for (i = 1; i <= off; i++) {
-               if(!BETWEEN(s[i], 0x80, 0xBF)) {
+               if(!BETWEEN(((unsigned char *)s)[i], 0x80, 0xBF)) {
                        /*
                         * byte does not match format; return
                         * number of bytes processed excluding the
                         * unexpected character as recommended since
                         * Unicode 6 (chapter 3)
+                        *
+                        * this also includes the cases where bits
+                        * higher than the 8th are set on systems
+                        * with CHAR_BIT > 8
                         */
                        *cp = LG_CODEPOINT_INVALID;
                        return 1 + (i - 1);
@@ -106,7 +114,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t 
*cp)
                 * shift code point by 6 bits and add the 6 stored bits
                 * in s[i] to it using the bitmask 0x3F (00111111)
                 */
-               *cp = (*cp << 6) | (s[i] & 0x3F);
+               *cp = (*cp << 6) | (((unsigned char *)s)[i] & 0x3F);
        }
 
        if (*cp < lut[off].mincp ||
@@ -125,7 +133,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t 
*cp)
 }
 
 size_t
-lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
+lg_utf8_encode(uint_least32_t cp, char *s, size_t n)
 {
        size_t off, i;
 
@@ -165,7 +173,7 @@ lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
         * We do not overwrite the mask because we guaranteed earlier
         * that there are no bits higher than the mask allows.
         */
-       s[0] = lut[off].lower | (uint8_t)(cp >> (6 * off));
+       ((unsigned char *)s)[0] = lut[off].lower | (uint8_t)(cp >> (6 * off));
 
        for (i = 1; i <= off; i++) {
                /*
@@ -174,7 +182,8 @@ lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
                 * extract from the properly-shifted value using the
                 * mask 00111111 (0x3F)
                 */
-               s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+               ((unsigned char *)s)[i] = 0x80 |
+                                         ((cp >> (6 * (off - i))) & 0x3F);
        }
 
        return 1 + off;
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
index 0fd6f77..b4dc7f2 100644
--- a/test/utf8-decode.c
+++ b/test/utf8-decode.c
@@ -8,7 +8,7 @@
 #include "util.h"
 
 static const struct {
-       uint8_t       *arr;     /* UTF-8 byte sequence */
+       char          *arr;     /* UTF-8 byte sequence */
        size_t         len;     /* length of UTF-8 byte sequence */
        size_t         exp_len; /* expected length returned */
        uint_least32_t exp_cp;  /* expected code point returned */
@@ -28,7 +28,7 @@ static const struct {
                 * [ 11111101 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xFD },
+               .arr     = (char *)(unsigned char[]){ 0xFD },
                .len     = 1,
                .exp_len = 1,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -38,7 +38,7 @@ static const struct {
                 * [ 00000001 ] ->
                 * 0000001
                 */
-               .arr     = (uint8_t[]){ 0x01 },
+               .arr     = (char *)(unsigned char[]){ 0x01 },
                .len     = 1,
                .exp_len = 1,
                .exp_cp  = 0x1,
@@ -48,7 +48,7 @@ static const struct {
                 * [ 11000011 10111111 ] ->
                 * 00011111111
                 */
-               .arr     = (uint8_t[]){ 0xC3, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xC3, 0xBF },
                .len     = 2,
                .exp_len = 2,
                .exp_cp  = 0xFF,
@@ -58,7 +58,7 @@ static const struct {
                 * [ 11000011 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xC3 },
+               .arr     = (char *)(unsigned char[]){ 0xC3 },
                .len     = 1,
                .exp_len = 2,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -68,7 +68,7 @@ static const struct {
                 * [ 11000011 11111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xC3, 0xFF },
+               .arr     = (char *)(unsigned char[]){ 0xC3, 0xFF },
                .len     = 2,
                .exp_len = 1,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -78,7 +78,7 @@ static const struct {
                 * [ 11000001 10111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xC1, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xC1, 0xBF },
                .len     = 2,
                .exp_len = 2,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -88,7 +88,7 @@ static const struct {
                 * [ 11100000 10111111 10111111 ] ->
                 * 0000111111111111
                 */
-               .arr     = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
                .len     = 3,
                .exp_len = 3,
                .exp_cp  = 0xFFF,
@@ -98,7 +98,7 @@ static const struct {
                 * [ 11100000 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xE0 },
+               .arr     = (char *)(unsigned char[]){ 0xE0 },
                .len     = 1,
                .exp_len = 3,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -108,7 +108,7 @@ static const struct {
                 * [ 11100000 01111111 10111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
                .len     = 3,
                .exp_len = 1,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -118,7 +118,7 @@ static const struct {
                 * [ 11100000 10111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xE0, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xE0, 0xBF },
                .len     = 2,
                .exp_len = 3,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -128,7 +128,7 @@ static const struct {
                 * [ 11100000 10111111 01111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+               .arr     = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
                .len     = 3,
                .exp_len = 2,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -138,7 +138,7 @@ static const struct {
                 * [ 11100000 10011111 10111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
                .len     = 3,
                .exp_len = 3,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -148,7 +148,7 @@ static const struct {
                 * [ 11101101 10100000 10000000 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+               .arr     = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
                .len     = 3,
                .exp_len = 3,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -158,7 +158,7 @@ static const struct {
                 * [ 11110011 10111111 10111111 10111111 ] ->
                 * 011111111111111111111
                 */
-               .arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
                .len     = 4,
                .exp_len = 4,
                .exp_cp  = UINT32_C(0xFFFFF),
@@ -168,7 +168,7 @@ static const struct {
                 * [ 11110011 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xF3 },
+               .arr     = (char *)(unsigned char[]){ 0xF3 },
                .len     = 1,
                .exp_len = 4,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -178,7 +178,7 @@ static const struct {
                 * [ 11110011 01111111 10111111 10111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
                .len     = 4,
                .exp_len = 1,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -188,7 +188,7 @@ static const struct {
                 * [ 11110011 10111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xF3, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xF3, 0xBF },
                .len     = 2,
                .exp_len = 4,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -198,7 +198,7 @@ static const struct {
                 * [ 11110011 10111111 01111111 10111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
                .len     = 4,
                .exp_len = 2,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -208,7 +208,7 @@ static const struct {
                 * [ 11110011 10111111 10111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
                .len     = 3,
                .exp_len = 4,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -218,7 +218,7 @@ static const struct {
                 * [ 11110011 10111111 10111111 01111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+               .arr     = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
                .len     = 4,
                .exp_len = 3,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -228,7 +228,7 @@ static const struct {
                 * [ 11110000 10000000 10000001 10111111 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+               .arr     = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
                .len     = 4,
                .exp_len = 4,
                .exp_cp  = LG_CODEPOINT_INVALID,
@@ -238,7 +238,7 @@ static const struct {
                 * [ 11110100 10010000 10000000 10000000 ] ->
                 * INVALID
                 */
-               .arr     = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+               .arr     = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
                .len     = 4,
                .exp_len = 4,
                .exp_cp  = LG_CODEPOINT_INVALID,
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
index 99f5d48..9ebaccf 100644
--- a/test/utf8-encode.c
+++ b/test/utf8-encode.c
@@ -9,43 +9,43 @@
 
 static const struct {
        uint_least32_t cp;      /* input code point */
-       uint8_t       *exp_arr; /* expected UTF-8 byte sequence */
+       char          *exp_arr; /* expected UTF-8 byte sequence */
        size_t         exp_len; /* expected length of UTF-8 sequence */
 } enc_test[] = {
        {
                /* invalid code point (UTF-16 surrogate half) */
                .cp      = UINT32_C(0xD800),
-               .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+               .exp_arr = (char *)(unsigned char[]){ 0xEF, 0xBF, 0xBD },
                .exp_len = 3,
        },
        {
                /* invalid code point (UTF-16-unrepresentable) */
                .cp      = UINT32_C(0x110000),
-               .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+               .exp_arr = (char *)(unsigned char[]){ 0xEF, 0xBF, 0xBD },
                .exp_len = 3,
        },
        {
                /* code point encoded to a 1-byte sequence */
                .cp      = 0x01,
-               .exp_arr = (uint8_t[]){ 0x01 },
+               .exp_arr = (char *)(unsigned char[]){ 0x01 },
                .exp_len = 1,
        },
        {
                /* code point encoded to a 2-byte sequence */
                .cp      = 0xFF,
-               .exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+               .exp_arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
                .exp_len = 2,
        },
        {
                /* code point encoded to a 3-byte sequence */
                .cp      = 0xFFF,
-               .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+               .exp_arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
                .exp_len = 3,
        },
        {
                /* code point encoded to a 4-byte sequence */
                .cp      = UINT32_C(0xFFFFF),
-               .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+               .exp_arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
                .exp_len = 4,
        },
 };
@@ -59,7 +59,7 @@ main(int argc, char *argv[])
 
        /* UTF-8 encoder test */
        for (i = 0, failed = 0; i < LEN(enc_test); i++) {
-               uint8_t arr[4];
+               char arr[4];
                size_t len;
 
                len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));

[hackers] [libgrapheme] Encourage strict aliasing for library users (uint8_t * -> char *) || Laslo Hunhold

Reply via email to