[hackers] [libgrapheme] Add UTF-8-encode-function || Laslo Hunhold

git Sun, 31 May 2020 13:53:02 -0700

commit 21b6f66acc659e8c515d4685a11fa534a289af14
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Sun May 31 22:44:06 2020 +0200
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Sun May 31 22:44:06 2020 +0200


    Add UTF-8-encode-function
    
    Merely to detect grapheme clusters, the reasoning behind adding
    an encoding-function is not immediately apparent. The main reason
    for it is because some decoding-scenarios actually change the text
    representation (by identifying invalid codepoints and outputting
    them as such).
    The user should have the chance to output a "processed" stream.
    
    A minor benefit with very little overhead is that this encoding
    function is just useful in general.
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/src/codepoint.c b/src/codepoint.c
index e20fdf5..976f922 100644
--- a/src/codepoint.c
+++ b/src/codepoint.c
@@ -9,7 +9,8 @@
 static const struct {
        uint8_t  lower; /* lower bound of sequence first byte */
        uint8_t  upper; /* upper bound of sequence first byte */
-       uint32_t mincp; /* smallest non-overlong encoded codepoint */
+       uint32_t mincp; /* smallest non-overlong encoded code point */
+       uint32_t maxcp; /* largest encodable code point */
        /*
         * implicit: table-offset represents the number of following
         * bytes of the form 10xxxxxx (6 bits capacity each)
@@ -20,24 +21,28 @@ static const struct {
                .lower = 0x00, /* 00000000 */
                .upper = 0x7F, /* 01111111 */
                .mincp = (uint32_t)0,
+               .maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */
        },
        [1] = {
                /* 110xxxxx */
                .lower = 0xC0, /* 11000000 */
                .upper = 0xDF, /* 11011111 */
-               .mincp = (uint32_t)1 << 7, /* [0] has 7 bits capacity */
+               .mincp = (uint32_t)1 << 7,
+               .maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
        },
        [2] = {
                /* 1110xxxx */
                .lower = 0xE0, /* 11100000 */
                .upper = 0xEF, /* 11101111 */
-               .mincp = (uint32_t)1 << 11, /* [1] has 5+6=11 bits capacity */
+               .mincp = (uint32_t)1 << 11,
+               .maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
        },
        [3] = {
                /* 11110xxx */
                .lower = 0xF0, /* 11110000 */
                .upper = 0xF7, /* 11110111 */
-               .mincp = (uint32_t)1 << 16, /* [2] has 4+6+6=16 bits capacity */
+               .mincp = (uint32_t)1 << 16,
+               .maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
        },
 };
 
@@ -117,3 +122,55 @@ grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t 
n)
 
        return 1 + off;
 }
+
+size_t
+grapheme_cp_encode(uint32_t cp, uint8_t *s, size_t n)
+{
+       size_t off, i;
+
+       if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
+           cp > UINT32_C(0x10FFFF)) {
+               /*
+                * code point is a high or low UTF-16 surrogate half
+                * (0xD800..0xDFFF) or not representable in UTF-16
+                * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
+                */
+               cp = CP_INVALID;
+       }
+
+       /* determine necessary sequence type */
+       for (off = 0; off < LEN(lut); off++) {
+               if (cp <= lut[off].maxcp) {
+                       break;
+               }
+       }
+       if (1 + off > n) {
+               /* specified buffer is too small to store sequence */
+               return 1 + off;
+       }
+
+       /* build sequence by filling cp-bits into each byte */
+
+       /*
+        * lut[off].lower is the bit-format for the first byte and
+        * the bits to fill into it are determined by shifting the
+        * cp 6 times the number of following bytes, as each
+        * following byte stores 6 bits, yielding the wanted bits.
+        *
+        * We do not overwrite the mask because we guaranteed earlier
+        * that there are no bits higher than the mask allows.
+        */
+       s[0] = lut[off].lower | (cp >> (6 * off));
+
+       for (i = 1; i <= off; i++) {
+               /*
+                * the bit-format for following bytes is 10000000 (0x80)
+                * and it each stores 6 bits in the 6 low bits that we
+                * extract from the properly-shifted value using the
+                * mask 00111111 (0x3F)
+                */
+               s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+       }
+
+       return 1 + off;
+}
diff --git a/src/codepoint.h b/src/codepoint.h
index 6a100f1..38292ba 100644
--- a/src/codepoint.h
+++ b/src/codepoint.h
@@ -10,5 +10,6 @@ typedef uint32_t Codepoint;
 #define CP_INVALID 0xFFFD
 
 size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t);
+size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t);
 
 #endif /* CODEPOINT_H */

[hackers] [libgrapheme] Add UTF-8-encode-function || Laslo Hunhold

Reply via email to