[hackers] [libgrapheme] Refactor API ("lg_" prefix, better naming scheme) || Laslo Hunhold

git Wed, 08 Dec 2021 09:29:54 -0800

commit 1c126d7ee10854b29e606e4eeb491621d021beeb
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Wed Dec 8 18:16:48 2021 +0100
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Wed Dec 8 18:16:48 2021 +0100


    Refactor API ("lg_" prefix, better naming scheme)
    
    The "grapheme_" prefix was sadly a bit confusing so it now switches
    to the "lg_" prefix which also will not get in the way too much.
    
    "_nextbreak" and "_isbreak" as a general form makes clearer what
    we actually do.
    
    "utf8_decode" and "utf8_encode" instead of "cp_decode" and
    "cp_encode" greatly improves readability and removes any doubt about
    what these functions do. libgrapheme is usable with any other encoding
    via the "_isbreak"-functions, but you'll have to decode yourself, but
    it should be clear by now that UTF-8 should be used everywhere. :)
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/Makefile b/Makefile
index a7a6eee..ae3f019 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ DATA =\
        data/GraphemeBreakProperty.txt\
        data/GraphemeBreakTest.txt
 GEN = gen/grapheme gen/grapheme-test
-LIB = src/codepoint src/grapheme src/util
+LIB = src/grapheme src/utf8 src/util
 TEST = test/grapheme test/utf8-decode test/utf8-encode
 
 MAN3 = man/grapheme_bytelen.3
@@ -20,7 +20,7 @@ all: libgrapheme.a libgrapheme.so
 gen/grapheme.o: gen/grapheme.c config.mk gen/util.h
 gen/grapheme-test.o: gen/grapheme-test.c config.mk gen/util.h
 gen/util.o: gen/util.c config.mk gen/util.h
-src/codepoint.o: src/codepoint.c config.mk grapheme.h
+src/utf8.o: src/utf8.c config.mk grapheme.h
 src/grapheme.o: src/grapheme.c config.mk gen/grapheme.h grapheme.h src/util.h
 src/util.o: src/util.c config.mk src/util.h
 test/grapheme.o: test/grapheme.c config.mk gen/grapheme-test.h grapheme.h
diff --git a/grapheme.h b/grapheme.h
index e0c5d02..638f5ba 100644
--- a/grapheme.h
+++ b/grapheme.h
@@ -5,12 +5,12 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#define GRAPHEME_CP_INVALID UINT32_C(0xFFFD)
+#define LG_CODEPOINT_INVALID UINT32_C(0xFFFD)
 
-int grapheme_boundary(uint32_t, uint32_t, int *);
-size_t grapheme_bytelen(const char *);
+size_t lg_utf8_decode(uint32_t *, const uint8_t *, size_t);
+size_t lg_utf8_encode(uint32_t, uint8_t *, size_t);
 
-size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t);
-size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t);
+size_t lg_grapheme_nextbreak(const char *);
+int lg_grapheme_isbreak(uint32_t, uint32_t, int *);
 
 #endif /* GRAPHEME_H */
diff --git a/src/grapheme.c b/src/grapheme.c
index 068f91b..5b33435 100644
--- a/src/grapheme.c
+++ b/src/grapheme.c
@@ -11,7 +11,7 @@ enum {
 };
 
 int
-grapheme_boundary(uint32_t a, uint32_t b, int *state)
+lg_grapheme_isbreak(uint32_t a, uint32_t b, int *state)
 {
        struct heisenstate prop[2] = { 0 };
        int s;
@@ -155,7 +155,7 @@ grapheme_boundary(uint32_t a, uint32_t b, int *state)
 }
 
 size_t
-grapheme_bytelen(const char *str)
+lg_grapheme_nextbreak(const char *str)
 {
        uint32_t cp0, cp1;
        size_t ret, len = 0;
@@ -166,7 +166,7 @@ grapheme_bytelen(const char *str)
        }
 
        /*
-        * grapheme_cp_decode, when it encounters an unexpected byte,
+        * lg_utf8_decode, when it encounters an unexpected byte,
         * does not count it to the error and instead assumes that the
         * unexpected byte is the beginning of a new sequence.
         * This way, when the string ends with a null byte, we never
@@ -178,17 +178,17 @@ grapheme_bytelen(const char *str)
         */
 
        /* get first code point */
-       len += grapheme_cp_decode(&cp0, (uint8_t *)str, 5);
-       if (cp0 == GRAPHEME_CP_INVALID) {
+       len += lg_utf8_decode(&cp0, (uint8_t *)str, 5);
+       if (cp0 == LG_CODEPOINT_INVALID) {
                return len;
        }
 
        while (cp0 != 0) {
                /* get next code point */
-               ret = grapheme_cp_decode(&cp1, (uint8_t *)(str + len), 5);
+               ret = lg_utf8_decode(&cp1, (uint8_t *)(str + len), 5);
 
-               if (cp1 == GRAPHEME_CP_INVALID ||
-                   grapheme_boundary(cp0, cp1, &state)) {
+               if (cp1 == LG_CODEPOINT_INVALID ||
+                   lg_grapheme_isbreak(cp0, cp1, &state)) {
                        /* we read an invalid cp or have a breakpoint */
                        break;
                } else {
diff --git a/src/codepoint.c b/src/utf8.c
similarity index 93%
rename from src/codepoint.c
rename to src/utf8.c
index 7e31320..d2c7265 100644
--- a/src/codepoint.c
+++ b/src/utf8.c
@@ -47,13 +47,13 @@ static const struct {
 };
 
 size_t
-grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n)
+lg_utf8_decode(uint32_t *cp, const uint8_t *s, size_t n)
 {
        size_t off, i;
 
        if (n == 0) {
                /* a sequence must be at least 1 byte long */
-               *cp = GRAPHEME_CP_INVALID;
+               *cp = LG_CODEPOINT_INVALID;
                return 1;
        }
 
@@ -74,7 +74,7 @@ grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n)
                 * first byte does not match a sequence type;
                 * set cp as invalid and return 1 byte processed
                 */
-               *cp = GRAPHEME_CP_INVALID;
+               *cp = LG_CODEPOINT_INVALID;
                return 1;
        }
        if (1 + off > n) {
@@ -82,7 +82,7 @@ grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n)
                 * input is not long enough, set cp as invalid and
                 * return number of bytes needed
                 */
-               *cp = GRAPHEME_CP_INVALID;
+               *cp = LG_CODEPOINT_INVALID;
                return 1 + off;
        }
 
@@ -98,7 +98,7 @@ grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n)
                         * unexpected character as recommended since
                         * Unicode 6 (chapter 3)
                         */
-                       *cp = GRAPHEME_CP_INVALID;
+                       *cp = LG_CODEPOINT_INVALID;
                        return 1 + (i - 1);
                }
                /*
@@ -117,14 +117,14 @@ grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t 
n)
                 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
                 * specifies the latter two conditions)
                 */
-               *cp = GRAPHEME_CP_INVALID;
+               *cp = LG_CODEPOINT_INVALID;
        }
 
        return 1 + off;
 }
 
 size_t
-grapheme_cp_encode(uint32_t cp, uint8_t *s, size_t n)
+lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
 {
        size_t off, i;
 
@@ -135,7 +135,7 @@ grapheme_cp_encode(uint32_t cp, uint8_t *s, size_t n)
                 * (0xD800..0xDFFF) or not representable in UTF-16
                 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
                 */
-               cp = GRAPHEME_CP_INVALID;
+               cp = LG_CODEPOINT_INVALID;
        }
 
        /* determine necessary sequence type */
diff --git a/test/grapheme.c b/test/grapheme.c
index ff4d1f4..1cb5ad0 100644
--- a/test/grapheme.c
+++ b/test/grapheme.c
@@ -19,9 +19,9 @@ main(void)
        for (i = 0, failed = 0; i < LEN(grapheme_test); i++) {
                for (j = 0, k = 0, state = 0, len = 1; j < 
grapheme_test[i].cplen; j++) {
                        if ((j + 1) == grapheme_test[i].cplen ||
-                           grapheme_boundary(grapheme_test[i].cp[j],
-                                             grapheme_test[i].cp[j + 1],
-                                             &state)) {
+                           lg_grapheme_isbreak(grapheme_test[i].cp[j],
+                                               grapheme_test[i].cp[j + 1],
+                                               &state)) {
                                /* check if our resulting length matches */
                                if (k == grapheme_test[i].lenlen ||
                                    len != grapheme_test[i].len[k++]) {
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
index 8349f39..033fcb6 100644
--- a/test/utf8-decode.c
+++ b/test/utf8-decode.c
@@ -22,7 +22,7 @@ static const struct {
                .arr     = NULL,
                .len     = 0,
                .exp_len = 1,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid lead byte
@@ -32,7 +32,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xFD },
                .len     = 1,
                .exp_len = 1,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* valid 1-byte sequence
@@ -62,7 +62,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xC3 },
                .len     = 1,
                .exp_len = 2,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 2-byte sequence (second byte malformed)
@@ -72,7 +72,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xC3, 0xFF },
                .len     = 2,
                .exp_len = 1,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 2-byte sequence (overlong encoded)
@@ -82,7 +82,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xC1, 0xBF },
                .len     = 2,
                .exp_len = 2,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* valid 3-byte sequence
@@ -102,7 +102,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xE0 },
                .len     = 1,
                .exp_len = 3,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 3-byte sequence (second byte malformed)
@@ -112,7 +112,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
                .len     = 3,
                .exp_len = 1,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 3-byte sequence (third byte missing)
@@ -122,7 +122,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xE0, 0xBF },
                .len     = 2,
                .exp_len = 3,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 3-byte sequence (third byte malformed)
@@ -132,7 +132,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
                .len     = 3,
                .exp_len = 2,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 3-byte sequence (overlong encoded)
@@ -142,7 +142,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
                .len     = 3,
                .exp_len = 3,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 3-byte sequence (UTF-16 surrogate half)
@@ -152,7 +152,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xED, 0xA0, 0x80 },
                .len     = 3,
                .exp_len = 3,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* valid 4-byte sequence
@@ -172,7 +172,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xF3 },
                .len     = 1,
                .exp_len = 4,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 4-byte sequence (second byte malformed)
@@ -182,7 +182,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
                .len     = 4,
                .exp_len = 1,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 4-byte sequence (third byte missing)
@@ -192,7 +192,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xF3, 0xBF },
                .len     = 2,
                .exp_len = 4,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 4-byte sequence (third byte malformed)
@@ -202,7 +202,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
                .len     = 4,
                .exp_len = 2,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 4-byte sequence (fourth byte missing)
@@ -212,7 +212,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
                .len     = 3,
                .exp_len = 4,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 4-byte sequence (fourth byte malformed)
@@ -222,7 +222,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
                .len     = 4,
                .exp_len = 3,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 4-byte sequence (overlong encoded)
@@ -232,7 +232,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
                .len     = 4,
                .exp_len = 4,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
        {
                /* invalid 4-byte sequence (UTF-16-unrepresentable)
@@ -242,7 +242,7 @@ static const struct {
                .arr     = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
                .len     = 4,
                .exp_len = 4,
-               .exp_cp  = GRAPHEME_CP_INVALID,
+               .exp_cp  = LG_CODEPOINT_INVALID,
        },
 };
 
@@ -256,7 +256,7 @@ main(void)
                size_t len;
                uint32_t cp;
 
-               len = grapheme_cp_decode(&cp, dec_test[i].arr,
+               len = lg_utf8_decode(&cp, dec_test[i].arr,
                                         dec_test[i].len);
 
                if (len != dec_test[i].exp_len ||
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
index 7851d25..0125b2a 100644
--- a/test/utf8-encode.c
+++ b/test/utf8-encode.c
@@ -61,7 +61,7 @@ main(void)
                uint8_t arr[4];
                size_t len;
 
-               len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr));
+               len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));
 
                if (len != enc_test[i].exp_len ||
                    memcmp(arr, enc_test[i].exp_arr, len)) {

[hackers] [libgrapheme] Refactor API ("lg_" prefix, better naming scheme) || Laslo Hunhold

Reply via email to