commit 65785f699be45dd77bdcbfc1d3aded39151f3205
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Sat Sep 24 11:45:20 2022 +0200
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Sat Sep 24 11:45:20 2022 +0200

    Refactor character-functions with Herodotus
    
    This also unifies the code and drops a lot of complicated state
    handling.
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/src/character.c b/src/character.c
index 4a0a05e..4d34b98 100644
--- a/src/character.c
+++ b/src/character.c
@@ -175,61 +175,39 @@ grapheme_is_character_break(uint_least32_t cp0, 
uint_least32_t cp1, GRAPHEME_STA
        return !notbreak;
 }
 
-size_t
-grapheme_next_character_break(const uint_least32_t *str, size_t len)
+static size_t
+next_character_break(HERODOTUS_READER *r)
 {
        GRAPHEME_STATE state = { 0 };
-       size_t off;
-
-       if (str == NULL || len == 0) {
-               return 0;
-       }
+       uint_least32_t cp0 = 0, cp1 = 0;
 
-       for (off = 1; off < len; off++) {
-               if (grapheme_is_character_break(str[off - 1], str[off], 
&state)) {
+       for (herodotus_read_codepoint(r, true, &cp0);
+            herodotus_read_codepoint(r, false, &cp1) == 
HERODOTUS_STATUS_SUCCESS;
+            herodotus_read_codepoint(r, true, &cp0)) {
+               if (grapheme_is_character_break(cp0, cp1, &state)) {
                        break;
                }
        }
 
-       return off;
+       return herodotus_reader_number_read(r);
 }
 
 size_t
-grapheme_next_character_break_utf8(const char *str, size_t len)
+grapheme_next_character_break(const uint_least32_t *str, size_t len)
 {
-       GRAPHEME_STATE state = { 0 };
-       uint_least32_t cp0 = 0, cp1 = 0;
-       size_t off, ret;
-
-       if (str == NULL || len == 0) {
-               return 0;
-       }
+       HERODOTUS_READER r;
 
-       for (off = 0; (len == SIZE_MAX) || off < len; off += ret) {
-               cp0 = cp1;
-               ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ?
-                                          SIZE_MAX : len - off, &cp1);
+       herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
 
-               if (len != SIZE_MAX && ret > (len - off)) {
-                       /* string ended abruptly, simply accept cropping */
-                       ret = len - off;
-               }
+       return next_character_break(&r);
+}
 
-               if (len == SIZE_MAX && cp1 == 0) {
-                       /* we hit a NUL-byte and are done */
-                       break;
-               }
+size_t
+grapheme_next_character_break_utf8(const char *str, size_t len)
+{
+       HERODOTUS_READER r;
 
-               if (off == 0) {
-                       /*
-                        * we skip the first round, as we need both
-                        * cp0 and cp1 to be initialized
-                        */
-                       continue;
-               } else if (grapheme_is_character_break(cp0, cp1, &state)) {
-                       break;
-               }
-       }
+       herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
 
-       return off;
+       return next_character_break(&r);
 }
diff --git a/src/util.c b/src/util.c
index 2a2b7d0..9d0eaee 100644
--- a/src/util.c
+++ b/src/util.c
@@ -111,7 +111,11 @@ herodotus_read_codepoint(HERODOTUS_READER *r, bool 
advance, uint_least32_t *cp)
        }
 
        if (r->type == HERODOTUS_TYPE_CODEPOINT) {
-               *cp = ((const uint_least32_t *)(r->src))[r->off++];
+               *cp = ((const uint_least32_t *)(r->src))[r->off];
+
+               if (advance) {
+                       r->off++;
+               }
        } else { /* r->type == HERODOTUS_TYPE_UTF8 */
                ret = grapheme_decode_utf8((const char *)r->src + r->off,
                                           MIN(r->srclen, r->soft_limit[0]) -

Reply via email to