commit 52b0e29e02068d6a8123042ef901f73e37b2f38f
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Sun Oct 2 21:17:03 2022 +0200
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Sun Oct 2 21:17:03 2022 +0200

    Refactor word-functions with Proper (using Herodotus in the background)
    
    As promised, this leads to a heavy simplification and separation of
    concerns in the code. Additionally, this fixes some known quirks in
    regard to handling NUL-terminated strings.
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/src/word.c b/src/word.c
index 91e1c31..4bdf62b 100644
--- a/src/word.c
+++ b/src/word.c
@@ -6,328 +6,237 @@
 #include "../grapheme.h"
 #include "util.h"
 
-static inline enum word_break_property
-get_break_prop(uint_least32_t cp)
+struct word_break_state
+{
+       bool ri_even;
+};
+
+static inline uint_least8_t
+get_word_break_prop(uint_least32_t cp)
 {
        if (likely(cp <= 0x10FFFF)) {
-               return (enum word_break_property)
+               return (uint_least8_t)
                       word_break_minor[word_break_major[cp >> 8] + (cp & 
0xff)];
        } else {
                return WORD_BREAK_PROP_OTHER;
        }
 }
 
-static size_t
-next_word_break(const void *str, size_t len, size_t (*get_codepoint)
-                (const void *, size_t, size_t, uint_least32_t *))
+static bool
+is_skippable_word_prop(uint_least8_t prop)
 {
-       struct {
-               enum word_break_property a, b, c, d;
-       } raw, skip;
-       enum word_break_property res;
-       uint_least32_t cp;
-       size_t off, tmp, new_off;
-       bool ri_even = true;
-
-       /* check degenerate cases */
-       if (str == NULL || len == 0) {
-               return 0;
-       }
-
-       /*
-        * Apply word breaking algorithm (UAX #29), see
-        * https://unicode.org/reports/tr29/#Word_Boundary_Rules
-        *
-        * There are 4 slots (a, b, c, d) of "break" properties and
-        * we check if there is a break in the middle between b and c.
-        *
-        * The position of this middle spot is determined by off,
-        * which gives the offset of the first element on the right
-        * hand side of said spot, or, in other words, gives the number
-        * of elements on the left hand side.
-        *
-        * It is further complicated by the fact that the algorithm
-        * expects you to skip certain characters for the second
-        * half of the rules (after WB4). Thus, we do not only have
-        * the "raw" properties as described above, but also the "skip"
-        * properties, where the skip.a and skip.b, for instance,
-        * give the two preceding character properties behind the
-        * currently investigated breakpoint.
-        *
-        */
+       return prop == WORD_BREAK_PROP_EXTEND ||
+              prop == WORD_BREAK_PROP_FORMAT ||
+              prop == WORD_BREAK_PROP_ZWJ;
+}
 
-       /*
-        * Initialize the different properties such that we have
-        * a good state after the state-update in the loop
-        */
-       raw.b = NUM_WORD_BREAK_PROPS;
-       if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
-               /*
-                * A line is at least one codepoint long, so we can
-                * safely return here
-                */
-               return len;
-       }
-       raw.c = get_break_prop(cp);
-       (void)get_codepoint(str, len, off, &cp);
-       raw.d = get_break_prop(cp);
-       skip.a = skip.b = NUM_WORD_BREAK_PROPS;
+static void
+word_skip_shift_callback(uint_least8_t prop, void *s)
+{
+       struct word_break_state *state = (struct word_break_state *)s;
 
-       for (; off < len; off = new_off) {
+       if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
                /*
-                * Update left side (a and b) of the skip state by
-                * "shifting in" the raw.c property as long as it is
-                * not one of the "ignored" character properties.
-                * While at it, update the RI-counter.
+                * The property we just shifted in is
+                * a regional indicator, increasing the
+                * number of consecutive RIs on the left
+                * side of the breakpoint by one, changing
+                * the oddness.
                 *
                 */
-               if (raw.c != WORD_BREAK_PROP_EXTEND &&
-                   raw.c != WORD_BREAK_PROP_FORMAT &&
-                   raw.c != WORD_BREAK_PROP_ZWJ) {
-                       skip.a = skip.b;
-                       skip.b = raw.c;
-
-                       if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
-                               /*
-                                * The property we just shifted in is
-                                * a regional indicator, increasing the
-                                * number of consecutive RIs on the left
-                                * side of the breakpoint by one, changing
-                                * the oddness.
-                                *
-                                */
-                               ri_even = !ri_even;
-                       } else {
-                               /*
-                                * We saw no regional indicator, so the
-                                * number of consecutive RIs on the left
-                                * side of the breakpoint is zero, which
-                                * is an even number.
-                                *
-                                */
-                               ri_even = true;
-                       }
-               }
-
+               state->ri_even = !(state->ri_even);
+       } else {
                /*
-                * Update right side (b and c) of the skip state by
-                * starting at the breakpoint and detecting the two
-                * following non-ignored character classes
+                * We saw no regional indicator, so the
+                * number of consecutive RIs on the left
+                * side of the breakpoint is zero, which
+                * is an even number.
                 *
                 */
-               skip.c = NUM_WORD_BREAK_PROPS;
-               for (tmp = off; tmp < len; ) {
-                       tmp += get_codepoint(str, len, tmp, &cp);
-                       res = get_break_prop(cp);
-
-                       if (res != WORD_BREAK_PROP_EXTEND &&
-                           res != WORD_BREAK_PROP_FORMAT &&
-                           res != WORD_BREAK_PROP_ZWJ) {
-                               skip.c = res;
-                               break;
-                       }
-               }
-               skip.d = NUM_WORD_BREAK_PROPS;
-               for (; tmp < len; ) {
-                       tmp += get_codepoint(str, len, tmp, &cp);
-                       res = get_break_prop(cp);
-
-                       if (res != WORD_BREAK_PROP_EXTEND &&
-                           res != WORD_BREAK_PROP_FORMAT &&
-                           res != WORD_BREAK_PROP_ZWJ) {
-                               skip.d = res;
-                               break;
-                       }
-               }
+               state->ri_even = true;
+       }
+}
 
-               /*
-                * Update the raw state by simply shifting everything
-                * in and, if we still have data left, determining
-                * the character class of the next codepoint.
-                *
-                */
-               raw.a = raw.b;
-               raw.b = raw.c;
-               raw.c = raw.d;
-               if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
-                       get_codepoint(str, len, new_off, &cp);
-                       raw.d = get_break_prop(cp);
-               } else {
-                       raw.d = NUM_WORD_BREAK_PROPS;
-               }
+static size_t
+next_word_break(HERODOTUS_READER *r)
+{
+       struct proper p;
+       struct word_break_state state = { .ri_even = true };
 
+       /*
+        * Apply word breaking algorithm (UAX #29), see
+        * https://unicode.org/reports/tr29/#Word_Boundary_Rules
+        */
+       proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
+                   is_skippable_word_prop, word_skip_shift_callback, &p);
+
+       while (!proper_advance(&p)) {
                /* WB3 */
-               if (raw.b == WORD_BREAK_PROP_CR &&
-                   raw.c == WORD_BREAK_PROP_LF) {
+               if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
+                   p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
                        continue;
                }
 
                /* WB3a */
-               if (raw.b == WORD_BREAK_PROP_NEWLINE ||
-                   raw.b == WORD_BREAK_PROP_CR      ||
-                   raw.b == WORD_BREAK_PROP_LF) {
+               if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
+                   p.raw.prev_prop[0] == WORD_BREAK_PROP_CR      ||
+                   p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
                        break;
                }
 
                /* WB3b */
-               if (raw.c == WORD_BREAK_PROP_NEWLINE ||
-                   raw.c == WORD_BREAK_PROP_CR      ||
-                   raw.c == WORD_BREAK_PROP_LF) {
+               if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
+                   p.raw.next_prop[0] == WORD_BREAK_PROP_CR      ||
+                   p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
                        break;
                }
 
                /* WB3c */
-               if (raw.b == WORD_BREAK_PROP_ZWJ &&
-                   (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
-                    raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
+               if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
+                   (p.raw.next_prop[0] == 
WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
+                    p.raw.next_prop[0] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
                        continue;
                }
 
                /* WB3d */
-               if (raw.b == WORD_BREAK_PROP_WSEGSPACE &&
-                   raw.c == WORD_BREAK_PROP_WSEGSPACE) {
+               if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
+                   p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
                        continue;
                }
 
                /* WB4 */
-               if (raw.c == WORD_BREAK_PROP_EXTEND ||
-                   raw.c == WORD_BREAK_PROP_FORMAT ||
-                   raw.c == WORD_BREAK_PROP_ZWJ) {
+               if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
+                   p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
+                   p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
                        continue;
                }
 
                /* WB5 */
-               if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
-                    skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
-                    skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
-                   (skip.c == WORD_BREAK_PROP_ALETTER              ||
-                    skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
-                    skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
+               if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER             
 ||
+                    p.skip.prev_prop[0] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+                   (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER             
 ||
+                    p.skip.next_prop[0] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+                    p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                        continue;
                }
 
                /* WB6 */
-               if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
-                    skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
-                    skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
-                   (skip.c == WORD_BREAK_PROP_MIDLETTER    ||
-                    skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
-                    skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
-                   len > 2 &&
-                   (skip.d == WORD_BREAK_PROP_ALETTER              ||
-                    skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
-                    skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) {
+               if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER             
 ||
+                    p.skip.prev_prop[0] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+                   (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER    ||
+                    p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
+                    p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+                   (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER             
 ||
+                    p.skip.next_prop[1] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+                    p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                        continue;
                }
 
                /* WB7 */
-               if ((skip.b == WORD_BREAK_PROP_MIDLETTER    ||
-                    skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
-                    skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
-                   (skip.c == WORD_BREAK_PROP_ALETTER              ||
-                    skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
-                    skip.c == WORD_BREAK_PROP_HEBREW_LETTER) &&
-                   len > 2 &&
-                   (skip.a == WORD_BREAK_PROP_ALETTER              ||
-                    skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
-                    skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) {
+               if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER    ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+                   (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER             
 ||
+                    p.skip.next_prop[0] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+                    p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+                   (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER             
 ||
+                    p.skip.prev_prop[1] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+                    p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                        continue;
                }
 
                /* WB7a */
-               if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
-                   skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) {
+               if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
+                   p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
                        continue;
                }
 
                /* WB7b */
-               if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
-                   skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE &&
-                   len > 2 &&
-                   skip.d == WORD_BREAK_PROP_HEBREW_LETTER) {
+               if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
+                   p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
+                   p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
                        continue;
                }
 
                /* WB7c */
-               if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE &&
-                   skip.c == WORD_BREAK_PROP_HEBREW_LETTER &&
-                   off > 1 &&
-                   skip.a == WORD_BREAK_PROP_HEBREW_LETTER) {
+               if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
+                   p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
+                   p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
                        continue;
                }
 
                /* WB8 */
-               if (skip.b == WORD_BREAK_PROP_NUMERIC &&
-                   skip.c == WORD_BREAK_PROP_NUMERIC) {
+               if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+                   p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
                        continue;
                }
 
                /* WB9 */
-               if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
-                    skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
-                    skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
-                   skip.c == WORD_BREAK_PROP_NUMERIC) {
+               if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER             
 ||
+                    p.skip.prev_prop[0] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
+                   p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
                        continue;
                }
 
                /* WB10 */
-               if (skip.b == WORD_BREAK_PROP_NUMERIC &&
-                   (skip.c == WORD_BREAK_PROP_ALETTER              ||
-                    skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
-                    skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
+               if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+                   (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER             
 ||
+                    p.skip.next_prop[0] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+                    p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                        continue;
                }
 
                /* WB11 */
-               if ((skip.b == WORD_BREAK_PROP_MIDNUM       ||
-                    skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
-                    skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
-                   skip.c == WORD_BREAK_PROP_NUMERIC &&
-                   off > 1 &&
-                   skip.a == WORD_BREAK_PROP_NUMERIC) {
+               if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM       ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+                   p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+                   p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
                        continue;
                }
 
                /* WB12 */
-               if (skip.b == WORD_BREAK_PROP_NUMERIC &&
-                   (skip.c == WORD_BREAK_PROP_MIDNUM       ||
-                    skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
-                    skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
-                   len > 2 &&
-                   skip.d == WORD_BREAK_PROP_NUMERIC) {
+               if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
+                   (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM       ||
+                    p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
+                    p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+                   p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
                        continue;
                }
 
                /* WB13 */
-               if (skip.b == WORD_BREAK_PROP_KATAKANA &&
-                   skip.c == WORD_BREAK_PROP_KATAKANA) {
+               if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
+                   p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
                        continue;
                }
 
                /* WB13a */
-               if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
-                    skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
-                    skip.b == WORD_BREAK_PROP_HEBREW_LETTER        ||
-                    skip.b == WORD_BREAK_PROP_NUMERIC              ||
-                    skip.b == WORD_BREAK_PROP_KATAKANA             ||
-                    skip.b == WORD_BREAK_PROP_EXTENDNUMLET) &&
-                   skip.c == WORD_BREAK_PROP_EXTENDNUMLET) {
+               if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER             
 ||
+                    p.skip.prev_prop[0] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER       
 ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC             
 ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA            
 ||
+                    p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
+                   p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
                        continue;
                }
 
                /* WB13b */
-               if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET &&
-                   (skip.c == WORD_BREAK_PROP_ALETTER              ||
-                    skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
-                    skip.c == WORD_BREAK_PROP_HEBREW_LETTER        ||
-                    skip.c == WORD_BREAK_PROP_NUMERIC              ||
-                    skip.c == WORD_BREAK_PROP_KATAKANA)) {
+               if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
+                   (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER             
 ||
+                    p.skip.next_prop[0] == 
WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+                    p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER       
 ||
+                    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC             
 ||
+                    p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
                        continue;
                }
 
                /* WB15 and WB16 */
-               if (!ri_even &&
-                   skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
+               if (!state.ri_even &&
+                   p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
                        continue;
                }
 
@@ -335,17 +244,25 @@ next_word_break(const void *str, size_t len, size_t 
(*get_codepoint)
                break;
        }
 
-       return off;
+       return herodotus_reader_number_read(&(p.mid_reader));
 }
 
 size_t
 grapheme_next_word_break(const uint_least32_t *str, size_t len)
 {
-       return next_word_break(str, len, get_codepoint);
+       HERODOTUS_READER r;
+
+       herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
+
+       return next_word_break(&r);
 }
 
 size_t
 grapheme_next_word_break_utf8(const char *str, size_t len)
 {
-       return next_word_break(str, len, get_codepoint_utf8);
+       HERODOTUS_READER r;
+
+       herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
+
+       return next_word_break(&r);
 }

Reply via email to