commit a5b1b0c0c7bc1576b5893175b27585fa963f4433
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Sun Oct 2 22:05:11 2022 +0200
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Sun Oct 2 22:05:11 2022 +0200

    Refactor sentence-functions with Proper (using Herodotus in the background)
    
    This refactor was a breeze and it passed all conformance tests on the
    first try. This, just like with the word-functions, leads to a massive
    simplification and separation of concerns in the code. And as with the
    word functions, this fixes some known quirks.
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/src/sentence.c b/src/sentence.c
index c302747..88c21de 100644
--- a/src/sentence.c
+++ b/src/sentence.c
@@ -6,11 +6,17 @@
 #include "../grapheme.h"
 #include "util.h"
 
-static inline enum sentence_break_property
-get_break_prop(uint_least32_t cp)
+struct sentence_break_state
+{
+       uint_least8_t aterm_close_sp_level;
+       uint_least8_t saterm_close_sp_parasep_level;
+};
+
+static inline uint_least8_t
+get_sentence_break_prop(uint_least32_t cp)
 {
        if (likely(cp <= 0x10FFFF)) {
-               return (enum sentence_break_property)
+               return (uint_least8_t)
                       sentence_break_minor[sentence_break_major[cp >> 8] +
                       (cp & 0xff)];
        } else {
@@ -18,243 +24,157 @@ get_break_prop(uint_least32_t cp)
        }
 }
 
-static size_t
-next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
-                    (const void *, size_t, size_t, uint_least32_t *))
+static bool
+is_skippable_sentence_prop(uint_least8_t prop)
 {
-       struct {
-               enum sentence_break_property a, b, c, d;
-       } raw, skip;
-       enum sentence_break_property res;
-       uint_least32_t cp;
-       uint_least8_t aterm_close_sp_level = 0,
-                     saterm_close_sp_parasep_level = 0;
-       size_t off, tmp, new_off;
+       return prop == SENTENCE_BREAK_PROP_EXTEND ||
+              prop == SENTENCE_BREAK_PROP_FORMAT;
+}
 
-       /* check degenerate cases */
-       if (str == NULL || len == 0) {
-               return 0;
-       }
+static void
+sentence_skip_shift_callback(uint_least8_t prop, void *s)
+{
+       struct sentence_break_state *state = (struct sentence_break_state *)s;
 
        /*
-        * Apply sentence breaking algorithm (UAX #29), see
-        * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
+        * Here comes a bit of magic. The rules
+        * SB8, SB8a, SB9 and SB10 have very complicated
+        * left-hand-side-rules of the form
         *
-        * There are 4 slots (a, b, c, d) of "break" properties and
-        * we check if there is a break in the middle between b and c.
+        *  ATerm Close* Sp*
+        *  SATerm Close*
+        *  SATerm Close* Sp*
+        *  SATerm Close* Sp* ParaSep?
         *
-        * The position of this middle spot is determined by off,
-        * which gives the offset of the first element on the right
-        * hand side of said spot, or, in other words, gives the number
-        * of elements on the left hand side.
+        * but instead of backtracking, we keep the
+        * state as some kind of "power level" in
+        * two state-variables
         *
-        * It is further complicated by the fact that the algorithm
-        * expects you to skip certain characters for the second
-        * half of the rules (after SB5). Thus, we do not only have
-        * the "raw" properties as described above, but also the "skip"
-        * properties, where the skip.a and skip.b, for instance,
-        * give the two preceding character properties behind the
-        * currently investigated breakpoint.
+        *  aterm_close_sp_level
+        *  saterm_close_sp_parasep_level
+        *
+        * that go from 0 to 3/4:
+        *
+        *  0: we are not in the sequence
+        *  1: we have one ATerm/SATerm to the left of
+        *     the middle spot
+        *  2: we have one ATerm/SATerm and one or more
+        *     Close to the left of the middle spot
+        *  3: we have one ATerm/SATerm, zero or more
+        *     Close and one or more Sp to the left of
+        *     the middle spot.
+        *  4: we have one SATerm, zero or more Close,
+        *     zero or more Sp and one ParaSep to the
+        *     left of the middle spot.
         *
         */
-
-       /*
-        * Initialize the different properties such that we have
-        * a good state after the state-update in the loop
-        */
-       raw.b = NUM_SENTENCE_BREAK_PROPS;
-       if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
-               /*
-                * A line is at least one codepoint long, so we can
-                * safely return here
-                */
-               return len;
+       if ((state->aterm_close_sp_level == 0 ||
+            state->aterm_close_sp_level == 1) &&
+           prop == SENTENCE_BREAK_PROP_ATERM) {
+               /* sequence has begun */
+               state->aterm_close_sp_level = 1;
+       } else if ((state->aterm_close_sp_level == 1 ||
+                   state->aterm_close_sp_level == 2) &&
+                  prop == SENTENCE_BREAK_PROP_CLOSE) {
+               /* close-sequence begins or continued */
+               state->aterm_close_sp_level = 2;
+       } else if ((state->aterm_close_sp_level == 1 ||
+                   state->aterm_close_sp_level == 2 ||
+                   state->aterm_close_sp_level == 3) &&
+                  prop == SENTENCE_BREAK_PROP_SP) {
+               /* sp-sequence begins or continued */
+               state->aterm_close_sp_level = 3;
+       } else {
+               /* sequence broke */
+               state->aterm_close_sp_level = 0;
        }
-       raw.c = get_break_prop(cp);
-       (void)get_codepoint(str, len, off, &cp);
-       raw.d = get_break_prop(cp);
-       skip.a = skip.b = NUM_SENTENCE_BREAK_PROPS;
-
-       for (; off < len; off = new_off) {
-               /*
-                * Update left side (a and b) of the skip state by
-                * "shifting in" the raw.c property as long as it is
-                * not one of the "ignored" character properties.
-                * While at it, update the RI-counter.
-                *
-                */
-               if (raw.c != SENTENCE_BREAK_PROP_EXTEND &&
-                   raw.c != SENTENCE_BREAK_PROP_FORMAT) {
-                       skip.a = skip.b;
-                       skip.b = raw.c;
-
-                       /*
-                        * Here comes a bit of magic. The rules
-                        * SB8, SB8a, SB9 and SB10 have very complicated
-                        * left-hand-side-rules of the form
-                        *
-                        *  ATerm Close* Sp*
-                        *  SATerm Close*
-                        *  SATerm Close* Sp*
-                        *  SATerm Close* Sp* ParaSep?
-                        * 
-                        * but instead of backtracking, we keep the
-                        * state as some kind of "power level" in
-                        * two variables
-                        *
-                        *  aterm_close_sp_level
-                        *  saterm_close_sp_parasep_level
-                        * 
-                        * that go from 0 to 3/4:
-                        *
-                        *  0: we are not in the sequence
-                        *  1: we have one ATerm/SATerm to the left of
-                        *     the middle spot
-                        *  2: we have one ATerm/SATerm and one or more
-                        *     Close to the left of the middle spot
-                        *  3: we have one ATerm/SATerm, zero or more
-                        *     Close and one or more Sp to the left of
-                        *     the middle spot.
-                        *  4: we have one SATerm, zero or more Close,
-                        *     zero or more Sp and one ParaSep to the
-                        *     left of the middle spot.
-                        *
-                        */
-                       if ((aterm_close_sp_level == 0 ||
-                            aterm_close_sp_level == 1) &&
-                           skip.b == SENTENCE_BREAK_PROP_ATERM) {
-                               /* sequence has begun */
-                               aterm_close_sp_level = 1;
-                       } else if ((aterm_close_sp_level == 1 ||
-                                   aterm_close_sp_level == 2) &&
-                                  skip.b == SENTENCE_BREAK_PROP_CLOSE) {
-                               /* close-sequence begins or continued */
-                               aterm_close_sp_level = 2;
-                       } else if ((aterm_close_sp_level == 1 ||
-                                   aterm_close_sp_level == 2 ||
-                                   aterm_close_sp_level == 3) &&
-                                  skip.b == SENTENCE_BREAK_PROP_SP) {
-                               /* sp-sequence begins or continued */
-                               aterm_close_sp_level = 3;
-                       } else {
-                               /* sequence broke */
-                               aterm_close_sp_level = 0;
-                       }
 
-                       if ((saterm_close_sp_parasep_level == 0 ||
-                            saterm_close_sp_parasep_level == 1) &&
-                           (skip.b == SENTENCE_BREAK_PROP_STERM ||
-                            skip.b == SENTENCE_BREAK_PROP_ATERM)) {
-                               /* sequence has begun */
-                               saterm_close_sp_parasep_level = 1;
-                       } else if ((saterm_close_sp_parasep_level == 1 ||
-                                   saterm_close_sp_parasep_level == 2) &&
-                                  skip.b == SENTENCE_BREAK_PROP_CLOSE) {
-                               /* close-sequence begins or continued */
-                               saterm_close_sp_parasep_level = 2;
-                       } else if ((saterm_close_sp_parasep_level == 1 ||
-                                   saterm_close_sp_parasep_level == 2 ||
-                                   saterm_close_sp_parasep_level == 3) &&
-                                  skip.b == SENTENCE_BREAK_PROP_SP) {
-                               /* sp-sequence begins or continued */
-                               saterm_close_sp_parasep_level = 3;
-                       } else if ((saterm_close_sp_parasep_level == 1 ||
-                                   saterm_close_sp_parasep_level == 2 ||
-                                   saterm_close_sp_parasep_level == 3) &&
-                                  (skip.b == SENTENCE_BREAK_PROP_SEP ||
-                                   skip.b == SENTENCE_BREAK_PROP_CR  ||
-                                   skip.b == SENTENCE_BREAK_PROP_LF)) {
-                               /* ParaSep at the end of the sequence */
-                               saterm_close_sp_parasep_level = 4;
-                       } else {
-                               /* sequence broke */
-                               saterm_close_sp_parasep_level = 0;
-                       }
-               }
-
-               /*
-                * Update right side (b and c) of the skip state by
-                * starting at the breakpoint and detecting the two
-                * following non-ignored character classes
-                *
-                */
-               skip.c = NUM_SENTENCE_BREAK_PROPS;
-               for (tmp = off; tmp < len; ) {
-                       tmp += get_codepoint(str, len, tmp, &cp);
-                       res = get_break_prop(cp);
-
-                       if (res != SENTENCE_BREAK_PROP_EXTEND &&
-                           res != SENTENCE_BREAK_PROP_FORMAT) {
-                               skip.c = res;
-                               break;
-                       }
-               }
-               skip.d = NUM_SENTENCE_BREAK_PROPS;
-               for (; tmp < len; ) {
-                       tmp += get_codepoint(str, len, tmp, &cp);
-                       res = get_break_prop(cp);
+       if ((state->saterm_close_sp_parasep_level == 0 ||
+            state->saterm_close_sp_parasep_level == 1) &&
+           (prop == SENTENCE_BREAK_PROP_STERM ||
+            prop == SENTENCE_BREAK_PROP_ATERM)) {
+               /* sequence has begun */
+               state->saterm_close_sp_parasep_level = 1;
+       } else if ((state->saterm_close_sp_parasep_level == 1 ||
+                   state->saterm_close_sp_parasep_level == 2) &&
+                  prop == SENTENCE_BREAK_PROP_CLOSE) {
+               /* close-sequence begins or continued */
+               state->saterm_close_sp_parasep_level = 2;
+       } else if ((state->saterm_close_sp_parasep_level == 1 ||
+                   state->saterm_close_sp_parasep_level == 2 ||
+                   state->saterm_close_sp_parasep_level == 3) &&
+                  prop == SENTENCE_BREAK_PROP_SP) {
+               /* sp-sequence begins or continued */
+               state->saterm_close_sp_parasep_level = 3;
+       } else if ((state->saterm_close_sp_parasep_level == 1 ||
+                   state->saterm_close_sp_parasep_level == 2 ||
+                   state->saterm_close_sp_parasep_level == 3) &&
+                  (prop == SENTENCE_BREAK_PROP_SEP ||
+                   prop == SENTENCE_BREAK_PROP_CR  ||
+                   prop == SENTENCE_BREAK_PROP_LF)) {
+               /* ParaSep at the end of the sequence */
+               state->saterm_close_sp_parasep_level = 4;
+       } else {
+               /* sequence broke */
+               state->saterm_close_sp_parasep_level = 0;
+       }
+}
 
-                       if (res != SENTENCE_BREAK_PROP_EXTEND &&
-                           res != SENTENCE_BREAK_PROP_FORMAT) {
-                               skip.d = res;
-                               break;
-                       }
-               }
+static size_t
+next_sentence_break(HERODOTUS_READER *r)
+{
+       HERODOTUS_READER tmp;
+       enum sentence_break_property prop;
+       struct proper p;
+       struct sentence_break_state state = { 0 };
+       uint_least32_t cp;
 
-               /*
-                * Update the raw state by simply shifting everything
-                * in and, if we still have data left, determining
-                * the character class of the next codepoint.
-                *
-                */
-               raw.a = raw.b;
-               raw.b = raw.c;
-               raw.c = raw.d;
-               if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
-                       get_codepoint(str, len, new_off, &cp);
-                       raw.d = get_break_prop(cp);
-               } else {
-                       raw.d = NUM_SENTENCE_BREAK_PROPS;
-               }
+       /*
+        * Apply sentence breaking algorithm (UAX #29), see
+        * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
+        */
+       proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
+                   get_sentence_break_prop, is_skippable_sentence_prop,
+                   sentence_skip_shift_callback, &p);
 
+       while (!proper_advance(&p)) {
                /* SB3 */
-               if (raw.b == SENTENCE_BREAK_PROP_CR &&
-                   raw.c == SENTENCE_BREAK_PROP_LF) {
+               if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
+                   p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
                        continue;
                }
 
                /* SB4 */
-               if (raw.b == SENTENCE_BREAK_PROP_SEP ||
-                   raw.b == SENTENCE_BREAK_PROP_CR  ||
-                   raw.b == SENTENCE_BREAK_PROP_LF) {
+               if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+                   p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR  ||
+                   p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
                        break;
                }
 
                /* SB5 */
-               if (raw.c == SENTENCE_BREAK_PROP_EXTEND ||
-                   raw.c == SENTENCE_BREAK_PROP_FORMAT) {
+               if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
+                   p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
                        continue;
                }
 
                /* SB6 */
-               if (skip.b == SENTENCE_BREAK_PROP_ATERM &&
-                   skip.c == SENTENCE_BREAK_PROP_NUMERIC) {
+               if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
+                   p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
                        continue;
                }
 
                /* SB7 */
-               if (off > 1 &&
-                   (skip.a == SENTENCE_BREAK_PROP_UPPER ||
-                    skip.a == SENTENCE_BREAK_PROP_LOWER) &&
-                   skip.b == SENTENCE_BREAK_PROP_ATERM &&
-                   skip.c == SENTENCE_BREAK_PROP_UPPER) {
+               if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
+                    p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
+                   p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
+                   p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
                        continue;
                }
 
                /* SB8 */
-               if (aterm_close_sp_level == 1 ||
-                   aterm_close_sp_level == 2 ||
-                   aterm_close_sp_level == 3) {
+               if (state.aterm_close_sp_level == 1 ||
+                   state.aterm_close_sp_level == 2 ||
+                   state.aterm_close_sp_level == 3) {
                        /*
                         * This is the most complicated rule, requiring
                         * the right-hand-side to satisfy the regular expression
@@ -262,67 +182,75 @@ next_sentence_break(const void *str, size_t len, size_t 
(*get_codepoint)
                         *  ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* 
Lower
                         *
                         * which we simply check "manually" given LUT-lookups
-                        * are very cheap.
+                        * are very cheap by starting at the mid_reader.
                         *
                         */
-                       for (tmp = off, res = NUM_SENTENCE_BREAK_PROPS; tmp < 
len; ) {
-                               tmp += get_codepoint(str, len, tmp, &cp);
-                               res = get_break_prop(cp);
+                       herodotus_reader_copy(&(p.mid_reader), &tmp);
+
+                       prop = NUM_SENTENCE_BREAK_PROPS;
+                       while (herodotus_read_codepoint(&tmp, true, &cp) ==
+                              HERODOTUS_STATUS_SUCCESS) {
+                               prop = get_sentence_break_prop(cp);
 
-                               if (res == SENTENCE_BREAK_PROP_OLETTER ||
-                                   res == SENTENCE_BREAK_PROP_UPPER   ||
-                                   res == SENTENCE_BREAK_PROP_LOWER   ||
-                                   res == SENTENCE_BREAK_PROP_SEP     ||
-                                   res == SENTENCE_BREAK_PROP_CR      ||
-                                   res == SENTENCE_BREAK_PROP_LF      ||
-                                   res == SENTENCE_BREAK_PROP_STERM   ||
-                                   res == SENTENCE_BREAK_PROP_ATERM) {
+                               /*
+                                * the skippable properties are ignored
+                                * automatically here given they do not
+                                * match the following condition
+                                */
+                               if (prop == SENTENCE_BREAK_PROP_OLETTER ||
+                                   prop == SENTENCE_BREAK_PROP_UPPER   ||
+                                   prop == SENTENCE_BREAK_PROP_LOWER   ||
+                                   prop == SENTENCE_BREAK_PROP_SEP     ||
+                                   prop == SENTENCE_BREAK_PROP_CR      ||
+                                   prop == SENTENCE_BREAK_PROP_LF      ||
+                                   prop == SENTENCE_BREAK_PROP_STERM   ||
+                                   prop == SENTENCE_BREAK_PROP_ATERM) {
                                        break;
                                }
                        }
 
-                       if (res == SENTENCE_BREAK_PROP_LOWER) {
+                       if (prop == SENTENCE_BREAK_PROP_LOWER) {
                                continue;
                        }
                }
 
                /* SB8a */
-               if ((saterm_close_sp_parasep_level == 1 ||
-                    saterm_close_sp_parasep_level == 2 ||
-                    saterm_close_sp_parasep_level == 3) &&
-                   (skip.c == SENTENCE_BREAK_PROP_SCONTINUE ||
-                    skip.c == SENTENCE_BREAK_PROP_STERM     ||
-                     skip.c == SENTENCE_BREAK_PROP_ATERM)) {
+               if ((state.saterm_close_sp_parasep_level == 1 ||
+                    state.saterm_close_sp_parasep_level == 2 ||
+                    state.saterm_close_sp_parasep_level == 3) &&
+                   (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
+                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM     ||
+                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
                        continue;
                }
 
                /* SB9 */
-               if ((saterm_close_sp_parasep_level == 1 ||
-                    saterm_close_sp_parasep_level == 2) &&
-                   (skip.c == SENTENCE_BREAK_PROP_CLOSE ||
-                    skip.c == SENTENCE_BREAK_PROP_SP    ||
-                    skip.c == SENTENCE_BREAK_PROP_SEP   ||
-                    skip.c == SENTENCE_BREAK_PROP_CR    ||
-                    skip.c == SENTENCE_BREAK_PROP_LF)) {
+               if ((state.saterm_close_sp_parasep_level == 1 ||
+                    state.saterm_close_sp_parasep_level == 2) &&
+                   (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
+                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP    ||
+                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP   ||
+                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR    ||
+                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
                        continue;
                }
 
                /* SB10 */
-               if ((saterm_close_sp_parasep_level == 1 ||
-                    saterm_close_sp_parasep_level == 2 ||
-                    saterm_close_sp_parasep_level == 3) &&
-                   (skip.c == SENTENCE_BREAK_PROP_SP  ||
-                    skip.c == SENTENCE_BREAK_PROP_SEP ||
-                    skip.c == SENTENCE_BREAK_PROP_CR  ||
-                    skip.c == SENTENCE_BREAK_PROP_LF)) {
+               if ((state.saterm_close_sp_parasep_level == 1 ||
+                    state.saterm_close_sp_parasep_level == 2 ||
+                    state.saterm_close_sp_parasep_level == 3) &&
+                   (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP  ||
+                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR  ||
+                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
                        continue;
                }
 
                /* SB11 */
-               if (saterm_close_sp_parasep_level == 1 ||
-                   saterm_close_sp_parasep_level == 2 ||
-                   saterm_close_sp_parasep_level == 3 ||
-                   saterm_close_sp_parasep_level == 4) {
+               if (state.saterm_close_sp_parasep_level == 1 ||
+                   state.saterm_close_sp_parasep_level == 2 ||
+                   state.saterm_close_sp_parasep_level == 3 ||
+                   state.saterm_close_sp_parasep_level == 4) {
                        break;
                }
 
@@ -330,17 +258,25 @@ next_sentence_break(const void *str, size_t len, size_t 
(*get_codepoint)
                continue;
        }
 
-       return off;
+       return herodotus_reader_number_read(&(p.mid_reader));
 }
 
 size_t
 grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
 {
-       return next_sentence_break(str, len, get_codepoint);
+       HERODOTUS_READER r;
+
+       herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
+
+       return next_sentence_break(&r);
 }
 
 size_t
 grapheme_next_sentence_break_utf8(const char *str, size_t len)
 {
-       return next_sentence_break(str, len, get_codepoint_utf8);
+       HERODOTUS_READER r;
+
+       herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
+
+       return next_sentence_break(&r);
 }

Reply via email to