commit b899fd685c50cbc61999296ce1e0a03a45e74f52
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Sun Oct 2 21:09:08 2022 +0200
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Sun Oct 2 21:09:08 2022 +0200

    Add "proper"-property-reader
    
    The word- and sentence-segmentation algorithms make use of a complicated
    logic to accomodate "raw" and "skip" properties. The code is barely
    readable and doesn't separate abstractions away nicely. Moreover, there
    is a high probability that certain edge-cases are not handled properly.
    
    To fix this, this commit adds a "proper"-property-reader, which
    basically does the whole dirty details in the background using
    well-commented and transparent code that builds on top of the
    herodotus-reader instead of doing this by hand. This ensures that we
    will (provably) never have buffer overflows unless there is a mistake
    in the implementation itself, which can be verified relatively easily
    given each function has a limited scope.
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/src/case.c b/src/case.c
index 04e504a..ac24f71 100644
--- a/src/case.c
+++ b/src/case.c
@@ -147,18 +147,14 @@ to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
 static size_t
 herodotus_next_word_break(const HERODOTUS_READER *r)
 {
-       if (r->src == NULL || r->off > r->srclen) {
-               return 0;
-       }
+       HERODOTUS_READER tmp;
+
+       herodotus_reader_copy(r, &tmp);
 
        if (r->type == HERODOTUS_TYPE_CODEPOINT) {
-               return grapheme_next_word_break(
-                       ((const uint_least32_t *)(r->src)) + r->off,
-                       r->srclen - r->off);
+               return grapheme_next_word_break(tmp.src, tmp.srclen);
        } else { /* r->type == HERODOTUS_TYPE_UTF8 */
-               return grapheme_next_word_break_utf8(
-                       ((const char *)(r->src)) + r->off,
-                       r->srclen - r->off);
+               return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
        }
 }
 
@@ -168,9 +164,10 @@ to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
        enum case_property prop;
        enum herodotus_status s;
        uint_least32_t cp;
+       size_t nwb;
 
-       for (;;) {
-               herodotus_reader_push_advance_limit(r, 
herodotus_next_word_break(r));
+       for (; (nwb = herodotus_next_word_break(r)) > 0;) {
+               herodotus_reader_push_advance_limit(r, nwb);
                for (; (s = herodotus_read_codepoint(r, false, &cp)) == 
HERODOTUS_STATUS_SUCCESS;) {
                        /* check if we have a cased character */
                        prop = get_case_property(cp);
@@ -354,9 +351,10 @@ is_titlecase(HERODOTUS_READER *r, size_t *output)
        enum herodotus_status s;
        bool ret = true;
        uint_least32_t cp;
+       size_t nwb;
 
-       for (;;) {
-               herodotus_reader_push_advance_limit(r, 
herodotus_next_word_break(r));
+       for (; (nwb = herodotus_next_word_break(r)) > 0;) {
+               herodotus_reader_push_advance_limit(r, nwb);
                for (; (s = herodotus_read_codepoint(r, false, &cp)) == 
HERODOTUS_STATUS_SUCCESS;) {
                        /* check if we have a cased character */
                        prop = get_case_property(cp);
@@ -377,6 +375,7 @@ is_titlecase(HERODOTUS_READER *r, size_t *output)
                         * we did not encounter any cased character
                         * up to the word break
                         */
+                       herodotus_reader_pop_limit(r);
                        continue;
                } else {
                        /*
diff --git a/src/util.c b/src/util.c
index 9d0eaee..1b6a7cb 100644
--- a/src/util.c
+++ b/src/util.c
@@ -30,14 +30,31 @@ herodotus_reader_copy(const HERODOTUS_READER *src, 
HERODOTUS_READER *dest)
 {
        size_t i;
 
+       /*
+        * we copy such that we have a "fresh" start and build
+        * on the fact that src->soft_limit[i] for any i and src->srclen
+        * are always larger or equal to src->off
+        */
        dest->type = src->type;
-       dest->src = src->src;
-       dest->srclen = src->srclen;
-       dest->off = src->off;
+       if (src->type == HERODOTUS_TYPE_CODEPOINT) {
+               dest->src = ((const uint_least32_t *)(src->src)) + src->off;
+       } else { /* src->type == HERODOTUS_TYPE_UTF8 */
+               dest->src = ((const char *)(src->src)) + src->off;
+       }
+       if (src->srclen == SIZE_MAX) {
+               dest->srclen = SIZE_MAX;
+       } else {
+               dest->srclen = src->srclen - src->off;
+       }
+       dest->off = 0;
        dest->terminated_by_null = src->terminated_by_null;
 
        for (i = 0; i < LEN(src->soft_limit); i++) {
-               dest->soft_limit[i] = src->soft_limit[i];
+               if (src->soft_limit[i] == SIZE_MAX) {
+                       dest->soft_limit[i] = src->soft_limit[i];
+               } else {
+                       dest->soft_limit[i] = src->soft_limit[i] - src->off;
+               }
        }
 }
 
@@ -258,6 +275,140 @@ herodotus_write_codepoint(HERODOTUS_WRITER *w, 
uint_least32_t cp)
        }
 }
 
+void
+proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
+            uint_least8_t (*get_break_prop)(uint_least32_t),
+            bool (*is_skippable_prop)(uint_least8_t),
+            void (*skip_shift_callback)(uint_least8_t, void *),
+            struct proper *p)
+{
+       uint_least8_t prop;
+       uint_least32_t cp;
+       size_t i;
+
+       /* set internal variables */
+       p->state = state;
+       p->no_prop = no_prop;
+       p->get_break_prop = get_break_prop;
+       p->is_skippable_prop = is_skippable_prop;
+       p->skip_shift_callback = skip_shift_callback;
+
+       /*
+        * Initialize mid-reader, which is basically just there
+        * to reflect the current position of the viewing-line
+        */
+       herodotus_reader_copy(r, &(p->mid_reader));
+
+       /*
+        * In the initialization, we simply (try to) fill in next_prop.
+        * If we cannot read in more (due to the buffer ending), we
+        * fill in the prop as invalid
+        */
+
+       /*
+        * initialize the previous properties to have no property
+        * (given we are at the start of the buffer)
+        */
+       p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
+       p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
+
+       /*
+        * initialize the next properties
+        */
+
+       /* initialize the raw reader */
+       herodotus_reader_copy(r, &(p->raw_reader));
+
+       /* fill in the two next raw properties (after no-initialization) */
+       p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
+       for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, 
&cp) ==
+            HERODOTUS_STATUS_SUCCESS; ) {
+               p->raw.next_prop[i++] = p->get_break_prop(cp);
+       }
+
+       /* initialize the skip reader */
+       herodotus_reader_copy(r, &(p->skip_reader));
+
+       /* fill in the two next skip properties (after no-initialization) */
+       p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
+       for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, 
&cp) ==
+            HERODOTUS_STATUS_SUCCESS; ) {
+               prop = p->get_break_prop(cp);
+               if (!p->is_skippable_prop(prop)) {
+                       p->skip.next_prop[i++] = prop;
+               }
+       }
+}
+
+int
+proper_advance(struct proper *p)
+{
+       uint_least8_t prop;
+       uint_least32_t cp;
+
+       /* read in next "raw" property */
+       if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
+           HERODOTUS_STATUS_SUCCESS) {
+               prop = p->get_break_prop(cp);
+       } else {
+               prop = p->no_prop;
+       }
+
+       /*
+        * do a shift-in, unless we find that the property that is to
+        * be moved past the "raw-viewing-line" (this property is stored
+        * in p->raw.next_prop[0]) is a no_prop, indicating that
+        * we are at the end of the buffer.
+        */
+       if (p->raw.next_prop[0] == p->no_prop) {
+               return 1;
+       }
+
+       /* shift in the properties */
+       p->raw.prev_prop[1] = p->raw.prev_prop[0];
+       p->raw.prev_prop[0] = p->raw.next_prop[0];
+       p->raw.next_prop[0] = p->raw.next_prop[1];
+       p->raw.next_prop[1] = prop;
+
+       /* advance the middle reader viewing-line */
+       (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
+
+       /* check skippability-property */
+       if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
+               /*
+                * the property that has moved past the "raw-viewing-line"
+                * (this property is now (after the raw-shift) stored in
+                * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
+                * guaranteeing that we won't shift a no-prop past the
+                * "viewing-line" in the skip-properties) is not a skippable
+                * property, thus we need to shift the skip property as well.
+                */
+               p->skip.prev_prop[1] = p->skip.prev_prop[0];
+               p->skip.prev_prop[0] = p->skip.next_prop[0];
+               p->skip.next_prop[0] = p->skip.next_prop[1];
+
+               /*
+                * call the skip-shift-callback on the property that
+                * passed the skip-viewing-line (this property is now
+                * stored in p->skip.prev_prop[0]).
+                */
+               p->skip_shift_callback(p->skip.prev_prop[0], p->state);
+
+               /* determine the next shift property */
+               p->skip.next_prop[1] = p->no_prop;
+               while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
+                      HERODOTUS_STATUS_SUCCESS) {
+                       prop = p->get_break_prop(cp);
+                       if (!p->is_skippable_prop(prop)) {
+                               p->skip.next_prop[1] = prop;
+                               break;
+                       }
+               }
+       }
+
+       return 0;
+}
+
 inline size_t
 get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp)
 {
diff --git a/src/util.h b/src/util.h
index 1b112a2..dd36ce3 100644
--- a/src/util.h
+++ b/src/util.h
@@ -74,6 +74,22 @@ typedef struct herodotus_writer {
        size_t first_unwritable_offset;
 } HERODOTUS_WRITER;
 
+struct proper {
+       /*
+        * prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1]
+        */
+       struct {
+               uint_least8_t prev_prop[2];
+               uint_least8_t next_prop[2];
+       } raw, skip;
+       HERODOTUS_READER mid_reader, raw_reader, skip_reader;
+       void *state;
+       uint_least8_t no_prop;
+       uint_least8_t (*get_break_prop)(uint_least32_t);
+       bool (*is_skippable_prop)(uint_least8_t);
+       void (*skip_shift_callback)(uint_least8_t, void *);
+};
+
 void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type,
                            const void *, size_t);
 void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *);
@@ -90,6 +106,13 @@ void herodotus_writer_nul_terminate(HERODOTUS_WRITER *);
 size_t herodotus_writer_number_written(const HERODOTUS_WRITER *);
 void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t);
 
+void proper_init(const HERODOTUS_READER *, void *, uint_least8_t,
+                 uint_least8_t (*get_break_prop)(uint_least32_t),
+                 bool (*is_skippable_prop)(uint_least8_t),
+                 void (*skip_shift_callback)(uint_least8_t, void *),
+                 struct proper *);
+int proper_advance(struct proper *);
+
 size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *);
 size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *);
 

Reply via email to