commit c8715cbecccbdb61b2f46f7ad18e015ba8703637
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Sun Aug 21 13:47:19 2022 +0200
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Sun Aug 21 13:51:22 2022 +0200

    Properly return offset when input is shorter or equal to one codepoint
    
    I sadly didn't catch this bug with automatic testing, given I didn't
    cover the shorthand-check at the beginning of the breakpoint-detection
    functions in any test-case. Additionally, it would be shadowed when
    simply working with UCS-4-arrays.
    
    On a higher level, this surfaced when checking the title-case of
    one-character-strings, given it would first get the next word-break
    (which would be underreported as 1 instead of something >1) only to
    subtract the real offset of the grapheme-cluster-bytelen later on,
    leading to an underflow and infinite loop.
    
    Thanks to polarisFuton9719 for reporting this bug!
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/src/case.c b/src/case.c
index 21ec5af..1be387d 100644
--- a/src/case.c
+++ b/src/case.c
@@ -431,9 +431,13 @@ is_titlecase(const void *src, size_t srclen,
                                }
                        }
 
-                       /* we consumed a character */
+                       /*
+                        * we consumed a character (make sure to never
+                        * underflow next_wb; this should not happen,
+                        * but it's better to be sure)
+                        */
                        srcoff += res;
-                       next_wb -= res;
+                       next_wb -= (res <= next_wb) ? res : next_wb;
                }
 
                /* check if the rest of the codepoints in the word are 
lowercase */
diff --git a/src/line.c b/src/line.c
index bced2f2..051e152 100644
--- a/src/line.c
+++ b/src/line.c
@@ -51,7 +51,11 @@ next_line_break(const void *str, size_t len, size_t 
(*get_codepoint)
         */
        cp0_prop = NUM_LINE_BREAK_PROPS;
        if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
-               return 1;
+               /*
+                * A line is at least one codepoint long, so we can
+                * safely return here
+                */
+               return len;
        }
        cp1_prop = get_break_prop(cp);
        last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
diff --git a/src/sentence.c b/src/sentence.c
index a5850ec..d464edc 100644
--- a/src/sentence.c
+++ b/src/sentence.c
@@ -66,7 +66,11 @@ next_sentence_break(const void *str, size_t len, size_t 
(*get_codepoint)
         */
        raw.b = NUM_SENTENCE_BREAK_PROPS;
        if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
-               return 1;
+               /*
+                * A line is at least one codepoint long, so we can
+                * safely return here
+                */
+               return len;
        }
        raw.c = get_break_prop(cp);
        (void)get_codepoint(str, len, off, &cp);
diff --git a/src/word.c b/src/word.c
index e8e9b44..dffb5b5 100644
--- a/src/word.c
+++ b/src/word.c
@@ -64,7 +64,11 @@ next_word_break(const void *str, size_t len, size_t 
(*get_codepoint)
         */
        raw.b = NUM_WORD_BREAK_PROPS;
        if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
-               return 1;
+               /*
+                * A line is at least one codepoint long, so we can
+                * safely return here
+                */
+               return len;
        }
        raw.c = get_break_prop(cp);
        (void)get_codepoint(str, len, off, &cp);

Reply via email to