commit c8715cbecccbdb61b2f46f7ad18e015ba8703637
parent 7230c626e0e193ec0b8ee45404e84e530c796e78
Author: Laslo Hunhold <dev@frign.de>
Date: Sun, 21 Aug 2022 13:47:19 +0200
Properly return offset when input is shorter or equal to one codepoint
I sadly didn't catch this bug with automatic testing, given I didn't
cover the shorthand-check at the beginning of the breakpoint-detection
functions in any test-case. Additionally, it would be shadowed when
simply working with UCS-4-arrays.
On a higher level, this surfaced when checking the title-case of
one-character-strings, given it would first get the next word-break
(which would be underreported as 1 instead of something >1) only to
subtract the real offset of the grapheme-cluster-bytelen later on,
leading to an underflow and infinite loop.
Thanks to polarisFuton9719 for reporting this bug!
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
4 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/src/case.c b/src/case.c
@@ -431,9 +431,13 @@ is_titlecase(const void *src, size_t srclen,
}
}
- /* we consumed a character */
+ /*
+ * we consumed a character (make sure to never
+ * underflow next_wb; this should not happen,
+ * but it's better to be sure)
+ */
srcoff += res;
- next_wb -= res;
+ next_wb -= (res <= next_wb) ? res : next_wb;
}
/* check if the rest of the codepoints in the word are lowercase */
diff --git a/src/line.c b/src/line.c
@@ -51,7 +51,11 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint)
*/
cp0_prop = NUM_LINE_BREAK_PROPS;
if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
- return 1;
+ /*
+ * A line is at least one codepoint long, so we can
+ * safely return here
+ */
+ return len;
}
cp1_prop = get_break_prop(cp);
last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
diff --git a/src/sentence.c b/src/sentence.c
@@ -66,7 +66,11 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
*/
raw.b = NUM_SENTENCE_BREAK_PROPS;
if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
- return 1;
+ /*
+ * A line is at least one codepoint long, so we can
+ * safely return here
+ */
+ return len;
}
raw.c = get_break_prop(cp);
(void)get_codepoint(str, len, off, &cp);
diff --git a/src/word.c b/src/word.c
@@ -64,7 +64,11 @@ next_word_break(const void *str, size_t len, size_t (*get_codepoint)
*/
raw.b = NUM_WORD_BREAK_PROPS;
if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
- return 1;
+ /*
+ * A line is at least one codepoint long, so we can
+ * safely return here
+ */
+ return len;
}
raw.c = get_break_prop(cp);
(void)get_codepoint(str, len, off, &cp);