commit 04bab2a4c09816c37c8e06aa38dfc7f2cab8c680
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Thu May 28 12:57:37 2020 +0200
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Thu May 28 12:57:37 2020 +0200

    Add automatic UTF-8-decoder-tests
    
    The 23 tests should cover all cases and provide safety against any
    possible regressions.
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/data/gbt.awk b/data/gbt.awk
index 41c635a..5fd7c0a 100644
--- a/data/gbt.awk
+++ b/data/gbt.awk
@@ -6,7 +6,7 @@ BEGIN {
 
        printf("struct test {\n\tCodepoint *cp;\n\tsize_t cplen;\n");
        printf("\tsize_t *len;\n\tsize_t lenlen;\n\tchar *descr;\n};\n\n");
-       printf("struct test t[] = {\n");
+       printf("static const struct test t[] = {\n");
 }
 
 $0 ~ /^#/ || $0 ~ /^\s*$/ { next }
diff --git a/src/test_body.c b/src/test_body.c
index 25dedd2..536de8f 100644
--- a/src/test_body.c
+++ b/src/test_body.c
@@ -3,15 +3,277 @@
 #include <stdio.h>
 
 #include "boundary.h"
+#include "codepoint.h"
 
 #define LEN(x) (sizeof(x) / sizeof(*x))
 
+/* all types valid/invalid, overencoded, surrogate, over 10FFFF w/e
+ * expected return value and return cp */
+
+static const struct {
+       uint8_t *arr;     /* byte array */
+       size_t   len;     /* number of bytes in array */
+       size_t   exp_len; /* expected length returned */
+       uint32_t exp_cp;  /* expected codepoint returned */
+} dec_test[] = {
+       {
+               /* empty sequence
+                * [ ] ->
+                * INVALID
+                */
+               .arr     = NULL,
+               .len     = 0,
+               .exp_len = 1,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid lead byte
+                * [ 11111101 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xFD },
+               .len     = 1,
+               .exp_len = 1,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* valid 1-byte sequence
+                * [ 00000001 ] ->
+                * 0000001
+                */
+               .arr     = (uint8_t[]){ 0x01 },
+               .len     = 1,
+               .exp_len = 1,
+               .exp_cp  = 0x1,
+       },
+       {
+               /* valid 2-byte sequence
+                * [ 11000011 10111111 ] ->
+                * 00011111111
+                */
+               .arr     = (uint8_t[]){ 0xC3, 0xBF },
+               .len     = 2,
+               .exp_len = 2,
+               .exp_cp  = 0xff,
+       },
+       {
+               /* invalid 2-byte sequence (second byte missing)
+                * [ 11000011 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xC3 },
+               .len     = 1,
+               .exp_len = 2,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 2-byte sequence (second byte malformed)
+                * [ 11000011 11111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xC3, 0xFF },
+               .len     = 2,
+               .exp_len = 1,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 2-byte sequence (overlong encoded)
+                * [ 11000001 10111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xC1, 0xBF },
+               .len     = 2,
+               .exp_len = 2,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* valid 3-byte sequence
+                * [ 11100000 10111111 10111111 ] ->
+                * 0000111111111111
+                */
+               .arr     = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+               .len     = 3,
+               .exp_len = 3,
+               .exp_cp  = 0xfff,
+       },
+       {
+               /* invalid 3-byte sequence (second byte missing)
+                * [ 11100000 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xE0 },
+               .len     = 1,
+               .exp_len = 3,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 3-byte sequence (second byte malformed)
+                * [ 11100000 01111111 10111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+               .len     = 3,
+               .exp_len = 1,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 3-byte sequence (third byte missing)
+                * [ 11100000 10111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xE0, 0xBF },
+               .len     = 2,
+               .exp_len = 3,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 3-byte sequence (third byte malformed)
+                * [ 11100000 10111111 01111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+               .len     = 3,
+               .exp_len = 2,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 3-byte sequence (overlong encoded)
+                * [ 11100000 10011111 10111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+               .len     = 3,
+               .exp_len = 3,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 3-byte sequence (UTF-16 surrogate half)
+                * [ 11101101 10100000 10000000 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+               .len     = 3,
+               .exp_len = 3,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* valid 4-byte sequence
+                * [ 11110011 10111111 10111111 10111111 ] ->
+                * 011111111111111111111
+                */
+               .arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+               .len     = 4,
+               .exp_len = 4,
+               .exp_cp  = 0xfffff,
+       },
+       {
+               /* invalid 4-byte sequence (second byte missing)
+                * [ 11110011 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xF3 },
+               .len     = 1,
+               .exp_len = 4,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 4-byte sequence (second byte malformed)
+                * [ 11110011 01111111 10111111 10111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+               .len     = 4,
+               .exp_len = 1,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 4-byte sequence (third byte missing)
+                * [ 11110011 10111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xF3, 0xBF },
+               .len     = 2,
+               .exp_len = 4,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 4-byte sequence (third byte malformed)
+                * [ 11110011 10111111 01111111 10111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+               .len     = 4,
+               .exp_len = 2,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 4-byte sequence (fourth byte missing)
+                * [ 11110011 10111111 10111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+               .len     = 3,
+               .exp_len = 4,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 4-byte sequence (fourth byte malformed)
+                * [ 11110011 10111111 10111111 01111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+               .len     = 4,
+               .exp_len = 3,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 4-byte sequence (overlong encoded)
+                * [ 11110000 10000000 10000001 10111111 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+               .len     = 4,
+               .exp_len = 4,
+               .exp_cp  = CP_INVALID,
+       },
+       {
+               /* invalid 4-byte sequence (UTF-16-unrepresentable)
+                * [ 11110100 10010000 10000000 10000000 ] ->
+                * INVALID
+                */
+               .arr     = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+               .len     = 4,
+               .exp_len = 4,
+               .exp_cp  = CP_INVALID,
+       },
+};
+
 int main(void)
 {
        int state;
-       size_t i, j, k, len, failed = 0;
+       size_t i, j, k, len, failed;
+
+       /* UTF-8 decoder test */
+       for (i = 0, failed = 0; i < LEN(dec_test); i++) {
+               size_t len;
+               uint32_t cp;
+
+               len = grapheme_cp_decode(&cp, dec_test[i].arr,
+                                        dec_test[i].len);
 
-       for (i = 0; i < LEN(t); i++) {
+               if (len != dec_test[i].exp_len ||
+                   cp != dec_test[i].exp_cp) {
+                       fprintf(stderr, "Failed UTF-8-decoder test %zu: "
+                               "Expected (%zx,%u), but got (%zx,%u)\n",
+                               i, dec_test[i].exp_len,
+                               dec_test[i].exp_cp, len, cp);
+               }
+       }
+       printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n",
+              LEN(dec_test) - failed, LEN(dec_test));
+
+       /* grapheme break test */
+       for (i = 0, failed = 0; i < LEN(t); i++) {
                for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
                        if ((j + 1) == t[i].cplen ||
                            boundary(t[i].cp[j], t[i].cp[j + 1], &state)) {
@@ -28,8 +290,8 @@ int main(void)
                        }
                }
        }
-
-       printf("Passed %zu out of %zu tests.\n", LEN(t) - failed, LEN(t));
+       printf("Grapheme break test: Passed %zu out of %zu tests.\n",
+              LEN(t) - failed, LEN(t));
 
        return (failed > 0) ? 1 : 0;
 }

Reply via email to