[hackers] [libgrapheme] Add UTF-8-encoder tests || Laslo Hunhold

git Sun, 31 May 2020 13:53:02 -0700

commit d2b53cb080b8c75b140bb1a3347b409c118e882d
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Sun May 31 22:49:30 2020 +0200
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Sun May 31 22:50:02 2020 +0200


    Add UTF-8-encoder tests
    
    This should cover all the edge cases and provide a regression test
    for the encoder.
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/src/test_body.c b/src/test_body.c
index 536de8f..2ec4546 100644
--- a/src/test_body.c
+++ b/src/test_body.c
@@ -1,14 +1,55 @@
 /* See LICENSE file for copyright and license details. */
 #include <stddef.h>
 #include <stdio.h>
+#include <string.h>
 
 #include "boundary.h"
 #include "codepoint.h"
 
 #define LEN(x) (sizeof(x) / sizeof(*x))
 
-/* all types valid/invalid, overencoded, surrogate, over 10FFFF w/e
- * expected return value and return cp */
+static const struct {
+       uint32_t cp;      /* input code point */
+       uint8_t *exp_arr; /* expected UTF-8 byte sequence */
+       size_t   exp_len; /* expected length of UTF-8 sequence */
+} enc_test[] = {
+       {
+               /* invalid code point (UTF-16 surrogate half) */
+               .cp      = UINT32_C(0xD800),
+               .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+               .exp_len = 3,
+       },
+       {
+               /* invalid code point (UTF-16-unrepresentable) */
+               .cp      = UINT32_C(0x110000),
+               .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+               .exp_len = 3,
+       },
+       {
+               /* code point encoded to a 1-byte sequence */
+               .cp      = 0x01,
+               .exp_arr = (uint8_t[]){ 0x01 },
+               .exp_len = 1,
+       },
+       {
+               /* code point encoded to a 2-byte sequence */
+               .cp      = 0xFF,
+               .exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+               .exp_len = 2,
+       },
+       {
+               /* code point encoded to a 3-byte sequence */
+               .cp      = 0xFFF,
+               .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+               .exp_len = 3,
+       },
+       {
+               /* code point encoded to a 4-byte sequence */
+               .cp      = UINT32_C(0xFFFFF),
+               .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+               .exp_len = 4,
+       },
+};
 
 static const struct {
        uint8_t *arr;     /* byte array */
@@ -253,6 +294,38 @@ int main(void)
        int state;
        size_t i, j, k, len, failed;
 
+       /* UTF-8 encoder test */
+       for (i = 0, failed = 0; i < LEN(enc_test); i++) {
+               uint8_t arr[4];
+               size_t len;
+
+               len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr));
+
+               if (len != enc_test[i].exp_len ||
+                   memcmp(arr, enc_test[i].exp_arr, len)) {
+                       fprintf(stderr, "Failed UTF-8-encoder test %zu: "
+                               "Expected (", i);
+                       for (j = 0; j < enc_test[i].exp_len; j++) {
+                               fprintf(stderr, "0x%x",
+                                       enc_test[i].exp_arr[j]);
+                               if (j != enc_test[i].exp_len - 1) {
+                                       fprintf(stderr, " ");
+                               }
+                       }
+                       fprintf(stderr, "), but got (");
+                       for (j = 0; j < len; j++) {
+                               fprintf(stderr, "0x%x", arr[j]);
+                               if (j != len - 1) {
+                                       fprintf(stderr, " ");
+                               }
+                       }
+                       fprintf(stderr, ")\n");
+                       failed++;
+               }
+       }
+       printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n",
+              LEN(enc_test) - failed, LEN(enc_test));
+
        /* UTF-8 decoder test */
        for (i = 0, failed = 0; i < LEN(dec_test); i++) {
                size_t len;

[hackers] [libgrapheme] Add UTF-8-encoder tests || Laslo Hunhold

Reply via email to