From 6d6e15edf1e47758ba17e3e03a2646c949aa4c06 Mon Sep 17 00:00:00 2001
From: Ray Gardner <raygard@gmail.com>
Date: Sat, 24 Aug 2024 17:44:24 -0600
Subject: [PATCH 5/7] Fix field splitting and split() bugs

Split utf8 chars correctly in fields and split when FS==""; also force
newline as field sep with RS=="" only if FS is single character (per
gawk manual).
---
 toys/pending/awk.c   | 21 ++++++++++++++++++---

diff --git a/toys/pending/awk.c b/toys/pending/awk.c
index 7080691..24cfb6a 100644
--- a/toys/pending/awk.c
+++ b/toys/pending/awk.c
@@ -2744,14 +2744,27 @@ static int splitter(void (*setter)(struct zmap *, int, char *, size_t), struct z
 {
   regex_t *rx;
   regoff_t offs, end;
+  int multiline_null_rs = !ENSURE_STR(&STACK[RS])->vst->str[0];
   if (!IS_RX(zvfs)) to_str(zvfs);
-  char *fs = IS_STR(zvfs) ? zvfs->vst->str : "";
+  char *s0 = s, *fs = IS_STR(zvfs) ? zvfs->vst->str : "";
+  int one_char_fs = utf8cnt(zvfs->vst->str, zvfs->vst->size) == 1;
   int nf = 0, r = 0, eflag = 0;
   // Empty string or empty fs (regex).
   // Need to include !*s b/c empty string, otherwise
   // split("", a, "x") splits to a 1-element (empty element) array
   if (!*s || (IS_STR(zvfs) && !*fs) || IS_EMPTY_RX(zvfs)) {
-    for ( ; *s; s++) setter(m, ++nf, s, 1);
+    while (*s) {
+      if (*s < 128) setter(m, ++nf, s++, 1);
+      else {        // Handle UTF-8
+        char cbuf[8];
+        unsigned wc;
+        int nc = utf8towc(&wc, s, strlen(s));
+        if (nc < 2) FATAL("bad string for split: \"%s\"\n", s0);
+        s += nc;
+        nc = wctoutf8(cbuf, wc);
+        setter(m, ++nf, cbuf, nc);
+      }
+    }
     return nf;
   }
   if (IS_RX(zvfs)) rx = zvfs->rx;
@@ -2761,7 +2774,9 @@ static int splitter(void (*setter)(struct zmap *, int, char *, size_t), struct z
     // rx_find_FS() returns 0 if found. If nonzero, the field will
     // be the rest of the record (all of it if first time through).
     if ((r = rx_find_FS(rx, s, &offs, &end, eflag))) offs = end = strlen(s);
-    else {
+    else if (setter == set_field && multiline_null_rs && one_char_fs) {
+      // Contra POSIX, if RS=="" then newline is always also a
+      // field separator only if FS is a single char (see gawk manual)
       int k = strcspn(s, "\n");
       if (k < offs) offs = k, end = k + 1;
     }
-- 
2.43.0

