Pádraig Brady wrote:
I've pushed the c_iscntrl patch since it's simplest
and probably most appropriate patch for an existing release.

Yes, that makes sense for a quick patch. However, for the next release I think it'd be better to catch encoding errors and multibyte control characters, given the problems noted. I installed the attached further patch to try to do this. This fixes the problem that Bruno noted, along with two others; my earlier patch neglected the possibility that mbrtowc can return 0, and it incorrectly assumed wide control characters always have a single-byte representation.

Either way the original bug appears to be fix so I'm boldly closing the bug 
report.
From 2cf5d730690dad600f8b6d74d0b5fde522804e43 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 22 Jul 2018 09:50:20 -0700
Subject: [PATCH] df: avoid multibyte character corruption on macOS

This improves on the earlier fix for the problem reported by
Chih-Hsuan Yen (Bug#32236), by also looking for other control
characters and for encoding errors.
* src/df.c: Include wchar.h and wctype.h instead of c-ctype.h.
(hide_problematic_chars): Process the string as multibyte.
Use iswcntrl, not c_iscntrl.
---
 src/df.c | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/df.c b/src/df.c
index c851fcc..d27ba02 100644
--- a/src/df.c
+++ b/src/df.c
@@ -23,7 +23,8 @@
 #include <sys/types.h>
 #include <getopt.h>
 #include <assert.h>
-#include <c-ctype.h>
+#include <wchar.h>
+#include <wctype.h>
 
 #include "system.h"
 #include "canonicalize.h"
@@ -272,21 +273,41 @@ static struct option const long_options[] =
   {NULL, 0, NULL, 0}
 };
 
-/* Replace problematic chars with '?'.
-   Since only control characters are currently considered,
-   this should work in all encodings.  */
+/* Replace problematic chars with '?'.  */
 
-static char*
+static void
 hide_problematic_chars (char *cell)
 {
-  char *p = cell;
-  while (*p)
+  char *srcend = cell + strlen (cell);
+  char *dst = cell;
+  mbstate_t mbstate = { 0, };
+  size_t n;
+
+  for (char *src = cell; src != srcend; src += n)
     {
-      if (c_iscntrl (to_uchar (*p)))
-        *p = '?';
-      p++;
+      wchar_t wc;
+      size_t srcbytes = srcend - src;
+      n = mbrtowc (&wc, src, srcbytes, &mbstate);
+      bool ok = 0 < n && n <= srcbytes;
+
+      if (ok)
+        ok = !iswcntrl (wc);
+      else
+        n = 1;
+
+      if (ok)
+        {
+          memmove (dst, src, n);
+          dst += n;
+        }
+      else
+        {
+          *dst++ = '?';
+          memset (&mbstate, 0, sizeof mbstate);
+        }
     }
-  return cell;
+
+  *dst = '\0';
 }
 
 /* Dynamically allocate a row of pointers in TABLE, which
-- 
2.7.4

Reply via email to