Chih-Hsuan Yen wrote:
How about following the idea from Pádraig Brady
and filter \n only?

Given the later comments it seems better to filter out encoding errors and control characters. Programs that parse the output already cannot trust the strings to be exactly right, since newlines are gonna get replaced no matter what. So there seems little benefit to copying the other garbage faithfully.

Revised proposed patch(es) attached.
>From e4f2a5f97771c4a74e0bdef1c1e4a0d2735cef15 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 22 Jul 2018 09:50:20 -0700
Subject: [PATCH 1/2] df: avoid multibyte character corruption on macOS

Problem reported by Chih-Hsuan Yen (Bug#32236).
* NEWS: Mention the bug fix.
* src/df.c: Include wchar.h and wctype.h.
(hide_problematic_chars): Respect multibyte encodings
when replacing problematic characters or bytes with '?'.
---
 NEWS     |  4 ++++
 src/df.c | 35 +++++++++++++++++++++++++----------
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/NEWS b/NEWS
index af1a990..aa3b4f9 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,10 @@ GNU coreutils NEWS                                    -*- outline -*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** Bug fixes
+
+  df no longer corrupts displayed multibyte characters on macOS.
+
 
 * Noteworthy changes in release 8.30 (2018-07-01) [stable]
 
diff --git a/src/df.c b/src/df.c
index 1178865..52be414b 100644
--- a/src/df.c
+++ b/src/df.c
@@ -23,6 +23,8 @@
 #include <sys/types.h>
 #include <getopt.h>
 #include <assert.h>
+#include <wchar.h>
+#include <wctype.h>
 
 #include "system.h"
 #include "canonicalize.h"
@@ -271,21 +273,34 @@ static struct option const long_options[] =
   {NULL, 0, NULL, 0}
 };
 
-/* Replace problematic chars with '?'.
-   Since only control characters are currently considered,
-   this should work in all encodings.  */
+/* Replace problematic chars with '?'.  */
 
-static char*
+static void
 hide_problematic_chars (char *cell)
 {
-  char *p = cell;
-  while (*p)
+  char *srcend = cell + strlen (cell);
+  char *dst = cell;
+  mbstate_t mbstate = { 0, };
+  size_t n;
+
+  for (char *src = cell; src != srcend; src += n)
     {
-      if (iscntrl (to_uchar (*p)))
-        *p = '?';
-      p++;
+      wchar_t wc;
+      size_t srcbytes = srcend - src;
+      n = mbrtowc (&wc, src, srcbytes, &mbstate);
+      if (n <= srcbytes && !iswcntrl (wc))
+        {
+          memcpy (dst, src, n);
+          dst += n;
+        }
+      else
+        {
+          *dst++ = '?';
+          memset (&mbstate, 0, sizeof mbstate);
+        }
     }
-  return cell;
+
+  *dst = '\0';
 }
 
 /* Dynamically allocate a row of pointers in TABLE, which
-- 
2.7.4

>From e40ae8fb26fd0c8c0cb7e42598a2b8d9bd8551bb Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Thu, 26 Jul 2018 01:56:28 -0700
Subject: [PATCH 2/2] df: tune slightly

* src/df.c (get_header, get_dev):
Avoid calling mbswidth twice when once will do.
---
 src/df.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/df.c b/src/df.c
index 52be414b..f5bd26d 100644
--- a/src/df.c
+++ b/src/df.c
@@ -588,7 +588,8 @@ get_header (void)
 
       table[nrows - 1][col] = cell;
 
-      columns[col]->width = MAX (columns[col]->width, mbswidth (cell, 0));
+      size_t cell_width = mbswidth (cell, 0);
+      columns[col]->width = MAX (columns[col]->width, cell_width);
     }
 }
 
@@ -1198,7 +1199,8 @@ get_dev (char const *disk, char const *mount_point, char const* file,
         assert (!"empty cell");
 
       hide_problematic_chars (cell);
-      columns[col]->width = MAX (columns[col]->width, mbswidth (cell, 0));
+      size_t cell_width = mbswidth (cell, 0);
+      columns[col]->width = MAX (columns[col]->width, cell_width);
       table[nrows - 1][col] = cell;
     }
   free (dev_name);
-- 
2.7.4

Reply via email to