Hi,

here is a clean patch implementing full UTF-8 support including
validation, sanitation, and columnation for ls(1).  This integrates
work done by tedu@, bentley@, and myself during u2k15 in Berlin.

After several iterations, i now consider it ripe for commit,
but would appreciate at least two OKs given that ls(1) is
not exactly some random niche utility.

Properties of the patch:

 * In the main ls(1) codebase, nothing changes except that
   putname(...) is replaced by mbsprint(..., 1) and
   strlen(...) is replaced by mbsprint(..., 0).
   The places that need to be changed were identified by
   tedu@ and bentley@.
 * Only one UTF-8 aware function is needed, isolated in a
   seperate file.  It uses standard POSIX interfaces.
 * I am resetting mbtowc(3) internal state after failure,
   even though that has no effect on OpenBSD, if only as a
   reminder that there be dragons.  It costs almost nothing,
   neither in terms of clutter nor in terms of performance.
 * I have decoupled the patch from my simplified implementations
   of mbtowc(3) and wcwidth(3).  That would be a later step
   and shouldn't muddle the waters in the present context.

Yours,
  Ingo


Index: Makefile
===================================================================
RCS file: /cvs/src/bin/ls/Makefile,v
retrieving revision 1.7
diff -u -p -r1.7 Makefile
--- Makefile    6 Aug 2003 19:09:09 -0000       1.7
+++ Makefile    30 Nov 2015 15:46:43 -0000
@@ -1,7 +1,7 @@
 #      $OpenBSD: Makefile,v 1.7 2003/08/06 19:09:09 tedu Exp $
 
 PROG=  ls
-SRCS=  cmp.c ls.c main.c print.c util.c
+SRCS=  cmp.c ls.c main.c print.c util.c utf8.c
 DPADD= ${LIBUTIL}
 LDADD= -lutil
 
Index: extern.h
===================================================================
RCS file: /cvs/src/bin/ls/extern.h,v
retrieving revision 1.9
diff -u -p -r1.9 extern.h
--- extern.h    2 Jun 2003 23:32:08 -0000       1.9
+++ extern.h    30 Nov 2015 15:46:43 -0000
@@ -45,7 +45,7 @@ int    revstatcmp(const FTSENT *, const FT
 int     sizecmp(const FTSENT *, const FTSENT *);
 int     revsizecmp(const FTSENT *, const FTSENT *);
 
-int     putname(char *);
+int     mbsprint(const char *, int);
 void    printcol(DISPLAY *);
 void    printacol(DISPLAY *);
 void    printlong(DISPLAY *);
Index: ls.1
===================================================================
RCS file: /cvs/src/bin/ls/ls.1,v
retrieving revision 1.72
diff -u -p -r1.72 ls.1
--- ls.1        24 Apr 2015 10:57:36 -0000      1.72
+++ ls.1        30 Nov 2015 15:46:43 -0000
@@ -440,6 +440,12 @@ If this variable contains a string repre
 decimal integer, it is used as the
 column position width for displaying
 multiple-text-column output.
+.It Ev LC_CTYPE
+If set to a string ending in
+.Qq .UTF-8 ,
+.Nm
+respects character display widths when columnating output.
+Otherwise, non-ASCII bytes are replaced by question marks.
 .It Ev TZ
 The time zone to use when displaying dates.
 See
Index: ls.c
===================================================================
RCS file: /cvs/src/bin/ls/ls.c,v
retrieving revision 1.43
diff -u -p -r1.43 ls.c
--- ls.c        9 Oct 2015 01:37:06 -0000       1.43
+++ ls.c        30 Nov 2015 15:46:44 -0000
@@ -48,6 +48,7 @@
 #include <string.h>
 #include <unistd.h>
 #include <limits.h>
+#include <locale.h>
 #include <util.h>
 
 #include "ls.h"
@@ -103,6 +104,10 @@ ls_main(int argc, char *argv[])
        int kflag = 0, width = 0;
        char *p;
 
+#ifndef SMALL
+       setlocale(LC_CTYPE, "");
+#endif
+
        /* Terminal defaults to -Cq, non-terminal defaults to -1. */
        if (isatty(STDOUT_FILENO)) {
                if ((p = getenv("COLUMNS")) != NULL)
@@ -428,6 +433,7 @@ display(FTSENT *p, FTSENT *list)
        ino_t maxinode;
        int bcfile, flen, glen, ulen, maxflags, maxgroup, maxuser;
        int entries, needstats;
+       int width;
        char *user, *group, buf[21];    /* 64 bits == 20 digits */
        char nuser[12], ngroup[12];
        char *flags = NULL;
@@ -474,8 +480,8 @@ display(FTSENT *p, FTSENT *list)
                                continue;
                        }
                }
-               if (cur->fts_namelen > maxlen)
-                       maxlen = cur->fts_namelen;
+               if ((width = mbsprint(cur->fts_name, 0)) > maxlen)
+                       maxlen = width;
                if (needstats) {
                        sp = cur->fts_statp;
                        if (sp->st_blocks > maxblock)
Index: print.c
===================================================================
RCS file: /cvs/src/bin/ls/print.c,v
retrieving revision 1.34
diff -u -p -r1.34 print.c
--- print.c     15 Mar 2015 00:41:27 -0000      1.34
+++ print.c     30 Nov 2015 15:46:44 -0000
@@ -122,7 +122,7 @@ printlong(DISPLAY *dp)
                        printtime(sp->st_ctime);
                else
                        printtime(sp->st_mtime);
-               (void)putname(p->fts_name);
+               (void)mbsprint(p->fts_name, 1);
                if (f_type || (f_typedir && S_ISDIR(sp->st_mode)))
                        (void)printtype(sp->st_mode);
                if (S_ISLNK(sp->st_mode))
@@ -231,7 +231,7 @@ printaname(FTSENT *p, u_long inodefield,
        if (f_size)
                chcnt += printf("%*qd ",
                    (int)sizefield, howmany(sp->st_blocks, blocksize));
-       chcnt += putname(p->fts_name);
+       chcnt += mbsprint(p->fts_name, 1);
        if (f_type || (f_typedir && S_ISDIR(sp->st_mode)))
                chcnt += printtype(sp->st_mode);
        return (chcnt);
@@ -310,7 +310,8 @@ printstream(DISPLAY *dp)
                        continue;
                if (col > 0) {
                        (void)putchar(','), col++;
-                       if (col + 1 + extwidth + p->fts_namelen >= termwidth)
+                       if (col + 1 + extwidth + mbsprint(p->fts_name, 0) >=
+                           termwidth)
                                (void)putchar('\n'), col = 0;
                        else
                                (void)putchar(' '), col++;
@@ -361,7 +362,7 @@ printlink(FTSENT *p)
        }
        path[lnklen] = '\0';
        (void)printf(" -> ");
-       (void)putname(path);
+       (void)mbsprint(path, 1);
 }
 
 static void
Index: utf8.c
===================================================================
RCS file: utf8.c
diff -N utf8.c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ utf8.c      30 Nov 2015 15:46:44 -0000
@@ -0,0 +1,51 @@
+/*     $OpenBSD$       */
+
+/*
+ * Copyright (c) 2015 Ingo Schwarze <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef SMALL
+#include <stdio.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+int
+mbsprint(const char *mbs, int print)
+{
+       wchar_t   wc;
+       int       len;  /* length in bytes of UTF-8 encoded string */
+       int       width;  /* display width of a single Unicode char */
+       int       total_width;  /* display width of the whole string */
+
+       for (total_width = 0; *mbs != '\0'; mbs += len) {
+               if ((len = mbtowc(&wc, mbs, MB_CUR_MAX)) == -1) {
+                       (void)mbtowc(NULL, NULL, MB_CUR_MAX);
+                       if (print)
+                               putchar('?');
+                       total_width++;
+                       len = 1;
+               } else if ((width = wcwidth(wc)) == -1) {
+                       if (print)
+                               putchar('?');
+                       total_width++;
+               } else {
+                       if (print)
+                               fwrite(mbs, 1, len, stdout);
+                       total_width += width;
+               }
+       }
+       return total_width;
+}
+#endif
Index: util.c
===================================================================
RCS file: /cvs/src/bin/ls/util.c,v
retrieving revision 1.16
diff -u -p -r1.16 util.c
--- util.c      21 Nov 2013 15:54:45 -0000      1.16
+++ util.c      30 Nov 2015 15:46:44 -0000
@@ -45,15 +45,20 @@
 #include "ls.h"
 #include "extern.h"
 
+#ifdef SMALL
 int
-putname(char *name)
+mbsprint(const char *name, int print)
 {
        int len;
 
+       if (print == 0)
+               return strlen(name);
+
        for (len = 0; *name; len++, name++)
                putchar((!isprint((unsigned char)*name) && f_nonprint) ? '?' : 
*name);
        return len;
 }
+#endif
 
 void
 usage(void)

Reply via email to