Hi,
here is a clean patch implementing full UTF-8 support including
validation, sanitation, and columnation for ls(1). This integrates
work done by tedu@, bentley@, and myself during u2k15 in Berlin.
After several iterations, i now consider it ripe for commit,
but would appreciate at least two OKs given that ls(1) is
not exactly some random niche utility.
Properties of the patch:
* In the main ls(1) codebase, nothing changes except that
putname(...) is replaced by mbsprint(..., 1) and
strlen(...) is replaced by mbsprint(..., 0).
The places that need to be changed were identified by
tedu@ and bentley@.
* Only one UTF-8 aware function is needed, isolated in a
seperate file. It uses standard POSIX interfaces.
* I am resetting mbtowc(3) internal state after failure,
even though that has no effect on OpenBSD, if only as a
reminder that there be dragons. It costs almost nothing,
neither in terms of clutter nor in terms of performance.
* I have decoupled the patch from my simplified implementations
of mbtowc(3) and wcwidth(3). That would be a later step
and shouldn't muddle the waters in the present context.
Yours,
Ingo
Index: Makefile
===================================================================
RCS file: /cvs/src/bin/ls/Makefile,v
retrieving revision 1.7
diff -u -p -r1.7 Makefile
--- Makefile 6 Aug 2003 19:09:09 -0000 1.7
+++ Makefile 30 Nov 2015 15:46:43 -0000
@@ -1,7 +1,7 @@
# $OpenBSD: Makefile,v 1.7 2003/08/06 19:09:09 tedu Exp $
PROG= ls
-SRCS= cmp.c ls.c main.c print.c util.c
+SRCS= cmp.c ls.c main.c print.c util.c utf8.c
DPADD= ${LIBUTIL}
LDADD= -lutil
Index: extern.h
===================================================================
RCS file: /cvs/src/bin/ls/extern.h,v
retrieving revision 1.9
diff -u -p -r1.9 extern.h
--- extern.h 2 Jun 2003 23:32:08 -0000 1.9
+++ extern.h 30 Nov 2015 15:46:43 -0000
@@ -45,7 +45,7 @@ int revstatcmp(const FTSENT *, const FT
int sizecmp(const FTSENT *, const FTSENT *);
int revsizecmp(const FTSENT *, const FTSENT *);
-int putname(char *);
+int mbsprint(const char *, int);
void printcol(DISPLAY *);
void printacol(DISPLAY *);
void printlong(DISPLAY *);
Index: ls.1
===================================================================
RCS file: /cvs/src/bin/ls/ls.1,v
retrieving revision 1.72
diff -u -p -r1.72 ls.1
--- ls.1 24 Apr 2015 10:57:36 -0000 1.72
+++ ls.1 30 Nov 2015 15:46:43 -0000
@@ -440,6 +440,12 @@ If this variable contains a string repre
decimal integer, it is used as the
column position width for displaying
multiple-text-column output.
+.It Ev LC_CTYPE
+If set to a string ending in
+.Qq .UTF-8 ,
+.Nm
+respects character display widths when columnating output.
+Otherwise, non-ASCII bytes are replaced by question marks.
.It Ev TZ
The time zone to use when displaying dates.
See
Index: ls.c
===================================================================
RCS file: /cvs/src/bin/ls/ls.c,v
retrieving revision 1.43
diff -u -p -r1.43 ls.c
--- ls.c 9 Oct 2015 01:37:06 -0000 1.43
+++ ls.c 30 Nov 2015 15:46:44 -0000
@@ -48,6 +48,7 @@
#include <string.h>
#include <unistd.h>
#include <limits.h>
+#include <locale.h>
#include <util.h>
#include "ls.h"
@@ -103,6 +104,10 @@ ls_main(int argc, char *argv[])
int kflag = 0, width = 0;
char *p;
+#ifndef SMALL
+ setlocale(LC_CTYPE, "");
+#endif
+
/* Terminal defaults to -Cq, non-terminal defaults to -1. */
if (isatty(STDOUT_FILENO)) {
if ((p = getenv("COLUMNS")) != NULL)
@@ -428,6 +433,7 @@ display(FTSENT *p, FTSENT *list)
ino_t maxinode;
int bcfile, flen, glen, ulen, maxflags, maxgroup, maxuser;
int entries, needstats;
+ int width;
char *user, *group, buf[21]; /* 64 bits == 20 digits */
char nuser[12], ngroup[12];
char *flags = NULL;
@@ -474,8 +480,8 @@ display(FTSENT *p, FTSENT *list)
continue;
}
}
- if (cur->fts_namelen > maxlen)
- maxlen = cur->fts_namelen;
+ if ((width = mbsprint(cur->fts_name, 0)) > maxlen)
+ maxlen = width;
if (needstats) {
sp = cur->fts_statp;
if (sp->st_blocks > maxblock)
Index: print.c
===================================================================
RCS file: /cvs/src/bin/ls/print.c,v
retrieving revision 1.34
diff -u -p -r1.34 print.c
--- print.c 15 Mar 2015 00:41:27 -0000 1.34
+++ print.c 30 Nov 2015 15:46:44 -0000
@@ -122,7 +122,7 @@ printlong(DISPLAY *dp)
printtime(sp->st_ctime);
else
printtime(sp->st_mtime);
- (void)putname(p->fts_name);
+ (void)mbsprint(p->fts_name, 1);
if (f_type || (f_typedir && S_ISDIR(sp->st_mode)))
(void)printtype(sp->st_mode);
if (S_ISLNK(sp->st_mode))
@@ -231,7 +231,7 @@ printaname(FTSENT *p, u_long inodefield,
if (f_size)
chcnt += printf("%*qd ",
(int)sizefield, howmany(sp->st_blocks, blocksize));
- chcnt += putname(p->fts_name);
+ chcnt += mbsprint(p->fts_name, 1);
if (f_type || (f_typedir && S_ISDIR(sp->st_mode)))
chcnt += printtype(sp->st_mode);
return (chcnt);
@@ -310,7 +310,8 @@ printstream(DISPLAY *dp)
continue;
if (col > 0) {
(void)putchar(','), col++;
- if (col + 1 + extwidth + p->fts_namelen >= termwidth)
+ if (col + 1 + extwidth + mbsprint(p->fts_name, 0) >=
+ termwidth)
(void)putchar('\n'), col = 0;
else
(void)putchar(' '), col++;
@@ -361,7 +362,7 @@ printlink(FTSENT *p)
}
path[lnklen] = '\0';
(void)printf(" -> ");
- (void)putname(path);
+ (void)mbsprint(path, 1);
}
static void
Index: utf8.c
===================================================================
RCS file: utf8.c
diff -N utf8.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ utf8.c 30 Nov 2015 15:46:44 -0000
@@ -0,0 +1,51 @@
+/* $OpenBSD$ */
+
+/*
+ * Copyright (c) 2015 Ingo Schwarze <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef SMALL
+#include <stdio.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+int
+mbsprint(const char *mbs, int print)
+{
+ wchar_t wc;
+ int len; /* length in bytes of UTF-8 encoded string */
+ int width; /* display width of a single Unicode char */
+ int total_width; /* display width of the whole string */
+
+ for (total_width = 0; *mbs != '\0'; mbs += len) {
+ if ((len = mbtowc(&wc, mbs, MB_CUR_MAX)) == -1) {
+ (void)mbtowc(NULL, NULL, MB_CUR_MAX);
+ if (print)
+ putchar('?');
+ total_width++;
+ len = 1;
+ } else if ((width = wcwidth(wc)) == -1) {
+ if (print)
+ putchar('?');
+ total_width++;
+ } else {
+ if (print)
+ fwrite(mbs, 1, len, stdout);
+ total_width += width;
+ }
+ }
+ return total_width;
+}
+#endif
Index: util.c
===================================================================
RCS file: /cvs/src/bin/ls/util.c,v
retrieving revision 1.16
diff -u -p -r1.16 util.c
--- util.c 21 Nov 2013 15:54:45 -0000 1.16
+++ util.c 30 Nov 2015 15:46:44 -0000
@@ -45,15 +45,20 @@
#include "ls.h"
#include "extern.h"
+#ifdef SMALL
int
-putname(char *name)
+mbsprint(const char *name, int print)
{
int len;
+ if (print == 0)
+ return strlen(name);
+
for (len = 0; *name; len++, name++)
putchar((!isprint((unsigned char)*name) && f_nonprint) ? '?' :
*name);
return len;
}
+#endif
void
usage(void)