* Jordi Beltran Creix <[email protected]> [100804 17:26]:
> > ls(1) does not seem to be 100% UTF-8 ready:
> >
> >
> > madro...@madthought:~% /bin/ls testb8-C4-D5-b/-\#
> > testo?=o?=-C4-D5-o?=o?=-#
> > madro...@madthought:~% /bin/ls testb8-C4-D5-b/-\# |cat
> > testb8-C4-D5-b/-#
> >
> ls(1) needs to use wcwidth(3) instead of just assuming 1 for alignment
> and if I remember correctly it also mangles the strings using
> isprint(3) or hardcoded values instead of iswprint(3) when printing to
> terminal which is probably what you are seeing here.
I made a patch for ls based on NetBSD (see below), works just fine for me.
> ed(1) is broken by the latter and ksh(1) for both reasons.
One can use mksh instead (or better backport their utf8-handling code).
> wcwidth(3) doesn't seem to have been added yet, though.
wcwidth() is in, but no man page yet.
--- ls.c 2010/08/07 15:15:04 1.1
+++ ls.c 2010/08/07 15:17:32
@@ -41,6 +41,7 @@
#include <err.h>
#include <errno.h>
#include <fts.h>
+#include <locale.h>
#include <grp.h>
#include <pwd.h>
#include <stdio.h>
@@ -102,6 +103,7 @@
int kflag = 0;
char *p;
+ setlocale(LC_CTYPE, "");
/* Terminal defaults to -Cq, non-terminal defaults to -1. */
if (isatty(STDOUT_FILENO)) {
if ((p = getenv("COLUMNS")) != NULL)
--- util.c 2010/08/07 15:00:48 1.1
+++ util.c 2010/08/07 15:13:52
@@ -41,18 +41,75 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <wchar.h>
#include "ls.h"
#include "extern.h"
+#define MB_LEN_MAX 32 /* goes into limits.h */
+
+static int
+printwc(wchar_t wc, mbstate_t * pst)
+{
+ size_t size;
+ char buf[MB_LEN_MAX];
+
+ size = wcrtomb(buf, wc, pst);
+ if (size == (size_t) -1) /* This shouldn't happen, but for
+ * sure */
+ return 0;
+ if (wc == L'\0') {
+ /* The following condition must be always true, but for sure */
+ if (size > 0 && buf[size - 1] == '\0')
+ --size;
+ }
+ if (size > 0)
+ fwrite(buf, 1, size, stdout);
+ return wc == L'\0' ? 0 : wcwidth(wc);
+}
+
int
-putname(char *name)
+putname(char *src)
{
- int len;
+ int n = 0;
+ mbstate_t src_state, stdout_state;
+ /* The following +1 is to pass '\0' at the end of src to mbrtowc(). */
+ const char *endptr = src + strlen(src) + 1;
- for (len = 0; *name; len++, name++)
- putchar((!isprint(*name) && f_nonprint) ? '?' : *name);
- return len;
+ /*
+ * We have to reset src_state each time in this function, because
+ * the codeset of src pathname may not match with current locale.
+ * Note that if we pass NULL instead of src_state to mbrtowc(),
+ * there is no way to reset the state.
+ */
+ memset(&src_state, 0, sizeof(src_state));
+ memset(&stdout_state, 0, sizeof(stdout_state));
+ while (src < endptr) {
+ wchar_t wc;
+ size_t rv, span = endptr - src;
+ rv = mbrtowc(&wc, src, span, &src_state);
+ if (rv == 0) { /* assert(wc == L'\0'); */
+ /* The following may output a shift sequence. */
+ n += printwc(wc, &stdout_state);
+ break;
+ }
+ if (rv == (size_t) -1) { /* probably errno == EILSEQ */
+ n += printwc(L'?', &stdout_state);
+ /* try to skip 1byte, because there is no better way */
+ src++;
+ memset(&src_state, 0, sizeof(src_state));
+ } else if (rv == (size_t) - 2) {
+ if (span < MB_CUR_MAX) { /* incomplete char */
+ n += printwc(L'?', &stdout_state);
+ break;
+ }
+ src += span; /* a redundant shift sequence? */
+ } else {
+ n += printwc(iswprint(wc) ? wc : L'?', &stdout_state);
+ src += rv;
+ }
+ }
+ return n;
}
void