Re: UTF-8

Alexander Polakov Sat, 07 Aug 2010 07:58:31 -0700

* Jordi Beltran Creix <[email protected]> [100804 17:26]:
> > ls(1) does not seem to be 100% UTF-8 ready:
> >
> >
> > madro...@madthought:~% /bin/ls testb8-C4-D5-b/-\#
> > testo?=o?=-C4-D5-o?=o?=-#
> > madro...@madthought:~% /bin/ls testb8-C4-D5-b/-\# |cat
> > testb8-C4-D5-b/-#
> >
> ls(1) needs to use wcwidth(3) instead of just assuming 1 for alignment
> and if I remember correctly it also mangles the strings using
> isprint(3) or hardcoded values instead of iswprint(3) when printing to
> terminal which is probably what you are seeing here.


I made a patch for ls based on NetBSD (see below), works just fine for me.

> ed(1) is broken by the latter and ksh(1) for both reasons.

One can use mksh instead (or better backport their utf8-handling code).

> wcwidth(3) doesn't seem to have been added yet, though.

wcwidth() is in, but no man page yet.


--- ls.c        2010/08/07 15:15:04     1.1
+++ ls.c        2010/08/07 15:17:32
@@ -41,6 +41,7 @@
 #include <err.h>
 #include <errno.h>
 #include <fts.h>
+#include <locale.h>
 #include <grp.h>
 #include <pwd.h>
 #include <stdio.h>
@@ -102,6 +103,7 @@
        int kflag = 0;
        char *p;

+       setlocale(LC_CTYPE, "");
        /* Terminal defaults to -Cq, non-terminal defaults to -1. */
        if (isatty(STDOUT_FILENO)) {
                if ((p = getenv("COLUMNS")) != NULL)
--- util.c      2010/08/07 15:00:48     1.1
+++ util.c      2010/08/07 15:13:52
@@ -41,18 +41,75 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <wchar.h>

 #include "ls.h"
 #include "extern.h"

+#define MB_LEN_MAX 32 /* goes into limits.h */
+
+static int
+printwc(wchar_t wc, mbstate_t * pst)
+{
+       size_t          size;
+       char            buf[MB_LEN_MAX];
+
+       size = wcrtomb(buf, wc, pst);
+       if (size == (size_t) -1)       /* This shouldn't happen, but for
+                                        * sure */
+               return 0;
+       if (wc == L'\0') {
+               /* The following condition must be always true, but for sure */
+               if (size > 0 && buf[size - 1] == '\0')
+                       --size;
+       }
+       if (size > 0)
+               fwrite(buf, 1, size, stdout);
+       return wc == L'\0' ? 0 : wcwidth(wc);
+}
+
 int
-putname(char *name)
+putname(char *src)
 {
-       int len;
+       int             n = 0;
+       mbstate_t       src_state, stdout_state;
+       /* The following +1 is to pass '\0' at the end of src to mbrtowc(). */
+       const char     *endptr = src + strlen(src) + 1;

-       for (len = 0; *name; len++, name++)
-               putchar((!isprint(*name) && f_nonprint) ? '?' : *name);
-       return len;
+       /*
+       * We have to reset src_state each time in this function, because
+       * the codeset of src pathname may not match with current locale.
+       * Note that if we pass NULL instead of src_state to mbrtowc(),
+       * there is no way to reset the state.
+       */
+       memset(&src_state, 0, sizeof(src_state));
+       memset(&stdout_state, 0, sizeof(stdout_state));
+       while (src < endptr) {
+               wchar_t         wc;
+               size_t          rv, span = endptr - src;
+               rv = mbrtowc(&wc, src, span, &src_state);
+               if (rv == 0) {  /* assert(wc == L'\0'); */
+                       /* The following may output a shift sequence. */
+                       n += printwc(wc, &stdout_state);
+                       break;
+               }
+               if (rv == (size_t) -1) {       /* probably errno == EILSEQ */
+                       n += printwc(L'?', &stdout_state);
+                       /* try to skip 1byte, because there is no better way */
+                       src++;
+                       memset(&src_state, 0, sizeof(src_state));
+               } else if (rv == (size_t) - 2) {
+                       if (span < MB_CUR_MAX) {        /* incomplete char */
+                               n += printwc(L'?', &stdout_state);
+                               break;
+                       }
+                       src += span;    /* a redundant shift sequence? */
+               } else {
+                       n += printwc(iswprint(wc) ? wc : L'?', &stdout_state);
+                       src += rv;
+               }
+       }
+       return n;
 }

 void

Re: UTF-8

Reply via email to