Re: ksh(1): utf8 in emacs editing mode

Ingo Schwarze Mon, 07 Dec 2015 16:21:07 -0800

Hi,

i'd like to propose a simplified version of this patch Frederic Nowak
posted a few weeks ago for commit.  Our experience is probably
not yet sufficient to develop a full-blown solution for all UTF-8
problems in ksh(1), but this is very non-intrusive and makes the
following commands better without breaking anything:


 * Ctrl-b, Ctrl-xd, Esc-[D, Esc-OD (backward-char)
 * Ctrl-f, Ctrl-xc, Esc-[C, Esc-OC (forward-char)
 * Ctrl-h, Ctrl-? (delete-char-backward)
 * Esc-[3~ (delete-char-forward)
 * Ctrl-k (kill-to-eol)

This patch deliberately does not attempt any character counting
or display column counting.  It's purely about not chopping UTF-8
characters into pieces, and it is not comprehensive.  It only
improves some of the most common cases.

Even though i'm not planning to do more work in ksh(1) myself just
yet, i would not like to see effort wasted that other people are
spending.

See my version of the patch at the end.  Before that, i'm commenting
on some parts of Frederic's patch.  Note that i'm cutting away the
parts i agree with.

Opinions?  Concerns?  OKs?
  Ingo


Frederic Nowak wrote on Tue, Nov 10, 2015 at 01:36:02PM +0100:

> I got annoyed with ksh(1) for messing up my command line after
> accidentally entering an umlaut and decided to take a stab at teaching
> it some utf8. The diff is inspired by Ted Unangst's recent patches for
> e.g. rs[0].
> 
> It works for my use cases and seems to handle 2-byte and 3-byte
> sequences quite well; I hope it does so for longer ones, too. Maybe
> it's of use for someone else as well.
> 
> Cheers,
> Frederic
> 
> [0] https://marc.info/?l=openbsd-tech&m=144560099607564
> 
> 
> Index: emacs.c
> ===================================================================
> RCS file: /cvs/src/bin/ksh/emacs.c,v
> retrieving revision 1.60
> diff -u -p -r1.60 emacs.c
> --- emacs.c   19 Oct 2015 14:42:16 -0000      1.60
> +++ emacs.c   10 Nov 2015 12:31:27 -0000
> @@ -49,7 +49,7 @@ struct      x_ftab {
>  #define      is_cfs(c)       (c == ' ' || c == '\t' || c == '"' || c == '\'')
> 
>  /* Separator for motion */
> -#define      is_mfs(c)       (!(isalnum((unsigned char)c) || c == '_' || c 
> == '$'))
> +#define      is_mfs(c)       (!(isu8lead((unsigned char) c) || 
> isu8cont((unsigned char) c) || isalnum((unsigned char)c) || c == '_' || c == 
> '$'))

In the first step, i'd like to focus on the simplest commands,
so i'm not including word handling for now.
If we get the first step committed, please consider rebasing
the rest of your work and resubmitting a clean patch.

>  /* Arguments for do_complete()
>   * 0 = enumerate  M-= complete as much as possible and then list
> @@ -198,6 +198,10 @@ static int       x_comment(int);
>  static int   x_debug_info(int);
>  #endif
> 
> +/* utf8 support */
> +static int isu8cont(unsigned char);
> +static int isu8lead(unsigned char);
> +

Sometimes, it may be needed to detect lead bytes, but in patches
of this style, i'd like to avoid that if possible, because typically,
that keeps the code simpler.  In the case at hand, it's possible
without isu8lead().

[...]
> @@ -468,6 +491,8 @@ x_del_back(int c)
>       }
>       if (x_arg > col)
>               x_arg = col;
> +     while(x_arg <= col && isu8cont(*(xcp - x_arg)))
> +             x_arg++;
>       x_goto(xcp - x_arg);
>       x_delete(x_arg, false);
>       return KSTD;

This seems off by one to me.
If x_arg == col, we must not increment it further, right?
Otherwise, we will access xcp[-1].

Of course, that's a pathological case, because it only happens
when the command line starts with a continuation byte, but still...

> @@ -621,7 +646,7 @@ x_fword(void)
>  static void
>  x_goto(char *cp)
>  {
> -     if (cp < xbp || cp >= (xbp + x_displen)) {
> +     if (cp < xbp || cp >= xlp) {
>               /* we are heading off screen */
>               xcp = cp;
>               x_adjust();

I don't understand why you propose to change this,
nor how it's related to UTF-8.

[...]
> @@ -669,7 +696,8 @@ x_zots(char *str)
>       int     adj = x_adj_done;
> 
>       x_lastcp();
> -     while (*str && str < xlp && adj == x_adj_done)
> +     while (*str && (isu8cont(*str) || str < xlp)
> +            && adj == x_adj_done)
>               x_zotc(*str++);
>  }

Isn't that change redundant?
Given that x_size() returns 0 for continuation bytes,
x_lastcp() can never leave xlp on a continuation byte,
or can it?

> @@ -697,6 +725,8 @@ x_mv_back(int c)
>       }
>       if (x_arg > col)
>               x_arg = col;
> +     while(x_arg <= col && isu8cont(*(xcp - x_arg)))
> +             x_arg++;
>       x_goto(xcp - x_arg);
>       return KSTD;
>  }

That looks like off by one to me, too.

> @@ -710,6 +740,7 @@ x_mv_forw(int c)
>               x_e_putc(BEL);
>               return KSTD;
>       }
> +     x_arg += isu8lead(*xcp);
>       if (x_arg > nleft)
>               x_arg = nleft;
>       x_goto(xcp + x_arg);

I suggest to look at the next byte and use isu8cont(),
see the patch below.

> @@ -1025,7 +1056,7 @@ x_redraw(int limit)
>               if (xep > xlp)
>                       i = 0;                  /* we fill the line */
>               else
> -                     i = limit - (xlp - xbp);
> +                     i = limit - x_col;
> 
>               for (j = 0; j < i && x_col < (xx_cols - 2); j++)
>                       x_e_putc(' ');

Again, i don't understand how this change is related to UTF-8.

> @@ -1821,11 +1852,18 @@ do_complete(int flags,        /* XCF_{COMMAND,F
>  static void
>  x_adjust(void)
>  {
> +     int i;
>       x_adj_done++;                   /* flag the fact that we were called. */
>       /*
>        * we had a problem if the prompt length > xx_cols / 2
>        */
> -     if ((xbp = xcp - (x_displen / 2)) < xbuf)
> +     xbp = xcp;
> +     for(i = 0; i < (x_displen/2);) {
> +             xbp--;
> +             if(!isu8cont(*xbp))
> +                     i++;
> +     }
> +     if (xbp < xbuf)
>               xbp = xbuf;
>       xlp_valid = false;
>       x_redraw(xx_cols);

In the first step, i'd like to not touch line adjustment,
but just cursor movement and character deletion.
Please resubmit if we get the first step in.

> @@ -1863,6 +1901,12 @@ x_e_getc(void)
>  static void
>  x_e_putc(int c)
>  {
> +     static int u8wait = 0;
> +     if(isu8lead(c)) {
> +             u8wait = isu8lead(c);
> +     } else if(isu8cont(c)) {
> +             u8wait--;
> +     }
>       if (c == '\r' || c == '\n')
>               x_col = 0;
>       if (x_col < xx_cols) {
> @@ -1874,10 +1918,12 @@ x_e_putc(int c)
>               case '\n':
>                       break;
>               case '\b':
> -                     x_col--;
> +                     if(!isu8cont(c))
> +                             x_col--;
>                       break;

That change seems redundant.
We already know c == '\b', so we already know isu8cont(c)
is false.

>               default:
> -                     x_col++;
> +                     if(!u8wait)
> +                             x_col++;
>                       break;
>               }
>       }

Right now, x_col counts bytes, not characters, and even less
display positions.  I don't think we should change that in a
hurry.  To get basic cursor movement and character deletion
to work, i don't think it is needed.

What exactly is broken for you without this change?

So, here is my version of the patch, proposed for commit:


Index: emacs.c
===================================================================
RCS file: /cvs/src/bin/ksh/emacs.c,v
retrieving revision 1.60
diff -u -p -r1.60 emacs.c
--- emacs.c     19 Oct 2015 14:42:16 -0000      1.60
+++ emacs.c     7 Dec 2015 23:52:27 -0000
@@ -82,7 +82,7 @@ static char    *xend;         /* end input buff
 static char    *xcp;           /* current position */
 static char    *xep;           /* current end */
 static char    *xbp;           /* start of visible portion of input buffer */
-static char    *xlp;           /* last char visible on screen */
+static char    *xlp;           /* last byte visible on screen */
 static int     x_adj_ok;
 /*
  * we use x_adj_done so that functions can tell
@@ -138,6 +138,7 @@ static int  x_comment(int);
 static int     x_fold_case(int);
 static char    *x_lastcp(void);
 static void    do_complete(int, Comp_type);
+static int     isu8cont(unsigned char);
 
 /* proto's for keybindings */
 static int     x_abort(int);
@@ -263,6 +264,12 @@ static const struct x_ftab x_ftab[] = {
 };
 
 int
+isu8cont(unsigned char c)
+{
+       return (c & (0x80 | 0x40)) == 0x80;
+}
+
+int
 x_emacs(char *buf, size_t len)
 {
        struct kb_entry         *k, *kmatch = NULL;
@@ -468,6 +475,8 @@ x_del_back(int c)
        }
        if (x_arg > col)
                x_arg = col;
+       while (x_arg < col && isu8cont(xcp[-x_arg]))
+               x_arg++;
        x_goto(xcp - x_arg);
        x_delete(x_arg, false);
        return KSTD;
@@ -484,11 +493,13 @@ x_del_char(int c)
        }
        if (x_arg > nleft)
                x_arg = nleft;
+       while (x_arg < nleft && isu8cont(xcp[x_arg]))
+               x_arg++;
        x_delete(x_arg, false);
        return KSTD;
 }
 
-/* Delete nc chars to the right of the cursor (including cursor position) */
+/* Delete nc bytes to the right of the cursor (including cursor position) */
 static void
 x_delete(int nc, int push)
 {
@@ -660,6 +671,8 @@ x_size(int c)
                return 4;       /* Kludge, tabs are always four spaces. */
        if (iscntrl(c))         /* control char */
                return 2;
+       if (isu8cont(c))
+               return 0;
        return 1;
 }
 
@@ -697,6 +710,8 @@ x_mv_back(int c)
        }
        if (x_arg > col)
                x_arg = col;
+       while (x_arg < col && isu8cont(xcp[-x_arg]))
+               x_arg++;
        x_goto(xcp - x_arg);
        return KSTD;
 }
@@ -712,6 +727,8 @@ x_mv_forw(int c)
        }
        if (x_arg > nleft)
                x_arg = nleft;
+       while (x_arg < nleft && isu8cont(xcp[x_arg + 1]))
+               x_arg++;
        x_goto(xcp + x_arg);
        return KSTD;
 }
@@ -1119,6 +1136,8 @@ x_kill(int c)
                x_arg = lastcol;
        else if (x_arg > lastcol)
                x_arg = lastcol;
+       while (x_arg < lastcol && isu8cont(xbuf[x_arg]))
+               x_arg++;
        ndel = x_arg - col;
        if (ndel < 0) {
                x_goto(xbuf + x_arg);
@@ -2108,13 +2127,13 @@ x_fold_case(int c)
 }
 
 /* NAME:
- *      x_lastcp - last visible char
+ *      x_lastcp - last visible byte
  *
  * SYNOPSIS:
  *      x_lastcp()
  *
  * DESCRIPTION:
- *      This function returns a pointer to that  char in the
+ *      This function returns a pointer to that byte in the
  *      edit buffer that will be the last displayed on the
  *      screen.  The sequence:
  *

Re: ksh(1): utf8 in emacs editing mode

Reply via email to