Hi, i'd like to propose a simplified version of this patch Frederic Nowak posted a few weeks ago for commit. Our experience is probably not yet sufficient to develop a full-blown solution for all UTF-8 problems in ksh(1), but this is very non-intrusive and makes the following commands better without breaking anything:
* Ctrl-b, Ctrl-xd, Esc-[D, Esc-OD (backward-char) * Ctrl-f, Ctrl-xc, Esc-[C, Esc-OC (forward-char) * Ctrl-h, Ctrl-? (delete-char-backward) * Esc-[3~ (delete-char-forward) * Ctrl-k (kill-to-eol) This patch deliberately does not attempt any character counting or display column counting. It's purely about not chopping UTF-8 characters into pieces, and it is not comprehensive. It only improves some of the most common cases. Even though i'm not planning to do more work in ksh(1) myself just yet, i would not like to see effort wasted that other people are spending. See my version of the patch at the end. Before that, i'm commenting on some parts of Frederic's patch. Note that i'm cutting away the parts i agree with. Opinions? Concerns? OKs? Ingo Frederic Nowak wrote on Tue, Nov 10, 2015 at 01:36:02PM +0100: > I got annoyed with ksh(1) for messing up my command line after > accidentally entering an umlaut and decided to take a stab at teaching > it some utf8. The diff is inspired by Ted Unangst's recent patches for > e.g. rs[0]. > > It works for my use cases and seems to handle 2-byte and 3-byte > sequences quite well; I hope it does so for longer ones, too. Maybe > it's of use for someone else as well. > > Cheers, > Frederic > > [0] https://marc.info/?l=openbsd-tech&m=144560099607564 > > > Index: emacs.c > =================================================================== > RCS file: /cvs/src/bin/ksh/emacs.c,v > retrieving revision 1.60 > diff -u -p -r1.60 emacs.c > --- emacs.c 19 Oct 2015 14:42:16 -0000 1.60 > +++ emacs.c 10 Nov 2015 12:31:27 -0000 > @@ -49,7 +49,7 @@ struct x_ftab { > #define is_cfs(c) (c == ' ' || c == '\t' || c == '"' || c == '\'') > > /* Separator for motion */ > -#define is_mfs(c) (!(isalnum((unsigned char)c) || c == '_' || c > == '$')) > +#define is_mfs(c) (!(isu8lead((unsigned char) c) || > isu8cont((unsigned char) c) || isalnum((unsigned char)c) || c == '_' || c == > '$')) In the first step, i'd like to focus on the simplest commands, so i'm not including word handling for now. If we get the first step committed, please consider rebasing the rest of your work and resubmitting a clean patch. > /* Arguments for do_complete() > * 0 = enumerate M-= complete as much as possible and then list > @@ -198,6 +198,10 @@ static int x_comment(int); > static int x_debug_info(int); > #endif > > +/* utf8 support */ > +static int isu8cont(unsigned char); > +static int isu8lead(unsigned char); > + Sometimes, it may be needed to detect lead bytes, but in patches of this style, i'd like to avoid that if possible, because typically, that keeps the code simpler. In the case at hand, it's possible without isu8lead(). [...] > @@ -468,6 +491,8 @@ x_del_back(int c) > } > if (x_arg > col) > x_arg = col; > + while(x_arg <= col && isu8cont(*(xcp - x_arg))) > + x_arg++; > x_goto(xcp - x_arg); > x_delete(x_arg, false); > return KSTD; This seems off by one to me. If x_arg == col, we must not increment it further, right? Otherwise, we will access xcp[-1]. Of course, that's a pathological case, because it only happens when the command line starts with a continuation byte, but still... > @@ -621,7 +646,7 @@ x_fword(void) > static void > x_goto(char *cp) > { > - if (cp < xbp || cp >= (xbp + x_displen)) { > + if (cp < xbp || cp >= xlp) { > /* we are heading off screen */ > xcp = cp; > x_adjust(); I don't understand why you propose to change this, nor how it's related to UTF-8. [...] > @@ -669,7 +696,8 @@ x_zots(char *str) > int adj = x_adj_done; > > x_lastcp(); > - while (*str && str < xlp && adj == x_adj_done) > + while (*str && (isu8cont(*str) || str < xlp) > + && adj == x_adj_done) > x_zotc(*str++); > } Isn't that change redundant? Given that x_size() returns 0 for continuation bytes, x_lastcp() can never leave xlp on a continuation byte, or can it? > @@ -697,6 +725,8 @@ x_mv_back(int c) > } > if (x_arg > col) > x_arg = col; > + while(x_arg <= col && isu8cont(*(xcp - x_arg))) > + x_arg++; > x_goto(xcp - x_arg); > return KSTD; > } That looks like off by one to me, too. > @@ -710,6 +740,7 @@ x_mv_forw(int c) > x_e_putc(BEL); > return KSTD; > } > + x_arg += isu8lead(*xcp); > if (x_arg > nleft) > x_arg = nleft; > x_goto(xcp + x_arg); I suggest to look at the next byte and use isu8cont(), see the patch below. > @@ -1025,7 +1056,7 @@ x_redraw(int limit) > if (xep > xlp) > i = 0; /* we fill the line */ > else > - i = limit - (xlp - xbp); > + i = limit - x_col; > > for (j = 0; j < i && x_col < (xx_cols - 2); j++) > x_e_putc(' '); Again, i don't understand how this change is related to UTF-8. > @@ -1821,11 +1852,18 @@ do_complete(int flags, /* XCF_{COMMAND,F > static void > x_adjust(void) > { > + int i; > x_adj_done++; /* flag the fact that we were called. */ > /* > * we had a problem if the prompt length > xx_cols / 2 > */ > - if ((xbp = xcp - (x_displen / 2)) < xbuf) > + xbp = xcp; > + for(i = 0; i < (x_displen/2);) { > + xbp--; > + if(!isu8cont(*xbp)) > + i++; > + } > + if (xbp < xbuf) > xbp = xbuf; > xlp_valid = false; > x_redraw(xx_cols); In the first step, i'd like to not touch line adjustment, but just cursor movement and character deletion. Please resubmit if we get the first step in. > @@ -1863,6 +1901,12 @@ x_e_getc(void) > static void > x_e_putc(int c) > { > + static int u8wait = 0; > + if(isu8lead(c)) { > + u8wait = isu8lead(c); > + } else if(isu8cont(c)) { > + u8wait--; > + } > if (c == '\r' || c == '\n') > x_col = 0; > if (x_col < xx_cols) { > @@ -1874,10 +1918,12 @@ x_e_putc(int c) > case '\n': > break; > case '\b': > - x_col--; > + if(!isu8cont(c)) > + x_col--; > break; That change seems redundant. We already know c == '\b', so we already know isu8cont(c) is false. > default: > - x_col++; > + if(!u8wait) > + x_col++; > break; > } > } Right now, x_col counts bytes, not characters, and even less display positions. I don't think we should change that in a hurry. To get basic cursor movement and character deletion to work, i don't think it is needed. What exactly is broken for you without this change? So, here is my version of the patch, proposed for commit: Index: emacs.c =================================================================== RCS file: /cvs/src/bin/ksh/emacs.c,v retrieving revision 1.60 diff -u -p -r1.60 emacs.c --- emacs.c 19 Oct 2015 14:42:16 -0000 1.60 +++ emacs.c 7 Dec 2015 23:52:27 -0000 @@ -82,7 +82,7 @@ static char *xend; /* end input buff static char *xcp; /* current position */ static char *xep; /* current end */ static char *xbp; /* start of visible portion of input buffer */ -static char *xlp; /* last char visible on screen */ +static char *xlp; /* last byte visible on screen */ static int x_adj_ok; /* * we use x_adj_done so that functions can tell @@ -138,6 +138,7 @@ static int x_comment(int); static int x_fold_case(int); static char *x_lastcp(void); static void do_complete(int, Comp_type); +static int isu8cont(unsigned char); /* proto's for keybindings */ static int x_abort(int); @@ -263,6 +264,12 @@ static const struct x_ftab x_ftab[] = { }; int +isu8cont(unsigned char c) +{ + return (c & (0x80 | 0x40)) == 0x80; +} + +int x_emacs(char *buf, size_t len) { struct kb_entry *k, *kmatch = NULL; @@ -468,6 +475,8 @@ x_del_back(int c) } if (x_arg > col) x_arg = col; + while (x_arg < col && isu8cont(xcp[-x_arg])) + x_arg++; x_goto(xcp - x_arg); x_delete(x_arg, false); return KSTD; @@ -484,11 +493,13 @@ x_del_char(int c) } if (x_arg > nleft) x_arg = nleft; + while (x_arg < nleft && isu8cont(xcp[x_arg])) + x_arg++; x_delete(x_arg, false); return KSTD; } -/* Delete nc chars to the right of the cursor (including cursor position) */ +/* Delete nc bytes to the right of the cursor (including cursor position) */ static void x_delete(int nc, int push) { @@ -660,6 +671,8 @@ x_size(int c) return 4; /* Kludge, tabs are always four spaces. */ if (iscntrl(c)) /* control char */ return 2; + if (isu8cont(c)) + return 0; return 1; } @@ -697,6 +710,8 @@ x_mv_back(int c) } if (x_arg > col) x_arg = col; + while (x_arg < col && isu8cont(xcp[-x_arg])) + x_arg++; x_goto(xcp - x_arg); return KSTD; } @@ -712,6 +727,8 @@ x_mv_forw(int c) } if (x_arg > nleft) x_arg = nleft; + while (x_arg < nleft && isu8cont(xcp[x_arg + 1])) + x_arg++; x_goto(xcp + x_arg); return KSTD; } @@ -1119,6 +1136,8 @@ x_kill(int c) x_arg = lastcol; else if (x_arg > lastcol) x_arg = lastcol; + while (x_arg < lastcol && isu8cont(xbuf[x_arg])) + x_arg++; ndel = x_arg - col; if (ndel < 0) { x_goto(xbuf + x_arg); @@ -2108,13 +2127,13 @@ x_fold_case(int c) } /* NAME: - * x_lastcp - last visible char + * x_lastcp - last visible byte * * SYNOPSIS: * x_lastcp() * * DESCRIPTION: - * This function returns a pointer to that char in the + * This function returns a pointer to that byte in the * edit buffer that will be the last displayed on the * screen. The sequence: *
