Hi there, I got annoyed with ksh(1) for messing up my command line after accidentally entering an umlaut and decided to take a stab at teaching it some utf8. The diff is inspired by Ted Unangst's recent patches for e.g. rs[0].
It works for my use cases and seems to handle 2-byte (ßüöä) and 3-byte (€) sequences quite well; I hope it does so for longer ones, too. Maybe it's of use for someone else as well. Cheers, Frederic [0] https://marc.info/?l=openbsd-tech&m=144560099607564 Index: emacs.c =================================================================== RCS file: /cvs/src/bin/ksh/emacs.c,v retrieving revision 1.60 diff -u -p -r1.60 emacs.c --- emacs.c 19 Oct 2015 14:42:16 -0000 1.60 +++ emacs.c 10 Nov 2015 12:31:27 -0000 @@ -49,7 +49,7 @@ struct x_ftab { #define is_cfs(c) (c == ' ' || c == '\t' || c == '"' || c == '\'') /* Separator for motion */ -#define is_mfs(c) (!(isalnum((unsigned char)c) || c == '_' || c == '$')) +#define is_mfs(c) (!(isu8lead((unsigned char) c) || isu8cont((unsigned char) c) || isalnum((unsigned char)c) || c == '_' || c == '$')) /* Arguments for do_complete() * 0 = enumerate M-= complete as much as possible and then list @@ -198,6 +198,10 @@ static int x_comment(int); static int x_debug_info(int); #endif +/* utf8 support */ +static int isu8cont(unsigned char); +static int isu8lead(unsigned char); + static const struct x_ftab x_ftab[] = { { x_abort, "abort", 0 }, { x_beg_hist, "beginning-of-history", 0 }, @@ -263,6 +267,25 @@ static const struct x_ftab x_ftab[] = { }; int +isu8cont(unsigned char c) +{ + return ((c & (0x80 | 0x40)) == 0x80); +} + +int +isu8lead(unsigned char c) +{ + if ((c & 0xE0) == 0xC0) + return 1; + if ((c & 0xF0) == 0xE0) + return 2; + if ((c & 0xF8) == 0xF0) + return 3; + + return 0; +} + +int x_emacs(char *buf, size_t len) { struct kb_entry *k, *kmatch = NULL; @@ -468,6 +491,8 @@ x_del_back(int c) } if (x_arg > col) x_arg = col; + while(x_arg <= col && isu8cont(*(xcp - x_arg))) + x_arg++; x_goto(xcp - x_arg); x_delete(x_arg, false); return KSTD; @@ -621,7 +646,7 @@ x_fword(void) static void x_goto(char *cp) { - if (cp < xbp || cp >= (xbp + x_displen)) { + if (cp < xbp || cp >= xlp) { /* we are heading off screen */ xcp = cp; x_adjust(); @@ -660,6 +685,8 @@ x_size(int c) return 4; /* Kludge, tabs are always four spaces. */ if (iscntrl(c)) /* control char */ return 2; + if (isu8cont(c)) /* utf8 continuation byte */ + return 0; return 1; } @@ -669,7 +696,8 @@ x_zots(char *str) int adj = x_adj_done; x_lastcp(); - while (*str && str < xlp && adj == x_adj_done) + while (*str && (isu8cont(*str) || str < xlp) + && adj == x_adj_done) x_zotc(*str++); } @@ -697,6 +725,8 @@ x_mv_back(int c) } if (x_arg > col) x_arg = col; + while(x_arg <= col && isu8cont(*(xcp - x_arg))) + x_arg++; x_goto(xcp - x_arg); return KSTD; } @@ -710,6 +740,7 @@ x_mv_forw(int c) x_e_putc(BEL); return KSTD; } + x_arg += isu8lead(*xcp); if (x_arg > nleft) x_arg = nleft; x_goto(xcp + x_arg); @@ -1025,7 +1056,7 @@ x_redraw(int limit) if (xep > xlp) i = 0; /* we fill the line */ else - i = limit - (xlp - xbp); + i = limit - x_col; for (j = 0; j < i && x_col < (xx_cols - 2); j++) x_e_putc(' '); @@ -1821,11 +1852,18 @@ do_complete(int flags, /* XCF_{COMMAND,F static void x_adjust(void) { + int i; x_adj_done++; /* flag the fact that we were called. */ /* * we had a problem if the prompt length > xx_cols / 2 */ - if ((xbp = xcp - (x_displen / 2)) < xbuf) + xbp = xcp; + for(i = 0; i < (x_displen/2);) { + xbp--; + if(!isu8cont(*xbp)) + i++; + } + if (xbp < xbuf) xbp = xbuf; xlp_valid = false; x_redraw(xx_cols); @@ -1863,6 +1901,12 @@ x_e_getc(void) static void x_e_putc(int c) { + static int u8wait = 0; + if(isu8lead(c)) { + u8wait = isu8lead(c); + } else if(isu8cont(c)) { + u8wait--; + } if (c == '\r' || c == '\n') x_col = 0; if (x_col < xx_cols) { @@ -1874,10 +1918,12 @@ x_e_putc(int c) case '\n': break; case '\b': - x_col--; + if(!isu8cont(c)) + x_col--; break; default: - x_col++; + if(!u8wait) + x_col++; break; } }
