Re: start UTF-8 support for ksh(1) vi editing mode

2016-01-17 Thread Ingo Schwarze
Hi Martin, hi Theo,

thank you for your feedback!

Martijn van Duren wrote on Sun, Jan 17, 2016 at 12:06:26PM +0100:

> When applying this patch and running in an environment without an
> UTF-8 LC_ALL or LC_CTYPE the isu8cont gives the reverse problem of
> the current ksh having UTF-8 filenames with UTF-8 enabled.
> It becomes impossible to properly remove the utf-8 characters where
> the screen is badly redrawn, while in the old situation these could
> be removed byte-wise when in POSIX mode.
>
> Example (caret is cursor):
> $ cd ./Muziek/Mot??rhead/
>  ^
> 
> $ cd ./Muziek/Mot??rhead/
>^
> 
> $ ccd ./Muziek/Mot??rhead
> 
> Since (afaik) utf-8 isn't supported on the console this could become an
> annoying situation in some situations.

All those seem valid concerns.  So we should only change behaviour
based on the locale(1), which means that the shell has to call
setlocale(3) - of course only in the non-ramdisk case.

Patch updated in two places:

 1. Call setlocale(3) in main().
 2. Check the locale in isu8cont().

> I reckon it would be cleaner to do it like ls where the character
> is first pulled through mbtowc and based on it's return printed or
> presented as an ? per byte, or at the very least treated as a single
> byte.

Right now, i'm trying to be extremely conservative and not use any
multibyte library functions.  But yes, ultimately, that will be
needed, in particular to support zero-width and double-width
characters.  But i don't deem the time ripe to attempt that just yet.


Theo (tb@) pointed out in private that entering a two-byte UTF-8
character produces the following echo to the terminal with the
first version of the patch:  first byte, backspace, first byte,
second byte.  While that works at least on some terminals, that's
certainly not ideal.  The following updated version of the patch
fixes this by keeping minimal state in display() and avoiding
the backup and reprint of the start byte if nothing unrelated
happened in between.  No changes outside display().

More testing and feedback is certainly welcome.

Yours,
  Ingo


Index: main.c
===
RCS file: /cvs/src/bin/ksh/main.c,v
retrieving revision 1.78
diff -u -p -r1.78 main.c
--- main.c  30 Dec 2015 09:07:00 -  1.78
+++ main.c  17 Jan 2016 16:40:03 -
@@ -8,6 +8,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -153,6 +154,8 @@ main(int argc, char *argv[])
kshname = argv[0];
 
 #ifndef MKNOD
+   setlocale(LC_CTYPE, "");
+
if (pledge("stdio rpath wpath cpath fattr flock getpw proc exec tty",
NULL) == -1) {
perror("pledge");
Index: vi.c
===
RCS file: /cvs/src/bin/ksh/vi.c,v
retrieving revision 1.39
diff -u -p -r1.39 vi.c
--- vi.c22 Dec 2015 08:39:26 -  1.39
+++ vi.c17 Jan 2016 16:40:03 -
@@ -12,6 +12,7 @@
 #include   /* completion */
 
 #include 
+#include 
 #include 
 
 #include "sh.h"
@@ -21,11 +22,11 @@
 #define CTRL(c)(c & 0x1f)
 
 struct edstate {
-   int winleft;
-   char*cbuf;
-   int cbufsize;
-   int linelen;
-   int cursor;
+   char*cbuf;  /* main buffer to build the command line */
+   int cbufsize;   /* number of bytes allocated for cbuf */
+   int linelen;/* current number of bytes in cbuf */
+   int winleft;/* first byte# in cbuf to be displayed */
+   int cursor; /* byte# in cbuf having the cursor */
 };
 
 
@@ -68,6 +69,7 @@ static void   vi_pprompt(int);
 static voidvi_error(void);
 static voidvi_macro_reset(void);
 static int x_vi_putbuf(const char *, size_t);
+static int isu8cont(unsigned char);
 
 #define C_ 0x1 /* a valid command that isn't a M_, E_, U_ */
 #define M_ 0x2 /* movement command (h, l, etc.) */
@@ -148,7 +150,7 @@ static void restore_edstate(struct edst
 static voidfree_edstate(struct edstate *old);
 
 static struct edstate  ebuf;
-static struct edstate  undobuf = { 0, undocbuf, CMDLEN, 0, 0 };
+static struct edstate  undobuf = { undocbuf, CMDLEN, 0, 0, 0 };
 
 static struct edstate  *es;/* current editor state */
 static struct edstate  *undo;
@@ -157,7 +159,7 @@ static char ibuf[CMDLEN];   /* input buff
 static int first_insert;   /* set when starting in insert mode */
 static int saved_inslen;   /* saved inslen for first insert */
 static int inslen; /* length of input buffer */
-static int srchlen;/* length of current search pattern */
+static int srchlen;/* number of bytes in search pattern */
 static charybuf[CMDLEN];   /* yank buffer */
 static int yanklen;/* length of yank buffer 

start UTF-8 support for ksh(1) vi editing mode

2016-01-16 Thread Ingo Schwarze
Hi,

the last few days i basically dug in to focus on understanding the vi
editing mode in ksh(1).  Given the quick partial success with the emacs
mode, i hoped that the usual 20/80 rule might apply.  However, the code
implementing vi mode seems substantially more contorted to me than the
code implementing emacs mode, so patches turn out to be slightly less
pretty.

Here is what might be 5% of the work giving 50% of the final usefulness.

This patch (-79 +145) includes three parts:

 1. Adding comments such that people reading the patch can understand
what is going on (that accounts for 29 of the added lines).

 2. Cleanup by deleting the unused "lastref" global variable.

 3. Implement insertion of UTF-8 characters and basic UTF-8 support
for the following commands: a h i I l x X ^H
The following commands are out of scope for this patch:
b B e E r R w W ^W

If you consider that useful, i can of course split the patch and
propose the three parts seperately.  But i think this is an exception
where it's easier to review as a whole because the comments are
added exactly at the places where you need them to understand the
changes.

What do you think?

Yours,
  Ingo


Index: vi.c
===
RCS file: /cvs/src/bin/ksh/vi.c,v
retrieving revision 1.39
diff -u -p -r1.39 vi.c
--- vi.c22 Dec 2015 08:39:26 -  1.39
+++ vi.c16 Jan 2016 23:37:13 -
@@ -21,11 +21,11 @@
 #define CTRL(c)(c & 0x1f)
 
 struct edstate {
-   int winleft;
-   char*cbuf;
-   int cbufsize;
-   int linelen;
-   int cursor;
+   char*cbuf;  /* main buffer to build the command line */
+   int cbufsize;   /* number of bytes allocated for cbuf */
+   int linelen;/* current number of bytes in cbuf */
+   int winleft;/* first byte# in cbuf to be displayed */
+   int cursor; /* byte# in cbuf having the cursor */
 };
 
 
@@ -68,6 +68,7 @@ static void   vi_pprompt(int);
 static voidvi_error(void);
 static voidvi_macro_reset(void);
 static int x_vi_putbuf(const char *, size_t);
+static int isu8cont(unsigned char);
 
 #define C_ 0x1 /* a valid command that isn't a M_, E_, U_ */
 #define M_ 0x2 /* movement command (h, l, etc.) */
@@ -148,7 +149,7 @@ static void restore_edstate(struct edst
 static voidfree_edstate(struct edstate *old);
 
 static struct edstate  ebuf;
-static struct edstate  undobuf = { 0, undocbuf, CMDLEN, 0, 0 };
+static struct edstate  undobuf = { undocbuf, CMDLEN, 0, 0, 0 };
 
 static struct edstate  *es;/* current editor state */
 static struct edstate  *undo;
@@ -157,7 +158,7 @@ static char ibuf[CMDLEN];   /* input buff
 static int first_insert;   /* set when starting in insert mode */
 static int saved_inslen;   /* saved inslen for first insert */
 static int inslen; /* length of input buffer */
-static int srchlen;/* length of current search pattern */
+static int srchlen;/* number of bytes in search pattern */
 static charybuf[CMDLEN];   /* yank buffer */
 static int yanklen;/* length of yank buffer */
 static int fsavecmd = ' '; /* last find command */
@@ -166,7 +167,7 @@ static char lastcmd[MAXVICMD];  /* last n
 static int lastac; /* argcnt for lastcmd */
 static int lastsearch = ' ';   /* last search command */
 static charsrchpat[SRCHLEN];   /* last search pattern */
-static int insert; /* non-zero in insert mode */
+static int insert; /* mode: INSERT, REPLACE, or 0 */
 static int hnum;   /* position in history */
 static int ohnum;  /* history line copied (after mod) */
 static int hlast;  /* 1 past last position in history */
@@ -399,8 +400,12 @@ vi_hook(int ch)
state = VCMD;
} else if (ch == edchars.erase || ch == CTRL('h')) {
if (srchlen != 0) {
-   srchlen--;
-   es->linelen -= char_len((unsigned 
char)locpat[srchlen]);
+   do {
+   srchlen--;
+   es->linelen -= char_len(
+   (unsigned char)locpat[srchlen]);
+   } while (srchlen > 0 &&
+   isu8cont(locpat[srchlen]));
es->cursor = es->linelen;
refresh(0);
return 0;
@@ -568,25 +573,28 @@ vi_insert(int ch)
vi_error();
return