Hi Dmitrij,

Dmitrij D. Czarkoff wrote on Fri, Oct 14, 2016 at 01:11:16PM +0200:

> I've noticed that in ksh's vi mode ranged operations are performed
> without respect to cursor's position within utf8 byte sequence.  Eg.:
> 
>  1. type "echo тест | hexdump -C"
>  2. leave inseart mode
>  3. "0", "2E", "dh", Enter
>  4. you end up with "те" and 0x82
>     (last byte of letter under cursor).
> 
> This happens because Endword() moves cursor to the whitespace after word
> and decrements cursor position by 1, so that it points to last byte of
> last letter.

That is exactly what i meant when saying "Some polishing may be
useful in tree, in particular adding utf8cont() support to a few
missing commands."  Thank you for paying attention to remaining
issues of this kind, for reporting them, and for writing patches.

> Then del_range() removes bytes between cursor position and
> preceding utf8 start byte, which include start byte of letter under
> cursor.
> 
> My diff makes del_range() (and yank_range() which operates in the same
> manner) always skip to the beginning of utf8 sequence it is in.
> Although this is a more of a bandaid -

I'm not thrilled about the prospect to always have to sanitize the
cursor position before being able to use it.  I expect there will
be very many places in the code needing such sanition if we allow
the cursor to go just anywhere.

> proper fix would be to make sure that cursor never rests
> on continuation byte -

I very strongly agree with that.  That's what the commands h, i, l,
r, and x already do, and what emacs mode does as well.  I think
we should do the same for the remaining commands.  Keeping the
cursor at sane places, we won't need to mess with code only using
the cursor.

> it is less invasive and
> does not hurt code readability too much.

In the long run, i don't think changing all places *using* the
cursor would be less invasive than changing all places *moving*
the cursor.

The case you found here is moving by words: b, B, e, E, w, W,
and we should probably handle | at the same time.

So i prefer the patch below, which has the following chunks:

 * 708, vi_cmd:
   When a movement command moves to the end of the line,
   move back to the last character, not to the last byte.
 * 1180, domove:
   When the movement command after a change, delete, or yank
   command moves forward to the end of a word,
   move on to the next character, not to the next byte.
 * 1266, domove:
   When a move to column command moves to the middle of a character,
   back up to the beginning of that character.
   Note that this is still moving to byte positions, not to
   display columns, but the latter isn't feasible without
   mbtowc(3) and wcwidth(3).
 * 1526, forwword:
   Move forward by characters, not bytes.
 * 1526, backword:
   Move backward by characters, not bytes.
 * 1526, endword:
   Move forward by characters, not bytes.
   Also return the position *after* the word, not the last byte,
   because that is easier to handle in the caller.
 * 1635, Endword:
   Return the position *after* the word, not the last byte,
   because that is easier to handle in the caller.

Most of the word movement functions required complete rewrites,
but the new versions aren't harder to read than the old ones.

I have done some testing, but a bit more might be in order.

OK?
  Ingo


Index: vi.c
===================================================================
RCS file: /cvs/src/bin/ksh/vi.c,v
retrieving revision 1.40
diff -u -p -r1.40 vi.c
--- vi.c        11 Oct 2016 19:52:54 -0000      1.40
+++ vi.c        14 Oct 2016 17:49:41 -0000
@@ -708,7 +708,8 @@ vi_cmd(int argcnt, const char *cmd)
        if (is_move(*cmd)) {
                if ((cur = domove(argcnt, cmd, 0)) >= 0) {
                        if (cur == es->linelen && cur != 0)
-                               cur--;
+                               while (isu8cont(es->cbuf[--cur]))
+                                       continue;
                        es->cursor = cur;
                } else
                        return -1;
@@ -1180,19 +1181,13 @@ domove(int argcnt, const char *cmd, int 
                break;
 
        case 'e':
-               if (!sub && es->cursor + 1 >= es->linelen)
-                       return -1;
-               ncursor = endword(argcnt);
-               if (sub && ncursor < es->linelen)
-                       ncursor++;
-               break;
-
        case 'E':
                if (!sub && es->cursor + 1 >= es->linelen)
                        return -1;
-               ncursor = Endword(argcnt);
-               if (sub && ncursor < es->linelen)
-                       ncursor++;
+               ncursor = (*cmd == 'e' ? endword : Endword)(argcnt);
+               if (!sub)
+                       while (isu8cont((unsigned char)es->cbuf[--ncursor]))
+                               continue;
                break;
 
        case 'f':
@@ -1266,6 +1261,8 @@ domove(int argcnt, const char *cmd, int 
                        ncursor = es->linelen;
                if (ncursor)
                        ncursor--;
+               while (isu8cont(es->cbuf[ncursor]))
+                       ncursor--;
                break;
 
        case '$':
@@ -1526,80 +1523,100 @@ findch(int ch, int cnt, int forw, int in
        return ncursor;
 }
 
+/* Move right one character, and then to the beginning of the next word. */
 static int
 forwword(int argcnt)
 {
-       int     ncursor;
+       int ncursor, skip_space, want_letnum;
+       unsigned char uc;
 
        ncursor = es->cursor;
        while (ncursor < es->linelen && argcnt--) {
-               if (letnum(es->cbuf[ncursor]))
-                       while (letnum(es->cbuf[ncursor]) &&
-                           ncursor < es->linelen)
-                               ncursor++;
-               else if (!isspace((unsigned char)es->cbuf[ncursor]))
-                       while (!letnum(es->cbuf[ncursor]) &&
-                           !isspace((unsigned char)es->cbuf[ncursor]) &&
-                           ncursor < es->linelen)
-                               ncursor++;
-               while (isspace((unsigned char)es->cbuf[ncursor]) &&
-                   ncursor < es->linelen)
-                       ncursor++;
+               skip_space = 0;
+               want_letnum = -1;
+               ncursor--;
+               while (++ncursor < es->linelen) {
+                       uc = es->cbuf[ncursor];
+                       if (isspace(uc)) {
+                               skip_space = 1;
+                               continue;
+                       } else if (skip_space)
+                               break; 
+                       if (uc & 0x80)
+                               continue;
+                       if (want_letnum == -1)
+                               want_letnum = letnum(uc);
+                       else if (want_letnum != letnum(uc))
+                               break;
+               }
        }
        return ncursor;
 }
 
+/* Move left one character, and then to the beginning of the word. */
 static int
 backword(int argcnt)
 {
-       int     ncursor;
+       int ncursor, skip_space, want_letnum;
+       unsigned char uc;
 
        ncursor = es->cursor;
        while (ncursor > 0 && argcnt--) {
-               while (--ncursor > 0 && isspace((unsigned 
char)es->cbuf[ncursor]))
-                       ;
-               if (ncursor > 0) {
-                       if (letnum(es->cbuf[ncursor]))
-                               while (--ncursor >= 0 &&
-                                   letnum(es->cbuf[ncursor]))
-                                       ;
-                       else
-                               while (--ncursor >= 0 &&
-                                   !letnum(es->cbuf[ncursor]) &&
-                                   !isspace((unsigned char)es->cbuf[ncursor]))
-                                       ;
-                       ncursor++;
+               skip_space = 1;
+               want_letnum = -1;
+               while (ncursor-- > 0) {
+                       uc = es->cbuf[ncursor];
+                       if (isspace(uc)) {
+                               if (skip_space)
+                                       continue;
+                               else
+                                       break;
+                       }
+                       skip_space = 0;
+                       if (uc & 0x80)
+                               continue;
+                       if (want_letnum == -1)
+                               want_letnum = letnum(uc);
+                       else if (want_letnum != letnum(uc))
+                               break;
                }
+               ncursor++;
        }
        return ncursor;
 }
 
+/* Move right one character, and then to the byte after the word. */
 static int
 endword(int argcnt)
 {
-       int     ncursor;
+       int ncursor, skip_space, want_letnum;
+       unsigned char uc;
 
        ncursor = es->cursor;
        while (ncursor < es->linelen && argcnt--) {
-               while (++ncursor < es->linelen - 1 &&
-                   isspace((unsigned char)es->cbuf[ncursor]))
-                       ;
-               if (ncursor < es->linelen - 1) {
-                       if (letnum(es->cbuf[ncursor]))
-                               while (++ncursor < es->linelen &&
-                                   letnum(es->cbuf[ncursor]))
-                                       ;
-                       else
-                               while (++ncursor < es->linelen &&
-                                   !letnum(es->cbuf[ncursor]) &&
-                                   !isspace((unsigned char)es->cbuf[ncursor]))
-                                       ;
-                       ncursor--;
+               skip_space = 1;
+               want_letnum = -1;
+               while (++ncursor < es->linelen) {
+                       uc = es->cbuf[ncursor];
+                       if (isspace(uc)) {
+                               if (skip_space)
+                                       continue;
+                               else
+                                       break;
+                       }
+                       skip_space = 0;
+                       if (uc & 0x80)
+                               continue;
+                       if (want_letnum == -1)
+                               want_letnum = letnum(uc);
+                       else if (want_letnum != letnum(uc))
+                               break;
                }
        }
        return ncursor;
 }
 
+/* Move right one character, and then to the beginning of the next big word. */
 static int
 Forwword(int argcnt)
 {
@@ -1617,6 +1634,7 @@ Forwword(int argcnt)
        return ncursor;
 }
 
+/* Move left one character, and then to the beginning of the big word. */
 static int
 Backword(int argcnt)
 {
@@ -1635,22 +1653,20 @@ Backword(int argcnt)
        return ncursor;
 }
 
+/* Move right one character, and then to the byte after the big word. */
 static int
 Endword(int argcnt)
 {
        int     ncursor;
 
        ncursor = es->cursor;
-       while (ncursor < es->linelen - 1 && argcnt--) {
-               while (++ncursor < es->linelen - 1 &&
+       while (ncursor < es->linelen && argcnt--) {
+               while (++ncursor < es->linelen &&
                    isspace((unsigned char)es->cbuf[ncursor]))
                        ;
-               if (ncursor < es->linelen - 1) {
-                       while (++ncursor < es->linelen &&
-                           !isspace((unsigned char)es->cbuf[ncursor]))
-                               ;
-                       ncursor--;
-               }
+               while (ncursor < es->linelen &&
+                   !isspace((unsigned char)es->cbuf[ncursor]))
+                       ncursor++;
        }
        return ncursor;
 }

Reply via email to