Hi,

The command line handling code in less/cmdbuf.c is very complicated.
>From the top level function cmd_char(), the stack can go down nine levels
before finally reaching the bottom level function cmd_step_common().
One of the functions traversed during that descent is the recursive
function cmd_repaint(), and the call tree has many different branches.
UTF-8 handling occurs both in the top level function and in the
bottom level function, and also at some places in between.

So when cleaning up the UTF-8 handling in here, i'm trying to proceed
very slowly and in minimal steps.  Given that i have little hope of
finding a system for testing each and every possible code path, my
plan is to make sure that individual changes do not change behaviour
locally (or only in intended ways).

So, lets get to the first, small step.  Right now, only one place
remains where the function step_char() is called with dir = -1,
i.e. moving the cursor leftward: from cmd_step_left() near the
bottom of the call stack.  That function is used when moving the
cursor left by one character (with or without erase), by one word,
or all the way back to the prompt.  It is also used to delete
leftover characters after repainting the line when the new text is
shorter than the old text.  And it is also used when scrolling
forward.  Given the complexity of the code, there may be even more
situations.

The obvious replacement for step_char(-1) is the new function
mbtowc_left() that i recently introduced.  Using that allows dropping
the "dir" argument from step_char().  Of course, the plan still is
to ultimately get rid fo step_char() altogether.

Let's make sure the new behaviour makes sense:

 * When at the beginning of the buffer, do not move, and use L'\0'
   as a replacement for the character that isn't there.
   Same behaviour as before.
 * When there is no valid UTF-8 character to the left (return value -1),
   back up one byte and use L'\0'.
   This _is_ a change of behaviour.  Current behaviour is to back up
   across all UTF-8 continuation bytes (no matter how many) until
   finding a byte that is not a continuation byte, then take the
   length given in that byte at face value (even if invalid) and
   construct a codepoint using that number of bytes (even if that
   implies ignoring additional garbage bytes in between, and even
   it it means extending the character that is supposedly to the
   left of the cursor to bytes that are to the right of the current
   cursor position).
   I believe making multibyte steps only for valid characters,
   and stepping byte-by-byte across invalid bytes, is safer than 
   the reckless jumping done right now.
   Then again, the practical difference may be small (if any)
   because i'm not quite sure how to get invalid bytes into the
   command buffer in the first place.
 * When there is a NUL byte to the left (return value 0),
   back up by this one byte, and use it as L'\0'.
   Same behaviour as before.

Tested by typing UTF-8 into the command buffer, moving the cursor
left and right across it (with and without deleting), inserting
UTF-8 before it, and scrolling the UTF-8 off the screen both to the
right and to the left.

OK?
  Ingo


Index: charset.c
===================================================================
RCS file: /cvs/src/usr.bin/less/charset.c,v
retrieving revision 1.26
diff -u -p -r1.26 charset.c
--- charset.c   2 Sep 2019 14:07:45 -0000       1.26
+++ charset.c   2 Sep 2019 15:00:50 -0000
@@ -353,7 +353,7 @@ get_wchar(const char *p)
  * Step forward or backward one character in a string.
  */
 LWCHAR
-step_char(char **pp, int dir, char *limit)
+step_char(char **pp, char *limit)
 {
        LWCHAR ch;
        int len;
@@ -361,11 +361,8 @@ step_char(char **pp, int dir, char *limi
 
        if (!utf_mode) {
                /* It's easy if chars are one byte. */
-               if (dir > 0)
-                       ch = (LWCHAR) ((p < limit) ? *p++ : 0);
-               else
-                       ch = (LWCHAR) ((p > limit) ? *--p : 0);
-       } else if (dir > 0) {
+               ch = (LWCHAR) ((p < limit) ? *p++ : 0);
+       } else {
                len = utf_len(*p);
                if (p + len > limit) {
                        ch = 0;
@@ -374,13 +371,6 @@ step_char(char **pp, int dir, char *limi
                        ch = get_wchar(p);
                        p += len;
                }
-       } else {
-               while (p > limit && IS_UTF8_TRAIL(p[-1]))
-                       p--;
-               if (p > limit)
-                       ch = get_wchar(--p);
-               else
-                       ch = 0;
        }
        *pp = p;
        return (ch);
Index: cmdbuf.c
===================================================================
RCS file: /cvs/src/usr.bin/less/cmdbuf.c,v
retrieving revision 1.20
diff -u -p -r1.20 cmdbuf.c
--- cmdbuf.c    2 Sep 2019 14:07:45 -0000       1.20
+++ cmdbuf.c    2 Sep 2019 15:00:51 -0000
@@ -141,7 +141,7 @@ len_cmdbuf(void)
        int len = 0;
 
        while (*s != '\0') {
-               step_char(&s, +1, endline);
+               step_char(&s, endline);
                len++;
        }
        return (len);
@@ -197,7 +197,7 @@ static char *
 cmd_step_right(char **pp, int *pwidth, int *bswidth)
 {
        char *p = *pp;
-       LWCHAR ch = step_char(pp, +1, p + strlen(p));
+       LWCHAR ch = step_char(pp, p + strlen(p));
 
        return (cmd_step_common(p, ch, *pp - p, pwidth, bswidth));
 }
@@ -208,10 +208,18 @@ cmd_step_right(char **pp, int *pwidth, i
 static char *
 cmd_step_left(char **pp, int *pwidth, int *bswidth)
 {
-       char *p = *pp;
-       LWCHAR ch = step_char(pp, -1, cmdbuf);
+       wchar_t ch;
+       int len;
 
-       return (cmd_step_common(*pp, ch, p - *pp, pwidth, bswidth));
+       if (*pp <= cmdbuf) {
+               ch = L'\0';
+               len = 0;
+       } else if ((len = mbtowc_left(&ch, *pp, *pp - cmdbuf)) <= 0) {
+               ch = L'\0';
+               len = 1;
+       }
+       *pp -= len;
+       return cmd_step_common(*pp, ch, len, pwidth, bswidth);
 }
 
 /*
@@ -813,7 +821,7 @@ cmd_istr(char *str)
 
        for (s = str; *s != '\0'; ) {
                char *os = s;
-               step_char(&s, +1, endline);
+               step_char(&s, endline);
                action = cmd_ichar(os, s - os);
                if (action != CC_OK) {
                        ring_bell();
Index: funcs.h
===================================================================
RCS file: /cvs/src/usr.bin/less/funcs.h,v
retrieving revision 1.25
diff -u -p -r1.25 funcs.h
--- funcs.h     2 Sep 2019 14:07:45 -0000       1.25
+++ funcs.h     2 Sep 2019 15:00:51 -0000
@@ -61,7 +61,7 @@ char *prchar(LWCHAR);
 char *prutfchar(LWCHAR);
 int utf_len(char);
 int is_utf8_well_formed(const char *);
-LWCHAR step_char(char **, int, char *);
+LWCHAR step_char(char **, char *);
 int is_composing_char(LWCHAR);
 int is_ubin_char(LWCHAR);
 int is_wide_char(LWCHAR);

Reply via email to