Re: diff: IUTF8 support

2012-04-22 Thread Nicholas Marriott
This makes sense to me.

I think there is no sane way to handle UTF-8 characters of width != 1...


On Thu, Apr 12, 2012 at 05:44:19AM -0700, Matthew Dempsky wrote:
 While reading the mosh research paper[1], I noticed we don't have
 IUTF8, which is necessary for backspace to work correctly in canonical
 mode (ICANON) with UTF-8 characters.
 
 [1] http://mosh.mit.edu/mosh-paper-draft.pdf
 
 I took a quick stab at implementing it, and it didn't seem too bad.
 See kernel diff below along with a corresponding fix to xterm(1) to
 actually make use of it on OpenBSD.
 
 I've only played with it for a little while, but I haven't noticed any
 issues with various mixes of backspace, ^W, and tabs.  I'm going to
 try to write some regress tests for it now, so I'm very interested if
 anyone notices any issues or differences in behavior from Linux.
 
 I suspect Unicode combining characters behave oddly, but Linux doesn't
 worry about those either as far as I can tell.
 
 
 Index: src/sys/sys/termios.h
 ===
 RCS file: /cvs/src/sys/sys/termios.h,v
 retrieving revision 1.11
 diff -u -p -r1.11 termios.h
 --- src/sys/sys/termios.h 26 Dec 2009 09:46:17 -  1.11
 +++ src/sys/sys/termios.h 12 Apr 2012 12:33:04 -
 @@ -101,7 +101,8 @@
  #if __BSD_VISIBLE
  #define  IXANY   0x0800  /* any char will restart after 
 stop */
  #define  IUCLC   0x1000  /* translate upper to lower 
 case */
 -#define IMAXBEL  0x2000  /* ring bell on input queue 
 full */
 +#define  IMAXBEL 0x2000  /* ring bell on input queue 
 full */
 +#define  IUTF8   0x4000  /* input stream is UTF-8 
 encoded */
  #endif /* __BSD_VISIBLE */
  
  /*
 Index: src/sys/kern/tty.c
 ===
 RCS file: /cvs/src/sys/kern/tty.c,v
 retrieving revision 1.94
 diff -u -p -r1.94 tty.c
 --- src/sys/kern/tty.c23 Mar 2012 15:51:26 -  1.94
 +++ src/sys/kern/tty.c12 Apr 2012 12:33:04 -
 @@ -163,6 +163,8 @@ u_char const char_type[] = {
  #define  tolower(c)  ((c) - 'A' + 'a')
  #define  toupper(c)  ((c) - 'a' + 'A')
  
 +#define iscont(tp, c)(((tp)-t_iflag  IUTF8)  (c) = 0x80  (c) 
  0xc0)
 +
  struct ttylist_head ttylist; /* TAILQ_HEAD */
  int tty_count;
  
 @@ -441,8 +443,12 @@ parmrk:  (void)putc(0377 | 
 TTY_QUOTE, 
* erase (^H / ^?)
*/
   if (CCEQ(cc[VERASE], c)) {
 - if (tp-t_rawq.c_cc)
 - ttyrub(unputc(tp-t_rawq), tp);
 + do {
 + c = unputc(tp-t_rawq);
 + if (c == -1)
 + break;
 + ttyrub(c, tp);
 + } while (iscont(tp, c));
   goto endcase;
   }
   /*
 @@ -698,7 +704,8 @@ ttyoutput(int c, struct tty *tp)
   col = 0;
   break;
   case ORDINARY:
 - ++col;
 + if (!iscont(tp, c))
 + ++col;
   break;
   case TAB:
   col = (col + 8)  ~7;
 @@ -1874,6 +1881,8 @@ ttyrub(int c, struct tty *tp)
   int tabc, s;
  
   if (!ISSET(tp-t_lflag, ECHO) || ISSET(tp-t_lflag, EXTPROC))
 + return;
 + if (iscont(tp, c))
   return;
   CLR(tp-t_lflag, FLUSHO);
   if (ISSET(tp-t_lflag, ECHOE)) {
 Index: xenocara/app/xterm/main.c
 ===
 RCS file: /cvs/xenocara/app/xterm/main.c,v
 retrieving revision 1.22
 diff -u -p -r1.22 main.c
 --- xenocara/app/xterm/main.c 16 Nov 2011 21:14:25 -  1.22
 +++ xenocara/app/xterm/main.c 12 Apr 2012 12:33:04 -
 @@ -3836,7 +3836,7 @@ spawnXTerm(XtermWidget xw)
   /* input: nl-nl, don't ignore cr, cr-nl */
   UIntClr(tio.c_iflag, (INLCR | IGNCR));
   tio.c_iflag |= ICRNL;
 -#if OPT_WIDE_CHARS  defined(linux)  defined(IUTF8)
 +#if OPT_WIDE_CHARS  defined(IUTF8)
  #if OPT_LUIT_PROG
   if (command_to_exec_with_luit == 0)
  #endif



diff: IUTF8 support

2012-04-12 Thread Matthew Dempsky
While reading the mosh research paper[1], I noticed we don't have
IUTF8, which is necessary for backspace to work correctly in canonical
mode (ICANON) with UTF-8 characters.

[1] http://mosh.mit.edu/mosh-paper-draft.pdf

I took a quick stab at implementing it, and it didn't seem too bad.
See kernel diff below along with a corresponding fix to xterm(1) to
actually make use of it on OpenBSD.

I've only played with it for a little while, but I haven't noticed any
issues with various mixes of backspace, ^W, and tabs.  I'm going to
try to write some regress tests for it now, so I'm very interested if
anyone notices any issues or differences in behavior from Linux.

I suspect Unicode combining characters behave oddly, but Linux doesn't
worry about those either as far as I can tell.


Index: src/sys/sys/termios.h
===
RCS file: /cvs/src/sys/sys/termios.h,v
retrieving revision 1.11
diff -u -p -r1.11 termios.h
--- src/sys/sys/termios.h   26 Dec 2009 09:46:17 -  1.11
+++ src/sys/sys/termios.h   12 Apr 2012 12:33:04 -
@@ -101,7 +101,8 @@
 #if __BSD_VISIBLE
 #defineIXANY   0x0800  /* any char will restart after 
stop */
 #defineIUCLC   0x1000  /* translate upper to lower 
case */
-#define IMAXBEL0x2000  /* ring bell on input queue 
full */
+#defineIMAXBEL 0x2000  /* ring bell on input queue 
full */
+#defineIUTF8   0x4000  /* input stream is UTF-8 
encoded */
 #endif /* __BSD_VISIBLE */
 
 /*
Index: src/sys/kern/tty.c
===
RCS file: /cvs/src/sys/kern/tty.c,v
retrieving revision 1.94
diff -u -p -r1.94 tty.c
--- src/sys/kern/tty.c  23 Mar 2012 15:51:26 -  1.94
+++ src/sys/kern/tty.c  12 Apr 2012 12:33:04 -
@@ -163,6 +163,8 @@ u_char const char_type[] = {
 #definetolower(c)  ((c) - 'A' + 'a')
 #definetoupper(c)  ((c) - 'a' + 'A')
 
+#define iscont(tp, c)  (((tp)-t_iflag  IUTF8)  (c) = 0x80  (c)  0xc0)
+
 struct ttylist_head ttylist;   /* TAILQ_HEAD */
 int tty_count;
 
@@ -441,8 +443,12 @@ parmrk:(void)putc(0377 | 
TTY_QUOTE, 
 * erase (^H / ^?)
 */
if (CCEQ(cc[VERASE], c)) {
-   if (tp-t_rawq.c_cc)
-   ttyrub(unputc(tp-t_rawq), tp);
+   do {
+   c = unputc(tp-t_rawq);
+   if (c == -1)
+   break;
+   ttyrub(c, tp);
+   } while (iscont(tp, c));
goto endcase;
}
/*
@@ -698,7 +704,8 @@ ttyoutput(int c, struct tty *tp)
col = 0;
break;
case ORDINARY:
-   ++col;
+   if (!iscont(tp, c))
+   ++col;
break;
case TAB:
col = (col + 8)  ~7;
@@ -1874,6 +1881,8 @@ ttyrub(int c, struct tty *tp)
int tabc, s;
 
if (!ISSET(tp-t_lflag, ECHO) || ISSET(tp-t_lflag, EXTPROC))
+   return;
+   if (iscont(tp, c))
return;
CLR(tp-t_lflag, FLUSHO);
if (ISSET(tp-t_lflag, ECHOE)) {
Index: xenocara/app/xterm/main.c
===
RCS file: /cvs/xenocara/app/xterm/main.c,v
retrieving revision 1.22
diff -u -p -r1.22 main.c
--- xenocara/app/xterm/main.c   16 Nov 2011 21:14:25 -  1.22
+++ xenocara/app/xterm/main.c   12 Apr 2012 12:33:04 -
@@ -3836,7 +3836,7 @@ spawnXTerm(XtermWidget xw)
/* input: nl-nl, don't ignore cr, cr-nl */
UIntClr(tio.c_iflag, (INLCR | IGNCR));
tio.c_iflag |= ICRNL;
-#if OPT_WIDE_CHARS  defined(linux)  defined(IUTF8)
+#if OPT_WIDE_CHARS  defined(IUTF8)
 #if OPT_LUIT_PROG
if (command_to_exec_with_luit == 0)
 #endif



Re: diff: IUTF8 support

2012-04-12 Thread Christian Weisgerber
Matthew Dempsky matt...@dempsky.org wrote:

 While reading the mosh research paper[1], I noticed we don't have
 IUTF8, which is necessary for backspace to work correctly in canonical
 mode (ICANON) with UTF-8 characters.

Here's a quick diff for stty(1).

Index: modes.c
===
RCS file: /cvs/src/bin/stty/modes.c,v
retrieving revision 1.10
diff -u -p -r1.10 modes.c
--- modes.c 27 Oct 2009 23:59:22 -  1.10
+++ modes.c 12 Apr 2012 13:45:33 -
@@ -115,6 +115,8 @@ const struct modes imodes[] = {
{ -decctlq,   IXANY, 0 },
{ imaxbel,IMAXBEL, 0 },
{ -imaxbel,   0, IMAXBEL },
+   { iutf8,  IUTF8, 0 },
+   { -iutf8, 0, IUTF8 },
{ NULL },
 };
 
Index: print.c
===
RCS file: /cvs/src/bin/stty/print.c,v
retrieving revision 1.13
diff -u -p -r1.13 print.c
--- print.c 27 Oct 2009 23:59:22 -  1.13
+++ print.c 12 Apr 2012 13:54:25 -
@@ -130,6 +130,7 @@ print(struct termios *tp, struct winsize
put(-ixoff, IXOFF, 0);
put(-ixany, IXANY, 1);
put(-imaxbel, IMAXBEL, 1);
+   put(-iutf8, IUTF8, 0);
put(-ignbrk, IGNBRK, 0);
put(-brkint, BRKINT, 1);
put(-inpck, INPCK, 0);
Index: stty.1
===
RCS file: /cvs/src/bin/stty/stty.1,v
retrieving revision 1.38
diff -u -p -r1.38 stty.1
--- stty.1  3 Sep 2011 22:59:08 -   1.38
+++ stty.1  12 Apr 2012 14:10:59 -
@@ -233,6 +233,8 @@ Otherwise, if
 .Cm imaxbel
 is unset and the input queue is full, the next input character causes
 the entire input and output queues to be discarded.
+.It Cm iutf8 Pq Fl iutf8
+Enable (disable) UTF-8 processing in canonical mode.
 .El
 .Ss Output modes
 This corresponds to the
@@ -267,7 +269,7 @@ Local mode flags (lflags) affect various
 processing.
 Historically the term local pertained to new job control features
 implemented by Jim Kulp on a
-.Tn Pdp 11/70
+.Tn PDP 11/70
 at
 .Tn IIASA .
 Later the driver ran on the first

-- 
Christian naddy Weisgerber  na...@mips.inka.de



Re: diff: IUTF8 support

2012-04-12 Thread Christian Weisgerber
Matthew Dempsky matt...@dempsky.org wrote:

 While reading the mosh research paper[1], I noticed we don't have
 IUTF8, which is necessary for backspace to work correctly in canonical
 mode (ICANON) with UTF-8 characters.

In principle, IUTF8 should also be added to ssh(1).

I see that the Debian people already made some noise about that back in
2005
http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=337041
but were told that it was too late to standardize it in what became
RFC4250.

Index: ttymodes.h
===
RCS file: /cvs/src/usr.bin/ssh/ttymodes.h,v
retrieving revision 1.14
diff -u -p -r1.14 ttymodes.h
--- ttymodes.h  25 Mar 2006 22:22:43 -  1.14
+++ ttymodes.h  12 Apr 2012 14:38:26 -
@@ -127,6 +127,9 @@ TTYMODE(IXOFF,  c_iflag, 40)
 #ifdef IMAXBEL
 TTYMODE(IMAXBEL,c_iflag, 41)
 #endif /* IMAXBEL */
+#ifdef IUTF8
+TTYMODE(IUTF8, c_iflag, 42)
+#endif
 
 TTYMODE(ISIG,  c_lflag, 50)
 TTYMODE(ICANON,c_lflag, 51)
-- 
Christian naddy Weisgerber  na...@mips.inka.de