Hello,

In some multibyte encodings, such as Shift_JIS[1], the trailing bytes of
a multibyte character may contain 7-bit character.  That could confuse
the 'read' command on ksh.

  $ /bin/echo -e '\x95\x5c' > x
  $ LANG=ja_JP.SJIS ksh -c 'read a' < x
  zsh: exit 1     LANG=ja_JP.SJIS ksh -c 'read a' < x

Note that the byte sequence '\x95\x5c' represents a multibyte character
U+8868, while '\x5c' is a backslash when interpreted as a single byte
character.

The attached patch tries to fix this.

Index: ast-ksh.2014-09-29/src/cmd/ksh93/bltins/read.c
===================================================================
--- ast-ksh.2014-09-29.orig/src/cmd/ksh93/bltins/read.c
+++ ast-ksh.2014-09-29/src/cmd/ksh93/bltins/read.c
@@ -728,6 +728,9 @@ int sh_readline(register Shell_t *shp,ch
 	del = 0;
 	while(1)
 	{
+#if SHOPT_MULTIBYTE
+		ssize_t z;
+#endif	/* SHOPT_MULTIBYTE */
 		switch(c)
 		{
 #if SHOPT_MULTIBYTE
@@ -864,11 +867,31 @@ int sh_readline(register Shell_t *shp,ch
 			}
 			/* skip over word characters */
 			wrd = -1;
+#if SHOPT_MULTIBYTE
+			/* skip a preceding multibyte character if any */
+			if(c==0 && (z=mbsize(cp-1))>1)
+				cp += z - 1;
+#endif	/* SHOPT_MULTIBYTE */
 			while(1)
 			{
+#if !SHOPT_MULTIBYTE
 				while((c=shp->ifstable[*cp++])==0)
 					if(!wrd)
 						wrd = 1;
+#else  /* !SHOPT_MULTIBYTE */
+				while(1)
+				{
+					if((c=shp->ifstable[*cp])!=0)
+					{
+						cp++;
+						break;
+					}
+					if((z=mbsize(cp))>=1)
+						cp += z;
+					if(!wrd)
+						wrd = 1;
+				}
+#endif	/* SHOPT_MULTIBYTE */
 				if(inquote)
 				{
 					if(c==S_QUOTE)
Footnotes: 
[1]  https://en.wikipedia.org/wiki/Shift_JIS

Regards,
--
Daiki Ueno
_______________________________________________
ast-developers mailing list
ast-developers@lists.research.att.com
http://lists.research.att.com/mailman/listinfo/ast-developers

Reply via email to