Jungshik Shin <[EMAIL PROTECTED]> writes:

> On Mon, 20 May 2002, Jungshik Shin wrote:
> > On Mon, 20 May 2002, Zvi Har'El wrote:
> 
> > > I am using less on a UTF-8 Redhat Linux 7.3 machine. I am having troubles
> > > with using man, because of the overstiking is not handled properly.
> > > I read the
> 
> >   Anyway, attached is a *simplistic*(not perfect)  patch against
> > less 374(the newest at less home page)
> 
>   Attached is a newer patch. This one works better than the previous
> one although it still has the problem mentioned in my previous
> message(and quoted below). With this, two hyphens in a row(U+2010) overstruck
> with themselves are rendered correctly.  I don't have to be bothered
> by hollow boxes(for U+2010) in man pages any more.

For comparison ... here a patch I've been working on (against less
358) that's a bit more ambitious... it validates the input and
displays invalid UTF-8 similarly to control characters.

Regards,
                                        Owen

diff -u less-358/charset.c less-358-new/charset.c
--- less-358/charset.c	Sat Jul  8 20:26:43 2000
+++ less-358-new/charset.c	Thu May  9 19:09:31 2002
@@ -274,15 +274,17 @@
 /*
  * Return the printable form of a character.
  * For example, in the "ascii" charset '\3' is printed as "^C".
+ * If force_ascii is 1, always print an ascii representation.
  */
 	public char *
-prchar(c)
+prchar(c, force_ascii)
 	int c;
+	int force_ascii;
 {
 	static char buf[8];
 
 	c &= 0377;
-	if (!control_char(c))
+	if (!force_ascii && !control_char(c))
 		sprintf(buf, "%c", c);
 	else if (c == ESC)
 		sprintf(buf, "ESC");
diff -u less-358/cmdbuf.c less-358-new/cmdbuf.c
--- less-358/cmdbuf.c	Sat Jul  8 20:26:43 2000
+++ less-358-new/cmdbuf.c	Thu May  9 19:08:34 2002
@@ -154,7 +154,7 @@
 	clear_eol();
 	for ( ;  *cp != '\0';  cp++)
 	{
-		p = prchar(*cp);
+		p = prchar(*cp, 0);
 		if (cmd_col + strlen(p) >= sc_width)
 			break;
 		putstr(p);
@@ -201,7 +201,7 @@
 	s = cmdbuf + cmd_offset;
 	cols = 0;
 	while (cols < (sc_width - prompt_col) / 2 && *s != '\0')
-		cols += strlen(prchar(*s++));
+		cols += strlen(prchar(*s++, 0));
 
 	cmd_offset = s - cmdbuf;
 	save_cp = cp;
@@ -229,7 +229,7 @@
 	cols = 0;
 	while (cols < (sc_width - prompt_col) / 2 && s > cmdbuf)
 	{
-		p = prchar(*--s);
+		p = prchar(*--s, 0);
 		cols += strlen(p);
 	}
 
@@ -254,7 +254,7 @@
 		 */
 		return (CC_OK);
 	}
-	p = prchar(*cp);
+	p = prchar(*cp, 0);
 	if (cmd_col + strlen(p) >= sc_width)
 		cmd_lshift();
 	else if (cmd_col + strlen(p) == sc_width - 1 && cp[1] != '\0')
@@ -278,7 +278,7 @@
 		/* Already at the beginning of the line */
 		return (CC_OK);
 	}
-	p = prchar(cp[-1]);
+	p = prchar(cp[-1], 0);
 	if (cmd_col < prompt_col + strlen(p))
 		cmd_rshift();
 	cp--;
diff -u less-358/line.c less-358-new/line.c
--- less-358/line.c	Sat Jul  8 20:26:46 2000
+++ less-358-new/line.c	Thu May  9 22:24:04 2002
@@ -16,8 +16,21 @@
  */
 
 #include "less.h"
+#include <stdlib.h>
+
+static const char utf8_skip[256] = {
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
 
 #define IS_CONT(c)  (((c) & 0xC0) == 0x80)
+#define UTF8_SKIP(c) (utf8_skip[(unsigned char)c])
 #define LINENUM_WIDTH   8       /* Chars to use for line number */
 
 /* Buffer which holds the current output line */
@@ -34,11 +47,13 @@
 static int overstrike;		/* Next char should overstrike previous char */
 static int is_null_line;	/* There is no current line */
 static int lmargin;		/* Left margin */
-static char pendc;
+static char pendc[6];		/* Pending characters; enough for 1 UTF-8 character */
+static int pendcount;		/* Count of pending characters */
 static POSITION pendpos;
 static char *end_ansi_chars;
 
 static int do_append();
+static int do_append_ascii();
 
 extern int bs_mode;
 extern int tabstop;
@@ -78,7 +93,7 @@
 	column = 0;
 	overstrike = 0;
 	is_null_line = 0;
-	pendc = '\0';
+	pendcount = 0;
 	lmargin = 0;
 	if (status_col)
 		lmargin += 1;
@@ -398,30 +413,66 @@
 {
 	int r;
 
-	if (pendc)
+	if (pendcount == 1 && pendc[0] == '\r')
 	{
-		if (do_append(pendc, pendpos))
+		if (do_append(pendc, pendcount, pendpos))
 			/*
 			 * Oops.  We've probably lost the char which
 			 * was in pendc, since caller won't back up.
 			 */
 			return (1);
-		pendc = '\0';
+		pendcount = 0;
 	}
 
-	if (c == '\r' && bs_mode == BS_SPECIAL)
+	if (c == '\r' && bs_mode == BS_SPECIAL && pendcount == 0)
 	{
 		/*
 		 * Don't put the CR into the buffer until we see 
 		 * the next char.  If the next char is a newline,
 		 * discard the CR.
 		 */
-		pendc = c;
+		pendcount = 1;
+		pendc[0] = c;
 		pendpos = pos;
 		return (0);
 	}
 
-	r = do_append(c, pos);
+	if (utf_mode)
+	{
+		if (IS_CONT(c)) {
+			pendc[pendcount++] = c;
+			if (pendcount == 1) {
+				/* Continuation character by itself */
+				r = do_append_ascii (pendc, pendcount, pos);
+				pendcount = 0;
+				return r;
+				
+			}
+		} else {
+			if (pendcount != 0) {
+				/* Incomplete multibyte sequence */
+				(void) do_append_ascii (pendc, pendcount, pendpos);
+				pendcount = 0;
+				
+			}
+
+			pendc[pendcount++] = c;
+			pendpos = pos;
+		}
+		
+		if (pendcount < UTF8_SKIP(pendc[0]))
+			return (0);
+	}
+	else
+	{
+		pendc[0] = c;
+		pendcount = 1;
+		pendpos = pos;
+	}
+
+	r = do_append(pendc, pendcount, pendpos);
+	pendcount = 0;
+	
 	/*
 	 * If we need to shift the line, do it.
 	 * But wait until we get to at least the middle of the screen,
@@ -433,30 +484,70 @@
 	return (r);
 }
 
+#define	STOREC(c,a) \
+	if (storec((c),(a),pos)) return (1); else curr++
+
 	static int
-do_append(c, pos)
-	int c;
+do_append_ascii(c, count, pos)
+	char *c;
+	int count;
 	POSITION pos;
 {
 	register char *s;
 	register int a;
+	register int i;
 
-#define	STOREC(c,a) \
-	if (storec((c),(a),pos)) return (1); else curr++
+	while (count--)
+	{
+		/*
+		 * Convert to printable representation.
+		 */
+		s = prchar(*c, 1);
+		a = binattr;
+		
+		/*
+		 * Make sure we can get the entire representation
+		 * of the character on this line.
+		 */
+		if (column + (int) strlen(s) + 
+		    attr_swidth(a) + attr_ewidth(a) > sc_width)
+			return (1);
+		
+		for ( ;  *s != 0;  s++)
+			STOREC(*s, a);
 
-	if (c == '\b')
+		pos++;
+		c++;
+	}
+
+	return (0);
+}
+	
+/*
+ * Append an unsafe character to the line buffer
+ * Returns 0 if ok, 1 if couldn't fit in buffer.
+ */
+
+	static int
+do_append(c, count, pos)
+        char *c;
+	int count;
+	POSITION pos;
+{
+	int tcount;
+	
+	if (c[0] == '\b')
 	{
 		switch (bs_mode)
 		{
 		case BS_NORMAL:
-			STOREC(c, AT_NORMAL);
+			STOREC(c[0], AT_NORMAL);
 			break;
 		case BS_CONTROL:
 			goto do_control_char;
 		case BS_SPECIAL:
 			if (curr == 0)
 				break;
-			backc();
 			overstrike = 1;
 			break;
 		}
@@ -469,18 +560,47 @@
 		 * bold (if an identical character is overstruck),
 		 * or just deletion of the character in the buffer.
 		 */
-		overstrike = 0;
-		if ((char)c == linebuf[curr])
-			STOREC(linebuf[curr], AT_BOLD);
-		else if (c == '_')
-			STOREC(linebuf[curr], AT_UNDERLINE);
-		else if (linebuf[curr] == '_')
-			STOREC(c, AT_UNDERLINE);
-		else if (control_char(c))
+	  	overstrike=0;
+		if (curr >= count && memcmp(&linebuf[curr-count],c,count) == 0) {
+			tcount = count;
+			while (tcount--)
+				backc();
+			while (count--) {
+				STOREC(*c, AT_BOLD);
+				c++;
+			}
+		}
+		else if (c[0] == '_') {
+			backc();
+			if (utf_mode) {
+				while (IS_CONT(linebuf[curr]))
+					backc();
+			}
+			while (count--) {
+				STOREC(*c, AT_UNDERLINE);
+				c++;
+			}
+		}
+		else if (linebuf[curr] == '_') {
+			backc();
+			while (count--) {
+				STOREC(*c, AT_UNDERLINE);
+				c++;
+			}
+		} else if (control_char(c[0]))
 			goto do_control_char;
-		else
-			STOREC(c, AT_NORMAL);
-	} else if (c == '\t') 
+		else {
+			backc();
+			if (utf_mode) {
+				while (IS_CONT(linebuf[curr]))
+					backc();
+			}
+			while (count--) {
+				STOREC(*c, AT_UNDERLINE);
+				c++;
+			}
+		}
+	} else if (c[0] == '\t') 
 	{
 		/*
 		 * Expand a tab into spaces.
@@ -499,10 +619,10 @@
 			} while (((column + cshift - lmargin) % tabstop) != 0);
 			break;
 		}
-	} else if (control_char(c))
+	} else if (control_char(c[0]))
 	{
 	do_control_char:
-		if (ctldisp == OPT_ON || (ctldisp == OPT_ONPLUS && c == ESC))
+		if (ctldisp == OPT_ON || (ctldisp == OPT_ONPLUS && c[0] == ESC))
 		{
 			/*
 			 * Output as a normal character.
@@ -510,26 +630,14 @@
 			STOREC(c, AT_NORMAL);
 		} else 
 		{
-			/*
-			 * Convert to printable representation.
-			 */
-			s = prchar(c);  
-			a = binattr;
-
-			/*
-			 * Make sure we can get the entire representation
-			 * of the character on this line.
-			 */
-			if (column + (int) strlen(s) + 
-			    attr_swidth(a) + attr_ewidth(a) > sc_width)
-				return (1);
-
-			for ( ;  *s != 0;  s++)
-				STOREC(*s, a);
+			return do_append_ascii(c, count, pos);
 		}
 	} else
 	{
-		STOREC(c, AT_NORMAL);
+		while (count--) {
+			STOREC(*c, AT_NORMAL);
+			c++;
+		}
 	}
 
 	return (0);
@@ -542,13 +650,22 @@
 pdone(endline)
 	int endline;
 {
-	if (pendc && (pendc != '\r' || !endline))
-		/*
-		 * If we had a pending character, put it in the buffer.
-		 * But discard a pending CR if we are at end of line
-		 * (that is, discard the CR in a CR/LF sequence).
-		 */
-		(void) do_append(pendc, pendpos);
+	if (pendcount) {
+		if (pendcount == 1 && pendc[0] == '\r') {
+			/*
+			 * Discard a pending CR if we are at end of line
+			 * (that is, discard the CR in a CR/LF sequence).
+			 */
+
+			if (!endline)
+				(void) do_append(pendc, pendcount, pendpos);
+		} else {
+			/*
+			 * Otherwise, pendc[] holds an incomplete multibyte sequence
+			 */
+			(void) do_append_ascii(pendc, pendcount, pendpos);
+		}
+	}
 
 	/*
 	 * Make sure we've shifted the line, if we need to.
diff -u less-358/option.c less-358-new/option.c
--- less-358/option.c	Sat Jul  8 20:26:47 2000
+++ less-358-new/option.c	Thu May  9 19:08:42 2002
@@ -467,7 +467,7 @@
 {
 	static char buf[8];
 
-	sprintf(buf, "-%s", prchar(c));
+	sprintf(buf, "-%s", prchar(c, 0));
 	return (buf);
 }
 

Reply via email to