Jungshik Shin <[EMAIL PROTECTED]> writes:
> On Mon, 20 May 2002, Jungshik Shin wrote:
> > On Mon, 20 May 2002, Zvi Har'El wrote:
>
> > > I am using less on a UTF-8 Redhat Linux 7.3 machine. I am having troubles
> > > with using man, because of the overstiking is not handled properly.
> > > I read the
>
> > Anyway, attached is a *simplistic*(not perfect) patch against
> > less 374(the newest at less home page)
>
> Attached is a newer patch. This one works better than the previous
> one although it still has the problem mentioned in my previous
> message(and quoted below). With this, two hyphens in a row(U+2010) overstruck
> with themselves are rendered correctly. I don't have to be bothered
> by hollow boxes(for U+2010) in man pages any more.
For comparison ... here a patch I've been working on (against less
358) that's a bit more ambitious... it validates the input and
displays invalid UTF-8 similarly to control characters.
Regards,
Owen
diff -u less-358/charset.c less-358-new/charset.c
--- less-358/charset.c Sat Jul 8 20:26:43 2000
+++ less-358-new/charset.c Thu May 9 19:09:31 2002
@@ -274,15 +274,17 @@
/*
* Return the printable form of a character.
* For example, in the "ascii" charset '\3' is printed as "^C".
+ * If force_ascii is 1, always print an ascii representation.
*/
public char *
-prchar(c)
+prchar(c, force_ascii)
int c;
+ int force_ascii;
{
static char buf[8];
c &= 0377;
- if (!control_char(c))
+ if (!force_ascii && !control_char(c))
sprintf(buf, "%c", c);
else if (c == ESC)
sprintf(buf, "ESC");
diff -u less-358/cmdbuf.c less-358-new/cmdbuf.c
--- less-358/cmdbuf.c Sat Jul 8 20:26:43 2000
+++ less-358-new/cmdbuf.c Thu May 9 19:08:34 2002
@@ -154,7 +154,7 @@
clear_eol();
for ( ; *cp != '\0'; cp++)
{
- p = prchar(*cp);
+ p = prchar(*cp, 0);
if (cmd_col + strlen(p) >= sc_width)
break;
putstr(p);
@@ -201,7 +201,7 @@
s = cmdbuf + cmd_offset;
cols = 0;
while (cols < (sc_width - prompt_col) / 2 && *s != '\0')
- cols += strlen(prchar(*s++));
+ cols += strlen(prchar(*s++, 0));
cmd_offset = s - cmdbuf;
save_cp = cp;
@@ -229,7 +229,7 @@
cols = 0;
while (cols < (sc_width - prompt_col) / 2 && s > cmdbuf)
{
- p = prchar(*--s);
+ p = prchar(*--s, 0);
cols += strlen(p);
}
@@ -254,7 +254,7 @@
*/
return (CC_OK);
}
- p = prchar(*cp);
+ p = prchar(*cp, 0);
if (cmd_col + strlen(p) >= sc_width)
cmd_lshift();
else if (cmd_col + strlen(p) == sc_width - 1 && cp[1] != '\0')
@@ -278,7 +278,7 @@
/* Already at the beginning of the line */
return (CC_OK);
}
- p = prchar(cp[-1]);
+ p = prchar(cp[-1], 0);
if (cmd_col < prompt_col + strlen(p))
cmd_rshift();
cp--;
diff -u less-358/line.c less-358-new/line.c
--- less-358/line.c Sat Jul 8 20:26:46 2000
+++ less-358-new/line.c Thu May 9 22:24:04 2002
@@ -16,8 +16,21 @@
*/
#include "less.h"
+#include <stdlib.h>
+
+static const char utf8_skip[256] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
#define IS_CONT(c) (((c) & 0xC0) == 0x80)
+#define UTF8_SKIP(c) (utf8_skip[(unsigned char)c])
#define LINENUM_WIDTH 8 /* Chars to use for line number */
/* Buffer which holds the current output line */
@@ -34,11 +47,13 @@
static int overstrike; /* Next char should overstrike previous char */
static int is_null_line; /* There is no current line */
static int lmargin; /* Left margin */
-static char pendc;
+static char pendc[6]; /* Pending characters; enough for 1 UTF-8 character */
+static int pendcount; /* Count of pending characters */
static POSITION pendpos;
static char *end_ansi_chars;
static int do_append();
+static int do_append_ascii();
extern int bs_mode;
extern int tabstop;
@@ -78,7 +93,7 @@
column = 0;
overstrike = 0;
is_null_line = 0;
- pendc = '\0';
+ pendcount = 0;
lmargin = 0;
if (status_col)
lmargin += 1;
@@ -398,30 +413,66 @@
{
int r;
- if (pendc)
+ if (pendcount == 1 && pendc[0] == '\r')
{
- if (do_append(pendc, pendpos))
+ if (do_append(pendc, pendcount, pendpos))
/*
* Oops. We've probably lost the char which
* was in pendc, since caller won't back up.
*/
return (1);
- pendc = '\0';
+ pendcount = 0;
}
- if (c == '\r' && bs_mode == BS_SPECIAL)
+ if (c == '\r' && bs_mode == BS_SPECIAL && pendcount == 0)
{
/*
* Don't put the CR into the buffer until we see
* the next char. If the next char is a newline,
* discard the CR.
*/
- pendc = c;
+ pendcount = 1;
+ pendc[0] = c;
pendpos = pos;
return (0);
}
- r = do_append(c, pos);
+ if (utf_mode)
+ {
+ if (IS_CONT(c)) {
+ pendc[pendcount++] = c;
+ if (pendcount == 1) {
+ /* Continuation character by itself */
+ r = do_append_ascii (pendc, pendcount, pos);
+ pendcount = 0;
+ return r;
+
+ }
+ } else {
+ if (pendcount != 0) {
+ /* Incomplete multibyte sequence */
+ (void) do_append_ascii (pendc, pendcount, pendpos);
+ pendcount = 0;
+
+ }
+
+ pendc[pendcount++] = c;
+ pendpos = pos;
+ }
+
+ if (pendcount < UTF8_SKIP(pendc[0]))
+ return (0);
+ }
+ else
+ {
+ pendc[0] = c;
+ pendcount = 1;
+ pendpos = pos;
+ }
+
+ r = do_append(pendc, pendcount, pendpos);
+ pendcount = 0;
+
/*
* If we need to shift the line, do it.
* But wait until we get to at least the middle of the screen,
@@ -433,30 +484,70 @@
return (r);
}
+#define STOREC(c,a) \
+ if (storec((c),(a),pos)) return (1); else curr++
+
static int
-do_append(c, pos)
- int c;
+do_append_ascii(c, count, pos)
+ char *c;
+ int count;
POSITION pos;
{
register char *s;
register int a;
+ register int i;
-#define STOREC(c,a) \
- if (storec((c),(a),pos)) return (1); else curr++
+ while (count--)
+ {
+ /*
+ * Convert to printable representation.
+ */
+ s = prchar(*c, 1);
+ a = binattr;
+
+ /*
+ * Make sure we can get the entire representation
+ * of the character on this line.
+ */
+ if (column + (int) strlen(s) +
+ attr_swidth(a) + attr_ewidth(a) > sc_width)
+ return (1);
+
+ for ( ; *s != 0; s++)
+ STOREC(*s, a);
- if (c == '\b')
+ pos++;
+ c++;
+ }
+
+ return (0);
+}
+
+/*
+ * Append an unsafe character to the line buffer
+ * Returns 0 if ok, 1 if couldn't fit in buffer.
+ */
+
+ static int
+do_append(c, count, pos)
+ char *c;
+ int count;
+ POSITION pos;
+{
+ int tcount;
+
+ if (c[0] == '\b')
{
switch (bs_mode)
{
case BS_NORMAL:
- STOREC(c, AT_NORMAL);
+ STOREC(c[0], AT_NORMAL);
break;
case BS_CONTROL:
goto do_control_char;
case BS_SPECIAL:
if (curr == 0)
break;
- backc();
overstrike = 1;
break;
}
@@ -469,18 +560,47 @@
* bold (if an identical character is overstruck),
* or just deletion of the character in the buffer.
*/
- overstrike = 0;
- if ((char)c == linebuf[curr])
- STOREC(linebuf[curr], AT_BOLD);
- else if (c == '_')
- STOREC(linebuf[curr], AT_UNDERLINE);
- else if (linebuf[curr] == '_')
- STOREC(c, AT_UNDERLINE);
- else if (control_char(c))
+ overstrike=0;
+ if (curr >= count && memcmp(&linebuf[curr-count],c,count) == 0) {
+ tcount = count;
+ while (tcount--)
+ backc();
+ while (count--) {
+ STOREC(*c, AT_BOLD);
+ c++;
+ }
+ }
+ else if (c[0] == '_') {
+ backc();
+ if (utf_mode) {
+ while (IS_CONT(linebuf[curr]))
+ backc();
+ }
+ while (count--) {
+ STOREC(*c, AT_UNDERLINE);
+ c++;
+ }
+ }
+ else if (linebuf[curr] == '_') {
+ backc();
+ while (count--) {
+ STOREC(*c, AT_UNDERLINE);
+ c++;
+ }
+ } else if (control_char(c[0]))
goto do_control_char;
- else
- STOREC(c, AT_NORMAL);
- } else if (c == '\t')
+ else {
+ backc();
+ if (utf_mode) {
+ while (IS_CONT(linebuf[curr]))
+ backc();
+ }
+ while (count--) {
+ STOREC(*c, AT_UNDERLINE);
+ c++;
+ }
+ }
+ } else if (c[0] == '\t')
{
/*
* Expand a tab into spaces.
@@ -499,10 +619,10 @@
} while (((column + cshift - lmargin) % tabstop) != 0);
break;
}
- } else if (control_char(c))
+ } else if (control_char(c[0]))
{
do_control_char:
- if (ctldisp == OPT_ON || (ctldisp == OPT_ONPLUS && c == ESC))
+ if (ctldisp == OPT_ON || (ctldisp == OPT_ONPLUS && c[0] == ESC))
{
/*
* Output as a normal character.
@@ -510,26 +630,14 @@
STOREC(c, AT_NORMAL);
} else
{
- /*
- * Convert to printable representation.
- */
- s = prchar(c);
- a = binattr;
-
- /*
- * Make sure we can get the entire representation
- * of the character on this line.
- */
- if (column + (int) strlen(s) +
- attr_swidth(a) + attr_ewidth(a) > sc_width)
- return (1);
-
- for ( ; *s != 0; s++)
- STOREC(*s, a);
+ return do_append_ascii(c, count, pos);
}
} else
{
- STOREC(c, AT_NORMAL);
+ while (count--) {
+ STOREC(*c, AT_NORMAL);
+ c++;
+ }
}
return (0);
@@ -542,13 +650,22 @@
pdone(endline)
int endline;
{
- if (pendc && (pendc != '\r' || !endline))
- /*
- * If we had a pending character, put it in the buffer.
- * But discard a pending CR if we are at end of line
- * (that is, discard the CR in a CR/LF sequence).
- */
- (void) do_append(pendc, pendpos);
+ if (pendcount) {
+ if (pendcount == 1 && pendc[0] == '\r') {
+ /*
+ * Discard a pending CR if we are at end of line
+ * (that is, discard the CR in a CR/LF sequence).
+ */
+
+ if (!endline)
+ (void) do_append(pendc, pendcount, pendpos);
+ } else {
+ /*
+ * Otherwise, pendc[] holds an incomplete multibyte sequence
+ */
+ (void) do_append_ascii(pendc, pendcount, pendpos);
+ }
+ }
/*
* Make sure we've shifted the line, if we need to.
diff -u less-358/option.c less-358-new/option.c
--- less-358/option.c Sat Jul 8 20:26:47 2000
+++ less-358-new/option.c Thu May 9 19:08:42 2002
@@ -467,7 +467,7 @@
{
static char buf[8];
- sprintf(buf, "-%s", prchar(c));
+ sprintf(buf, "-%s", prchar(c, 0));
return (buf);
}