Bug#211217: most: Fixed UTF-8 output
Hi John, mako, El lun, 05-12-2005 a las 11:52 -0500, John E. Davis escribió: On Mon, 5 Dec 2005 10:39:46 -0500, Benj. Mako Hill [EMAIL PROTECTED] said: UTF-8 enabled Most, with working (and enabled by default during build) UTF-8 compliant RegExp searches. Wonderful! I've test out the patch and it works great on my system. I've applied it in whole. While it is a good start for UTF-8, it will require more work to integrate. For example, the patch to buffer.c:forward_columns does not appear to properly handle tab characters, embedded backspaces, etc. Such backspaces are used by manpages to simulate an overstrike, underline, etc, e.g., The attached patch fixes this case, plus other minor glitches with multi-byte characters. There is only one important glitch that I haven't been able to fix, and seems like a bug in Slang or Gnome-Terminal. The attached file most-test-long is displayed as expected, but some of the characters in most-test-long-fmt wrap to the second line of the buffer. This seems to be an artifact of formatting. As you'll see if you scroll the buffer to the right, the few bold and underlined characters are sometimes displayed with different spacing around them than in the plain version of the document. I've verified with the code below that most_analyse_line extracts at most 80 columns (for my terminal) worth of text. However, the fact that the terminal adds extra spacing when formatting is applied, causes the output to wrap. len = most_analyse_line(beg, end, line, attr); p = line; for (i = 0; i strlen(attr); i++) { SLwchar_Type wc; p = SLutf8_decode(p, line + line_len, wc, NULL); if (p) wc_len += SLwchar_wcwidth(wc); } assert(wc_len = SLtt_Screen_Cols); Cheers, -- Javier Kohen [EMAIL PROTECTED] ICQ: blashyrkh #2361802 Jabber: [EMAIL PROTECTED] aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ aáホホビ_ビ_Хथሰਗ਼בּホビ aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ aáŴÓХथሰਗ਼בּホビХथሰਗ਼בּホビ Sólo en most-4.10.2: config.log Sólo en most-4.10.2: config.status diff -ur most-4.10.2-2.debian-orig/debian/changelog most-4.10.2/debian/changelog --- most-4.10.2-2.debian-orig/debian/changelog 2005-12-05 21:51:17.0 -0300 +++ most-4.10.2/debian/changelog 2005-12-06 04:57:47.0 -0300 @@ -1,3 +1,10 @@ +most (4.10.2-2.0.1) unstable; urgency=low + + * Fixed minor glitches when displaying multi-byte characters. + * Fixed wrapped mode when using formatted output. + + -- Javier Kohen [EMAIL PROTECTED] Tue, 6 Dec 2005 04:57:01 -0300 + most (4.10.2-2) unstable; urgency=low * Patch from Javier Kohen to rework (and fix) RegExp searches, so they now Sólo en most-4.10.2: Makefile diff -ur most-4.10.2-2.debian-orig/src/buffer.c most-4.10.2/src/buffer.c --- most-4.10.2-2.debian-orig/src/buffer.c 2005-12-05 21:51:17.0 -0300 +++ most-4.10.2/src/buffer.c 2005-12-06 04:48:06.0 -0300 @@ -62,11 +62,13 @@ if (*pos == '\n') { pos--; /* Skip back the new-line. */ - while ((pos Most_Beg) - (*pos != '\n')) - pos = SLutf8_bskip_char(Most_Beg, pos); + /* This block is UTF-8 safe, because it only scans the + buffer for a new-line, and doesn't count + characters. */ + while ((pos Most_Beg) (*pos != '\n')) + pos--; - if (*pos != '\n') return pos; + if (*pos != '\n') return Most_Beg; /* from here on *pos == '\n' */ if (pos + 1 != cpos) return pos + 1; @@ -77,9 +79,9 @@ if (*pos != '\n') { - while ((pos Most_Beg) - (*pos != '\n')) - pos = SLutf8_bskip_char(Most_Beg, pos); + /* This block is UTF-8 safe. See comment above. */ + while ((pos Most_Beg) (*pos != '\n')) + pos--; if (*pos != '\n') return Most_Beg; /* from here on *pos == '\n' */ return pos + 1; @@ -96,58 +98,6 @@ return pos; } - -static unsigned char *forward_columns (unsigned char *b, unsigned char *e, unsigned int num_cols) -{ - unsigned int col = 0; - - if (Most_UTF8_Mode) - return SLutf8_skip_chars(b, e, num_cols, col, 0); - - while ((b e) - (col num_cols)) - { - unsigned char ch = *b++; - - if (most_isprint(ch)) - { - col++; - continue; - } - - if ((ch == '\b') || (ch == '\t') || (ch == '\r')) - switch (ch) - { - case '\b': - if (Most_V_Opt == 0) - { - if (col 0) col--; - } - else col += 2; - break; - - case '\r': - if (Most_V_Opt == 0) - col = 0; - else - col += 2; - break; - - case '\t': - if (Most_T_Opt == 0) - col = Most_Tab_Width * (col/Most_Tab_Width + 1); - else - col += 2; - break; - } - else if (ch 0x80) - col += 3; - else - col += 2; - } - return b; -} - /* does not move point */ static unsigned char *end_of_line1(void) { @@ -170,6 +120,9 @@ if (*pos !=
Bug#211217: most: Fixed UTF-8 output
quote who=Javier Kohen date=Sun, Dec 04, 2005 at 11:59:06AM -0300 UTF-8 enabled Most, with working (and enabled by default during build) UTF-8 compliant RegExp searches. Wonderful! I've test out the patch and it works great on my system. I've applied it in whole. Check it out while it's hot. I've tested all I could, including forward and backward searches. Note that non-RegExp searches should not be more broken than before, as I tried to avoid disturbing them. However, they didn't work with UTF-8 or formatted text. I don't think this is a problem, as the user can't switch RegExp searches off in run-time. Agreed. Most could use some improvement in general, for instance, to allow backward searches to happen from the end of the buffer, and not just from the beginning of the screen at the end of the buffer. I suggest we get this tested first, and then move onto addressing the remaining issues. Agreed again. By the way, you might want to merge this bug with bug #341187. The bugs are, I think, distinct so I'm going to leave them sepereate. I will, however, close both bugs with the next upload to Debian which I'll send off in about 5 minutes. JED: Please let me know if you take this page so I can sync it and remove the patch from the Debian/Ubuntu diff. Regards, Mako -- Benjamin Mako Hill [EMAIL PROTECTED] http://mako.cc/ signature.asc Description: Digital signature
Bug#211217: most: Fixed UTF-8 output
On Mon, 5 Dec 2005 10:39:46 -0500, Benj. Mako Hill [EMAIL PROTECTED] said: UTF-8 enabled Most, with working (and enabled by default during build) UTF-8 compliant RegExp searches. Wonderful! I've test out the patch and it works great on my system. I've applied it in whole. While it is a good start for UTF-8, it will require more work to integrate. For example, the patch to buffer.c:forward_columns does not appear to properly handle tab characters, embedded backspaces, etc. Such backspaces are used by manpages to simulate an overstrike, underline, etc, e.g., This is B\bBO\bOL\bLD\bD This is BOLD\rBOLD This is U\b_N\b_D\b_E\b_R\b_L\b_I\b_N\b_E\b_D\b_ Accounting for such constructs greatly complicate searches, etc. I welcome the new manpage format that use ANSI escape sequences but I do not know how widespread their use is. Thanks, --John -- To UNSUBSCRIBE, email to [EMAIL PROTECTED] with a subject of unsubscribe. Trouble? Contact [EMAIL PROTECTED]
Bug#211217: most: Fixed UTF-8 output
El lun, 05-12-2005 a las 11:52 -0500, John E. Davis escribió: On Mon, 5 Dec 2005 10:39:46 -0500, Benj. Mako Hill [EMAIL PROTECTED] said: UTF-8 enabled Most, with working (and enabled by default during build) UTF-8 compliant RegExp searches. Wonderful! I've test out the patch and it works great on my system. I've applied it in whole. While it is a good start for UTF-8, it will require more work to integrate. For example, the patch to buffer.c:forward_columns does not appear to properly handle tab characters, embedded backspaces, etc. Such backspaces are used by manpages to simulate an overstrike, That's true, and also easy to fix. I think I over simplified that one. It should be the only case, though, note that SLutf8_skip_char*s* is only being called from forward_columns; the rest of the calls to SLutf8_skip_char are integrated in the loops so that single-byte characters remain handled exactly as before. I'll get back to you with a patch on top of most 4.10.2-2 soon, probably tonight. If you can find more pathologic cases, please send me a case to test. Greetings, -- Javier Kohen [EMAIL PROTECTED] ICQ: blashyrkh #2361802 Jabber: [EMAIL PROTECTED] signature.asc Description: This is a digitally signed message part
Bug#211217: most: Fixed UTF-8 output
On Mon, 05 Dec 2005 14:21:43 -0300, Javier Kohen [EMAIL PROTECTED] said: I'll get back to you with a patch on top of most 4.10.2-2 soon, probably tonight. I appreciate it. Thanks, --John -- To UNSUBSCRIBE, email to [EMAIL PROTECTED] with a subject of unsubscribe. Trouble? Contact [EMAIL PROTECTED]
Bug#211217: most: Fixed UTF-8 output
El lun, 28-11-2005 a las 16:01 -0500, Benj. Mako Hill escribió: quote who=Javier Kohen date=Mon, Nov 28, 2005 at 10:52:33AM -0300 I wrote a patch that fixes the UTF-8 output issue. Seems to work fine in non-wrap and wrap modes, with unformatted and formatted text (e.g. bold text from man's output). Give it a try and let me know if you find any bugs. If this works, I owe you several drinks. Hopefully, I'll be able to try it out tonight. I'll do an upload immediately if it works. I'll try to fix the UTF-8 input issues next. Here we have... UTF-8 enabled Most, with working (and enabled by default during build) UTF-8 compliant RegExp searches. Check it out while it's hot. I've tested all I could, including forward and backward searches. Note that non-RegExp searches should not be more broken than before, as I tried to avoid disturbing them. However, they didn't work with UTF-8 or formatted text. I don't think this is a problem, as the user can't switch RegExp searches off in run-time. Most could use some improvement in general, for instance, to allow backward searches to happen from the end of the buffer, and not just from the beginning of the screen at the end of the buffer. I suggest we get this tested first, and then move onto addressing the remaining issues. By the way, you might want to merge this bug with bug #341187. Greetings, -- Javier Kohen [EMAIL PROTECTED] ICQ: blashyrkh #2361802 Jabber: [EMAIL PROTECTED] Sólo en most-4.10.2: build-stamp Sólo en most-4.10.2: config.log Sólo en most-4.10.2: config.status diff -ur most-4.10.2-1.debian-orig/debian/changelog most-4.10.2/debian/changelog --- most-4.10.2-1.debian-orig/debian/changelog 2005-12-04 04:43:47.0 -0300 +++ most-4.10.2/debian/changelog 2005-12-04 04:06:21.0 -0300 @@ -1,3 +1,10 @@ +most (4.10.2-1.0.1) unstable; urgency=low + + * Reworked RegExp searches, so they now work. + * Properly handle UTF-8 data. + + -- Javier Kohen [EMAIL PROTECTED] Mon, 28 Nov 2005 04:07:10 -0300 + most (4.10.2-1) unstable; urgency=low * New upstream release. Sólo en most-4.10.2/debian: files Sólo en most-4.10.2/debian: most Sólo en most-4.10.2/debian: most.substvars Sólo en most-4.10.2: Makefile diff -ur most-4.10.2-1.debian-orig/src/buffer.c most-4.10.2/src/buffer.c --- most-4.10.2-1.debian-orig/src/buffer.c 2005-12-04 04:43:47.0 -0300 +++ most-4.10.2/src/buffer.c 2005-11-29 03:42:09.0 -0300 @@ -61,24 +61,27 @@ { if (*pos == '\n') { - pos--; + pos--; /* Skip back the new-line. */ while ((pos Most_Beg) (*pos != '\n')) - pos--; + pos = SLutf8_bskip_char(Most_Beg, pos); if (*pos != '\n') return pos; + /* from here on *pos == '\n' */ if (pos + 1 != cpos) return pos + 1; } } - else pos--; + else + pos = SLutf8_bskip_char(Most_Beg, pos); if (*pos != '\n') { while ((pos Most_Beg) (*pos != '\n')) - pos--; + pos = SLutf8_bskip_char(Most_Beg, pos); if (*pos != '\n') return Most_Beg; + /* from here on *pos == '\n' */ return pos + 1; } @@ -98,12 +101,15 @@ { unsigned int col = 0; + if (Most_UTF8_Mode) + return SLutf8_skip_chars(b, e, num_cols, col, 0); + while ((b e) (col num_cols)) { unsigned char ch = *b++; - if (((ch = ' ') (ch 0x7F)) - || (ch = SLsmg_Display_Eight_Bit)) + + if (most_isprint(ch)) { col++; continue; @@ -545,7 +551,10 @@ /* Now we have found the line it is on so */ beg = most_beg_of_line(); *c = 1; - while (beg++ pos) *c = *c + 1; + if (Most_UTF8_Mode) + while ((beg = SLutf8_skip_char(beg, pos)) pos) *c = *c + 1; + else + while (beg++ pos) *c = *c + 1; Most_C_Line = save_line; Most_C_Offset = save_offset; } Sólo en most-4.10.2/src: config.h diff -ur most-4.10.2-1.debian-orig/src/keym.c most-4.10.2/src/keym.c --- most-4.10.2-1.debian-orig/src/keym.c 2005-12-04 04:43:47.0 -0300 +++ most-4.10.2/src/keym.c 2005-12-04 04:41:13.0 -0300 @@ -309,7 +309,7 @@ #else Search: , #endif - Most_Search_Str, + (char *) Most_Search_Str, MOST_SEARCH_BUF_LEN ) == -1) return; Most_Curs_Offset = Most_C_Offset; @@ -325,7 +325,7 @@ #else Search Backwards: , #endif - Most_Search_Str, + (char *) Most_Search_Str, MOST_SEARCH_BUF_LEN) == -1) return; find_next_cmd(); } diff -ur most-4.10.2-1.debian-orig/src/line.c most-4.10.2/src/line.c --- most-4.10.2-1.debian-orig/src/line.c 2005-12-04 04:43:47.0 -0300 +++ most-4.10.2/src/line.c 2005-11-29 03:51:31.0 -0300 @@ -87,8 +87,7 @@ while (b end) { ch = *b++; - if (((ch = ' ') (ch 0x7F)) - || (ch = SLsmg_Display_Eight_Bit)) + if (most_isprint(ch)) { *s++ = ch; continue; @@ -114,15 +113,22 @@ SLsmg_erase_eol (); } +int most_isprint(unsigned char ch) +{ + /* Can this be directly replaced with isprint? */ +
Bug#211217: most: Fixed UTF-8 output
Package: most Version: 4.10.2-2 Followup-For: Bug #211217 -BEGIN PGP SIGNED MESSAGE- Hash: SHA1 I wrote a patch that fixes the UTF-8 output issue. Seems to work fine in non-wrap and wrap modes, with unformatted and formatted text (e.g. bold text from man's output). Give it a try and let me know if you find any bugs. I'll try to fix the UTF-8 input issues next. - -- System Information: Debian Release: testing/unstable APT prefers unstable APT policy: (500, 'unstable') Architecture: i386 (i686) Shell: /bin/sh linked to /bin/bash Kernel: Linux 2.6.14-ck5 Locale: LANG=es_AR.UTF-8, LC_CTYPE=es_AR.UTF-8 (charmap=UTF-8) Versions of packages most depends on: ii libc6 2.3.5-8GNU C Library: Shared libraries an ii libslang2 2.0.5-1The S-Lang programming library - r most recommends no packages. - -- no debconf information -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.2 (GNU/Linux) iD8DBQFDiwuh823633cP2P8RAmZGAJ9mXBCT6PAPEYlQUgoh6L4lOMd8HACgkPzM 9RnEFzFh7lsmbm3Hm9Gh/MA= =VmtO -END PGP SIGNATURE- Sólo en most-4.10.2.jkohen: build-stamp Sólo en most-4.10.2.jkohen: config.log Sólo en most-4.10.2.jkohen: config.status Sólo en most-4.10.2.jkohen/debian: files Sólo en most-4.10.2.jkohen/debian: most Sólo en most-4.10.2.jkohen/debian: most.substvars Sólo en most-4.10.2.jkohen: Makefile diff -ur most-4.10.2.orig/src/buffer.c most-4.10.2.jkohen/src/buffer.c --- most-4.10.2.orig/src/buffer.c 2005-11-28 04:04:13.0 -0300 +++ most-4.10.2.jkohen/src/buffer.c 2005-11-28 04:00:46.0 -0300 @@ -61,24 +61,27 @@ { if (*pos == '\n') { - pos--; + pos--; /* Skip back the new-line. */ while ((pos Most_Beg) (*pos != '\n')) - pos--; + pos = SLutf8_bskip_char(Most_Beg, pos); if (*pos != '\n') return pos; + /* from here on *pos == '\n' */ if (pos + 1 != cpos) return pos + 1; } } - else pos--; + else + pos = SLutf8_bskip_char(Most_Beg, pos); if (*pos != '\n') { while ((pos Most_Beg) (*pos != '\n')) - pos--; + pos = SLutf8_bskip_char(Most_Beg, pos); if (*pos != '\n') return Most_Beg; + /* from here on *pos == '\n' */ return pos + 1; } @@ -98,12 +101,15 @@ { unsigned int col = 0; + if (Most_UTF8_Mode) + return SLutf8_skip_chars(b, e, num_cols, col, 0); + while ((b e) (col num_cols)) { unsigned char ch = *b++; - if (((ch = ' ') (ch 0x7F)) - || (ch = SLsmg_Display_Eight_Bit)) + + if (most_isprint(ch)) { col++; continue; Sólo en most-4.10.2.jkohen/src: config.h diff -ur most-4.10.2.orig/src/line.c most-4.10.2.jkohen/src/line.c --- most-4.10.2.orig/src/line.c 2005-11-28 04:04:13.0 -0300 +++ most-4.10.2.jkohen/src/line.c 2005-11-28 03:37:34.0 -0300 @@ -87,8 +87,7 @@ while (b end) { ch = *b++; - if (((ch = ' ') (ch 0x7F)) - || (ch = SLsmg_Display_Eight_Bit)) + if (most_isprint(ch)) { *s++ = ch; continue; @@ -114,15 +113,22 @@ SLsmg_erase_eol (); } +int most_isprint(unsigned char ch) +{ + /* Can this be directly replaced with isprint? */ + return (ch = ' ' ch 0x7F) || ch = SLsmg_Display_Eight_Bit; +} + static int most_analyse_line(unsigned char *begg, unsigned char *endd, - char *out, char *attributes) + unsigned char *out, char *attributes) { - unsigned char *beg, *end; + unsigned char *beg, *end, *pout; unsigned int min_col, max_col; unsigned int i, i_max; beg = begg; end = endd; + pout = out; i = i_max = 0; min_col = Most_Column - 1; max_col = min_col + SLtt_Screen_Cols; @@ -130,9 +136,9 @@ while (beg end) { char attr = ' '; - unsigned char ch; + unsigned char ch = *beg++; - if ('\n' == (ch = *beg++)) + if ('\n' == ch) break; if ((ch == '\r') (Most_V_Opt == 0)) @@ -146,7 +152,10 @@ { if (i i_max) i_max = i; if (i 0) - i--; + { + pout = SLutf8_bskip_char(out, pout); + i--; + } continue; } @@ -155,12 +164,12 @@ attr = 'b'; if ((i = min_col) (i max_col)) { - if (out[i-min_col] == '_') + if (*pout == '_') attr = 'u'; else if (ch == '_') { attr = 'u'; - ch = out[i - min_col]; + ch = *pout; } } if (ch == ' ') @@ -170,23 +179,30 @@ } /* drop */ } - - if ((ch = ' ') (ch 0x7F)) - { + + if (Most_UTF8_Mode) { + unsigned char *prev = --beg; + int len; + beg = SLutf8_skip_char(beg, end); + len = beg - prev; + if (len 1) { + /* Non-ASCII char, display it. */ if ((i = min_col) (i max_col)) { - out[i-min_col] = ch; + memcpy(pout, prev, len); + pout += len; attributes[i-min_col] = attr; } i++; continue; - } - - if (ch = SLsmg_Display_Eight_Bit) + } + } + + if
Bug#211217: most: Fixed UTF-8 output
quote who=Javier Kohen date=Mon, Nov 28, 2005 at 10:52:33AM -0300 I wrote a patch that fixes the UTF-8 output issue. Seems to work fine in non-wrap and wrap modes, with unformatted and formatted text (e.g. bold text from man's output). Give it a try and let me know if you find any bugs. If this works, I owe you several drinks. Hopefully, I'll be able to try it out tonight. I'll do an upload immediately if it works. I'll try to fix the UTF-8 input issues next. My hero. Regards, Mako -- Benjamin Mako Hill [EMAIL PROTECTED] http://mako.cc/ -- To UNSUBSCRIBE, email to [EMAIL PROTECTED] with a subject of unsubscribe. Trouble? Contact [EMAIL PROTECTED]