Re: xterm and wcwidth()
Hi, Ted Unangst wrote on Fri, Mar 08, 2019 at 03:24:56PM -0500: > Matthieu Herrb wrote: >> I would prefer a diff that just add a &&!defined(__OpenBSD__) to the >> condition before the definition of systemWcwidthOk(). This will cause >> less risk of conflicts in future updates and clearly show the >> intention. > If you prefer that, I would suggest the following to avoid patching multiple > places. Just short circuit the function. Makes sense, i committed that version. Thanks for reporting and for all the feedback, Ingo >>> #if OPT_WIDE_CHARS >>> -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH) >>> -/* >>> - * If xterm is running in a UTF-8 locale, it is still possible to encounter >>> - * old runtime configurations which yield incomplete or inaccurate data. >>> - */ >>> -static Bool >>> -systemWcwidthOk(int samplesize, int samplepass) >>> -{ >>> -wchar_t n; >>> -int oops = 0; > #ifdef __OpenBSD__ > return 1; > #endif >>> - >>> -for (n = 21; n <= 25; ++n) { >>> - wchar_t code = (wchar_t) dec2ucs(NULL, (unsigned) n); >>> - int system_code = wcwidth(code); >>> - int intern_code = mk_wcwidth(code);
Re: xterm and wcwidth()
Matthieu Herrb wrote: > I would prefer a diff that just add a &&!defined(__OpenBSD__) to the > condition before the definition of systemWcwidthOk(). This will cause > less risk of conflicts in future updates and clearly show the > intention. If you prefer that, I would suggest the following to avoid patching multiple places. Just short circuit the function. > > #if OPT_WIDE_CHARS > > -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH) > > -/* > > - * If xterm is running in a UTF-8 locale, it is still possible to encounter > > - * old runtime configurations which yield incomplete or inaccurate data. > > - */ > > -static Bool > > -systemWcwidthOk(int samplesize, int samplepass) > > -{ > > -wchar_t n; > > -int oops = 0; #ifdef __OpenBSD__ return 1; #endif > > - > > -for (n = 21; n <= 25; ++n) { > > - wchar_t code = (wchar_t) dec2ucs(NULL, (unsigned) n); > > - int system_code = wcwidth(code); > > - int intern_code = mk_wcwidth(code);
Re: xterm and wcwidth()
On Fri, Mar 08, 2019 at 02:39:32PM +0100, Ingo Schwarze wrote: > Hi, > > Lauri Tirkkonen wrote on Fri, Mar 08, 2019 at 12:43:11PM +0200: > > > I feel like xterm should just use the system wcwidth() to avoid these > > mismatches, so rudimentary diff to do that below. > > Absolutely, i strongly agree with that sentiment. > > Having a local character width table in an application program is > just wrong. Archaic operating systems like Oracle Solaris 11 that > have broken wcwidth(3) implementations should either fix their C > library or go to hell. > > I also agree with the details of the diff. It is minimal in so far > as it changes only one file, keeping the annoyance minimal when merging > new xterm releases. The function systemWcwidthOk() is horrific > enough that deleting it outright makes sense, to make sure that it > can never get called by accident. > > Note that the command line options -cjk_width and -mk_width and the > related resources remain broken. But i'd say people using them get > what they deserve. The disruption to updates that would be caused > by excising them from multiple files is probably better avoided. > > Unchanged diff left in place below for easy reference. > > OK to commit? hi, I would prefer a diff that just add a &&!defined(__OpenBSD__) to the condition before the definition of systemWcwidthOk(). This will cause less risk of conflicts in future updates and clearly show the intention. but either way ok matthieu@ > Ingo > > > diff --git a/app/xterm/util.c b/app/xterm/util.c > index a3de4a005..9b6443683 100644 > --- a/app/xterm/util.c > +++ b/app/xterm/util.c > @@ -4980,61 +4980,6 @@ decode_keyboard_type(XtermWidget xw, XTERM_RESOURCE * > rp) > } > > #if OPT_WIDE_CHARS > -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH) > -/* > - * If xterm is running in a UTF-8 locale, it is still possible to encounter > - * old runtime configurations which yield incomplete or inaccurate data. > - */ > -static Bool > -systemWcwidthOk(int samplesize, int samplepass) > -{ > -wchar_t n; > -int oops = 0; > - > -for (n = 21; n <= 25; ++n) { > - wchar_t code = (wchar_t) dec2ucs(NULL, (unsigned) n); > - int system_code = wcwidth(code); > - int intern_code = mk_wcwidth(code); > - > - /* > - * Solaris 10 wcwidth() returns "2" for all of the line-drawing (page > - * 0x2500) and most of the geometric shapes (a few are excluded, just > - * to make it more difficult to use). Do a sanity check to avoid using > - * it. > - */ > - if ((system_code < 0 && intern_code >= 1) > - || (system_code >= 0 && intern_code != system_code)) { > - TRACE(("systemWcwidthOk: broken system line-drawing wcwidth\n")); > - oops += (samplepass + 1); > - break; > - } > -} > - > -for (n = 0; n < (wchar_t) samplesize; ++n) { > - int system_code = wcwidth(n); > - int intern_code = mk_wcwidth(n); > - > - /* > - * When this check was originally implemented, there were few if any > - * libraries with full Unicode coverage. Time passes, and it is > - * possible to make a full comparison of the BMP. There are some > - * differences: mk_wcwidth() marks some codes as combining and some > - * as single-width, differing from GNU libc. > - */ > - if ((system_code < 0 && intern_code >= 1) > - || (system_code >= 0 && intern_code != system_code)) { > - TRACE((".. width(U+%04X) = %d, expected %d\n", > -(unsigned) n, system_code, intern_code)); > - if (++oops > samplepass) > - break; > - } > -} > -TRACE(("systemWcwidthOk: %d/%d mismatches, allowed %d\n", > -oops, (int) n, samplepass)); > -return (oops <= samplepass); > -} > -#endif /* HAVE_WCWIDTH */ > - > void > decode_wcwidth(XtermWidget xw) > { > @@ -5045,8 +4990,7 @@ decode_wcwidth(XtermWidget xw) > switch (mode) { > default: > #if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH) > - if (xtermEnvUTF8() && > - systemWcwidthOk(xw->misc.mk_samplesize, xw->misc.mk_samplepass)) { > + if (xtermEnvUTF8()) { > my_wcwidth = wcwidth; > TRACE(("using system wcwidth() function\n")); > break; -- Matthieu Herrb
Re: xterm and wcwidth()
Hi, Ted Unangst wrote on Fri, Mar 08, 2019 at 12:57:17PM -0500: > Lauri Tirkkonen wrote: >> in other words, xterm is (and was) using its own idea of how many >> columns characters take up. The way this manifested itself to me was >> that I received some email that contained emoji characters in the >> subject, and they look fine in mutt when I use another terminal (st on >> OpenBSD, or termux on my Android phone), but on 'xterm -fa "DejaVu Sans >> Mono"' on OpenBSD, only the left half of these characters is rendered, >> and mutt's index background color indicating the selection doesn't reach >> the right edge of the screen: libc wcwidth() returns 2, but xterm's idea >> seems to be 1 column. > Sigh, here we go again. Some systems have a bad function, so let's replace it > with a slightly better version, and then never update it. If there are bugs in > the libc wcwidth function, we'd very much like to know and fix them, not have > these workarounds. I'm counting that as OK tedu@, but i'll wait a bit before committing in case matthieu@ (or someone else) wants to speak up. Yours, Ingo
Re: xterm and wcwidth()
Lauri Tirkkonen wrote: > in other words, xterm is (and was) using its own idea of how many > columns characters take up. The way this manifested itself to me was > that I received some email that contained emoji characters in the > subject, and they look fine in mutt when I use another terminal (st on > OpenBSD, or termux on my Android phone), but on 'xterm -fa "DejaVu Sans > Mono"' on OpenBSD, only the left half of these characters is rendered, > and mutt's index background color indicating the selection doesn't reach > the right edge of the screen: libc wcwidth() returns 2, but xterm's idea > seems to be 1 column. Sigh, here we go again. Some systems have a bad function, so let's replace it with a slightly better version, and then never update it. If there are bugs in the libc wcwidth function, we'd very much like to know and fix them, not have these workarounds.
Re: xterm and wcwidth()
Hi, Lauri Tirkkonen wrote on Fri, Mar 08, 2019 at 12:43:11PM +0200: > I feel like xterm should just use the system wcwidth() to avoid these > mismatches, so rudimentary diff to do that below. Absolutely, i strongly agree with that sentiment. Having a local character width table in an application program is just wrong. Archaic operating systems like Oracle Solaris 11 that have broken wcwidth(3) implementations should either fix their C library or go to hell. I also agree with the details of the diff. It is minimal in so far as it changes only one file, keeping the annoyance minimal when merging new xterm releases. The function systemWcwidthOk() is horrific enough that deleting it outright makes sense, to make sure that it can never get called by accident. Note that the command line options -cjk_width and -mk_width and the related resources remain broken. But i'd say people using them get what they deserve. The disruption to updates that would be caused by excising them from multiple files is probably better avoided. Unchanged diff left in place below for easy reference. OK to commit? Ingo diff --git a/app/xterm/util.c b/app/xterm/util.c index a3de4a005..9b6443683 100644 --- a/app/xterm/util.c +++ b/app/xterm/util.c @@ -4980,61 +4980,6 @@ decode_keyboard_type(XtermWidget xw, XTERM_RESOURCE * rp) } #if OPT_WIDE_CHARS -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH) -/* - * If xterm is running in a UTF-8 locale, it is still possible to encounter - * old runtime configurations which yield incomplete or inaccurate data. - */ -static Bool -systemWcwidthOk(int samplesize, int samplepass) -{ -wchar_t n; -int oops = 0; - -for (n = 21; n <= 25; ++n) { - wchar_t code = (wchar_t) dec2ucs(NULL, (unsigned) n); - int system_code = wcwidth(code); - int intern_code = mk_wcwidth(code); - - /* -* Solaris 10 wcwidth() returns "2" for all of the line-drawing (page -* 0x2500) and most of the geometric shapes (a few are excluded, just -* to make it more difficult to use). Do a sanity check to avoid using -* it. -*/ - if ((system_code < 0 && intern_code >= 1) - || (system_code >= 0 && intern_code != system_code)) { - TRACE(("systemWcwidthOk: broken system line-drawing wcwidth\n")); - oops += (samplepass + 1); - break; - } -} - -for (n = 0; n < (wchar_t) samplesize; ++n) { - int system_code = wcwidth(n); - int intern_code = mk_wcwidth(n); - - /* -* When this check was originally implemented, there were few if any -* libraries with full Unicode coverage. Time passes, and it is -* possible to make a full comparison of the BMP. There are some -* differences: mk_wcwidth() marks some codes as combining and some -* as single-width, differing from GNU libc. -*/ - if ((system_code < 0 && intern_code >= 1) - || (system_code >= 0 && intern_code != system_code)) { - TRACE((".. width(U+%04X) = %d, expected %d\n", - (unsigned) n, system_code, intern_code)); - if (++oops > samplepass) - break; - } -} -TRACE(("systemWcwidthOk: %d/%d mismatches, allowed %d\n", - oops, (int) n, samplepass)); -return (oops <= samplepass); -} -#endif /* HAVE_WCWIDTH */ - void decode_wcwidth(XtermWidget xw) { @@ -5045,8 +4990,7 @@ decode_wcwidth(XtermWidget xw) switch (mode) { default: #if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH) - if (xtermEnvUTF8() && - systemWcwidthOk(xw->misc.mk_samplesize, xw->misc.mk_samplepass)) { + if (xtermEnvUTF8()) { my_wcwidth = wcwidth; TRACE(("using system wcwidth() function\n")); break;
xterm and wcwidth()
Hi, it appears xterm tests the system's wcwidth() function on startup, comparing the results between it and the wcwidth implementation it ships itself (mk_wcwidth()), for each wchar_t value between 0 and 0x. This is done in systemWcwidthOk(), called from decode_wcwidth(). I turned some TRACE()'s into printf()'s in util.c (by hand, since OPT_TRACE didn't compile), and it turns out that xterm deems the system wcwidth() inadequate, both before the unicode10 update: % /usr/xenocara/app/xterm/obj/xterm systemWcwidthOk: 656/55533 mismatches, allowed 655 using MK wcwidth() function and after it: % /usr/xenocara/app/xterm/obj/xterm systemWcwidthOk: 656/55516 mismatches, allowed 655 using MK wcwidth() function in other words, xterm is (and was) using its own idea of how many columns characters take up. The way this manifested itself to me was that I received some email that contained emoji characters in the subject, and they look fine in mutt when I use another terminal (st on OpenBSD, or termux on my Android phone), but on 'xterm -fa "DejaVu Sans Mono"' on OpenBSD, only the left half of these characters is rendered, and mutt's index background color indicating the selection doesn't reach the right edge of the screen: libc wcwidth() returns 2, but xterm's idea seems to be 1 column. Another way this problem manifests is typing one of these mismatched-width characters in bash or zsh, and erasing it with backspace: the character is rendered in one column, but on backspace the cursor goes backward two columns, erasing part of the prompt. ksh doesn't seem to have this problem. I feel like xterm should just use the system wcwidth() to avoid these mismatches, so rudimentary diff to do that below. Using the default font, this makes emojis render as double-wide boxes; using '-fa "DejaVu Sans Mono"', they actually render correctly. In both cases the width mismatches in mutt and bash/zsh are fixed. diff --git a/app/xterm/util.c b/app/xterm/util.c index a3de4a005..9b6443683 100644 --- a/app/xterm/util.c +++ b/app/xterm/util.c @@ -4980,61 +4980,6 @@ decode_keyboard_type(XtermWidget xw, XTERM_RESOURCE * rp) } #if OPT_WIDE_CHARS -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH) -/* - * If xterm is running in a UTF-8 locale, it is still possible to encounter - * old runtime configurations which yield incomplete or inaccurate data. - */ -static Bool -systemWcwidthOk(int samplesize, int samplepass) -{ -wchar_t n; -int oops = 0; - -for (n = 21; n <= 25; ++n) { - wchar_t code = (wchar_t) dec2ucs(NULL, (unsigned) n); - int system_code = wcwidth(code); - int intern_code = mk_wcwidth(code); - - /* -* Solaris 10 wcwidth() returns "2" for all of the line-drawing (page -* 0x2500) and most of the geometric shapes (a few are excluded, just -* to make it more difficult to use). Do a sanity check to avoid using -* it. -*/ - if ((system_code < 0 && intern_code >= 1) - || (system_code >= 0 && intern_code != system_code)) { - TRACE(("systemWcwidthOk: broken system line-drawing wcwidth\n")); - oops += (samplepass + 1); - break; - } -} - -for (n = 0; n < (wchar_t) samplesize; ++n) { - int system_code = wcwidth(n); - int intern_code = mk_wcwidth(n); - - /* -* When this check was originally implemented, there were few if any -* libraries with full Unicode coverage. Time passes, and it is -* possible to make a full comparison of the BMP. There are some -* differences: mk_wcwidth() marks some codes as combining and some -* as single-width, differing from GNU libc. -*/ - if ((system_code < 0 && intern_code >= 1) - || (system_code >= 0 && intern_code != system_code)) { - TRACE((".. width(U+%04X) = %d, expected %d\n", - (unsigned) n, system_code, intern_code)); - if (++oops > samplepass) - break; - } -} -TRACE(("systemWcwidthOk: %d/%d mismatches, allowed %d\n", - oops, (int) n, samplepass)); -return (oops <= samplepass); -} -#endif /* HAVE_WCWIDTH */ - void decode_wcwidth(XtermWidget xw) { @@ -5045,8 +4990,7 @@ decode_wcwidth(XtermWidget xw) switch (mode) { default: #if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH) - if (xtermEnvUTF8() && - systemWcwidthOk(xw->misc.mk_samplesize, xw->misc.mk_samplepass)) { + if (xtermEnvUTF8()) { my_wcwidth = wcwidth; TRACE(("using system wcwidth() function\n")); break; -- Lauri Tirkkonen | lotheac @ IRCnet