Re: xterm and wcwidth()

2019-03-08 Thread Ingo Schwarze
Hi,

Ted Unangst wrote on Fri, Mar 08, 2019 at 03:24:56PM -0500:
> Matthieu Herrb wrote:

>> I would prefer a diff that just add a &&!defined(__OpenBSD__) to the
>> condition before the definition of systemWcwidthOk(). This will cause
>> less risk of conflicts in future updates and clearly show the
>> intention.

> If you prefer that, I would suggest the following to avoid patching multiple
> places. Just short circuit the function.

Makes sense, i committed that version.

Thanks for reporting and for all the feedback,
  Ingo

>>>  #if OPT_WIDE_CHARS
>>> -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH)
>>> -/*
>>> - * If xterm is running in a UTF-8 locale, it is still possible to encounter
>>> - * old runtime configurations which yield incomplete or inaccurate data.
>>> - */
>>> -static Bool
>>> -systemWcwidthOk(int samplesize, int samplepass)
>>> -{
>>> -wchar_t n;
>>> -int oops = 0;

> #ifdef __OpenBSD__
>   return 1;
> #endif
 
>>> -
>>> -for (n = 21; n <= 25; ++n) {
>>> -   wchar_t code = (wchar_t) dec2ucs(NULL, (unsigned) n);
>>> -   int system_code = wcwidth(code);
>>> -   int intern_code = mk_wcwidth(code);



Re: xterm and wcwidth()

2019-03-08 Thread Ted Unangst
Matthieu Herrb wrote:
> I would prefer a diff that just add a &&!defined(__OpenBSD__) to the
> condition before the definition of systemWcwidthOk(). This will cause
> less risk of conflicts in future updates and clearly show the
> intention.

If you prefer that, I would suggest the following to avoid patching multiple
places. Just short circuit the function.

> >  #if OPT_WIDE_CHARS
> > -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH)
> > -/*
> > - * If xterm is running in a UTF-8 locale, it is still possible to encounter
> > - * old runtime configurations which yield incomplete or inaccurate data.
> > - */
> > -static Bool
> > -systemWcwidthOk(int samplesize, int samplepass)
> > -{
> > -wchar_t n;
> > -int oops = 0;

#ifdef __OpenBSD__
return 1;
#endif

> > -
> > -for (n = 21; n <= 25; ++n) {
> > -   wchar_t code = (wchar_t) dec2ucs(NULL, (unsigned) n);
> > -   int system_code = wcwidth(code);
> > -   int intern_code = mk_wcwidth(code);



Re: xterm and wcwidth()

2019-03-08 Thread Matthieu Herrb
On Fri, Mar 08, 2019 at 02:39:32PM +0100, Ingo Schwarze wrote:
> Hi,
> 
> Lauri Tirkkonen wrote on Fri, Mar 08, 2019 at 12:43:11PM +0200:
> 
> > I feel like xterm should just use the system wcwidth() to avoid these
> > mismatches, so rudimentary diff to do that below.
> 
> Absolutely, i strongly agree with that sentiment.
> 
> Having a local character width table in an application program is
> just wrong.  Archaic operating systems like Oracle Solaris 11 that
> have broken wcwidth(3) implementations should either fix their C
> library or go to hell.
> 
> I also agree with the details of the diff.  It is minimal in so far
> as it changes only one file, keeping the annoyance minimal when merging
> new xterm releases.  The function systemWcwidthOk() is horrific
> enough that deleting it outright makes sense, to make sure that it
> can never get called by accident.
> 
> Note that the command line options -cjk_width and -mk_width and the
> related resources remain broken.  But i'd say people using them get
> what they deserve.  The disruption to updates that would be caused
> by excising them from multiple files is probably better avoided.
> 
> Unchanged diff left in place below for easy reference.
> 
> OK to commit?

hi,

I would prefer a diff that just add a &&!defined(__OpenBSD__) to the
condition before the definition of systemWcwidthOk(). This will cause
less risk of conflicts in future updates and clearly show the
intention.

but either way ok matthieu@

>   Ingo
> 
> 
> diff --git a/app/xterm/util.c b/app/xterm/util.c
> index a3de4a005..9b6443683 100644
> --- a/app/xterm/util.c
> +++ b/app/xterm/util.c
> @@ -4980,61 +4980,6 @@ decode_keyboard_type(XtermWidget xw, XTERM_RESOURCE * 
> rp)
>  }
>  
>  #if OPT_WIDE_CHARS
> -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH)
> -/*
> - * If xterm is running in a UTF-8 locale, it is still possible to encounter
> - * old runtime configurations which yield incomplete or inaccurate data.
> - */
> -static Bool
> -systemWcwidthOk(int samplesize, int samplepass)
> -{
> -wchar_t n;
> -int oops = 0;
> -
> -for (n = 21; n <= 25; ++n) {
> - wchar_t code = (wchar_t) dec2ucs(NULL, (unsigned) n);
> - int system_code = wcwidth(code);
> - int intern_code = mk_wcwidth(code);
> -
> - /*
> -  * Solaris 10 wcwidth() returns "2" for all of the line-drawing (page
> -  * 0x2500) and most of the geometric shapes (a few are excluded, just
> -  * to make it more difficult to use).  Do a sanity check to avoid using
> -  * it.
> -  */
> - if ((system_code < 0 && intern_code >= 1)
> - || (system_code >= 0 && intern_code != system_code)) {
> - TRACE(("systemWcwidthOk: broken system line-drawing wcwidth\n"));
> - oops += (samplepass + 1);
> - break;
> - }
> -}
> -
> -for (n = 0; n < (wchar_t) samplesize; ++n) {
> - int system_code = wcwidth(n);
> - int intern_code = mk_wcwidth(n);
> -
> - /*
> -  * When this check was originally implemented, there were few if any
> -  * libraries with full Unicode coverage.  Time passes, and it is
> -  * possible to make a full comparison of the BMP.  There are some
> -  * differences: mk_wcwidth() marks some codes as combining and some
> -  * as single-width, differing from GNU libc.
> -  */
> - if ((system_code < 0 && intern_code >= 1)
> - || (system_code >= 0 && intern_code != system_code)) {
> - TRACE((".. width(U+%04X) = %d, expected %d\n",
> -(unsigned) n, system_code, intern_code));
> - if (++oops > samplepass)
> - break;
> - }
> -}
> -TRACE(("systemWcwidthOk: %d/%d mismatches, allowed %d\n",
> -oops, (int) n, samplepass));
> -return (oops <= samplepass);
> -}
> -#endif /* HAVE_WCWIDTH */
> -
>  void
>  decode_wcwidth(XtermWidget xw)
>  {
> @@ -5045,8 +4990,7 @@ decode_wcwidth(XtermWidget xw)
>  switch (mode) {
>  default:
>  #if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH)
> - if (xtermEnvUTF8() &&
> - systemWcwidthOk(xw->misc.mk_samplesize, xw->misc.mk_samplepass)) {
> + if (xtermEnvUTF8()) {
>   my_wcwidth = wcwidth;
>   TRACE(("using system wcwidth() function\n"));
>   break;

-- 
Matthieu Herrb



Re: xterm and wcwidth()

2019-03-08 Thread Ingo Schwarze
Hi,

Ted Unangst wrote on Fri, Mar 08, 2019 at 12:57:17PM -0500:
> Lauri Tirkkonen wrote:

>> in other words, xterm is (and was) using its own idea of how many
>> columns characters take up. The way this manifested itself to me was
>> that I received some email that contained emoji characters in the
>> subject, and they look fine in mutt when I use another terminal (st on
>> OpenBSD, or termux on my Android phone), but on 'xterm -fa "DejaVu Sans
>> Mono"' on OpenBSD, only the left half of these characters is rendered,
>> and mutt's index background color indicating the selection doesn't reach
>> the right edge of the screen: libc wcwidth() returns 2, but xterm's idea
>> seems to be 1 column.

> Sigh, here we go again. Some systems have a bad function, so let's replace it
> with a slightly better version, and then never update it. If there are bugs in
> the libc wcwidth function, we'd very much like to know and fix them, not have
> these workarounds.

I'm counting that as OK tedu@, but i'll wait a bit before committing
in case matthieu@ (or someone else) wants to speak up.

Yours,
  Ingo



Re: xterm and wcwidth()

2019-03-08 Thread Ted Unangst
Lauri Tirkkonen wrote:
> in other words, xterm is (and was) using its own idea of how many
> columns characters take up. The way this manifested itself to me was
> that I received some email that contained emoji characters in the
> subject, and they look fine in mutt when I use another terminal (st on
> OpenBSD, or termux on my Android phone), but on 'xterm -fa "DejaVu Sans
> Mono"' on OpenBSD, only the left half of these characters is rendered,
> and mutt's index background color indicating the selection doesn't reach
> the right edge of the screen: libc wcwidth() returns 2, but xterm's idea
> seems to be 1 column.

Sigh, here we go again. Some systems have a bad function, so let's replace it
with a slightly better version, and then never update it. If there are bugs in
the libc wcwidth function, we'd very much like to know and fix them, not have
these workarounds.



Re: xterm and wcwidth()

2019-03-08 Thread Ingo Schwarze
Hi,

Lauri Tirkkonen wrote on Fri, Mar 08, 2019 at 12:43:11PM +0200:

> I feel like xterm should just use the system wcwidth() to avoid these
> mismatches, so rudimentary diff to do that below.

Absolutely, i strongly agree with that sentiment.

Having a local character width table in an application program is
just wrong.  Archaic operating systems like Oracle Solaris 11 that
have broken wcwidth(3) implementations should either fix their C
library or go to hell.

I also agree with the details of the diff.  It is minimal in so far
as it changes only one file, keeping the annoyance minimal when merging
new xterm releases.  The function systemWcwidthOk() is horrific
enough that deleting it outright makes sense, to make sure that it
can never get called by accident.

Note that the command line options -cjk_width and -mk_width and the
related resources remain broken.  But i'd say people using them get
what they deserve.  The disruption to updates that would be caused
by excising them from multiple files is probably better avoided.

Unchanged diff left in place below for easy reference.

OK to commit?
  Ingo


diff --git a/app/xterm/util.c b/app/xterm/util.c
index a3de4a005..9b6443683 100644
--- a/app/xterm/util.c
+++ b/app/xterm/util.c
@@ -4980,61 +4980,6 @@ decode_keyboard_type(XtermWidget xw, XTERM_RESOURCE * rp)
 }
 
 #if OPT_WIDE_CHARS
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH)
-/*
- * If xterm is running in a UTF-8 locale, it is still possible to encounter
- * old runtime configurations which yield incomplete or inaccurate data.
- */
-static Bool
-systemWcwidthOk(int samplesize, int samplepass)
-{
-wchar_t n;
-int oops = 0;
-
-for (n = 21; n <= 25; ++n) {
-   wchar_t code = (wchar_t) dec2ucs(NULL, (unsigned) n);
-   int system_code = wcwidth(code);
-   int intern_code = mk_wcwidth(code);
-
-   /*
-* Solaris 10 wcwidth() returns "2" for all of the line-drawing (page
-* 0x2500) and most of the geometric shapes (a few are excluded, just
-* to make it more difficult to use).  Do a sanity check to avoid using
-* it.
-*/
-   if ((system_code < 0 && intern_code >= 1)
-   || (system_code >= 0 && intern_code != system_code)) {
-   TRACE(("systemWcwidthOk: broken system line-drawing wcwidth\n"));
-   oops += (samplepass + 1);
-   break;
-   }
-}
-
-for (n = 0; n < (wchar_t) samplesize; ++n) {
-   int system_code = wcwidth(n);
-   int intern_code = mk_wcwidth(n);
-
-   /*
-* When this check was originally implemented, there were few if any
-* libraries with full Unicode coverage.  Time passes, and it is
-* possible to make a full comparison of the BMP.  There are some
-* differences: mk_wcwidth() marks some codes as combining and some
-* as single-width, differing from GNU libc.
-*/
-   if ((system_code < 0 && intern_code >= 1)
-   || (system_code >= 0 && intern_code != system_code)) {
-   TRACE((".. width(U+%04X) = %d, expected %d\n",
-  (unsigned) n, system_code, intern_code));
-   if (++oops > samplepass)
-   break;
-   }
-}
-TRACE(("systemWcwidthOk: %d/%d mismatches, allowed %d\n",
-  oops, (int) n, samplepass));
-return (oops <= samplepass);
-}
-#endif /* HAVE_WCWIDTH */
-
 void
 decode_wcwidth(XtermWidget xw)
 {
@@ -5045,8 +4990,7 @@ decode_wcwidth(XtermWidget xw)
 switch (mode) {
 default:
 #if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH)
-   if (xtermEnvUTF8() &&
-   systemWcwidthOk(xw->misc.mk_samplesize, xw->misc.mk_samplepass)) {
+   if (xtermEnvUTF8()) {
my_wcwidth = wcwidth;
TRACE(("using system wcwidth() function\n"));
break;



xterm and wcwidth()

2019-03-08 Thread Lauri Tirkkonen
Hi,

it appears xterm tests the system's wcwidth() function on startup,
comparing the results between it and the wcwidth implementation it ships
itself (mk_wcwidth()), for each wchar_t value between 0 and 0x. This
is done in systemWcwidthOk(), called from decode_wcwidth().

I turned some TRACE()'s into printf()'s in util.c (by hand, since
OPT_TRACE didn't compile), and it turns out that xterm deems the system
wcwidth() inadequate, both before the unicode10 update:

% /usr/xenocara/app/xterm/obj/xterm
systemWcwidthOk: 656/55533 mismatches, allowed 655
using MK wcwidth() function

and after it:

% /usr/xenocara/app/xterm/obj/xterm
systemWcwidthOk: 656/55516 mismatches, allowed 655
using MK wcwidth() function

in other words, xterm is (and was) using its own idea of how many
columns characters take up. The way this manifested itself to me was
that I received some email that contained emoji characters in the
subject, and they look fine in mutt when I use another terminal (st on
OpenBSD, or termux on my Android phone), but on 'xterm -fa "DejaVu Sans
Mono"' on OpenBSD, only the left half of these characters is rendered,
and mutt's index background color indicating the selection doesn't reach
the right edge of the screen: libc wcwidth() returns 2, but xterm's idea
seems to be 1 column.

Another way this problem manifests is typing one of these
mismatched-width characters in bash or zsh, and erasing it with
backspace: the character is rendered in one column, but on backspace the
cursor goes backward two columns, erasing part of the prompt. ksh
doesn't seem to have this problem.

I feel like xterm should just use the system wcwidth() to avoid these
mismatches, so rudimentary diff to do that below. Using the default
font, this makes emojis render as double-wide boxes; using '-fa "DejaVu
Sans Mono"', they actually render correctly. In both cases the width
mismatches in mutt and bash/zsh are fixed.

diff --git a/app/xterm/util.c b/app/xterm/util.c
index a3de4a005..9b6443683 100644
--- a/app/xterm/util.c
+++ b/app/xterm/util.c
@@ -4980,61 +4980,6 @@ decode_keyboard_type(XtermWidget xw, XTERM_RESOURCE * rp)
 }
 
 #if OPT_WIDE_CHARS
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH)
-/*
- * If xterm is running in a UTF-8 locale, it is still possible to encounter
- * old runtime configurations which yield incomplete or inaccurate data.
- */
-static Bool
-systemWcwidthOk(int samplesize, int samplepass)
-{
-wchar_t n;
-int oops = 0;
-
-for (n = 21; n <= 25; ++n) {
-   wchar_t code = (wchar_t) dec2ucs(NULL, (unsigned) n);
-   int system_code = wcwidth(code);
-   int intern_code = mk_wcwidth(code);
-
-   /*
-* Solaris 10 wcwidth() returns "2" for all of the line-drawing (page
-* 0x2500) and most of the geometric shapes (a few are excluded, just
-* to make it more difficult to use).  Do a sanity check to avoid using
-* it.
-*/
-   if ((system_code < 0 && intern_code >= 1)
-   || (system_code >= 0 && intern_code != system_code)) {
-   TRACE(("systemWcwidthOk: broken system line-drawing wcwidth\n"));
-   oops += (samplepass + 1);
-   break;
-   }
-}
-
-for (n = 0; n < (wchar_t) samplesize; ++n) {
-   int system_code = wcwidth(n);
-   int intern_code = mk_wcwidth(n);
-
-   /*
-* When this check was originally implemented, there were few if any
-* libraries with full Unicode coverage.  Time passes, and it is
-* possible to make a full comparison of the BMP.  There are some
-* differences: mk_wcwidth() marks some codes as combining and some
-* as single-width, differing from GNU libc.
-*/
-   if ((system_code < 0 && intern_code >= 1)
-   || (system_code >= 0 && intern_code != system_code)) {
-   TRACE((".. width(U+%04X) = %d, expected %d\n",
-  (unsigned) n, system_code, intern_code));
-   if (++oops > samplepass)
-   break;
-   }
-}
-TRACE(("systemWcwidthOk: %d/%d mismatches, allowed %d\n",
-  oops, (int) n, samplepass));
-return (oops <= samplepass);
-}
-#endif /* HAVE_WCWIDTH */
-
 void
 decode_wcwidth(XtermWidget xw)
 {
@@ -5045,8 +4990,7 @@ decode_wcwidth(XtermWidget xw)
 switch (mode) {
 default:
 #if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH)
-   if (xtermEnvUTF8() &&
-   systemWcwidthOk(xw->misc.mk_samplesize, xw->misc.mk_samplepass)) {
+   if (xtermEnvUTF8()) {
my_wcwidth = wcwidth;
TRACE(("using system wcwidth() function\n"));
break;

-- 
Lauri Tirkkonen | lotheac @ IRCnet