Re: svn commit: r335836 - head/usr.bin/top

2018-07-09 Thread 後藤大地 via svn-src-all
I summarized in the Phabricator. Check it out please.

https://reviews.freebsd.org/D16203

> 2018/07/05 1:37、Hiroki Sato のメール:
> 
> Hiroki Sato  wrote
>  in <20180703.020956.859981414196673670@allbsd.org>:
> 
> hr> 後藤大地  wrote
> hr>   in <459bd898-8072-426e-a968-96c1382ac...@icloud.com>:
> hr>
> hr> da>
> hr> da>
> hr> da> > 2018/07/02 15:55、Hiroki Sato のメール:
> hr> da> >
> hr> da> > Eitan Adler  wrote
> hr> da> >  in 
> :
> hr> da> >
> hr> da> > li> On 1 July 2018 at 10:08, Conrad Meyer  wrote:
> hr> da> > li> > Hi Daichi,
> hr> da> > li> >
> hr> da> > li> >
> hr> da> > li> >
> hr> da> > li> > I don't think code to decode UTF-8 belongs in top(1).  I 
> don't know
> hr> da> > li> > what the goal of this routine is, but I doubt this is the 
> right way to
> hr> da> > li> > accomplish it.
> hr> da> > li>
> hr> da> > li> For the record, I agree. This is why I didn't click "accept" on 
> the
> hr> da> > li> revision. I don't fully oppose leaving it in top(1) for now as 
> we work
> hr> da> > li> out the API, but long term its the wrong place.
> hr> da> > li>
> hr> da> > li> https://reviews.freebsd.org/D16058 is the review.
> hr> da> >
> hr> da> > I strongly object this kind of encoding-specific routine.  Please
> hr> da> > back out it.  The problem is that top(1) does not support multibyte
> hr> da> > encoding in functions for printing, and using C99 wide/multibyte
> hr> da> > character manipulation API such as iswprint(3) is the way to solve
> hr> da> > it.  Doing getenv("LANG") and assuming an encoding based on it is a
> hr> da> > very bad practice to internationalize software.
> hr> da> >
> hr> da> > -- Hiroki
> hr> da>
> hr> da> I respect what you mean.
> hr> da>
> hr> da> Once I back out, I will begin implementing it in a different way.
> hr> da> Please advise which function should be used for implementation
> hr> da> (iswprint (3) and what other functions should be used?)
> hr>
> hr>  Roughly speaking, POSIX/XPG/C99 I18N model requires the following
> hr>  steps:
> (snip)
> 
> Are you going to back out r335836, or disagree about it?
> 
> If there is no objection in the next 24 hours, I will commit the
> attached patch.  This should be a minimal change to support multibyte
> characters in ARGV array depending on LC_CTYPE, not limited to UTF-8.
> 
> -- Hiroki
> Index: usr.bin/top/display.c
> ===
> --- usr.bin/top/display.c (revision 335957)
> +++ usr.bin/top/display.c (working copy)
> @@ -1248,55 +1248,6 @@
> }
> }
> 
> -/*
> - *  printable(str) - make the string pointed to by "str" into one that is
> - *   printable (i.e.: all ascii), by converting all non-printable
> - *   characters into '?'.  Replacements are done in place and a pointer
> - *   to the original buffer is returned.
> - */
> -
> -char *
> -printable(char str[])
> -{
> - char *ptr;
> - char ch;
> -
> - ptr = str;
> - if (utf8flag) {
> - while ((ch = *ptr) != '\0') {
> - if (0x00 == (0x80 & ch)) {
> - if (!isprint(ch)) {
> - *ptr = '?';
> - }
> - ++ptr;
> - } else if (0xC0 == (0xE0 & ch)) {
> - ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - } else if (0xE0 == (0xF0 & ch)) {
> - ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - } else if (0xF0 == (0xF8 & ch)) {
> - ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - } else {
> - *ptr = '?';
> - ++ptr;
> - }
> - }
> - } else {
> - while ((ch = *ptr) != '\0') {
> - if (!isprint(ch)) {
> - *ptr = '?';
> - }
> - ptr++;
> - }
> - }
> - return(str);
> -}
> -
> void
> i_uptime(struct timeval *bt, time_t *tod)
> {
> Index: usr.bin/top/display.h
> ===
> --- usr.bin/top/display.h (revision 335957)
> +++ usr.bin/top/display.h (working copy)
> @@ -11,7 +11,6 @@
> void   clear_message(void);
> intdisplay_resize(void);
> void   i_header(const char *text);
> -char *printable(char *string);
> void   display_header(int t);
> intdisplay_init(struct statics *statics);
> void   i_arc(int *stats);
> Index: usr.bin/top/machine.c
> ===
> --- usr.bin/top/machine.c (revision 335957)
> +++ usr.bin/top/machine.c 

Re: svn commit: r335836 - head/usr.bin/top

2018-07-04 Thread 後藤大地 via svn-src-all
I think that’s fine.

I have not been committed for a while, so now I am in the state of
being reactive commit bit mentee under George’s mentor. So I sent 
him an e-mail about rollback, but I have not heard back from him yet. 
I have been waiting for a reply mail.

I planed to roll back as soon as permission comes out from him.  
However, I think that there is no problem with your commit.
Thank you.

Best regards,
Daichi

> 2018/07/05 1:37、Hiroki Sato のメール:
> 
> Hiroki Sato  wrote
>  in <20180703.020956.859981414196673670@allbsd.org>:
> 
> hr> 後藤大地  wrote
> hr>   in <459bd898-8072-426e-a968-96c1382ac...@icloud.com>:
> hr>
> hr> da>
> hr> da>
> hr> da> > 2018/07/02 15:55、Hiroki Sato のメール:
> hr> da> >
> hr> da> > Eitan Adler  wrote
> hr> da> >  in 
> :
> hr> da> >
> hr> da> > li> On 1 July 2018 at 10:08, Conrad Meyer  wrote:
> hr> da> > li> > Hi Daichi,
> hr> da> > li> >
> hr> da> > li> >
> hr> da> > li> >
> hr> da> > li> > I don't think code to decode UTF-8 belongs in top(1).  I 
> don't know
> hr> da> > li> > what the goal of this routine is, but I doubt this is the 
> right way to
> hr> da> > li> > accomplish it.
> hr> da> > li>
> hr> da> > li> For the record, I agree. This is why I didn't click "accept" on 
> the
> hr> da> > li> revision. I don't fully oppose leaving it in top(1) for now as 
> we work
> hr> da> > li> out the API, but long term its the wrong place.
> hr> da> > li>
> hr> da> > li> https://reviews.freebsd.org/D16058 is the review.
> hr> da> >
> hr> da> > I strongly object this kind of encoding-specific routine.  Please
> hr> da> > back out it.  The problem is that top(1) does not support multibyte
> hr> da> > encoding in functions for printing, and using C99 wide/multibyte
> hr> da> > character manipulation API such as iswprint(3) is the way to solve
> hr> da> > it.  Doing getenv("LANG") and assuming an encoding based on it is a
> hr> da> > very bad practice to internationalize software.
> hr> da> >
> hr> da> > -- Hiroki
> hr> da>
> hr> da> I respect what you mean.
> hr> da>
> hr> da> Once I back out, I will begin implementing it in a different way.
> hr> da> Please advise which function should be used for implementation
> hr> da> (iswprint (3) and what other functions should be used?)
> hr>
> hr>  Roughly speaking, POSIX/XPG/C99 I18N model requires the following
> hr>  steps:
> (snip)
> 
> Are you going to back out r335836, or disagree about it?
> 
> If there is no objection in the next 24 hours, I will commit the
> attached patch.  This should be a minimal change to support multibyte
> characters in ARGV array depending on LC_CTYPE, not limited to UTF-8.
> 
> -- Hiroki
> Index: usr.bin/top/display.c
> ===
> --- usr.bin/top/display.c (revision 335957)
> +++ usr.bin/top/display.c (working copy)
> @@ -1248,55 +1248,6 @@
> }
> }
> 
> -/*
> - *  printable(str) - make the string pointed to by "str" into one that is
> - *   printable (i.e.: all ascii), by converting all non-printable
> - *   characters into '?'.  Replacements are done in place and a pointer
> - *   to the original buffer is returned.
> - */
> -
> -char *
> -printable(char str[])
> -{
> - char *ptr;
> - char ch;
> -
> - ptr = str;
> - if (utf8flag) {
> - while ((ch = *ptr) != '\0') {
> - if (0x00 == (0x80 & ch)) {
> - if (!isprint(ch)) {
> - *ptr = '?';
> - }
> - ++ptr;
> - } else if (0xC0 == (0xE0 & ch)) {
> - ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - } else if (0xE0 == (0xF0 & ch)) {
> - ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - } else if (0xF0 == (0xF8 & ch)) {
> - ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - if ('\0' != *ptr) ++ptr;
> - } else {
> - *ptr = '?';
> - ++ptr;
> - }
> - }
> - } else {
> - while ((ch = *ptr) != '\0') {
> - if (!isprint(ch)) {
> - *ptr = '?';
> - }
> - ptr++;
> - }
> - }
> - return(str);
> -}
> -
> void
> i_uptime(struct timeval *bt, time_t *tod)
> {
> Index: usr.bin/top/display.h
> ===
> --- usr.bin/top/display.h (revision 335957)
> +++ usr.bin/top/display.h (working copy)
> @@ -11,7 +11,6 @@
> void   clear_message(void);
> intdisplay_resize(void);
> void   i_header(const char *text);
> 

Re: svn commit: r335836 - head/usr.bin/top

2018-07-04 Thread Eitan Adler
On Wed, 4 Jul 2018 at 09:41, Hiroki Sato  wrote:
>
>  Are you going to back out r335836, or disagree about it?
>
>  If there is no objection in the next 24 hours, I will commit the
>  attached patch.  This should be a minimal change to support multibyte
>  characters in ARGV array depending on LC_CTYPE, not limited to UTF-8.

Based on this conversation and my otherwise minimal knowledge about
software localization this LGTM.

Thanks!



-- 
Eitan Adler
___
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r335836 - head/usr.bin/top

2018-07-04 Thread Hiroki Sato
Hiroki Sato  wrote
  in <20180703.020956.859981414196673670@allbsd.org>:

hr> 後藤大地  wrote
hr>   in <459bd898-8072-426e-a968-96c1382ac...@icloud.com>:
hr>
hr> da>
hr> da>
hr> da> > 2018/07/02 15:55、Hiroki Sato のメール:
hr> da> >
hr> da> > Eitan Adler  wrote
hr> da> >  in 
:
hr> da> >
hr> da> > li> On 1 July 2018 at 10:08, Conrad Meyer  wrote:
hr> da> > li> > Hi Daichi,
hr> da> > li> >
hr> da> > li> >
hr> da> > li> >
hr> da> > li> > I don't think code to decode UTF-8 belongs in top(1).  I don't 
know
hr> da> > li> > what the goal of this routine is, but I doubt this is the right 
way to
hr> da> > li> > accomplish it.
hr> da> > li>
hr> da> > li> For the record, I agree. This is why I didn't click "accept" on 
the
hr> da> > li> revision. I don't fully oppose leaving it in top(1) for now as we 
work
hr> da> > li> out the API, but long term its the wrong place.
hr> da> > li>
hr> da> > li> https://reviews.freebsd.org/D16058 is the review.
hr> da> >
hr> da> > I strongly object this kind of encoding-specific routine.  Please
hr> da> > back out it.  The problem is that top(1) does not support multibyte
hr> da> > encoding in functions for printing, and using C99 wide/multibyte
hr> da> > character manipulation API such as iswprint(3) is the way to solve
hr> da> > it.  Doing getenv("LANG") and assuming an encoding based on it is a
hr> da> > very bad practice to internationalize software.
hr> da> >
hr> da> > -- Hiroki
hr> da>
hr> da> I respect what you mean.
hr> da>
hr> da> Once I back out, I will begin implementing it in a different way.
hr> da> Please advise which function should be used for implementation
hr> da> (iswprint (3) and what other functions should be used?)
hr>
hr>  Roughly speaking, POSIX/XPG/C99 I18N model requires the following
hr>  steps:
(snip)

 Are you going to back out r335836, or disagree about it?

 If there is no objection in the next 24 hours, I will commit the
 attached patch.  This should be a minimal change to support multibyte
 characters in ARGV array depending on LC_CTYPE, not limited to UTF-8.

-- Hiroki
Index: usr.bin/top/display.c
===
--- usr.bin/top/display.c	(revision 335957)
+++ usr.bin/top/display.c	(working copy)
@@ -1248,55 +1248,6 @@
 }
 }

-/*
- *  printable(str) - make the string pointed to by "str" into one that is
- *	printable (i.e.: all ascii), by converting all non-printable
- *	characters into '?'.  Replacements are done in place and a pointer
- *	to the original buffer is returned.
- */
-
-char *
-printable(char str[])
-{
-	char *ptr;
-	char ch;
-
-	ptr = str;
-	if (utf8flag) {
-		while ((ch = *ptr) != '\0') {
-			if (0x00 == (0x80 & ch)) {
-if (!isprint(ch)) {
-	*ptr = '?';
-}
-++ptr;
-			} else if (0xC0 == (0xE0 & ch)) {
-++ptr;
-if ('\0' != *ptr) ++ptr;
-			} else if (0xE0 == (0xF0 & ch)) {
-++ptr;
-if ('\0' != *ptr) ++ptr;
-if ('\0' != *ptr) ++ptr;
-			} else if (0xF0 == (0xF8 & ch)) {
-++ptr;
-if ('\0' != *ptr) ++ptr;
-if ('\0' != *ptr) ++ptr;
-if ('\0' != *ptr) ++ptr;
-			} else {
-*ptr = '?';
-++ptr;
-			}
-		}
-	} else {
-		while ((ch = *ptr) != '\0') {
-			if (!isprint(ch)) {
-*ptr = '?';
-			}
-			ptr++;
-		}
-	}
-	return(str);
-}
-
 void
 i_uptime(struct timeval *bt, time_t *tod)
 {
Index: usr.bin/top/display.h
===
--- usr.bin/top/display.h	(revision 335957)
+++ usr.bin/top/display.h	(working copy)
@@ -11,7 +11,6 @@
 void	 clear_message(void);
 int		 display_resize(void);
 void	 i_header(const char *text);
-char	*printable(char *string);
 void	 display_header(int t);
 int		 display_init(struct statics *statics);
 void	 i_arc(int *stats);
Index: usr.bin/top/machine.c
===
--- usr.bin/top/machine.c	(revision 335957)
+++ usr.bin/top/machine.c	(working copy)
@@ -986,13 +986,8 @@
 if (*src == '\0')
 	continue;
 len = (argbuflen - (dst - argbuf) - 1) / 4;
-if (utf8flag) {
-	utf8strvisx(dst, src, MIN(strlen(src), len));
-} else {
-	strvisx(dst, src,
-	MIN(strlen(src), len),
-	VIS_NL | VIS_CSTYLE);
-}
+strvisx(dst, src, MIN(strlen(src), len),
+VIS_NL | VIS_CSTYLE | VIS_OCTAL);
 while (*dst != '\0')
 	dst++;
 if ((argbuflen - (dst - argbuf) - 1) / 4 > 0)
@@ -1089,7 +1084,7 @@
 		sbuf_printf(procbuf, "%6s ", format_time(cputime));
 		sbuf_printf(procbuf, "%6.2f%% ", ps.wcpu ? 100.0 * weighted_cpu(PCTCPU(pp), pp) : 100.0 * PCTCPU(pp));
 	}
-	sbuf_printf(procbuf, "%s", printable(cmdbuf));
+	sbuf_printf(procbuf, "%s", cmdbuf);
 	free(cmdbuf);
 	return (sbuf_data(procbuf));
 }
Index: usr.bin/top/top.c
===
--- usr.bin/top/top.c	(revision 335957)
+++ usr.bin/top/top.c	(working copy)
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -69,7 

Re: svn commit: r335836 - head/usr.bin/top

2018-07-03 Thread Hiroki Sato
Jilles Tjoelker  wrote
  in <20180703211002.ga11...@stack.nl>:

ji> >  3. Print the multibyte characters by using strvisx(3) family, which
ji> > supports multibyte character, or swprintf(3) family if you want to
ji> > format wide characters directly.  Note that buffer length for
ji> > strvisx(3) must be calculated by using MB_LEN_MAX.
ji>
ji> In this case, calling setlocale() and then using strvisx() seems the
ji> right solution. If locales differ across processes this may result in
ji> mojibake but that cannot really be helped. Even analyzing other
ji> processes' locale variables is not fully reliable, since strings may be
ji> incorrectly encoded even in the process's real locale, environment
ji> variables cannot be read across users and the environment block may be
ji> overwritten by a program.
ji>
ji> In general, although using conversion to wide characters allows users a
ji> lot of flexibility, I don't think it is the best in all situations:
ji>
ji> * The result of mbstowcs() is a UTF-32 string which consumes a lot of
ji>   memory. A loop with mbrtowc() may also be slow. Many operations can be
ji>   done directly on UTF-8 strings with no or little additional complexity
ji>   compared to byte strings.
ji>
ji> * If there is an invalid multibyte character, there is little
ji>   flexibility to handle this usefully and securely, since so little is
ji>   known about the encoding. The best handling may depend on the context.
ji>
ji> Therefore, in /bin/sh, I have only implemented multibyte support for
ji> UTF-8. All other encodings have bytes treated as characters.
ji>
ji> However, I do agree that getenv("LANG") is bad. Instead, setlocale()
ji> should be used. After that, nl_langinfo(CODESET) can be called and the
ji> result compared to "UTF-8".

 Yes, I agree that using mb->wc conversion is not always the best and
 using strvisx() for cmdbuf, not only for argv, is enough in this
 case.  I thought it was difficult to avoid iswprint() because I was
 not sure of the goal of r335836 and it looked to me that it aimed to
 keep the original printable() function.  And as you mentioned it may
 not be worth to try to correctly detect/support locales in different
 processes, either.  Probably one of the simplest ways would be that
 relying on LC_CTYPE+strvisx() and documenting how top(1) handles
 multibyte characters in the manual page.

-- Hiroki


pgpvYI6YWCefc.pgp
Description: PGP signature


Re: svn commit: r335836 - head/usr.bin/top

2018-07-03 Thread Jilles Tjoelker
On Tue, Jul 03, 2018 at 02:09:56AM +0900, Hiroki Sato wrote:
> 後藤大地  wrote
>   in <459bd898-8072-426e-a968-96c1382ac...@icloud.com>:
> da> > 2018/07/02 15:55、Hiroki Sato のメール:

> da> > Eitan Adler  wrote
> da> >  in 
> :

> da> > li> On 1 July 2018 at 10:08, Conrad Meyer  wrote:

> da> > li> > I don't think code to decode UTF-8 belongs in top(1).  I don't 
> know
> da> > li> > what the goal of this routine is, but I doubt this is the right 
> way to
> da> > li> > accomplish it.
> da> > li>
> da> > li> For the record, I agree. This is why I didn't click "accept" on the
> da> > li> revision. I don't fully oppose leaving it in top(1) for now as we 
> work
> da> > li> out the API, but long term its the wrong place.
> da> > li>
> da> > li> https://reviews.freebsd.org/D16058 is the review.

> da> > I strongly object this kind of encoding-specific routine.  Please
> da> > back out it.  The problem is that top(1) does not support multibyte
> da> > encoding in functions for printing, and using C99 wide/multibyte
> da> > character manipulation API such as iswprint(3) is the way to solve
> da> > it.  Doing getenv("LANG") and assuming an encoding based on it is a
> da> > very bad practice to internationalize software.

> da> I respect what you mean.

> da> Once I back out, I will begin implementing it in a different way.
> da> Please advise which function should be used for implementation
> da> (iswprint (3) and what other functions should be used?)

>  Roughly speaking, POSIX/XPG/C99 I18N model requires the following
>  steps:

>  1. Call setlocale(LC_ALL, "") first.

>  2. Use mbs<->wcs and/or mb<->wc conversion functions in C95/C99 to
> manipulate characters and strings depending on what you want to
> do.  The printable() function should use mbtowc(3) and
> iswprint(3), for example.  And wcslen(3) should be used to
> determine the length of characters to be printed instead of
> strlen().

> Note that if mbs->wcs or mb->wc conversion fails with EILSEQ at
> some point, some of the character(s) are invalid for printing.
> This can happen because command-line parameters in top(1) are not
> always encoded in one specified in LC_CTYPE or LANG.  It should
> also be handled as non-printable.  However, to make matters worse,
> each process does not always use a single, same locale as top(1).
> A process invoked with LANG=ja_JP.eucJP may have EUC-JP characters
> in its ARGV array even if top(1) runs by another user whose LANG
> is en_US.UTF-8.  You have to determine which locale should be used
> before doing mb->wc conversion.  It is not so simple.

>  3. Print the multibyte characters by using strvisx(3) family, which
> supports multibyte character, or swprintf(3) family if you want to
> format wide characters directly.  Note that buffer length for
> strvisx(3) must be calculated by using MB_LEN_MAX.

In this case, calling setlocale() and then using strvisx() seems the
right solution. If locales differ across processes this may result in
mojibake but that cannot really be helped. Even analyzing other
processes' locale variables is not fully reliable, since strings may be
incorrectly encoded even in the process's real locale, environment
variables cannot be read across users and the environment block may be
overwritten by a program.

In general, although using conversion to wide characters allows users a
lot of flexibility, I don't think it is the best in all situations:

* The result of mbstowcs() is a UTF-32 string which consumes a lot of
  memory. A loop with mbrtowc() may also be slow. Many operations can be
  done directly on UTF-8 strings with no or little additional complexity
  compared to byte strings.

* If there is an invalid multibyte character, there is little
  flexibility to handle this usefully and securely, since so little is
  known about the encoding. The best handling may depend on the context.

Therefore, in /bin/sh, I have only implemented multibyte support for
UTF-8. All other encodings have bytes treated as characters.

However, I do agree that getenv("LANG") is bad. Instead, setlocale()
should be used. After that, nl_langinfo(CODESET) can be called and the
result compared to "UTF-8".

-- 
Jilles Tjoelker
___
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r335836 - head/usr.bin/top

2018-07-02 Thread Warner Losh
Sato-san

Sorry for the top post, but your message would make an excellent intro to
i18n in one of our developer guides.

Warner

On Mon, Jul 2, 2018, 11:13 AM Hiroki Sato  wrote:

> 後藤大地  wrote
>   in <459bd898-8072-426e-a968-96c1382ac...@icloud.com>:
>
> da>
> da>
> da> > 2018/07/02 15:55、Hiroki Sato のメール:
> da> >
> da> > Eitan Adler  wrote
> da> >  in  n9zjwejh+di800k...@mail.gmail.com>:
> da> >
> da> > li> On 1 July 2018 at 10:08, Conrad Meyer  wrote:
> da> > li> > Hi Daichi,
> da> > li> >
> da> > li> >
> da> > li> >
> da> > li> > I don't think code to decode UTF-8 belongs in top(1).  I don't
> know
> da> > li> > what the goal of this routine is, but I doubt this is the
> right way to
> da> > li> > accomplish it.
> da> > li>
> da> > li> For the record, I agree. This is why I didn't click "accept" on
> the
> da> > li> revision. I don't fully oppose leaving it in top(1) for now as
> we work
> da> > li> out the API, but long term its the wrong place.
> da> > li>
> da> > li> https://reviews.freebsd.org/D16058 is the review.
> da> >
> da> > I strongly object this kind of encoding-specific routine.  Please
> da> > back out it.  The problem is that top(1) does not support multibyte
> da> > encoding in functions for printing, and using C99 wide/multibyte
> da> > character manipulation API such as iswprint(3) is the way to solve
> da> > it.  Doing getenv("LANG") and assuming an encoding based on it is a
> da> > very bad practice to internationalize software.
> da> >
> da> > -- Hiroki
> da>
> da> I respect what you mean.
> da>
> da> Once I back out, I will begin implementing it in a different way.
> da> Please advise which function should be used for implementation
> da> (iswprint (3) and what other functions should be used?)
>
>  Roughly speaking, POSIX/XPG/C99 I18N model requires the following
>  steps:
>
>  1. Call setlocale(LC_ALL, "") first.
>
>  2. Use mbs<->wcs and/or mb<->wc conversion functions in C95/C99 to
> manipulate characters and strings depending on what you want to
> do.  The printable() function should use mbtowc(3) and
> iswprint(3), for example.  And wcslen(3) should be used to
> determine the length of characters to be printed instead of
> strlen().
>
> Note that if mbs->wcs or mb->wc conversion fails with EILSEQ at
> some point, some of the character(s) are invalid for printing.
> This can happen because command-line parameters in top(1) are not
> always encoded in one specified in LC_CTYPE or LANG.  It should
> also be handled as non-printable.  However, to make matters worse,
> each process does not always use a single, same locale as top(1).
> A process invoked with LANG=ja_JP.eucJP may have EUC-JP characters
> in its ARGV array even if top(1) runs by another user whose LANG
> is en_US.UTF-8.  You have to determine which locale should be used
> before doing mb->wc conversion.  It is not so simple.
>
>  3. Print the multibyte characters by using strvisx(3) family, which
> supports multibyte character, or swprintf(3) family if you want to
> format wide characters directly.  Note that buffer length for
> strvisx(3) must be calculated by using MB_LEN_MAX.
>
>  I recommend you to learn about I18N by reading the following
>  documents since this involves an I18N programming model, not just a
>  matter of which function should be used.  While they are quite old
>  and contain system-specific topics, they are still useful to
>  understand general overview of how XPG4 and the relevant C95/C99 APIs
>  work:
>
>  [1] Developer's Guide to Internationalization (801-6660)
>  https://docs.oracle.com/cd/E19457-01/801-6660/801-6660.pdf
>
>  [2] Software Internationalization Guide (526225-002)
>
> https://support.hpe.com/hpsc/doc/public/display?docId=emr_na-c02131936
>
>  [3] ISO/IEC 9899:TC2 draft (p.204, Sec. 7.11 Localization)
>  http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1124.pdf
>
>  [4] Internationalization Guide, Version 2
>  ISBN: 978-0133535419
>
> -- Hiroki
>
___
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r335836 - head/usr.bin/top

2018-07-02 Thread Hiroki Sato
後藤大地  wrote
  in <459bd898-8072-426e-a968-96c1382ac...@icloud.com>:

da>
da>
da> > 2018/07/02 15:55、Hiroki Sato のメール:
da> >
da> > Eitan Adler  wrote
da> >  in :
da> >
da> > li> On 1 July 2018 at 10:08, Conrad Meyer  wrote:
da> > li> > Hi Daichi,
da> > li> >
da> > li> >
da> > li> >
da> > li> > I don't think code to decode UTF-8 belongs in top(1).  I don't know
da> > li> > what the goal of this routine is, but I doubt this is the right way 
to
da> > li> > accomplish it.
da> > li>
da> > li> For the record, I agree. This is why I didn't click "accept" on the
da> > li> revision. I don't fully oppose leaving it in top(1) for now as we work
da> > li> out the API, but long term its the wrong place.
da> > li>
da> > li> https://reviews.freebsd.org/D16058 is the review.
da> >
da> > I strongly object this kind of encoding-specific routine.  Please
da> > back out it.  The problem is that top(1) does not support multibyte
da> > encoding in functions for printing, and using C99 wide/multibyte
da> > character manipulation API such as iswprint(3) is the way to solve
da> > it.  Doing getenv("LANG") and assuming an encoding based on it is a
da> > very bad practice to internationalize software.
da> >
da> > -- Hiroki
da>
da> I respect what you mean.
da>
da> Once I back out, I will begin implementing it in a different way.
da> Please advise which function should be used for implementation
da> (iswprint (3) and what other functions should be used?)

 Roughly speaking, POSIX/XPG/C99 I18N model requires the following
 steps:

 1. Call setlocale(LC_ALL, "") first.

 2. Use mbs<->wcs and/or mb<->wc conversion functions in C95/C99 to
manipulate characters and strings depending on what you want to
do.  The printable() function should use mbtowc(3) and
iswprint(3), for example.  And wcslen(3) should be used to
determine the length of characters to be printed instead of
strlen().

Note that if mbs->wcs or mb->wc conversion fails with EILSEQ at
some point, some of the character(s) are invalid for printing.
This can happen because command-line parameters in top(1) are not
always encoded in one specified in LC_CTYPE or LANG.  It should
also be handled as non-printable.  However, to make matters worse,
each process does not always use a single, same locale as top(1).
A process invoked with LANG=ja_JP.eucJP may have EUC-JP characters
in its ARGV array even if top(1) runs by another user whose LANG
is en_US.UTF-8.  You have to determine which locale should be used
before doing mb->wc conversion.  It is not so simple.

 3. Print the multibyte characters by using strvisx(3) family, which
supports multibyte character, or swprintf(3) family if you want to
format wide characters directly.  Note that buffer length for
strvisx(3) must be calculated by using MB_LEN_MAX.

 I recommend you to learn about I18N by reading the following
 documents since this involves an I18N programming model, not just a
 matter of which function should be used.  While they are quite old
 and contain system-specific topics, they are still useful to
 understand general overview of how XPG4 and the relevant C95/C99 APIs
 work:

 [1] Developer's Guide to Internationalization (801-6660)
 https://docs.oracle.com/cd/E19457-01/801-6660/801-6660.pdf

 [2] Software Internationalization Guide (526225-002)
 https://support.hpe.com/hpsc/doc/public/display?docId=emr_na-c02131936

 [3] ISO/IEC 9899:TC2 draft (p.204, Sec. 7.11 Localization)
 http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1124.pdf

 [4] Internationalization Guide, Version 2
 ISBN: 978-0133535419

-- Hiroki


pgpDOusTGA7M1.pgp
Description: PGP signature


Re: svn commit: r335836 - head/usr.bin/top

2018-07-02 Thread 後藤大地


> 2018/07/02 2:08、Conrad Meyer のメール:
> 
> Hi Daichi,
> 
> On Sat, Jun 30, 2018 at 10:32 PM, Daichi GOTO  wrote:
>> Author: daichi
>> Date: Sun Jul  1 05:32:03 2018
>> New Revision: 335836
>> URL: https://svnweb.freebsd.org/changeset/base/335836
>> 
>> Log:
>>  top(1) - support UTF-8 display
>> 
>> ...
>> ==
>> --- head/usr.bin/top/display.c  Sun Jul  1 01:56:40 2018(r335835)
>> +++ head/usr.bin/top/display.c  Sun Jul  1 05:32:03 2018(r335836)
>> @@ -1258,19 +1258,43 @@ line_update(char *old, char *new, int start, int 
>> line)
>> char *
>> printable(char str[])
>> {
>> -char *ptr;
>> -char ch;
>> +   char *ptr;
>> +   char ch;
>> 
>> -ptr = str;
>> -while ((ch = *ptr) != '\0')
>> -{
>> -   if (!isprint(ch))
>> -   {
>> -   *ptr = '?';
>> +   ptr = str;
>> +   if (utf8flag) {
>> +   while ((ch = *ptr) != '\0') {
>> +   if (0x00 == (0x80 & ch)) {
>> +   if (!isprint(ch)) {
>> +   *ptr = '?';
>> +   }
>> +   ++ptr;
>> +   } else if (0xC0 == (0xE0 & ch)) {
>> +   ++ptr;
>> +   if ('\0' != *ptr) ++ptr;
>> +   } else if (0xE0 == (0xF0 & ch)) {
>> +   ++ptr;
>> +   if ('\0' != *ptr) ++ptr;
>> +   if ('\0' != *ptr) ++ptr;
>> +   } else if (0xF0 == (0xF8 & ch)) {
>> +   ++ptr;
>> +   if ('\0' != *ptr) ++ptr;
>> +   if ('\0' != *ptr) ++ptr;
>> +   if ('\0' != *ptr) ++ptr;
>> +   } else {
>> +   *ptr = '?';
>> +   ++ptr;
>> +   }
>> +   }
>> +   } else {
>> +   while ((ch = *ptr) != '\0') {
>> +   if (!isprint(ch)) {
>> +   *ptr = '?';
>> +   }
>> +   ptr++;
>> +   }
>>}
> 
> 
> I don't think code to decode UTF-8 belongs in top(1).  I don't know
> what the goal of this routine is, but I doubt this is the right way to
> accomplish it.
> 
> For the strvisx portion it seems like support should be rolled into
> libc instead.

Hi Conrad,

The purpose is to display UTF-8 string. Certainly, you are right. 
In the end I think I should support it with libc. However, I think that 
it is a problem that UTF-8 can not be used with the top command for a long 
period of time. Although it may be transient, I think that keeping it to 
work is not that bad idea.

Best regards,
Daichi

> (Also, the patch in phabricator does not seem to match what was committed.)
> 
> Best,
> Conrad
> 

___
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r335836 - head/usr.bin/top

2018-07-02 Thread 後藤大地


> 2018/07/02 15:55、Hiroki Sato のメール:
> 
> Eitan Adler  wrote
>  in :
> 
> li> On 1 July 2018 at 10:08, Conrad Meyer  wrote:
> li> > Hi Daichi,
> li> >
> li> >
> li> >
> li> > I don't think code to decode UTF-8 belongs in top(1).  I don't know
> li> > what the goal of this routine is, but I doubt this is the right way to
> li> > accomplish it.
> li>
> li> For the record, I agree. This is why I didn't click "accept" on the
> li> revision. I don't fully oppose leaving it in top(1) for now as we work
> li> out the API, but long term its the wrong place.
> li>
> li> https://reviews.freebsd.org/D16058 is the review.
> 
> I strongly object this kind of encoding-specific routine.  Please
> back out it.  The problem is that top(1) does not support multibyte
> encoding in functions for printing, and using C99 wide/multibyte
> character manipulation API such as iswprint(3) is the way to solve
> it.  Doing getenv("LANG") and assuming an encoding based on it is a
> very bad practice to internationalize software.
> 
> -- Hiroki

I respect what you mean.

Once I back out, I will begin implementing it in a different way.
Please advise which function should be used for implementation
(iswprint (3) and what other functions should be used?)

Best regards,
Daichi
___
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r335836 - head/usr.bin/top

2018-07-02 Thread Hiroki Sato
Eitan Adler  wrote
  in :

li> On 1 July 2018 at 10:08, Conrad Meyer  wrote:
li> > Hi Daichi,
li> >
li> >
li> >
li> > I don't think code to decode UTF-8 belongs in top(1).  I don't know
li> > what the goal of this routine is, but I doubt this is the right way to
li> > accomplish it.
li>
li> For the record, I agree. This is why I didn't click "accept" on the
li> revision. I don't fully oppose leaving it in top(1) for now as we work
li> out the API, but long term its the wrong place.
li>
li> https://reviews.freebsd.org/D16058 is the review.

 I strongly object this kind of encoding-specific routine.  Please
 back out it.  The problem is that top(1) does not support multibyte
 encoding in functions for printing, and using C99 wide/multibyte
 character manipulation API such as iswprint(3) is the way to solve
 it.  Doing getenv("LANG") and assuming an encoding based on it is a
 very bad practice to internationalize software.

-- Hiroki


pgp7xYnpPuj_n.pgp
Description: PGP signature


Re: svn commit: r335836 - head/usr.bin/top

2018-07-01 Thread Eitan Adler
On 1 July 2018 at 10:08, Conrad Meyer  wrote:
> Hi Daichi,
>
>
>
> I don't think code to decode UTF-8 belongs in top(1).  I don't know
> what the goal of this routine is, but I doubt this is the right way to
> accomplish it.

For the record, I agree. This is why I didn't click "accept" on the
revision. I don't fully oppose leaving it in top(1) for now as we work
out the API, but long term its the wrong place.

https://reviews.freebsd.org/D16058 is the review.


-- 
Eitan Adler
___
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r335836 - head/usr.bin/top

2018-07-01 Thread Conrad Meyer
On Sun, Jul 1, 2018 at 10:08 AM, Conrad Meyer  wrote:
> (Also, the patch in phabricator does not seem to match what was committed.)

It seems the right phabricator URL is https://reviews.freebsd.org/D16058 .

Best,
Conrad
___
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r335836 - head/usr.bin/top

2018-07-01 Thread Conrad Meyer
Hi Daichi,

On Sat, Jun 30, 2018 at 10:32 PM, Daichi GOTO  wrote:
> Author: daichi
> Date: Sun Jul  1 05:32:03 2018
> New Revision: 335836
> URL: https://svnweb.freebsd.org/changeset/base/335836
>
> Log:
>   top(1) - support UTF-8 display
>
> ...
> ==
> --- head/usr.bin/top/display.c  Sun Jul  1 01:56:40 2018(r335835)
> +++ head/usr.bin/top/display.c  Sun Jul  1 05:32:03 2018(r335836)
> @@ -1258,19 +1258,43 @@ line_update(char *old, char *new, int start, int line)
>  char *
>  printable(char str[])
>  {
> -char *ptr;
> -char ch;
> +   char *ptr;
> +   char ch;
>
> -ptr = str;
> -while ((ch = *ptr) != '\0')
> -{
> -   if (!isprint(ch))
> -   {
> -   *ptr = '?';
> +   ptr = str;
> +   if (utf8flag) {
> +   while ((ch = *ptr) != '\0') {
> +   if (0x00 == (0x80 & ch)) {
> +   if (!isprint(ch)) {
> +   *ptr = '?';
> +   }
> +   ++ptr;
> +   } else if (0xC0 == (0xE0 & ch)) {
> +   ++ptr;
> +   if ('\0' != *ptr) ++ptr;
> +   } else if (0xE0 == (0xF0 & ch)) {
> +   ++ptr;
> +   if ('\0' != *ptr) ++ptr;
> +   if ('\0' != *ptr) ++ptr;
> +   } else if (0xF0 == (0xF8 & ch)) {
> +   ++ptr;
> +   if ('\0' != *ptr) ++ptr;
> +   if ('\0' != *ptr) ++ptr;
> +   if ('\0' != *ptr) ++ptr;
> +   } else {
> +   *ptr = '?';
> +   ++ptr;
> +   }
> +   }
> +   } else {
> +   while ((ch = *ptr) != '\0') {
> +   if (!isprint(ch)) {
> +   *ptr = '?';
> +   }
> +   ptr++;
> +   }
> }


I don't think code to decode UTF-8 belongs in top(1).  I don't know
what the goal of this routine is, but I doubt this is the right way to
accomplish it.

For the strvisx portion it seems like support should be rolled into
libc instead.

(Also, the patch in phabricator does not seem to match what was committed.)

Best,
Conrad
___
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


svn commit: r335836 - head/usr.bin/top

2018-07-01 Thread Daichi GOTO
Author: daichi
Date: Sun Jul  1 05:32:03 2018
New Revision: 335836
URL: https://svnweb.freebsd.org/changeset/base/335836

Log:
  top(1) - support UTF-8 display
  
  Reviewed by:  eadler
  Approved by:  gnn (mentor)
  Differential Revision:https://reviews.freebsd.org/D16006

Modified:
  head/usr.bin/top/display.c
  head/usr.bin/top/machine.c
  head/usr.bin/top/top.c
  head/usr.bin/top/top.h
  head/usr.bin/top/utils.c
  head/usr.bin/top/utils.h

Modified: head/usr.bin/top/display.c
==
--- head/usr.bin/top/display.c  Sun Jul  1 01:56:40 2018(r335835)
+++ head/usr.bin/top/display.c  Sun Jul  1 05:32:03 2018(r335836)
@@ -1258,19 +1258,43 @@ line_update(char *old, char *new, int start, int line)
 char *
 printable(char str[])
 {
-char *ptr;
-char ch;
+   char *ptr;
+   char ch;
 
-ptr = str;
-while ((ch = *ptr) != '\0')
-{
-   if (!isprint(ch))
-   {
-   *ptr = '?';
+   ptr = str;
+   if (utf8flag) {
+   while ((ch = *ptr) != '\0') {
+   if (0x00 == (0x80 & ch)) {
+   if (!isprint(ch)) {
+   *ptr = '?';
+   }
+   ++ptr;
+   } else if (0xC0 == (0xE0 & ch)) {
+   ++ptr;
+   if ('\0' != *ptr) ++ptr;
+   } else if (0xE0 == (0xF0 & ch)) {
+   ++ptr;
+   if ('\0' != *ptr) ++ptr;
+   if ('\0' != *ptr) ++ptr;
+   } else if (0xF0 == (0xF8 & ch)) {
+   ++ptr;
+   if ('\0' != *ptr) ++ptr;
+   if ('\0' != *ptr) ++ptr;
+   if ('\0' != *ptr) ++ptr;
+   } else {
+   *ptr = '?';
+   ++ptr;
+   }
+   }
+   } else {
+   while ((ch = *ptr) != '\0') {
+   if (!isprint(ch)) {
+   *ptr = '?';
+   }
+   ptr++;
+   }
}
-   ptr++;
-}
-return(str);
+   return(str);
 }
 
 void

Modified: head/usr.bin/top/machine.c
==
--- head/usr.bin/top/machine.c  Sun Jul  1 01:56:40 2018(r335835)
+++ head/usr.bin/top/machine.c  Sun Jul  1 05:32:03 2018(r335836)
@@ -988,9 +988,13 @@ format_next_process(struct handle * xhandle, char *(*g
if (*src == '\0')
continue;
len = (argbuflen - (dst - argbuf) - 1) / 4;
-   strvisx(dst, src,
-   MIN(strlen(src), len),
-   VIS_NL | VIS_CSTYLE);
+   if (utf8flag) {
+   utf8strvisx(dst, src, MIN(strlen(src), 
len));
+   } else {
+   strvisx(dst, src,
+   MIN(strlen(src), len),
+   VIS_NL | VIS_CSTYLE);
+   }
while (*dst != '\0')
dst++;
if ((argbuflen - (dst - argbuf) - 1) / 4 > 0)

Modified: head/usr.bin/top/top.c
==
--- head/usr.bin/top/top.c  Sun Jul  1 01:56:40 2018(r335835)
+++ head/usr.bin/top/top.c  Sun Jul  1 05:32:03 2018(r335836)
@@ -69,6 +69,7 @@ static int max_topn;  /* maximum displayable processes
 struct process_select ps;
 const char * myname = "top";
 pid_t mypid;
+bool utf8flag = false;
 
 /* pointers to display routines */
 static void (*d_loadave)(int mpid, double *avenrun) = i_loadave;
@@ -605,6 +606,14 @@ main(int argc, char *argv[])
sleep(3 * warnings);
fputc('\n', stderr);
 }
+
+   /* check if you are using UTF-8 */
+   char *env_lang;
+   if (NULL != (env_lang = getenv("LANG")) && 
+   0 != strcmp(env_lang, "") &&
+   NULL != strstr(env_lang, "UTF-8")) {
+   utf8flag = true;
+   }
 
 restart:
 

Modified: head/usr.bin/top/top.h
==
--- head/usr.bin/top/top.h  Sun Jul  1 01:56:40 2018(r335835)
+++ head/usr.bin/top/top.h  Sun Jul  1 05:32:03 2018(r335836)
@@ -8,6 +8,7 @@
 #define TOP_H
 
 #include 
+#include 
 
 /* Number of lines of header information on the