Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales

2012-09-12 Thread Nguyen Thai Ngoc Duy
Ping.. what happens to this patch? Do you want to support other
encodings as well via iconv()? I can't test that though.

On Tue, Sep 4, 2012 at 5:39 PM, Nguyễn Thái Ngọc Duy pclo...@gmail.com wrote:
 fetch does printf(%-*s, width, foo) where foo can be a utf-8
 string, but width is in bytes, not columns. For ASCII it's fine as one
 byte takes one column. For utf-8, this may result in misaligned ref
 summary table.

 Introduce gettext_width() function that returns the string length in
 columns (currently only supports utf-8 locales). Make the code use
 TRANSPORT_SUMMARY(x) where the length is compensated properly in
 non-English locales.

 Signed-off-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
 ---
   - rename gettext_length() to gettext_width()
   - use width instead of letters
   - leave other %-*s places unchanged if they always take ascii
 strings (i.e. no _() calls)
   - note to self, may need to i18n-ize print_ref_status() in
 transport.c, looks like it's used by git-push only

  builtin/fetch.c | 15 +++
  gettext.c   | 15 +--
  gettext.h   |  5 +
  transport.h |  1 +
  4 files changed, 26 insertions(+), 10 deletions(-)

 diff --git a/builtin/fetch.c b/builtin/fetch.c
 index bb9a074..85e291f 100644
 --- a/builtin/fetch.c
 +++ b/builtin/fetch.c
 @@ -255,9 +255,8 @@ static int update_local_ref(struct ref *ref,
 if (!hashcmp(ref-old_sha1, ref-new_sha1)) {
 if (verbosity  0)
 strbuf_addf(display, = %-*s %-*s - %s,
 -   TRANSPORT_SUMMARY_WIDTH,
 -   _([up to date]), REFCOL_WIDTH,
 -   remote, pretty_ref);
 +   TRANSPORT_SUMMARY(_([up to date])),
 +   REFCOL_WIDTH, remote, pretty_ref);
 return 0;
 }

 @@ -271,7 +270,7 @@ static int update_local_ref(struct ref *ref,
  */
 strbuf_addf(display,
 _(! %-*s %-*s - %s  (can't fetch in current 
 branch)),
 -   TRANSPORT_SUMMARY_WIDTH, _([rejected]),
 +   TRANSPORT_SUMMARY(_([rejected])),
 REFCOL_WIDTH, remote, pretty_ref);
 return 1;
 }
 @@ -282,7 +281,7 @@ static int update_local_ref(struct ref *ref,
 r = s_update_ref(updating tag, ref, 0);
 strbuf_addf(display, %c %-*s %-*s - %s%s,
 r ? '!' : '-',
 -   TRANSPORT_SUMMARY_WIDTH, _([tag update]),
 +   TRANSPORT_SUMMARY(_([tag update])),
 REFCOL_WIDTH, remote, pretty_ref,
 r ? _(  (unable to update local ref)) : );
 return r;
 @@ -317,7 +316,7 @@ static int update_local_ref(struct ref *ref,
 r = s_update_ref(msg, ref, 0);
 strbuf_addf(display, %c %-*s %-*s - %s%s,
 r ? '!' : '*',
 -   TRANSPORT_SUMMARY_WIDTH, what,
 +   TRANSPORT_SUMMARY(what),
 REFCOL_WIDTH, remote, pretty_ref,
 r ? _(  (unable to update local ref)) : );
 return r;
 @@ -357,7 +356,7 @@ static int update_local_ref(struct ref *ref,
 return r;
 } else {
 strbuf_addf(display, ! %-*s %-*s - %s  %s,
 -   TRANSPORT_SUMMARY_WIDTH, _([rejected]),
 +   TRANSPORT_SUMMARY(_([rejected])),
 REFCOL_WIDTH, remote, pretty_ref,
 _((non-fast-forward)));
 return 1;
 @@ -554,7 +553,7 @@ static int prune_refs(struct refspec *refs, int 
 ref_count, struct ref *ref_map)
 result |= delete_ref(ref-name, NULL, 0);
 if (verbosity = 0) {
 fprintf(stderr,  x %-*s %-*s - %s\n,
 -   TRANSPORT_SUMMARY_WIDTH, _([deleted]),
 +   TRANSPORT_SUMMARY(_([deleted])),
 REFCOL_WIDTH, _((none)), 
 prettify_refname(ref-name));
 warn_dangling_symref(stderr, dangling_msg, ref-name);
 }
 diff --git a/gettext.c b/gettext.c
 index f75bca7..71e9545 100644
 --- a/gettext.c
 +++ b/gettext.c
 @@ -4,6 +4,8 @@

  #include git-compat-util.h
  #include gettext.h
 +#include strbuf.h
 +#include utf8.h

  #ifndef NO_GETTEXT
  #  include locale.h
 @@ -27,10 +29,9 @@ int use_gettext_poison(void)
  #endif

  #ifndef NO_GETTEXT
 +static const char *charset;
  static void init_gettext_charset(const char *domain)
  {
 -   const char *charset;
 -
 /*
This trick arranges for messages to be emitted in the user's
requested encoding, but avoids 

Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales

2012-09-12 Thread Junio C Hamano
Nguyen Thai Ngoc Duy pclo...@gmail.com writes:

 Ping.. what happens to this patch? Do you want to support other
 encodings as well via iconv()? I can't test that though.

I thought I refuted a potential blocker, which was an implied
objection from Torsten, in $gmane/204912 already.  As long as we
make it clear that your patch helps only ASCII/UTF-8 only audience
but we still try to be nicer to 'others' by doing two things I
said in the message, I think we should proceed without iconv() to
keep things simple.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales

2012-09-12 Thread Torsten Bögershausen
On 12.09.12 20:02, Junio C Hamano wrote:
 Nguyen Thai Ngoc Duy pclo...@gmail.com writes:
 
 Ping.. what happens to this patch? Do you want to support other
 encodings as well via iconv()? I can't test that though.
 
 I thought I refuted a potential blocker, which was an implied
 objection from Torsten, in $gmane/204912 already.  As long as we
 make it clear that your patch helps only ASCII/UTF-8 only audience
 but we still try to be nicer to 'others' by doing two things I
 said in the message, I think we should proceed without iconv() to
 keep things simple.

Please unblock and proceed (as I can't test other encodings either)

And even special thanks for the excellent lines from Junio,
which explained the update philosophy in git so well, 
that I take the freedom to re-post them here:

 I try to re-phrase my question:

 Do installations still exist which use e.g. BIG5 or any other
 multi byte encoding which is not UTF-8?

Yes.

 Do we want to support other encodings than ASCII or UTF-8?
 (Because then the screen width needs to be calculate different, I think)

That depends on who we are and what timeframe you have in mind.

Do our developers care about these encodings so much that we would
reject ASCCI/UTF-8 only patch and wait until we enhance it to do
the right thing for other encodings that we do not use ourselves?
No.  That does not make any sense, especially when we know we will
not have a good test coverage on the additional parts that we will
not be using ourselves.

This change only helps people with ASCII or UTF-8 and does not help
others alone is never a valid reason to reject a change, but we
still try to be nicer to others that may come after we leave this
topic behind by doing a few things:

 - If the change will make things worse than it currently is for
   others, we would try to minimize the regression for them.

 - If the change will make the code harder to update later to
   enhance with additional change to support others, we would try
   to anticipate what kind of changes are needed on top, and
   structure the code in such a way that future changes can be made
   cleanly.

For the first point, for multi-byte encodings (e.g. ISO-2022), the
display columns and byte length do not match and in general byte
length is longer than the display columns in the current code.  With
the current code that measures the required columns across elements
by taking the maximum of byte length, they will see wrong number of
filler, so they are already getting a wrong alignment.  With the
UTF-8 only change, the required columns and the filler will be
calculated by assuming that the string is in UTF-8, which may make
the computation off in a different way, and if we underestimate the
display columns for a string, they may see the strings truncated,
which is bad.

So as long as gettext_width() punts and returns strlen() for a
malformed UTF-8, it would be OK [*1*].

For the second point, I think the API here is a string, give me the
number of display columns it will occupy, as I am interested in
aligning them is a good abstraction that can be later enhanced to
other encodings fairly easily, so I do not see a problem in the
patch that goes in that direction.



[Footnote]

*1* If the patch used utf_strwidth() (which I didn't bother to go
back and check, though), it should be OK.  The underlying
utf8_width() will reject a malformed UTF-8 sequence and the code
falls back to strlen().

 










--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales

2012-09-06 Thread Nguyen Thai Ngoc Duy
On Thu, Sep 6, 2012 at 2:20 AM, Torsten Bögershausen tbo...@web.de wrote:
 On 09/05/2012 08:15 PM, Torsten Bögershausen wrote:

 On 04.09.12 12:39, Nguyễn Thái Ngọc Duy wrote:

 +/* return the number of columns of string 's' in current locale */
 +int gettext_width(const char *s)
 +{
 +   static int is_utf8 = -1;
 +   if (is_utf8 == -1)
 +   is_utf8 = !strcmp(charset, UTF-8);
 +
 +   return is_utf8 ? utf8_strwidth(s) : strlen(s);


 Will that work for non-ASCII encodings?
 For ISO-8859-x we can say strlen() == strwidth(),
 but for other encodings using multibytes that doesn't work, does it?

No it does not. I think I mentioned that in the first version that I
was only interested in utf-8. Others can extend the function for their
favourite encodings.

 (Sorry the message went out before completely written)
 Something like that:

 int gettext_width(const char *s) {

   static int is_utf8 = -1;

   if (is_utf8 == -1)

 is_utf8 = !strcmp(charset, UTF-8);

   if (is_utf8)
 return utf8_strwidth(s);
   else  {
 char *s_utf = reencode_string(s, UTF-8, charset);
 if (s_utf) {
   witdh = utf8_strwidth(s_utf);
   free(s_utf);
 } else
   width = strlen(s);

 return width;
 }

Yes, something like that, assuming that column information is intact
after the conversion. Maybe you can make that a new function, int
strwidth(const char *str, const char *charset), and make
gettext_strwidth() a thin wrapper:

int gettext_strwidth(const char *s)
{
   return strwidth(s, charset);
}
-- 
Duy
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales

2012-09-06 Thread Nguyen Thai Ngoc Duy
On Thu, Sep 6, 2012 at 2:20 AM, Torsten Bögershausen tbo...@web.de wrote:
 Will that work for non-ASCII encodings?
 For ISO-8859-x we can say strlen() == strwidth(),
 but for other encodings using multibytes that doesn't work, does it?

BTW if you are interested in supporting non-utf8 output, you may want
to look at 1452bd6 (branch -v: align even when branch names are in
UTF-8 - 2012-08-26), which assumes branches are in utf-8. So you have
to convert them to output charset before printing.
-- 
Duy
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales

2012-09-06 Thread Torsten Bögershausen

Am 06.09.2012 um 17:36 schrieb Nguyen Thai Ngoc Duy:

 On Thu, Sep 6, 2012 at 2:20 AM, Torsten Bögershausen tbo...@web.de wrote:
 Will that work for non-ASCII encodings?
 For ISO-8859-x we can say strlen() == strwidth(),
 but for other encodings using multibytes that doesn't work, does it?
 
 BTW if you are interested in supporting non-utf8 output, you may want
 to look at 1452bd6 (branch -v: align even when branch names are in
 UTF-8 - 2012-08-26), which assumes branches are in utf-8. So you have
 to convert them to output charset before printing.
 -- 
 Duy

Thanks,
I try to re-phrase my question:

Do installations still exist which use e.g. BIG5 or any other
multi byte encoding which is not UTF-8?

Do we want to support other encodings than ASCII or UTF-8?
(Because then the screen width needs to be calculate different, I think)

/Torsten



--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales

2012-09-06 Thread Junio C Hamano
Torsten Bögershausen tbo...@web.de writes:

 I try to re-phrase my question:

 Do installations still exist which use e.g. BIG5 or any other
 multi byte encoding which is not UTF-8?

Yes.

 Do we want to support other encodings than ASCII or UTF-8?
 (Because then the screen width needs to be calculate different, I think)

That depends on who we are and what timeframe you have in mind.

Do our developers care about these encodings so much that we would
reject ASCCI/UTF-8 only patch and wait until we enhance it to do
the right thing for other encodings that we do not use ourselves?
No.  That does not make any sense, especially when we know we will
not have a good test coverage on the additional parts that we will
not be using ourselves.

This change only helps people with ASCII or UTF-8 and does not help
others alone is never a valid reason to reject a change, but we
still try to be nicer to others that may come after we leave this
topic behind by doing a few things:

 - If the change will make things worse than it currently is for
   others, we would try to minimize the regression for them.

 - If the change will make the code harder to update later to
   enhance with additional change to support others, we would try
   to anticipate what kind of changes are needed on top, and
   structure the code in such a way that future changes can be made
   cleanly.

For the first point, for multi-byte encodings (e.g. ISO-2022), the
display columns and byte length do not match and in general byte
length is longer than the display columns in the current code.  With
the current code that measures the required columns across elements
by taking the maximum of byte length, they will see wrong number of
filler, so they are already getting a wrong alignment.  With the
UTF-8 only change, the required columns and the filler will be
calculated by assuming that the string is in UTF-8, which may make
the computation off in a different way, and if we underestimate the
display columns for a string, they may see the strings truncated,
which is bad.

So as long as gettext_width() punts and returns strlen() for a
malformed UTF-8, it would be OK [*1*].

For the second point, I think the API here is a string, give me the
number of display columns it will occupy, as I am interested in
aligning them is a good abstraction that can be later enhanced to
other encodings fairly easily, so I do not see a problem in the
patch that goes in that direction.



[Footnote]

*1* If the patch used utf_strwidth() (which I didn't bother to go
back and check, though), it should be OK.  The underlying
utf8_width() will reject a malformed UTF-8 sequence and the code
falls back to strlen().
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales

2012-09-05 Thread Torsten Bögershausen

On 09/05/2012 08:15 PM, Torsten Bögershausen wrote:

On 04.09.12 12:39, Nguyễn Thái Ngọc Duy wrote:

+/* return the number of columns of string 's' in current locale */
+int gettext_width(const char *s)
+{
+   static int is_utf8 = -1;
+   if (is_utf8 == -1)
+   is_utf8 = !strcmp(charset, UTF-8);
+
+   return is_utf8 ? utf8_strwidth(s) : strlen(s);


Will that work for non-ASCII encodings?
For ISO-8859-x we can say strlen() == strwidth(),
but for other encodings using multibytes that doesn't work, does it?

(Sorry the message went out before completely written)
Something like that:

int gettext_width(const char *s) {
  static int is_utf8 = -1;

  if (is_utf8 == -1)
is_utf8 = !strcmp(charset, UTF-8);

  if (is_utf8)
return utf8_strwidth(s);
  else  {
char *s_utf = reencode_string(s, UTF-8, charset);
if (s_utf) {
  witdh = utf8_strwidth(s_utf);
  free(s_utf);
} else
  width = strlen(s);

return width;
}

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] fetch: align new ref summary printout in UTF-8 locales

2012-09-04 Thread Nguyễn Thái Ngọc Duy
fetch does printf(%-*s, width, foo) where foo can be a utf-8
string, but width is in bytes, not columns. For ASCII it's fine as one
byte takes one column. For utf-8, this may result in misaligned ref
summary table.

Introduce gettext_width() function that returns the string length in
columns (currently only supports utf-8 locales). Make the code use
TRANSPORT_SUMMARY(x) where the length is compensated properly in
non-English locales.

Signed-off-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
---
  - rename gettext_length() to gettext_width()
  - use width instead of letters
  - leave other %-*s places unchanged if they always take ascii
strings (i.e. no _() calls)
  - note to self, may need to i18n-ize print_ref_status() in
transport.c, looks like it's used by git-push only

 builtin/fetch.c | 15 +++
 gettext.c   | 15 +--
 gettext.h   |  5 +
 transport.h |  1 +
 4 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index bb9a074..85e291f 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -255,9 +255,8 @@ static int update_local_ref(struct ref *ref,
if (!hashcmp(ref-old_sha1, ref-new_sha1)) {
if (verbosity  0)
strbuf_addf(display, = %-*s %-*s - %s,
-   TRANSPORT_SUMMARY_WIDTH,
-   _([up to date]), REFCOL_WIDTH,
-   remote, pretty_ref);
+   TRANSPORT_SUMMARY(_([up to date])),
+   REFCOL_WIDTH, remote, pretty_ref);
return 0;
}
 
@@ -271,7 +270,7 @@ static int update_local_ref(struct ref *ref,
 */
strbuf_addf(display,
_(! %-*s %-*s - %s  (can't fetch in current 
branch)),
-   TRANSPORT_SUMMARY_WIDTH, _([rejected]),
+   TRANSPORT_SUMMARY(_([rejected])),
REFCOL_WIDTH, remote, pretty_ref);
return 1;
}
@@ -282,7 +281,7 @@ static int update_local_ref(struct ref *ref,
r = s_update_ref(updating tag, ref, 0);
strbuf_addf(display, %c %-*s %-*s - %s%s,
r ? '!' : '-',
-   TRANSPORT_SUMMARY_WIDTH, _([tag update]),
+   TRANSPORT_SUMMARY(_([tag update])),
REFCOL_WIDTH, remote, pretty_ref,
r ? _(  (unable to update local ref)) : );
return r;
@@ -317,7 +316,7 @@ static int update_local_ref(struct ref *ref,
r = s_update_ref(msg, ref, 0);
strbuf_addf(display, %c %-*s %-*s - %s%s,
r ? '!' : '*',
-   TRANSPORT_SUMMARY_WIDTH, what,
+   TRANSPORT_SUMMARY(what),
REFCOL_WIDTH, remote, pretty_ref,
r ? _(  (unable to update local ref)) : );
return r;
@@ -357,7 +356,7 @@ static int update_local_ref(struct ref *ref,
return r;
} else {
strbuf_addf(display, ! %-*s %-*s - %s  %s,
-   TRANSPORT_SUMMARY_WIDTH, _([rejected]),
+   TRANSPORT_SUMMARY(_([rejected])),
REFCOL_WIDTH, remote, pretty_ref,
_((non-fast-forward)));
return 1;
@@ -554,7 +553,7 @@ static int prune_refs(struct refspec *refs, int ref_count, 
struct ref *ref_map)
result |= delete_ref(ref-name, NULL, 0);
if (verbosity = 0) {
fprintf(stderr,  x %-*s %-*s - %s\n,
-   TRANSPORT_SUMMARY_WIDTH, _([deleted]),
+   TRANSPORT_SUMMARY(_([deleted])),
REFCOL_WIDTH, _((none)), 
prettify_refname(ref-name));
warn_dangling_symref(stderr, dangling_msg, ref-name);
}
diff --git a/gettext.c b/gettext.c
index f75bca7..71e9545 100644
--- a/gettext.c
+++ b/gettext.c
@@ -4,6 +4,8 @@
 
 #include git-compat-util.h
 #include gettext.h
+#include strbuf.h
+#include utf8.h
 
 #ifndef NO_GETTEXT
 #  include locale.h
@@ -27,10 +29,9 @@ int use_gettext_poison(void)
 #endif
 
 #ifndef NO_GETTEXT
+static const char *charset;
 static void init_gettext_charset(const char *domain)
 {
-   const char *charset;
-
/*
   This trick arranges for messages to be emitted in the user's
   requested encoding, but avoids setting LC_CTYPE from the
@@ -128,4 +129,14 @@ void git_setup_gettext(void)
init_gettext_charset(git);
textdomain(git);
 }
+
+/* return the number of columns of string 's' in current locale */
+int gettext_width(const char *s)
+{
+   static int is_utf8 = -1;
+   if (is_utf8 == -1)
+