Bruno Haible <[email protected]> writes: > Robbie Harwood wrote: >> From: Vladimir 'phcoder' Serbinenko <[email protected]> >> >> [[email protected]: merge with later fix from [email protected]] >> Signed-off-by: Robbie Harwood <[email protected]> >> --- >> lib/argp-fmtstream.c | 80 +++++++++++++++++++++++++++++++++++++------- >> lib/argp-help.c | 3 +- >> lib/mbswidth.c | 15 +++++++++ >> lib/mbswidth.h | 4 +++ >> 4 files changed, 88 insertions(+), 14 deletions(-) > > This patch will need several more iterations, as it mixes two good > ideas, with - so far - imperfect implementations: > > * The need to use wcwidth in argp-fmtstream arises because the > help strings are internationalized. But for internationalized strings, > line breaking by looking for spaces is just wrong. (It works for > Russian and Greek but not for Chinese.) A much better algorithm > (that works for most languages, except Thai) is found in > GNU libunistring 1.0. But the code in glibc cannot use libunistring; > so there likely will need to be some '#ifdef _LIBC'. > > * The idea to parse escape sequences in a special way, before invoking > wcwidth, it nice. But's it's far from complete. IMO one needs to look > at GNU teseq, the ISO 2022 standard, and other de-facto standards > when implementing that. And it should then be implemented in wcswidth, > mbswidth, etc. uniformly.
I've put together at using libunistring (attached). Unfortunately it doesn't seem to find ulc_width_linebreaks() at link time, and I'm not familiar enough with gnulib to understand why. Feedback on that (or other parts of it) appreciated. Be well, --Robbie
signature.asc
Description: PGP signature
>From 12c09fc6632e295101ee6957c952bbf1b5c0165a Mon Sep 17 00:00:00 2001 From: Robbie Harwood <[email protected]> Date: Mon, 10 Jan 2022 17:17:39 -0500 Subject: [PATCH] argp-fmtstream: use ulc_width_linebreaks() ulc_width_linebreaks() can't be used in _LIBC, so fall back to a best-effort solution there that won't work with Chinese. Signed-off-by: Robbie Harwood <[email protected]> --- lib/argp-fmtstream.c | 361 +++++++++++++++++++------------------------ modules/argp | 1 + 2 files changed, 161 insertions(+), 201 deletions(-) diff --git a/lib/argp-fmtstream.c b/lib/argp-fmtstream.c index 78c29e38c..12d26effb 100644 --- a/lib/argp-fmtstream.c +++ b/lib/argp-fmtstream.c @@ -28,9 +28,11 @@ #include <errno.h> #include <stdarg.h> #include <ctype.h> +#include <wchar.h> #include "argp-fmtstream.h" #include "argp-namefrob.h" +#include "mbswidth.h" #ifndef ARGP_FMTSTREAM_USE_LINEWRAP @@ -42,6 +44,8 @@ # include <wchar.h> # include <libio/libioP.h> # define __vsnprintf(s, l, f, a) _IO_vsnprintf (s, l, f, a) +#else +# include <unilbrk.h> #endif #define INIT_BUF_SIZE 200 @@ -115,233 +119,188 @@ weak_alias (__argp_fmtstream_free, argp_fmtstream_free) #endif #endif -/* Process FS's buffer so that line wrapping is done from POINT_OFFS to the - end of its buffer. This code is mostly from glibc stdio/linewrap.c. */ -void -__argp_fmtstream_update (argp_fmtstream_t fs) +/* Insert suffix and left margin at POINT_OFFS, flushing as needed. */ +static void +line_at_point (argp_fmtstream_t fs, const char *suffix) { - char *buf, *nl; - size_t len; + size_t i, space_needed = fs->wmargin; + char *nl, *queue = fs->buf + fs->point_offs; - /* Scan the buffer for newlines. */ - buf = fs->buf + fs->point_offs; - while (buf < fs->p) + if (suffix) + space_needed += strlen(suffix); + + while (fs->p + space_needed > fs->end) { - size_t r; - - if (fs->point_col == 0 && fs->lmargin != 0) + /* Output the first line so we can use the space. */ + nl = memchr (fs->buf, '\n', fs->point_offs); + if (nl == NULL) { - /* We are starting a new line. Print spaces to the left margin. */ - const size_t pad = fs->lmargin; - if (fs->p + pad < fs->end) - { - /* We can fit in them in the buffer by moving the - buffer text up and filling in the beginning. */ - memmove (buf + pad, buf, fs->p - buf); - fs->p += pad; /* Compensate for bigger buffer. */ - memset (buf, ' ', pad); /* Fill in the spaces. */ - buf += pad; /* Don't bother searching them. */ - } - else - { - /* No buffer space for spaces. Must flush. */ - size_t i; - for (i = 0; i < pad; i++) - { + /* Line longer than buffer - shouldn't happen. Truncate. */ + nl = queue - 1; + *nl = '\n'; + } + #ifdef _LIBC - if (_IO_fwide (fs->stream, 0) > 0) - putwc_unlocked (L' ', fs->stream); - else + __fxprintf (fs->stream, "%.*s\n", + (int) (nl - fs->buf), fs->buf); +#else + if (nl > fs->buf) + fwrite_unlocked (fs->buf, 1, nl - fs->buf, fs->stream); + putc_unlocked ('\n', fs->stream); #endif - putc_unlocked (' ', fs->stream); - } - } - fs->point_col = pad; - } - len = fs->p - buf; - nl = memchr (buf, '\n', len); + memmove (fs->buf, nl + 1, nl + 1 - fs->buf); + fs->p -= nl + 1 - fs->buf; + fs->point_offs -= nl + 1 - fs->buf; + } - if (fs->point_col < 0) - fs->point_col = 0; + memmove (queue + space_needed, queue, fs->p - queue); + if (suffix) + { + memcpy (queue, suffix, strlen (suffix)); + fs->point_offs += strlen (suffix); + } + for (i = 0; i < fs->lmargin && fs->point_col != -1; i++) + fs->buf[fs->point_offs++] = ' '; + fs->point_col = fs->lmargin; +} - if (!nl) +#ifdef _LIBC +# define UC_BREAK_UNDEFINED 0 +# define UC_BREAK_POSSIBLE 1 +# define UC_BREAK_HYPHENATION 2 +# define UC_BREAK_MANDATORY 3 +static int +uwl_shim(const char *s, size_t n, int width, int start_column, + const char *encoding, char *p) +{ + size_t i = 0, cur_col = start_column, last_option = 0, c_len; + mbstate_t ps; + wchar_t c; + + memset(&ps, 0, sizeof (ps)); + memset(p, UC_BREAK_UNDEFINED, n); + + while (i < n) + { + c_len = mbrtowc (&c, s + i, n, &ps); + if (c_len == 0 || c_len == -2) { - /* The buffer ends in a partial line. */ - - if (fs->point_col + len < fs->rmargin) - { - /* The remaining buffer text is a partial line and fits - within the maximum line width. Advance point for the - characters to be written and stop scanning. */ - fs->point_col += len; - break; - } - else - /* Set the end-of-line pointer for the code below to - the end of the buffer. */ - nl = fs->p; + break; } - else if (fs->point_col + (nl - buf) < (ssize_t) fs->rmargin) + else if (c_len == -1) { - /* The buffer contains a full line that fits within the maximum - line width. Reset point and scan the next line. */ - fs->point_col = 0; - buf = nl + 1; + /* Malformed. Walk forward until things make sense again. */ + i++; + cur_col++; continue; } - /* This line is too long. */ - r = fs->rmargin - 1; - - if (fs->wmargin < 0) + if (c == L'\n') { - /* Truncate the line by overwriting the excess with the - newline and anything after it in the buffer. */ - if (nl < fs->p) + p[i] = UC_BREAK_MANDATORY; + last_option = 0; + cur_col = 0; + } + else if (c == L' ' || c == '\t') + { + /* Best effort - won't help with Chinese or Thai. */ + last_option = i; + } + else if (cur_col >= width && last_option != 0) + { + p[last_option] = UC_BREAK_POSSIBLE; + last_option = 0; + cur_col = 0; + } + + i += c_len; + cur_col += wcwidth(c); + } + + return n - last_break; +} +#define ulc_width_linebreaks(s, n, width, start_column, at_end_columns, \ + o, encoding, p) \ + uwl_shim(s, n, width, start_column, encoding, p) +#endif + +/* Process FS's buffer so that line wrapping is done from POINT_OFFS to the + end of its buffer. */ +void +__argp_fmtstream_update (argp_fmtstream_t fs) +{ + char *lns, *nl, *c, *queue = fs->buf + fs->point_offs; + int col; + size_t i, proc_len = fs->p - queue; + + if (queue >= fs->p) + return; + + if (fs->wmargin == -1) + { + while (fs->buf + fs->point_offs < fs->p) + { + nl = memchr (fs->buf + fs->point_offs, '\n', + fs->rmargin - fs->point_col); + if (nl) { - memmove (buf + (r - fs->point_col), nl, fs->p - nl); - fs->p -= buf + (r - fs->point_col) - nl; - /* Reset point for the next line and start scanning it. */ - fs->point_col = 0; - buf += r + 1; /* Skip full line plus \n. */ + fs->point_offs = nl - queue; + line_at_point (fs, NULL); + continue; } - else + + /* Truncate until end of line. */ + fs->point_offs += fs->rmargin - fs->point_col; + nl = memchr (fs->buf + fs->point_offs + fs->rmargin - fs->point_col, + '\n', fs->p - fs->buf - fs->point_offs); + if (! nl) { - /* The buffer ends with a partial line that is beyond the - maximum line width. Advance point for the characters - written, and discard those past the max from the buffer. */ - fs->point_col += len; - fs->p -= fs->point_col - r; - break; + /* This is the last line. */ + fs->buf[fs->point_offs++] = '\n'; + fs->p = fs->buf + fs->point_offs; + line_at_point(fs, NULL); + return; } + + fs->buf[fs->point_offs++] = '\n'; + memmove (fs->buf + fs->point_offs, nl + 1, + nl + 1 - fs->buf - fs->point_offs); + line_at_point(fs, NULL); + } + return; + } + + lns = (char *) malloc (proc_len); + if (! lns) + return; + + col = ulc_width_linebreaks (queue, proc_len, fs->rmargin - fs->wmargin, + fs->point_col == -1 ? 0 : fs->point_col, 0, + NULL, locale_charset (), lns); + for (i = 0; i < proc_len; i++) + { + if (lns[i] == UC_BREAK_HYPHENATION) + { + line_at_point (fs, "-\n"); + } + else if (lns[i] == UC_BREAK_POSSIBLE) + { + line_at_point (fs, "\n"); + } + else if (lns[i] == UC_BREAK_MANDATORY) + { + fs->point_offs++; + line_at_point (fs, NULL); } else { - /* Do word wrap. Go to the column just past the maximum line - width and scan back for the beginning of the word there. - Then insert a line break. */ - - char *p, *nextline; - int i; - - p = buf + (r + 1 - fs->point_col); - while (p >= buf && !isblank ((unsigned char) *p)) - --p; - nextline = p + 1; /* This will begin the next line. */ - - if (nextline > buf) - { - /* Swallow separating blanks. */ - if (p >= buf) - do - --p; - while (p >= buf && isblank ((unsigned char) *p)); - nl = p + 1; /* The newline will replace the first blank. */ - } - else - { - /* A single word that is greater than the maximum line width. - Oh well. Put it on an overlong line by itself. */ - p = buf + (r + 1 - fs->point_col); - /* Find the end of the long word. */ - if (p < nl) - do - ++p; - while (p < nl && !isblank ((unsigned char) *p)); - if (p == nl) - { - /* It already ends a line. No fussing required. */ - fs->point_col = 0; - buf = nl + 1; - continue; - } - /* We will move the newline to replace the first blank. */ - nl = p; - /* Swallow separating blanks. */ - do - ++p; - while (isblank ((unsigned char) *p)); - /* The next line will start here. */ - nextline = p; - } - - /* Note: There are a bunch of tests below for - NEXTLINE == BUF + LEN + 1; this case is where NL happens to fall - at the end of the buffer, and NEXTLINE is in fact empty (and so - we need not be careful to maintain its contents). */ - - if ((nextline == buf + len + 1 - ? fs->end - nl < fs->wmargin + 1 - : nextline - (nl + 1) < fs->wmargin) - && fs->p > nextline) - { - /* The margin needs more blanks than we removed. */ - if (fs->end - fs->p > fs->wmargin + 1) - /* Make some space for them. */ - { - size_t mv = fs->p - nextline; - memmove (nl + 1 + fs->wmargin, nextline, mv); - nextline = nl + 1 + fs->wmargin; - len = nextline + mv - buf; - *nl++ = '\n'; - } - else - /* Output the first line so we can use the space. */ - { -#ifdef _LIBC - __fxprintf (fs->stream, "%.*s\n", - (int) (nl - fs->buf), fs->buf); -#else - if (nl > fs->buf) - fwrite_unlocked (fs->buf, 1, nl - fs->buf, fs->stream); - putc_unlocked ('\n', fs->stream); -#endif - - len += buf - fs->buf; - nl = buf = fs->buf; - } - } - else - /* We can fit the newline and blanks in before - the next word. */ - *nl++ = '\n'; - - if (nextline - nl >= fs->wmargin - || (nextline == buf + len + 1 && fs->end - nextline >= fs->wmargin)) - /* Add blanks up to the wrap margin column. */ - for (i = 0; i < fs->wmargin; ++i) - *nl++ = ' '; - else - for (i = 0; i < fs->wmargin; ++i) -#ifdef _LIBC - if (_IO_fwide (fs->stream, 0) > 0) - putwc_unlocked (L' ', fs->stream); - else -#endif - putc_unlocked (' ', fs->stream); - - /* Copy the tail of the original buffer into the current buffer - position. */ - if (nl < nextline) - memmove (nl, nextline, buf + len - nextline); - len -= nextline - buf; - - /* Continue the scan on the remaining lines in the buffer. */ - buf = nl; - - /* Restore bufp to include all the remaining text. */ - fs->p = nl + len; - - /* Reset the counter of what has been output this line. If wmargin - is 0, we want to avoid the lmargin getting added, so we set - point_col to a magic value of -1 in that case. */ - fs->point_col = fs->wmargin ? fs->wmargin : -1; + fs->point_offs++; } } + fs->point_col = col; - /* Remember that we've scanned as far as the end of the buffer. */ - fs->point_offs = fs->p - fs->buf; + free (lns); } /* Ensure that FS has space for AMOUNT more bytes in its buffer, either by diff --git a/modules/argp b/modules/argp index 85eefe2b1..1a958b1f1 100644 --- a/modules/argp +++ b/modules/argp @@ -36,6 +36,7 @@ stdio strerror memchr memmove +libunistring configure.ac: gl_ARGP -- 2.34.1
