Hi, I have been working a bit at a patch for that problem in psql. The patch is far from being ready for inclusion or whatever, it's just for comments... By the way, someone can tell me how to generate nice patches showing the difference between one's version and the cvs code that has been downloaded ? I'm new to this (I've only used cvs for personal projects so far, and I don't need to send patches to myself ;) ). The good things in this patch : - it works for me :) - I've used Markus Kuhn's implementation of wcwidth.c : it is locale independant, and is in the public domain. :) [if we keep it, I'll have to tell him, though !] - No dependency on the local libc's UTF-8-awareness ;) [I've seen that psql has no such dependancy, at least in print.c, so I haven't added any]. Actually, the change is completely self-contained. - I've made my own utf-8 -> ucs converter, since I haven't found any without a copyright notice yesterday. It checks invalid and non-optimal UTF-8 sequences, as requested per Unicode 3.0.1 (or 3.1, I don't remember). - it works for japanese (and I believe other "full-width" characters). - if MULTIBYTE is not defined, the code doesn't change from the commited version. The not so good things : - I've made my own utf-8 -> ucs converter... It seems to work fine, but it's not tested well enough, it may not be so robust. - The printf( "%*s", width, utfstr) doesn't work as expected, so I had to fix by doing printf( "%*s%s", width - utfstrwidth, "", utfstr); - everything in #ifdef MULTIBYTE/#endif . Since they're is no dependancy on anything else (including the rest of the multibyte implementation - which I haven't had the time to look at in detail), it doesn't depend on it. - I get this (for each call to pg_mb_utfs_width) and I don't know why : print.c:265: warning: passing arg 1 of `pg_mb_utfs_width' discards qualifiers from pointer target type - If pg_mb_utfs_width finds an invalid UTF-8 string, it truncates it. I suppose that's what we want to do, but that's probably not the best place to do it. The bad things : - If MULTIBYTE is defined, the strings must be in UTF-8, it doesn't check any encoding. - it is not integrated at all with the rest of the MB code. - it doesn't respect the indentation policy ;) To do : - integrate better with the rest of the MB (client-side encoding), and with the rest of the code of print.c . - verify utf8-to-ucs robustness seriously. - make a visually nicer code :) - find better function names. And possibly : - consolidate the code, in order to remove the need for the #ifdef's in many places. - make it working with some others multiwidth-encoding (but then, I don't know anything about these encodings myself !). - check also utf-8 stream at input time, so that no invalid utf-8 is sent to the backend (at least from psql - the backend will need also a strict checking for UTF-8). - add nice UTF-8 borders as an option :) - add a command-line parameter to consider Unicode Ambiguous characters (characters which can be narrow or wide, depending on the terminal) wide characters, as it seems to be the case for CJK terminals (as per TR#11). - What else ? BTW, here is the table I had in the first mail. I would have shown the one with all the weird Unicode characters, but my mutt is configured with iso-8859-15, and I doubt many of you have utf-8 as a default yet :) +------+-------+--------+ | lang | text | text | +------+-------+--------+ | isl | álíta | áleit | | isl | álíta | álitum | | isl | álíta | álitiđ | | isl | mađur | mann | | isl | mađur | mönnum | | isl | mađur | manna | | isl | óska | -ađi | +------+-------+--------+ The files in attachment : - a diff for pgsql/src/bin/psql/print.c - a diff for pgsql/src/bin/psql/Makefile - two new files : pgsql/src/bin/psql/pg_mb_utf8.c pgsql/src/bin/psql/pg_mb_utf8.h Have fun ! Patrice -- Patrice HÉDÉ ------------------------------- patrice ŕ islande org ----- -- Isn't it weird how scientists can imagine all the matter of the universe exploding out of a dot smaller than the head of a pin, but they can't come up with a more evocative name for it than "The Big Bang" ? -- What would _you_ call the creation of the universe ? -- "The HORRENDOUS SPACE KABLOOIE !" - Calvin and Hobbes ------------------------------------------ http://www.islande.org/ -----
*** pgsql/src/bin/psql/print.c~ Wed Aug 1 20:44:54 2001 --- pgsql/src/bin/psql/print.c Wed Sep 26 19:30:42 2001 *************** *** 33,38 **** --- 33,41 ---- #include <termios.h> #endif + #ifdef MULTIBYTE + #include "pg_mb_utf8.h" + #endif /*************************/ /* Unaligned text */ *************** *** 213,218 **** --- 216,227 ---- FILE *fout) { unsigned int col_count = 0; + + #ifdef MULTIBYTE + unsigned int cell_count = 0; + unsigned int *head_w, *cell_w; + #endif + unsigned int i, tmp; unsigned int *widths, *************** *** 230,244 **** exit(EXIT_FAILURE); } /* calc column widths */ ! for (i = 0; i < col_count; i++) ! if ((tmp = strlen(headers[i])) > widths[i]) widths[i] = tmp; /* don't wanna call strlen twice */ ! ! for (i = 0, ptr = cells; *ptr; ptr++, i++) ! if ((tmp = strlen(*ptr)) > widths[i % col_count]) widths[i % col_count] = tmp; ! if (opt_border == 0) total_w = col_count - 1; else if (opt_border == 1) --- 239,289 ---- exit(EXIT_FAILURE); } + #ifdef MULTIBYTE + head_w = calloc(col_count, sizeof(*head_w)); + if (!head_w) { + perror("calloc"); + exit(EXIT_FAILURE); + } + + /* count rows */ + for (ptr = cells; *ptr; ptr++) { + cell_count++; + } + + cell_w = calloc(cell_count, sizeof(*cell_w)); + if (!cell_w) { + perror("calloc"); + exit(EXIT_FAILURE); + } + #endif + + /* calc column widths */ ! for (i = 0; i < col_count; i++) { ! #ifdef MULTIBYTE ! if ((tmp = pg_mb_utfs_width(headers[i], PG_MB_UTF_TRUNC)) > widths[i]) { widths[i] = tmp; /* don't wanna call strlen twice */ ! } ! head_w[i] = tmp; ! #else ! if ((tmp = strlen(headers[i])) > widths[i]) { ! widths[i] = tmp; /* don't wanna call strlen twice */ ! } ! #endif ! } ! for (i = 0, ptr = cells; *ptr; ptr++, i++) { ! #ifdef MULTIBYTE ! if ((tmp = pg_mb_utfs_width(*ptr, PG_MB_UTF_TRUNC)) > widths[i % col_count]) { widths[i % col_count] = tmp; ! } ! cell_w[i] = tmp; ! #else ! if ((tmp = strlen(*ptr)) > widths[i % col_count]) { ! widths[i % col_count] = tmp; ! } ! #endif ! } if (opt_border == 0) total_w = col_count - 1; else if (opt_border == 1) *************** *** 252,261 **** /* print title */ if (title && !opt_barebones) { if (strlen(title) >= total_w) fprintf(fout, "%s\n", title); else ! fprintf(fout, "%-*s%s\n", (int) (total_w - strlen(title)) / 2, "", title); } /* print headers */ --- 297,315 ---- /* print title */ if (title && !opt_barebones) { + #ifdef MULTIBYTE + int tlen; + if ((tlen = pg_mb_utfs_width(title, PG_MB_UTF_TRUNC)) >= total_w) + fprintf(fout, "%s\n", title); + else + fprintf(fout, "%-*s%s\n", (int) (total_w - tlen) / 2, "", +title); + #else if (strlen(title) >= total_w) fprintf(fout, "%s\n", title); else ! fprintf(fout, "%-*s%s\n", (int) (total_w - strlen(title)) / 2, "", ! title); ! #endif } /* print headers */ *************** *** 271,280 **** for (i = 0; i < col_count; i++) { /* centered */ ! fprintf(fout, "%-*s%s%-*s", ! (int) floor((widths[i] - strlen(headers[i])) / 2.0), "", ! headers[i], (int) ceil((widths[i] - strlen(headers[i])) / 2.0), ""); if (i < col_count - 1) { --- 325,340 ---- for (i = 0; i < col_count; i++) { + int nbspace; + #ifdef MULTIBYTE + nbspace = widths[i] - head_w[i]; + #else + nbspace = widths[i] - strlen(headers[i]); + #endif + /* centered */ ! fprintf(fout, "%-*s%s%-*s", ! nbspace / 2, "", headers[i], (nbspace+1) / 2, ""); if (i < col_count - 1) { *************** *** 307,320 **** } /* content */ ! if (opt_align[(i) % col_count] == 'r') fprintf(fout, "%*s", widths[i % col_count], cells[i]); else { if ((i + 1) % col_count == 0 && opt_border != 2) fputs(cells[i], fout); ! else fprintf(fout, "%-*s", widths[i % col_count], cells[i]); } /* divider */ --- 367,392 ---- } /* content */ ! if (opt_align[(i) % col_count] == 'r') { ! #ifdef MULTIBYTE ! fprintf(fout, "%*s%s", ! widths[i % col_count] - cell_w[i], "", cells[i] ); ! #else fprintf(fout, "%*s", widths[i % col_count], cells[i]); + #endif + } else { if ((i + 1) % col_count == 0 && opt_border != 2) fputs(cells[i], fout); ! else { ! #ifdef MULTIBYTE ! fprintf(fout, "%-s%*s", cells[i], ! widths[i % col_count] - cell_w[i], "" ); ! #else fprintf(fout, "%-*s", widths[i % col_count], cells[i]); + #endif + } } /* divider */ *************** *** 345,350 **** --- 417,426 ---- fputc('\n', fout); /* clean up */ + #ifdef MULTIBYTE + free(cell_w); + free(head_w); + #endif free(widths); } *************** *** 364,369 **** --- 440,449 ---- hwidth = 0, dwidth = 0; char *divider; + #ifdef MULTIBYTE + unsigned int cell_count = 0; + unsigned int *cell_w,*head_w; + #endif if (cells[0] == NULL) { *************** *** 371,376 **** --- 451,490 ---- return; } + #ifdef MULTIBYTE + /* pre-count headers */ + for (ptr = headers; *ptr; ptr++) { + col_count++; + } + head_w = calloc(col_count, sizeof(*head_w)); + if (!head_w) { + perror("calloc"); + exit(EXIT_FAILURE); + } + for (i = 0; i < col_count; i++) + { + if ((tmp = pg_mb_utfs_width(headers[i], PG_MB_UTF_TRUNC)) > hwidth) + hwidth = tmp; /* don't wanna call strlen twice */ + head_w[i] = tmp; + } + for (ptr = cells; *ptr; ptr++) { + cell_count++; + } + + cell_w = calloc(cell_count, sizeof(*cell_w)); + if (!cell_w) { + perror("calloc"); + exit(EXIT_FAILURE); + } + + /* find longest data cell */ + for (i = 0, ptr = cells; *ptr; ptr++, i++) { + if ((tmp = pg_mb_utfs_width(*ptr, PG_MB_UTF_TRUNC)) > dwidth) { + dwidth = tmp; + } + cell_w[i] = tmp; + } + #else /* count columns and find longest header */ for (ptr = headers; *ptr; ptr++) { *************** *** 380,388 **** } /* find longest data cell */ ! for (ptr = cells; *ptr; ptr++) if ((tmp = strlen(*ptr)) > dwidth) dwidth = tmp; /* print title */ if (!opt_barebones && title) --- 494,504 ---- } /* find longest data cell */ ! for (ptr = cells; *ptr; ptr++) { if ((tmp = strlen(*ptr)) > dwidth) dwidth = tmp; + } + #endif /* print title */ if (!opt_barebones && title) *************** *** 456,462 **** --- 572,583 ---- if (opt_border == 2) fputs("| ", fout); + #if MULTIBYTE + fprintf(fout, "%-s%*s", headers[i % col_count], + hwidth - head_w[i % col_count], ""); + #else fprintf(fout, "%-*s", hwidth, headers[i % col_count]); + #endif if (opt_border > 0) fputs(" | ", fout); *************** *** 465,472 **** if (opt_border < 2) fprintf(fout, "%s\n", *ptr); ! else fprintf(fout, "%-*s |\n", dwidth, *ptr); } if (opt_border == 2) --- 586,598 ---- if (opt_border < 2) fprintf(fout, "%s\n", *ptr); ! else { ! #ifdef MULTIBYTE ! fprintf(fout, "%-s%*s |\n", *ptr, dwidth - cell_w[i], ""); ! #else fprintf(fout, "%-*s |\n", dwidth, *ptr); + #endif + } } if (opt_border == 2) *************** *** 485,490 **** --- 611,620 ---- fputc('\n', fout); free(divider); + + #ifdef MULTIBYTE + free(cell_w); + #endif }
*** pgsql/src/bin/psql/Makefile~ Tue Feb 27 09:13:27 2001 --- pgsql/src/bin/psql/Makefile Wed Sep 26 01:14:40 2001 *************** *** 19,25 **** OBJS=command.o common.o help.o input.o stringutils.o mainloop.o \ copy.o startup.o prompt.o variables.o large_obj.o print.o describe.o \ ! tab-complete.o all: submake psql --- 19,25 ---- OBJS=command.o common.o help.o input.o stringutils.o mainloop.o \ copy.o startup.o prompt.o variables.o large_obj.o print.o describe.o \ ! tab-complete.o pg_mb_utf8.o all: submake psql
/* $Id: $ */ #include "pg_mb_utf8.h" /* * This is an implementation of wcwidth() and wcswidth() as defined in * "The Single UNIX Specification, Version 2, The Open Group, 1997" * <http://www.UNIX-systems.org/online.html> * * Markus Kuhn -- 2001-09-08 -- public domain * * customised to PostgreSQL by Patrice Hede * - removed dependency to wchar, * - modified and renamed wcswidth to take utf8 strings. * * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c */ struct pg_mbinterval { unsigned short first; unsigned short last; }; /* auxiliary function for binary search in interval table */ static int pg_mbbisearch(pg_wchar ucs, const struct pg_mbinterval *table, int max) { int min = 0; int mid; if (ucs < table[0].first || ucs > table[max].last) return 0; while (max >= min) { mid = (min + max) / 2; if (ucs > table[mid].last) min = mid + 1; else if (ucs < table[mid].first) max = mid - 1; else return 1; } return 0; } /* The following functions define the column width of an ISO 10646 * character as follows: * * - The null character (U+0000) has a column width of 0. * * - Other C0/C1 control characters and DEL will lead to a return * value of -1. * * - Non-spacing and enclosing combining characters (general * category code Mn or Me in the Unicode database) have a * column width of 0. * * - Other format characters (general category code Cf in the Unicode * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. * * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) * have a column width of 0. * * - Spacing characters in the East Asian Wide (W) or East Asian * FullWidth (F) category as defined in Unicode Technical * Report #11 have a column width of 2. * * - All remaining characters (including all printable * ISO 8859-1 and WGL4 characters, Unicode control characters, * etc.) have a column width of 1. * * This implementation assumes that wchar_t characters are encoded * in ISO 10646. */ int pg_mb_wcwidth(pg_wchar ucs) { /* sorted list of non-overlapping intervals of non-spacing characters */ static const struct pg_mbinterval combining[] = { { 0x0300, 0x034E }, { 0x0360, 0x0362 }, { 0x0483, 0x0486 }, { 0x0488, 0x0489 }, { 0x0591, 0x05A1 }, { 0x05A3, 0x05B9 }, { 0x05BB, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C4 }, { 0x064B, 0x0655 }, { 0x0670, 0x0670 }, { 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x070F, 0x070F }, { 0x0711, 0x0711 }, { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, { 0x0901, 0x0902 }, { 0x093C, 0x093C }, { 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0954 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 }, { 0x0A02, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD }, { 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D }, { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 }, { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 }, { 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x17B7, 0x17BD }, { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x180B, 0x180E }, { 0x18A9, 0x18A9 }, { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x206A, 0x206F }, { 0x20D0, 0x20E3 }, { 0x302A, 0x302F }, { 0x3099, 0x309A }, { 0xFB1E, 0xFB1E }, { 0xFE20, 0xFE23 }, { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB } }; /* test for 8-bit control characters */ if (ucs == 0) { return 0; } if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff) { return -1; } /* binary search in table of non-spacing characters */ if (pg_mbbisearch(ucs, combining, sizeof(combining) / sizeof(struct pg_mbinterval) - 1)) { return 0; } /* if we arrive here, ucs is not a combining or C0/C1 control character */ return 1 + (ucs >= 0x1100 && (ucs <= 0x115f || /* Hangul Jamo init. consonants */ (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a && ucs != 0x303f) || /* CJK ... Yi */ (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */ (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */ (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */ (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */ (ucs >= 0xffe0 && ucs <= 0xffe6) || (ucs >= 0x20000 && ucs <= 0x2ffff))); } pg_wchar pg_mb_utf_ucs(const unsigned char *c, int *l) { /* returns a pg_wchar containing the character, as well as the size to skip */ pg_wchar u = 0; *l = 1; if (!(*c & 0x80)) { return *c; } if (!(*c & 0x40)) { /* will generate a fault as soon as it reaches pg_wcwidth as * the last valid unicode char is 0x10ffff. */ return PG_MB_UTF_INVALID; } switch (*c & 0x30) { case 0x30: if (*c & 0x08) { return PG_MB_UTF_INVALID; } (*l)++; u = (*(c++) & 0x07) << 6; if ((*c & 0xc0) != 0x80) { return PG_MB_UTF_INVALID; } /* no break */ case 0x20: (*l)++; u = (u | (*(c++) & 0x3f)) << 6; if ((*c & 0xc0) != 0x80) { return PG_MB_UTF_INVALID; } /* no break */ default: (*l)++; u = (u | (*(c++) & 0x3f)) << 6; } if ((*c & 0xc0) != 0x80) { return PG_MB_UTF_INVALID; } u |= *c & 0x3f; if (*l == 3) { u &= 0xffff; } return u; } /* pwcs : utf8 string to mesure * truncate : in case the utf-8 is not legal : * 0 : return -1 (invalid), 1 : truncate the string at the invalid character */ int pg_mb_utfs_width(unsigned char *pwcs, int truncate) { int w, l = 0; int width = 0; for (;*pwcs; pwcs+=l) { if ((w = pg_mb_wcwidth(pg_mb_utf_ucs(pwcs,&l))) < 0) { if(truncate) { *pwcs = '\0'; return width; } else { return -1; } } width += w; } return width; }
/* $Id: $ */ #ifndef PG_MB_UTF8_H #define PG_MB_UTF8_H /* * The pg_wchar */ /* #ifdef MULTIBYTE typedef unsigned int pg_wchar; #else #define pg_wchar char #endif */ typedef unsigned int pg_wchar; int pg_mb_wcwidth(pg_wchar ucs); pg_wchar pg_mb_utf_ucs(const unsigned char *c, int *l); int pg_mb_utfs_width(unsigned char *pwcs, int truncate); /* choose 0xffffffff to enforce invalid values, * or 0x00000020 to count as a space. */ /* #define PG_MB_UTF_INVALID 0x00000020 */ #define PG_MB_UTF_INVALID 0xffffffff #define PG_MB_UTF_TRUNC 1 #define PG_MB_UTF_RETINV 0 #endif /* PG_MB_UTF8_H */
---------------------------(end of broadcast)--------------------------- TIP 4: Don't 'kill -9' the postmaster