On Fri, Oct 06, 2023 at 06:50:38PM -0400, Wietse Venema via Postfix-users wrote:
> + } else { > + server->username = mystrdup(serverout); > + printable(server->username, '?'); I might note that when UTF8 is enabled, this does correctly leaves valid UTF8 characters undisturbed. However, I also took a close look at printable(), and noticed that it admits UTF-8 code points with outside the the Unicode range (leader byte up to 254), while the maximum valid UTF-8 lead byte tops out at 244 (0b11110100), for characters in the range U+100000–U+10FFFF. And also, the number of non-leader bytes is not validated, allowing for abitrarily long runs of 0b10xxxxxx octets after the leader byte. I think we can do better: - Limit the leader byte to 244 (which still admits some high code points up to U+13FFFF). - When the leader byte is followed by too few non-leader bytes, resynchronise by replacing the leader byte with '?' and starting at the next byte. This produces more sensible results. - When too many non-leader bytes follow, they're also replaced with '?'. I put together a patch, and small number of test code points, which include some malformed UTF-8, so not easy to post. In git, I tagged the input file as "binary" for "git commit" purposes. I doubt "patch" supports the binary file, so I also attached "printable.in" separately. -- Viktor.
>From 32135d6f447b18c5474730846c69de25ed77adcc Mon Sep 17 00:00:00 2001 From: Viktor Dukhovni <ietf-d...@dukhovni.org> Date: Fri, 6 Oct 2023 21:38:10 -0400 Subject: [PATCH 1/1] Better UTF8 printable --- src/util/.gitattributes | 1 + src/util/Makefile.in | 12 ++++++++++- src/util/printable.c | 46 +++++++++++++++++++++++++++++++++++----- src/util/printable.in | Bin 0 -> 175 bytes src/util/printable.ref | 10 +++++++++ 5 files changed, 63 insertions(+), 6 deletions(-) create mode 100644 postfix/src/util/.gitattributes create mode 100644 postfix/src/util/printable.in create mode 100644 postfix/src/util/printable.ref diff --git a/src/util/.gitattributes b/src/util/.gitattributes new file mode 100644 index 000000000..8d63819a9 --- /dev/null +++ b/src/util/.gitattributes @@ -0,0 +1 @@ +printable.in binary diff --git a/src/util/Makefile.in b/src/util/Makefile.in index d795f69d6..e9982485d 100644 --- a/src/util/Makefile.in +++ b/src/util/Makefile.in @@ -365,6 +365,11 @@ unescape: $(LIB) $(CC) $(CFLAGS) -DTEST -o $@ $@.c $(LIB) $(SYSLIBS) mv junk $@.o +printable: $(LIB) + mv $@.o junk + $(CC) $(CFLAGS) -DTEST -o $@ $@.c $(LIB) $(SYSLIBS) + mv junk $@.o + hex_quote: $(LIB) mv $@.o junk $(CC) $(CFLAGS) -DTEST -o $@ $@.c $(LIB) $(SYSLIBS) @@ -618,7 +623,7 @@ tests: all valid_hostname_test mac_expand_test dict_test unescape_test \ strcasecmp_utf8_test vbuf_print_test miss_endif_cidr_test \ miss_endif_regexp_test split_qnameval_test vstring_test \ vstream_test byte_mask_tests mystrtok_test known_tcp_ports_test \ - binhash_test argv_test inet_prefix_top_test + binhash_test argv_test inet_prefix_top_test printable_test dict_tests: all dict_test \ dict_pcre_tests dict_cidr_test dict_thash_test dict_static_test \ @@ -650,6 +655,11 @@ unescape_test: unescape unescape.in unescape.ref # diff unescape.in unescape.tmp rm -f unescape.tmp +printable_test: printable printable.in + $(SHLIB_ENV) ${VALGRIND} ./printable <printable.in > printable.tmp + diff -b printable.ref printable.tmp + rm -f printable.tmp + hex_quote_test: hex_quote $(SHLIB_ENV) ${VALGRIND} ./hex_quote <hex_quote.c | od -cb >hex_quote.tmp od -cb <hex_quote.c >hex_quote.ref diff --git a/src/util/printable.c b/src/util/printable.c index 6c148fd00..6d44be578 100644 --- a/src/util/printable.c +++ b/src/util/printable.c @@ -76,6 +76,7 @@ char *printable_except(char *string, int replacement, const char *except) { unsigned char *cp; int ch; + int nb; /* * XXX Replace invalid UTF8 sequences (too short, over-long encodings, @@ -85,11 +86,21 @@ char *printable_except(char *string, int replacement, const char *except) while ((ch = *cp) != 0) { if (ISASCII(ch) && (ISPRINT(ch) || (except && strchr(except, ch)))) { /* ok */ - } else if (util_utf8_enable && ch >= 194 && ch <= 254 - && cp[1] >= 128 && cp[1] < 192) { - /* UTF8; skip the rest of the bytes in the character. */ - while (cp[1] >= 128 && cp[1] < 192) - cp++; + } else if (util_utf8_enable && ch >= 194 && ch <= 244) { + /* + * UTF8 leader byte; skip the rest of the bytes in the character. + * If not followed by the expected number of non-leader bytes, + * replace the leader byte and restart at the next, resynchronising + * at the next valid sequence. + */ + for (nb = 1, ch &= 0xf0; ch & 0x40; ch <<= 1, ++nb) { + if (cp[nb] < 128 || cp[nb] >= 192) { + *cp = replacement; + nb = 1; + break; + } + } + cp += nb - 1; } else { /* Not ASCII and not UTF8. */ *cp = replacement; @@ -98,3 +109,28 @@ char *printable_except(char *string, int replacement, const char *except) } return (string); } + + +#ifdef TEST + +#include <stdlib.h> +#include <string.h> +#include <msg.h> +#include <vstring_vstream.h> + +int main(int argc, char **argv) +{ + VSTRING *in = vstring_alloc(10); + + util_utf8_enable = 1; + + while (vstring_fgets_nonl(in, VSTREAM_IN)) { + printable(vstring_str(in), '?'); + vstream_fwrite(VSTREAM_OUT, vstring_str(in), VSTRING_LEN(in)); + VSTREAM_PUTC('\n', VSTREAM_OUT); + } + vstream_fflush(VSTREAM_OUT); + exit(0); +} + +#endif diff --git a/src/util/printable.in b/src/util/printable.in new file mode 100644 index 0000000000000000000000000000000000000000..21e0acdd13fddb4d278550a30583822941562f3c GIT binary patch literal 175 zcmXRY%FHWCOv*{+%FEB=$kQ!A@DmTOF9QlE9tM&ZHeJ|pVb{f`3;QlMfH)U+!8q5a zUY~h=`Slssr(WB0eFfL`smQ!1JJvp(z4ytkNiQ0wJl{L<`R+MS`zJo#IQ4ngwx?}d apYNXkV%@@LT}z)%?BW6`1u4X<jtc;co^AmE literal 0 HcmV?d00001 diff --git a/src/util/printable.ref b/src/util/printable.ref new file mode 100644 index 000000000..a85f3405a --- /dev/null +++ b/src/util/printable.ref @@ -0,0 +1,10 @@ +printable +non?n-printable +naïve +na?ve +виктор +в?к?тор +ויקטוּר +ו?קטוּר +中国互联网络发展状况统计报告 +中?互??网络发展状况统计报? -- 2.41.0
printable.in
Description: Binary data
_______________________________________________ Postfix-users mailing list -- postfix-users@postfix.org To unsubscribe send an email to postfix-users-le...@postfix.org