On Mon, Sep 25, 2023 at 03:13:03PM +0200, Walter Alejandro Iglesias wrote: > This new version, when it detects invalid utf-8 in the body saves a > dead.letter, prints the following message and exits. > > $ mail -s hello user < invalid_utf8.txt > Invalid or incomplete multibyte or wide character > . . . message not sent. > > > > Index: send.c > =================================================================== > RCS file: /cvs/src/usr.bin/mail/send.c,v > retrieving revision 1.26 > diff -u -p -r1.26 send.c > --- send.c 8 Mar 2023 04:43:11 -0000 1.26 > +++ send.c 25 Sep 2023 13:07:17 -0000 > @@ -32,6 +32,11 @@ > > #include "rcv.h" > #include "extern.h" > +#include "locale.h" > + > +/* To check charset of the message and add the appropiate MIME headers */ > +static char nutf8; > +static int not_utf8(FILE *s, int len); > > static volatile sig_atomic_t sendsignal; /* Interrupted by a signal? */ > > @@ -341,6 +346,17 @@ mail1(struct header *hp, int printheader > else > puts("Null message body; hope that's ok"); > } > + > + /* Check non valid UTF-8 characters in the message */ > + nutf8 = not_utf8(mtf, fsize(mtf)); > + rewind(mtf); > + if (nutf8 > 1) { > + savedeadletter(mtf); > + puts("Invalid or incomplete multibyte or wide character"); > + fputs(". . . message not sent.\n", stderr); > + exit(1); > + } > + > /* > * Now, take the user names from the combined > * to and cc lists and do all the alias > @@ -520,15 +536,30 @@ puthead(struct header *hp, FILE *fo, int > gotcha = 0; > from = hp->h_from ? hp->h_from : value("from"); > if (from != NULL) > - fprintf(fo, "From: %s\n", from), gotcha++; > + fprintf(fo, "From: %s\n", from), > + gotcha++; > if (hp->h_to != NULL && w & GTO) > - fmt("To:", hp->h_to, fo, w&GCOMMA), gotcha++; > + fmt("To:", hp->h_to, fo, w&GCOMMA), > + gotcha++; > if (hp->h_subject != NULL && w & GSUBJECT) > - fprintf(fo, "Subject: %s\n", hp->h_subject), gotcha++; > + fprintf(fo, "Subject: %s\n", hp->h_subject), > + gotcha++; > + if (nutf8 == 0) > + fprintf(fo, "MIME-Version: 1.0\n" > + "Content-Type: text/plain; charset=us-ascii\n" > + "Content-Transfer-Encoding: 7bit\n"), > + gotcha++; > + else if (nutf8 == 1) > + fprintf(fo, "MIME-Version: 1.0\n" > + "Content-Type: text/plain; charset=utf-8\n" > + "Content-Transfer-Encoding: 8bit\n"), > + gotcha++; > if (hp->h_cc != NULL && w & GCC) > - fmt("Cc:", hp->h_cc, fo, w&GCOMMA), gotcha++; > + fmt("Cc:", hp->h_cc, fo, w&GCOMMA), > + gotcha++; > if (hp->h_bcc != NULL && w & GBCC) > - fmt("Bcc:", hp->h_bcc, fo, w&GCOMMA), gotcha++; > + fmt("Bcc:", hp->h_bcc, fo, w&GCOMMA), > + gotcha++; > if (gotcha && w & GNL) > (void)putc('\n', fo); > return(0); > @@ -609,4 +640,44 @@ sendint(int s) > { > > sendsignal = s; > +} > + > +/* Search non valid UTF-8 characters in the message */ > +static int > +not_utf8(FILE *message, int len) > +{
Nitpick: I would call `message` maybe `fp` or something here. > + int c, count, n, ulen; > + size_t i, resize; > + size_t jump = 100; > + unsigned char *s = NULL; > + > + setlocale(LC_CTYPE, "en_US.UTF-8"); > + Should setlocale() be restored later on? > + if (s == NULL && (s = malloc(jump)) == NULL) > + err(1, NULL); The check if `s` is NULL seems unncessary here. > + > + i = count = 0; > + while ((c = getc(message)) != EOF) { > + if (s == NULL || count == jump) { The check if `s` is NULL seems unncessary here. > + if ((s = realloc(s, i + jump + 1)) == NULL) > + err(1, NULL); > + count = 0; > + } > + s[i++] = c; > + count++; > + } > + > + s[i] = '\0'; > + > + ulen = mbstowcs(NULL, s, 0); > + > + if (ulen == len) > + n = 0; > + else if (ulen < 0) > + n = 2; > + else if (ulen < len) > + n = 1; > + > + free(s); > + return n; > } > > > -- > Walter > Since it assumes UTF-8, maybe mbstowcs() is not needed and it can be done in one pass while reading the stream (no need to allocate, slurp the whole file and decode). Just: read the per byte and return on the first invalid sequence. -- Kind regards, Hiltjo