On Mon, Sep 25, 2023 at 03:13:03PM +0200, Walter Alejandro Iglesias wrote:
> This new version, when it detects invalid utf-8 in the body saves a
> dead.letter, prints the following message and exits.
> 
>   $ mail -s hello user < invalid_utf8.txt
>   Invalid or incomplete multibyte or wide character
>   . . . message not sent.
> 
> 
> 
> Index: send.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/mail/send.c,v
> retrieving revision 1.26
> diff -u -p -r1.26 send.c
> --- send.c    8 Mar 2023 04:43:11 -0000       1.26
> +++ send.c    25 Sep 2023 13:07:17 -0000
> @@ -32,6 +32,11 @@
>  
>  #include "rcv.h"
>  #include "extern.h"
> +#include "locale.h"
> +
> +/* To check charset of the message and add the appropiate MIME headers  */
> +static char nutf8;
> +static int not_utf8(FILE *s, int len);
>  
>  static volatile sig_atomic_t sendsignal;     /* Interrupted by a signal? */
>  
> @@ -341,6 +346,17 @@ mail1(struct header *hp, int printheader
>               else
>                       puts("Null message body; hope that's ok");
>       }
> +
> +     /* Check non valid UTF-8 characters in the message */
> +     nutf8 = not_utf8(mtf, fsize(mtf));
> +     rewind(mtf);
> +     if (nutf8 > 1) {
> +             savedeadletter(mtf);
> +             puts("Invalid or incomplete multibyte or wide character");
> +             fputs(". . . message not sent.\n", stderr);
> +             exit(1);
> +     }
> +
>       /*
>        * Now, take the user names from the combined
>        * to and cc lists and do all the alias
> @@ -520,15 +536,30 @@ puthead(struct header *hp, FILE *fo, int
>       gotcha = 0;
>       from = hp->h_from ? hp->h_from : value("from");
>       if (from != NULL)
> -             fprintf(fo, "From: %s\n", from), gotcha++;
> +             fprintf(fo, "From: %s\n", from),
> +                 gotcha++;
>       if (hp->h_to != NULL && w & GTO)
> -             fmt("To:", hp->h_to, fo, w&GCOMMA), gotcha++;
> +             fmt("To:", hp->h_to, fo, w&GCOMMA),
> +                 gotcha++;
>       if (hp->h_subject != NULL && w & GSUBJECT)
> -             fprintf(fo, "Subject: %s\n", hp->h_subject), gotcha++;
> +             fprintf(fo, "Subject: %s\n", hp->h_subject),
> +                 gotcha++;
> +     if (nutf8 == 0)
> +             fprintf(fo, "MIME-Version: 1.0\n"
> +                 "Content-Type: text/plain; charset=us-ascii\n"
> +                 "Content-Transfer-Encoding: 7bit\n"),
> +                 gotcha++;
> +     else if (nutf8 == 1)
> +             fprintf(fo, "MIME-Version: 1.0\n"
> +                 "Content-Type: text/plain; charset=utf-8\n"
> +                 "Content-Transfer-Encoding: 8bit\n"),
> +                 gotcha++;
>       if (hp->h_cc != NULL && w & GCC)
> -             fmt("Cc:", hp->h_cc, fo, w&GCOMMA), gotcha++;
> +             fmt("Cc:", hp->h_cc, fo, w&GCOMMA),
> +                 gotcha++;
>       if (hp->h_bcc != NULL && w & GBCC)
> -             fmt("Bcc:", hp->h_bcc, fo, w&GCOMMA), gotcha++;
> +             fmt("Bcc:", hp->h_bcc, fo, w&GCOMMA),
> +                 gotcha++;
>       if (gotcha && w & GNL)
>               (void)putc('\n', fo);
>       return(0);
> @@ -609,4 +640,44 @@ sendint(int s)
>  {
>  
>       sendsignal = s;
> +}
> +
> +/* Search non valid UTF-8 characters in the message */
> +static int
> +not_utf8(FILE *message, int len)
> +{

Nitpick: I would call `message` maybe `fp` or something here.

> +     int c, count, n, ulen;
> +     size_t i, resize;
> +     size_t jump = 100;
> +     unsigned char *s = NULL;
> +
> +     setlocale(LC_CTYPE, "en_US.UTF-8");
> +

Should setlocale() be restored later on?

> +     if (s == NULL && (s = malloc(jump)) == NULL)
> +             err(1, NULL);

The check if `s` is NULL seems unncessary here.

> +
> +     i = count = 0;
> +     while ((c = getc(message)) != EOF) {
> +             if (s == NULL || count == jump) {

The check if `s` is NULL seems unncessary here.

> +                     if ((s = realloc(s, i + jump + 1)) == NULL)
> +                             err(1, NULL);
> +                     count = 0;
> +             }
> +             s[i++] = c;
> +             count++;
> +     }
> +
> +     s[i] = '\0';
> +
> +     ulen = mbstowcs(NULL, s, 0);
> +
> +     if (ulen == len)
> +             n = 0;
> +     else if (ulen < 0)
> +             n = 2; 
> +     else if (ulen < len)
> +             n = 1;
> +     
> +     free(s);
> +     return n;
>  }
> 
> 
> -- 
> Walter
> 

Since it assumes UTF-8, maybe mbstowcs() is not needed and it can be done in
one pass while reading the stream (no need to allocate, slurp the whole file
and decode). Just: read the per byte and return on the first invalid sequence.

-- 
Kind regards,
Hiltjo

Reply via email to