On Mon, 25 Sep 2023 19:00:15 +0200, Hiltjo Posthuma wrote:
> On Mon, Sep 25, 2023 at 03:13:03PM +0200, Walter Alejandro Iglesias wrote:
> > This new version, when it detects invalid utf-8 in the body saves a
> > dead.letter, prints the following message and exits.
> > 
> >   $ mail -s hello user < invalid_utf8.txt
> >   Invalid or incomplete multibyte or wide character
> >   . . . message not sent.
> > 
> > 
> > 
> > Index: send.c
> > ===================================================================
> > RCS file: /cvs/src/usr.bin/mail/send.c,v
> > retrieving revision 1.26
> > diff -u -p -r1.26 send.c
> > --- send.c  8 Mar 2023 04:43:11 -0000       1.26
> > +++ send.c  25 Sep 2023 13:07:17 -0000
> > @@ -32,6 +32,11 @@
> >  
> >  #include "rcv.h"
> >  #include "extern.h"
> > +#include "locale.h"
> > +
> > +/* To check charset of the message and add the appropiate MIME headers  */
> > +static char nutf8;
> > +static int not_utf8(FILE *s, int len);
> >  
> >  static volatile sig_atomic_t sendsignal;   /* Interrupted by a signal? */
> >  
> > @@ -341,6 +346,17 @@ mail1(struct header *hp, int printheader
> >             else
> >                     puts("Null message body; hope that's ok");
> >     }
> > +
> > +   /* Check non valid UTF-8 characters in the message */
> > +   nutf8 = not_utf8(mtf, fsize(mtf));
> > +   rewind(mtf);
> > +   if (nutf8 > 1) {
> > +           savedeadletter(mtf);
> > +           puts("Invalid or incomplete multibyte or wide character");
> > +           fputs(". . . message not sent.\n", stderr);
> > +           exit(1);
> > +   }
> > +
> >     /*
> >      * Now, take the user names from the combined
> >      * to and cc lists and do all the alias
> > @@ -520,15 +536,30 @@ puthead(struct header *hp, FILE *fo, int
> >     gotcha = 0;
> >     from = hp->h_from ? hp->h_from : value("from");
> >     if (from != NULL)
> > -           fprintf(fo, "From: %s\n", from), gotcha++;
> > +           fprintf(fo, "From: %s\n", from),
> > +               gotcha++;
> >     if (hp->h_to != NULL && w & GTO)
> > -           fmt("To:", hp->h_to, fo, w&GCOMMA), gotcha++;
> > +           fmt("To:", hp->h_to, fo, w&GCOMMA),
> > +               gotcha++;
> >     if (hp->h_subject != NULL && w & GSUBJECT)
> > -           fprintf(fo, "Subject: %s\n", hp->h_subject), gotcha++;
> > +           fprintf(fo, "Subject: %s\n", hp->h_subject),
> > +               gotcha++;
> > +   if (nutf8 == 0)
> > +           fprintf(fo, "MIME-Version: 1.0\n"
> > +               "Content-Type: text/plain; charset=us-ascii\n"
> > +               "Content-Transfer-Encoding: 7bit\n"),
> > +               gotcha++;
> > +   else if (nutf8 == 1)
> > +           fprintf(fo, "MIME-Version: 1.0\n"
> > +               "Content-Type: text/plain; charset=utf-8\n"
> > +               "Content-Transfer-Encoding: 8bit\n"),
> > +               gotcha++;
> >     if (hp->h_cc != NULL && w & GCC)
> > -           fmt("Cc:", hp->h_cc, fo, w&GCOMMA), gotcha++;
> > +           fmt("Cc:", hp->h_cc, fo, w&GCOMMA),
> > +               gotcha++;
> >     if (hp->h_bcc != NULL && w & GBCC)
> > -           fmt("Bcc:", hp->h_bcc, fo, w&GCOMMA), gotcha++;
> > +           fmt("Bcc:", hp->h_bcc, fo, w&GCOMMA),
> > +               gotcha++;
> >     if (gotcha && w & GNL)
> >             (void)putc('\n', fo);
> >     return(0);
> > @@ -609,4 +640,44 @@ sendint(int s)
> >  {
> >  
> >     sendsignal = s;
> > +}
> > +
> > +/* Search non valid UTF-8 characters in the message */
> > +static int
> > +not_utf8(FILE *message, int len)
> > +{
>
> Nitpick: I would call `message` maybe `fp` or something here.
>
> > +   int c, count, n, ulen;
> > +   size_t i, resize;
> > +   size_t jump = 100;
> > +   unsigned char *s = NULL;
> > +
> > +   setlocale(LC_CTYPE, "en_US.UTF-8");
> > +
>
> Should setlocale() be restored later on?
>
> > +   if (s == NULL && (s = malloc(jump)) == NULL)
> > +           err(1, NULL);
>
> The check if `s` is NULL seems unncessary here.
>
> > +
> > +   i = count = 0;
> > +   while ((c = getc(message)) != EOF) {
> > +           if (s == NULL || count == jump) {
>
> The check if `s` is NULL seems unncessary here.
>
> > +                   if ((s = realloc(s, i + jump + 1)) == NULL)
> > +                           err(1, NULL);
> > +                   count = 0;
> > +           }
> > +           s[i++] = c;
> > +           count++;
> > +   }
> > +
> > +   s[i] = '\0';
> > +
> > +   ulen = mbstowcs(NULL, s, 0);
> > +
> > +   if (ulen == len)
> > +           n = 0;
> > +   else if (ulen < 0)
> > +           n = 2; 
> > +   else if (ulen < len)
> > +           n = 1;
> > +   
> > +   free(s);
> > +   return n;
> >  }
> > 
> > 
> > -- 
> > Walter
> > 
>
> Since it assumes UTF-8, maybe mbstowcs() is not needed and it can be done in
> one pass while reading the stream (no need to allocate, slurp the whole file
> and decode). Just: read the per byte and return on the first invalid sequence.

Yours are the first technical, functional corrections I got about the
code.  Thanks!  Let's go back in time, then.  I think that what you're
telling me can be done by simply replacing "break" for "return" in my
original function.  Tell me what you think, please.


(I didn't test the following patch since I'm respondig this from Linux)


--- send.c.orig 2023-09-25 21:01:34.780102611 +0200
+++ send.c      2023-09-25 21:17:11.120117761 +0200
@@ -33,6 +33,10 @@
 #include "rcv.h"
 #include "extern.h"
 
++/* To check charset of the message and add the appropiate MIME headers  */
++static char nutf8;
++static int not_utf8(FILE *s, int len);
+
 static volatile sig_atomic_t sendsignal;       /* Interrupted by a signal? */
 
 /*
@@ -341,6 +345,17 @@
                else
                        puts("Null message body; hope that's ok");
        }
+
+       /* Check non valid UTF-8 characters in the message */
+       nutf8 = not_utf8(mtf, fsize(mtf));
+       rewind(mtf);
+       if (nutf8 > 1) {
+               savedeadletter(mtf);
+               puts("Invalid or incomplete multibyte or wide character");
+               fputs(". . . message not sent.\n", stderr);
+               exit(1);
+       }
+
        /*
         * Now, take the user names from the combined
         * to and cc lists and do all the alias
@@ -369,7 +384,7 @@
        }
        if ((cp = value("record")) != NULL)
                (void)savemail(expand(cp), mtf);
-       
+
        /* Setup sendmail arguments. */
         *ap++ = "sendmail";
         *ap++ = "-i";
@@ -525,6 +540,16 @@
                fmt("To:", hp->h_to, fo, w&GCOMMA), gotcha++;
        if (hp->h_subject != NULL && w & GSUBJECT)
                fprintf(fo, "Subject: %s\n", hp->h_subject), gotcha++;
+       if (nutf8 == 0)
+               fprintf(fo, "MIME-Version: 1.0\n"
+                   "Content-Type: text/plain; charset=us-ascii\n"
+                   "Content-Transfer-Encoding: 7bit\n"),
+                   gotcha++;
+       else if (nutf8 == 1)
+               fprintf(fo, "MIME-Version: 1.0\n"
+                   "Content-Type: text/plain; charset=utf-8\n"
+                   "Content-Transfer-Encoding: 8bit\n"),
+                   gotcha++;
        if (hp->h_cc != NULL && w & GCC)
                fmt("Cc:", hp->h_cc, fo, w&GCOMMA), gotcha++;
        if (hp->h_bcc != NULL && w & GBCC)
@@ -610,3 +635,59 @@
 
        sendsignal = s;
 }
+
+/* Search non valid UTF-8 characters in the message */
+static int
+not_utf8(FILE *fp, int len)
+{
+       int i, n, nonascii;
+       char c;
+       unsigned char s[len];
+
+       i = 0;
+        while ((c = getc(fp)) != EOF)
+               s[i++] = c;
+
+       s[i] = '\0';
+
+       i = n = nonascii = 0;
+       while (s[i] != '\0')
+               if (s[i] <= 0x7f) {
+                       i++;
+               /* Two bytes case */
+               } else if (s[i] >= 0xc2 && s[i] < 0xe0 &&
+                   s[i + 1] >= 0x80 && s[i + 1] <= 0xbf) {
+                       i += 2;
+                       nonascii++;
+               /* Special three bytes case */
+               } else if ((s[i] == 0xe0 &&
+                   s[i + 1] >= 0xa0 && s[i + 1] <= 0xbf &&
+                   s[i + 2] >= 0x80 && s[i + 2] <= 0xbf) ||
+                   /* Three bytes case */
+                   (s[i] > 0xe0 && s[i] < 0xf0 &&
+                   s[i + 1] >= 0x80 && s[i + 1] <= 0xbf &&
+                   s[i + 2] >= 0x80 && s[i + 2] <= 0xbf)) {
+                       i += 3;
+                       nonascii++;
+               /* Special four bytes case */
+               } else if ((s[i] == 0xf0 &&
+                   s[i + 1] >= 0x90 && s[i + 1] <= 0xbf &&
+                   s[i + 2] >= 0x80 && s[i + 2] <= 0xbf &&
+                   s[i + 3] >= 0x80 && s[i + 3] <= 0xbf) ||
+                   /* Four bytes case */
+                   (s[i] > 0xf0 &&
+                   s[i + 1] >= 0x80 && s[i + 1] <= 0xbf &&
+                   s[i + 2] >= 0x80 && s[i + 2] <= 0xbf &&
+                   s[i + 3] >= 0x80 && s[i + 3] <= 0xbf)) {
+                       i += 4;
+                       nonascii++;
+               } else {
+                       n = i + 1;
+                       return n;
+               }
+
+       if (nonascii)
+               n++;
+
+       return n;
+}



>
> -- 
> Kind regards,
> Hiltjo
>

-- 
Walter

Reply via email to