On Thu, Sep 21, 2023 at 11:26:11AM +0200, Omar Polo wrote:
> On 2023/09/21 10:55:47 +0200, Walter Alejandro Iglesias <[email protected]>
> wrote:
> > On Wed, Sep 20, 2023 at 08:36:23PM +0200, Walter Alejandro Iglesias wrote:
> > > On Wed, Sep 20, 2023 at 07:44:12PM +0200, Walter Alejandro Iglesias wrote:
> > > > And this new idea simplifies all to this:
> > >
> > > In case anyone else is worried. Crystal Kolipe already pointed me out
> > > that a better UTF-8 checking is needed, I know, I'll get to that
> > > tomorrow.
> >
> > The following version checks for not valid UTF-8 characters. I could
> > make it fail in this case and send a dead.letter but I imagine that
> > those who really use mail(1) surely do it mostly in a tty console where,
> > at least with a non US keyboard, is too easy to type some non valid utf-8
> > character, hence this feature would be more a hassle than a help, so I
> > chose to make it simply skip adding any MIME header in this case (how it
> > has been used until now and no one complained :-)). If you prefer the
> > other behavior let me know.
> >
> >
> > Index: send.c
> > ===================================================================
> > RCS file: /cvs/src/usr.bin/mail/send.c,v
> > retrieving revision 1.26
> > diff -u -p -r1.26 send.c
> > --- send.c 8 Mar 2023 04:43:11 -0000 1.26
> > +++ send.c 21 Sep 2023 08:40:11 -0000
> > @@ -33,6 +33,15 @@
> > #include "rcv.h"
> > #include "extern.h"
> >
> > +/*
> > + * Variables and functions declared here will be useful to check the
> > + * character set of the message to add the appropiate MIME headers.
> > + */
> > +static char nascii = 0;
> > +static char nutf8 = 0;
>
> There's no need to explicitly zero static (or global) variables.
>
> > +static int not_ascii(struct __sFILE *s);
> > +static int not_utf8(struct __sFILE *s, int len);
>
> I'd use FILE * instead of struct __sFILE
>
> > static volatile sig_atomic_t sendsignal; /* Interrupted by a signal? */
> >
> > /*
> > @@ -341,6 +350,15 @@ mail1(struct header *hp, int printheader
> > else
> > puts("Null message body; hope that's ok");
> > }
> > +
> > + /* Check for non ASCII characters in the message */
> > + nascii = not_ascii(mtf);
> > + rewind(mtf);
> > +
> > + /* Check for non valid UTF-8 characters in the message */
> > + nutf8 = not_utf8(mtf, fsize(mtf));
> > + rewind(mtf);
>
> assuming that we care for this two checks, why not doing everything in
> a single pass?
>
> Do we really need the two checks?
>
> > /*
> > * Now, take the user names from the combined
> > * to and cc lists and do all the alias
> > @@ -525,6 +543,14 @@ puthead(struct header *hp, FILE *fo, int
> > fmt("To:", hp->h_to, fo, w&GCOMMA), gotcha++;
> > if (hp->h_subject != NULL && w & GSUBJECT)
> > fprintf(fo, "Subject: %s\n", hp->h_subject), gotcha++;
> > + if (!nascii)
> > + fprintf(fo, "MIME-Version: 1.0\n"
> > + "Content-Type: text/plain; charset=us-ascii\n"
> > + "Content-Transfer-Encoding: 7bit\n"), gotcha++;
>
> +1 for splitting the string in multiple lines, this is an improvements
> over previous versions, but please
>
> - use four spaces of indentation for continuation lines
>
> - although existing code uses ", gotcha++" I'd split that in a
> separate line for clarity.
>
> > + else if (nutf8 == 0)
> > + fprintf(fo, "MIME-Version: 1.0\n"
> > + "Content-Type: text/plain; charset=utf-8\n"
> > + "Content-Transfer-Encoding: 8bit\n"), gotcha++;
> > if (hp->h_cc != NULL && w & GCC)
> > fmt("Cc:", hp->h_cc, fo, w&GCOMMA), gotcha++;
> > if (hp->h_bcc != NULL && w & GBCC)
> > @@ -609,4 +635,67 @@ sendint(int s)
> > {
> >
> > sendsignal = s;
> > +}
> > +
> > +/* Search non ASCII characters in the message */
> > +static int
> > +not_ascii(struct __sFILE *s)
> > +{
> > + int ch, n;
> > + n = 0;
> > + while ((ch = getc(s)) != EOF)
>
> There are some spacing issues, both here and below.
>
> > + if (ch > 0x7f)
> > + n = 1;
> > +
> > + return n;
> > +}
> > +
> > +/* Search non valid UTF-8 characters in the message */
> > +static int
> > +not_utf8(struct __sFILE *message, int len)
> > +{
> > + int i, nou8;
> > + char c;
> > + unsigned char s[len + 1];
>
> Please don't. Variable length arrays (VLA) with a possibly large len
> are a bad idea. They have more or less the same issues as alloca(3),
> see the CAVEATS section of it to get an idea.
>
> > +
> > + i = 0;
> > + while ((c = getc(message)) != EOF)
> > + s[i++] = c;
>
> and even then, fread() is simpler :-)
>
> > + s[i] = '\0';
> > +
> > + i = nou8 = 0;
> > + while (i != len)
>
> ...and even then, mbtowc is easier to use. See Ingo'
> /src/usr/bin/ls/utf8.c for an example usage.
>
> > + if (s[i] <= 0x7f)
> > + ++i;
> > + /* Two bytes case */
> > + else if (s[i] >= 0xc2 && s[i] < 0xe0 &&
> > + s[i + 1] >= 0x80 && s[i + 1] <= 0xbf)
> > + i += 2;
> > + /* Special three bytes case */
> > + else if ((s[i] == 0xe0 &&
> > + s[i + 1] >= 0xa0 && s[i + 1] <= 0xbf &&
> > + s[i + 2] >= 0x80 && s[i + 2] <= 0xbf) ||
> > [...]
> > }
>
>
> Admittedly I haven't followed the thread too closely, although I hope
> something like this will end up in mail(1) sometimes :-)
Hi Omar,
I corrected many of the things you pointed me, but not all. The
function I use to check utf8 is mine, I use it in a pair of little
programs which I've *hardly* checked for memory leacks. I know my
function looks BIG :-), but I know for sure that it does the job. I
could just filter only 0xfe and 0xff as Crystal suggests, but that lets
pass iso-latin characters as utf-8, if that doesn't bother anyone I can
do that. I don't know the other solutions you mention, I still have to
learn them. :-)
What concerns me now is to add a utf8 check for the subject.
>
> Thanks,
>
> Omar Polo
Index: send.c
===================================================================
RCS file: /cvs/src/usr.bin/mail/send.c,v
retrieving revision 1.26
diff -u -p -r1.26 send.c
--- send.c 8 Mar 2023 04:43:11 -0000 1.26
+++ send.c 21 Sep 2023 11:01:58 -0000
@@ -33,6 +33,10 @@
#include "rcv.h"
#include "extern.h"
+/* To check charset of the message and add the appropiate MIME headers */
+static char nutf8;
+static int not_utf8(FILE *s, int len);
+
static volatile sig_atomic_t sendsignal; /* Interrupted by a signal? */
/*
@@ -341,6 +345,11 @@ mail1(struct header *hp, int printheader
else
puts("Null message body; hope that's ok");
}
+
+ /* Check non valid UTF-8 characters in the message */
+ nutf8 = not_utf8(mtf, fsize(mtf));
+ rewind(mtf);
+
/*
* Now, take the user names from the combined
* to and cc lists and do all the alias
@@ -525,6 +534,14 @@ puthead(struct header *hp, FILE *fo, int
fmt("To:", hp->h_to, fo, w&GCOMMA), gotcha++;
if (hp->h_subject != NULL && w & GSUBJECT)
fprintf(fo, "Subject: %s\n", hp->h_subject), gotcha++;
+ if (nutf8 == 0)
+ fprintf(fo, "MIME-Version: 1.0\n"
+ "Content-Type: text/plain; charset=us-ascii\n"
+ "Content-Transfer-Encoding: 7bit\n"), gotcha++;
+ else if (nutf8 == 1)
+ fprintf(fo, "MIME-Version: 1.0\n"
+ "Content-Type: text/plain; charset=utf-8\n"
+ "Content-Transfer-Encoding: 8bit\n"), gotcha++;
if (hp->h_cc != NULL && w & GCC)
fmt("Cc:", hp->h_cc, fo, w&GCOMMA), gotcha++;
if (hp->h_bcc != NULL && w & GBCC)
@@ -609,4 +626,60 @@ sendint(int s)
{
sendsignal = s;
+}
+
+/* Search non valid UTF-8 characters in the message */
+static int
+not_utf8(FILE *message, int len)
+{
+ int i, n, nonascii;
+ char c;
+ unsigned char s[len + 1];
+
+ i = 0;
+ while ((c = getc(message)) != EOF)
+ s[i++] = c;
+
+ s[i] = '\0';
+
+ i = n = nonascii = 0;
+ while (i != len)
+ if (s[i] <= 0x7f) {
+ i++;
+ /* Two bytes case */
+ } else if (s[i] >= 0xc2 && s[i] < 0xe0 &&
+ s[i + 1] >= 0x80 && s[i + 1] <= 0xbf) {
+ i += 2;
+ nonascii++;
+ /* Special three bytes case */
+ } else if ((s[i] == 0xe0 &&
+ s[i + 1] >= 0xa0 && s[i + 1] <= 0xbf &&
+ s[i + 2] >= 0x80 && s[i + 2] <= 0xbf) ||
+ /* Three bytes case */
+ (s[i] > 0xe0 && s[i] < 0xf0 &&
+ s[i + 1] >= 0x80 && s[i + 1] <= 0xbf &&
+ s[i + 2] >= 0x80 && s[i + 2] <= 0xbf)) {
+ i += 3;
+ nonascii++;
+ /* Special four bytes case */
+ } else if ((s[i] == 0xf0 &&
+ s[i + 1] >= 0x90 && s[i + 1] <= 0xbf &&
+ s[i + 2] >= 0x80 && s[i + 2] <= 0xbf &&
+ s[i + 3] >= 0x80 && s[i + 3] <= 0xbf) ||
+ /* Four bytes case */
+ (s[i] > 0xf0 &&
+ s[i + 1] >= 0x80 && s[i + 1] <= 0xbf &&
+ s[i + 2] >= 0x80 && s[i + 2] <= 0xbf &&
+ s[i + 3] >= 0x80 && s[i + 3] <= 0xbf)) {
+ i += 4;
+ nonascii++;
+ } else {
+ n = i + 1;
+ break;
+ }
+
+ if (nonascii)
+ n++;
+
+ return n;
}
--
Walter