Murray Sargent wrote:
> [...] UTF-8 has the restrictions:
>  
> 1. Shortest UTF-8 form for a 32-bit value is always used; longer forms are
illegal
> 2. The surrogate codes 0xD800 - 0xDFFF are illegal in UTF-8 form
> 3. Only the first 17 planes are legal; codes above these plans, i.e.,
values greater than 0x10FFFF, are illegal.
>  
> These restrictions complicate the code below considerably.

Restriction 1 is already implemented (see field "lval" in struct "Tab"), and
restrictions 2 and 3 only require minor modifications (deleting two entries
from array "tab" and adding a range check in both functions):
 
---------------

// Changes:
// #1 - (Change ID not used!)
// #2 - The surrogate codes 0xD800 - 0xDFFF are illegal in UTF-8 form
// #3 - Only the first 17 planes are legal; codes above these plans, i.e.,
values greater than 0x10FFFF, are illegal.

typedef
struct
{
        int     cmask;
        int     cval;
        int     shift;
        long    lmask;
        long    lval;
} Tab;

static
Tab     tab[] =
{
        0x80,   0x00,   0*6,    0x7F,           0,              /* 1 byte
sequence */
        0xE0,   0xC0,   1*6,    0x7FF,          0x80,           /* 2 byte
sequence */
        0xF0,   0xE0,   2*6,    0xFFFF,         0x800,          /* 3 byte
sequence */
        0xF8,   0xF0,   3*6,    0x1FFFFF,       0x10000,        /* 4 byte
sequence */
        //code deleted for change #3        0xFC,   0xF8,   4*6,
0x3FFFFFF,      0x200000,       /* 5 byte sequence */
        //code deleted for change #3        0xFE,   0xFC,   5*6,
0x7FFFFFFF,     0x4000000,      /* 6 byte sequence */
        0,                                                      /* end of
table */
};

int
mbtowc(wchar_t *p, char *s, size_t n)
{
        long l;
        int c0, c, nc;
        Tab *t;

        if(s == 0)
                return 0;

        nc = 0;
        if(n <= nc)
                return -1;
        c0 = *s & 0xff;
        l = c0;
        for(t=tab; t->cmask; t++) {
                nc++;
                if((c0 & t->cmask) == t->cval) {
                        l &= t->lmask;
                        if(l < t->lval)
                                return -1;

                        //BEGIN: code added for changes #2 and #3
                        if(l >= 0xD800L && l <= 0xDFFFL || l >= 0x10FFFFL)
                                return -1;
                        //END: code added for changes #2 and #3

                        *p = l;
                        return nc;
                }
                if(n <= nc)
                        return -1;
                s++;
                c = (*s ^ 0x80) & 0xFF;
                if(c & 0xC0)
                        return -1;
                l = (l<<6) | c;
        }
        return -1;
}

int
wctomb(char *s, wchar_t wc)
{
        long l;
        int c, nc;
        Tab *t;

        if(s == 0)
                return 0;

        //BEGIN: code added for changes #2 and #3
        if(wc >= 0xD800L && wc <= 0xDFFFL || wc >= 0x10FFFFL)
                return -1;
        //END: code added for changes #2 and #3

        l = wc;
        nc = 0;
        for(t=tab; t->cmask; t++) {
                nc++;
                if(l <= t->lmask) {
                        c = t->shift;
                        *s = t->cval | (l>>c);
                        while(c > 0) {
                                c -= 6;
                                s++;
                                *s = 0x80 | ((l>>c) & 0x3F);
                        }
                        return nc;
                }
        }
        return -1;
}

---------------

BTW, I wish I had this historical info before: I just delivered a paper
about Unicode where I attributed the invention of UTF-8 to Fran�ois
Yergeau... And that placemat of the New Jersey diner would have made such a
cute story! :-(

_ Marco
--
Linux-UTF8:   i18n of Linux on all levels
Archive:      http://mail.nl.linux.org/linux-utf8/

Reply via email to