>I am new in UTF-8 issue and I would like to know how to read/write
UTF-8 in 
>c/c++ program.

If I know I want utf-8, I skip all the locale-stuff and format it
directly.
For encoding, in my opinion, if you do not care one whit about non-utf-8
locales, this will work fine.



a simple utf-8 formatting example:

//direct mapping from 31 bit UCS-4 to UTF-8.
size_t ucs4toutf8( unsigned long value, unsigned char *buf )
{
    if( value <=      0x0000007F )
    {
        buf[0] = (unsigned char)value;
        return 1;
    }
    else if( value <= 0x000007FF )
    {
        buf[1] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[0] = (unsigned char)(value & 0x1F | 0xC0);
        return 2;
    }
    else if( value <= 0x0000FFFF )
    {
        buf[2] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[1] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[0] = (unsigned char)(value & 0x0F | 0xE0);
        return 3;
    }
    else if( value <= 0x001FFFFF )
    {
        buf[3] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[2] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[1] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[0] = (unsigned char)(value & 0x07 | 0xF0);
        return 4;
    }
    else if( value <= 0x03FFFFFF )
    {
        buf[4] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[3] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[2] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[1] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[0] = (unsigned char)(value & 0x03 | 0xF8);
        return 5;
    }
    else if( value <= 0x7FFFFFFF )
    {
        buf[5] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[4] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[3] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[2] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[1] = (unsigned char)(value & 0x3F | 0x80);
        value>>=6;
        buf[0] = (unsigned char)(value & 0x01 | 0xFC);
        return 6;
    }
    return 0;
}
--
Linux-UTF8:   i18n of Linux on all levels
Archive:      http://mail.nl.linux.org/linux-utf8/

Reply via email to