>I am new in UTF-8 issue and I would like to know how to read/write
UTF-8 in
>c/c++ program.
If I know I want utf-8, I skip all the locale-stuff and format it
directly.
For encoding, in my opinion, if you do not care one whit about non-utf-8
locales, this will work fine.
a simple utf-8 formatting example:
//direct mapping from 31 bit UCS-4 to UTF-8.
size_t ucs4toutf8( unsigned long value, unsigned char *buf )
{
if( value <= 0x0000007F )
{
buf[0] = (unsigned char)value;
return 1;
}
else if( value <= 0x000007FF )
{
buf[1] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[0] = (unsigned char)(value & 0x1F | 0xC0);
return 2;
}
else if( value <= 0x0000FFFF )
{
buf[2] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[1] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[0] = (unsigned char)(value & 0x0F | 0xE0);
return 3;
}
else if( value <= 0x001FFFFF )
{
buf[3] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[2] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[1] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[0] = (unsigned char)(value & 0x07 | 0xF0);
return 4;
}
else if( value <= 0x03FFFFFF )
{
buf[4] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[3] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[2] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[1] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[0] = (unsigned char)(value & 0x03 | 0xF8);
return 5;
}
else if( value <= 0x7FFFFFFF )
{
buf[5] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[4] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[3] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[2] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[1] = (unsigned char)(value & 0x3F | 0x80);
value>>=6;
buf[0] = (unsigned char)(value & 0x01 | 0xFC);
return 6;
}
return 0;
}
--
Linux-UTF8: i18n of Linux on all levels
Archive: http://mail.nl.linux.org/linux-utf8/