I'm hoping someone with C++ knowledge and some spare time can convert this
function to Delphi/Pascal for me. I don't have any C++ knowledge.
It is a method to establish with "some" degree of certainty if text is UTF-8
encoded as opposed to a European character set with extended characters.
Much appreciated!
int isUTF8(const char *data, size_t size)
{
const unsigned char *str = (unsigned char*)data;
const unsigned char *end = str + size;
unsigned char byte;
unsigned int code_length, i;
uint32_t ch;
while (str != end) {
byte = *str;
if (byte <= 0x7F) {
/* 1 byte sequence: U+0000..U+007F */
str += 1;
continue;
}
if (0xC2 <= byte && byte <= 0xDF)
/* 0b110xxxxx: 2 bytes sequence */
code_length = 2;
else if (0xE0 <= byte && byte <= 0xEF)
/* 0b1110xxxx: 3 bytes sequence */
code_length = 3;
else if (0xF0 <= byte && byte <= 0xF4)
/* 0b11110xxx: 4 bytes sequence */
code_length = 4;
else {
/* invalid first byte of a multibyte character */
return 0;
}
if (str + (code_length - 1) >= end) {
/* truncated string or invalid byte sequence */
return 0;
}
/* Check continuation bytes: bit 7 should be set, bit 6 should be
* unset (b10xxxxxx). */
for (i=1; i < code_length; i++) {
if ((str[i] & 0xC0) != 0x80)
return 0;
}
if (code_length == 2) {
/* 2 bytes sequence: U+0080..U+07FF */
ch = ((str[0] & 0x1f) << 6) + (str[1] & 0x3f);
/* str[0] >= 0xC2, so ch >= 0x0080.
str[0] <= 0xDF, (str[1] & 0x3f) <= 0x3f, so ch <= 0x07ff */
} else if (code_length == 3) {
/* 3 bytes sequence: U+0800..U+FFFF */
ch = ((str[0] & 0x0f) << 12) + ((str[1] & 0x3f) << 6) +
(str[2] & 0x3f);
/* (0xff & 0x0f) << 12 | (0xff & 0x3f) << 6 | (0xff & 0x3f) =
0xffff,
so ch <= 0xffff */
if (ch < 0x0800)
return 0;
/* surrogates (U+D800-U+DFFF) are invalid in UTF-8:
test if (0xD800 <= ch && ch <= 0xDFFF) */
if ((ch >> 11) == 0x1b)
return 0;
} else if (code_length == 4) {
/* 4 bytes sequence: U+10000..U+10FFFF */
ch = ((str[0] & 0x07) << 18) + ((str[1] & 0x3f) << 12) +
((str[2] & 0x3f) << 6) + (str[3] & 0x3f);
if ((ch < 0x10000) || (0x10FFFF < ch))
return 0;
}
str += code_length;
}
return 1;
}
Regards,
Ross.
_______________________________________________
Delphi mailing list
[email protected]
http://lists.elists.org/cgi-bin/mailman/listinfo/delphi