Arrrgh, fumble-fingered typing, that second comment should read: // search for any CONTRADICTION of UTF-8 encoding
Stephen Posey [email protected] -----Original Message----- >From: Ross Levis <[email protected]> >Sent: Jul 23, 2016 12:27 AM >To: 'Moderated List for the Discussion of Delphi Programming excluding > Database-related topics' <[email protected]> >Subject: RE: C++ function convert to Delphi > >Hi Stephen, > >Many thanks for doing that. I won't necessarily know if it's working or not >but I can run some tests on UTF-8 and European text and see if it appears to >work. > >Cheers, >Ross. > >-----Original Message----- >From: Delphi [mailto:[email protected]] On Behalf Of Stephen >Posey >Sent: Saturday, 23 July 2016 1:04 a.m. >To: Moderated List for the Discussion of Delphi Programming excluding >Database-related topics; 'Moderated List for the Discussion of Delphi >Programming excluding Database-related topics' >Subject: Re: C++ function convert to Delphi > >Appended is a fairly literal translation (with some added annotations about >what's going on). > >Compiles in D2007. > >I haven't tested it and can't swear that I've gotten all the pointer >arithmetic and byte calculations 100% correct; but it ought to be close. > >HTH > >Stephen Posey >[email protected] > > >function isUTF8(data: PAnsiChar; size: Cardinal): Boolean; var > pStr, pEnd: PAnsiChar; > pb: PChar; > codeLength, byteIdx: Integer; > ch: Integer; >begin > pStr := data; // start of byte buffer > pEnd := data + size; // end of byte buffer > > Result := True; > > // while still bytes to examine > while (pStr <> pEnd) do > // search for any contraction of UTF-8 encoding > begin > pb := @pStr^; > > // bypass any 7-bit ASCII characters > > if (pb[0] <= Chr($7F)) then > // 1-byte 7-bit ASCII sequence: U+0000..U+007F > begin > Inc(pStr); // point to next byte > CONTINUE; // OK so far, next loop iteration > end; > > // check for legal multi-byte sequence > if (pb[0] >= Chr($C2)) and (pb[0] <= Chr($DF)) then > // binary 110xxxxx: 2-byte sequence > codeLength := 2 > else if (pb[0] >= Chr($E0)) and (pb[0] <= Chr($EF)) then > // binary 1110xxxx: 3-byte sequence > codeLength := 3 > else if (pb[0] >= Chr($F0)) and (pb[0] <= Chr($F4)) then > // binary 11110xxx: 4-byte sequence > codeLength := 4 > else begin > // invalid first byte for a multibyte character, definitely not UTF-8 > Result := False; > EXIT; > end; > > // not enough bytes left in byte buffer to account for indicated code >length > if ((pStr + (codeLength - 1)) >= pEnd) then > begin > // truncated string or invalid byte sequence, so not UTF-8 > Result := False; > EXIT; > end; > > // Check continuation bytes: bit 7 should be set, bit 6 should be unset >(binary 10xxxxxx) > for byteIdx := 1 to codeLength - 1 do > begin > if ((Byte(pb[byteIdx]) and $C0) <> $80) then > begin > // not UTF-8 > Result := False; > EXIT; > end; > end; > > // build multi-byte character > if (codeLength = 2) then > begin > // 2-byte sequence: U+0080..U+07FF > ch := ((Byte(pb[0]) and $1F) shl 6) + > (Byte(pb[1]) and $3F); > // str[0] >= $C2, so ch >= $0080. str[0] <= $DF, (str[1] and $3F) <= >$3F, so ch <= $07FF > end > > else if (codeLength = 3) then > begin > // 3-byte sequence: U+0800..U+FFFF > ch := ((Byte(pb[0]) and $0F) shl 12) + > ((Byte(pb[1]) and $3F) shl 6) + > (Byte(pb[2]) and $3F); > // ($FF and $0F) shl 12 or ($FF and $3F) shl 6 or ($FF and $3F) = >$FFFF, so ch <= $FFFF > > if (ch < $0800) then > begin > // not UTF-8 > Result := False; > EXIT; > end; > > // surrogates (U+D800-U+DFFF) are invalid in UTF-8: test if (ch >= >$D800 and ch <= $DFFF) > if ((ch shr 11) = $1B) then > begin > // not UTF-8 > Result := False; > EXIT; > end > end > > else if (codeLength = 4) then > begin > // 4-byte sequence: U+10000..U+10FFFF > ch := ((Byte(pb[0]) and $07) shl 18) + > ((Byte(pb[1]) and $3F) shl 12) + > ((Byte(pb[2]) and $3F) shl 6) + > (Byte(pb[3]) and $3F); > if ((ch < $10000) or (ch > $10FFFF)) then > begin > // not UTF-8 > Result := False; > EXIT; > end; > end; > > Inc(pStr, codeLength); // advance past multi-byte code point > end; > > Result := True; >end; > > >-----Original Message----- >>From: Ross Levis <[email protected]> >>Sent: Jul 7, 2016 1:09 AM >>To: 'Moderated List for the Discussion of Delphi Programming excluding >>Database-related topics' <[email protected]> >>Subject: C++ function convert to Delphi >> >>I'm hoping someone with C++ knowledge and some spare time can convert >>this function to Delphi/Pascal for me. I don't have any C++ knowledge. >> >> >> >>It is a method to establish with "some" degree of certainty if text is >>UTF-8 encoded as opposed to a European character set with extended >characters. >> >> >> >>Much appreciated! >> >> >> >>int isUTF8(const char *data, size_t size) >> >>{ >> >> const unsigned char *str = (unsigned char*)data; >> >> const unsigned char *end = str + size; >> >> unsigned char byte; >> >> unsigned int code_length, i; >> >> uint32_t ch; >> >> while (str != end) { >> >> byte = *str; >> >> if (byte <= 0x7F) { >> >> /* 1 byte sequence: U+0000..U+007F */ >> >> str += 1; >> >> continue; >> >> } >> >> >> >> if (0xC2 <= byte && byte <= 0xDF) >> >> /* 0b110xxxxx: 2 bytes sequence */ >> >> code_length = 2; >> >> else if (0xE0 <= byte && byte <= 0xEF) >> >> /* 0b1110xxxx: 3 bytes sequence */ >> >> code_length = 3; >> >> else if (0xF0 <= byte && byte <= 0xF4) >> >> /* 0b11110xxx: 4 bytes sequence */ >> >> code_length = 4; >> >> else { >> >> /* invalid first byte of a multibyte character */ >> >> return 0; >> >> } >> >> >> >> if (str + (code_length - 1) >= end) { >> >> /* truncated string or invalid byte sequence */ >> >> return 0; >> >> } >> >> >> >> /* Check continuation bytes: bit 7 should be set, bit 6 should >> be >> >> * unset (b10xxxxxx). */ >> >> for (i=1; i < code_length; i++) { >> >> if ((str[i] & 0xC0) != 0x80) >> >> return 0; >> >> } >> >> >> >> if (code_length == 2) { >> >> /* 2 bytes sequence: U+0080..U+07FF */ >> >> ch = ((str[0] & 0x1f) << 6) + (str[1] & 0x3f); >> >> /* str[0] >= 0xC2, so ch >= 0x0080. >> >> str[0] <= 0xDF, (str[1] & 0x3f) <= 0x3f, so ch <= 0x07ff >> */ >> >> } else if (code_length == 3) { >> >> /* 3 bytes sequence: U+0800..U+FFFF */ >> >> ch = ((str[0] & 0x0f) << 12) + ((str[1] & 0x3f) << 6) + >> >> (str[2] & 0x3f); >> >> /* (0xff & 0x0f) << 12 | (0xff & 0x3f) << 6 | (0xff & 0x3f) >>= 0xffff, >> >> so ch <= 0xffff */ >> >> if (ch < 0x0800) >> >> return 0; >> >> >> >> /* surrogates (U+D800-U+DFFF) are invalid in UTF-8: >> >> test if (0xD800 <= ch && ch <= 0xDFFF) */ >> >> if ((ch >> 11) == 0x1b) >> >> return 0; >> >> } else if (code_length == 4) { >> >> /* 4 bytes sequence: U+10000..U+10FFFF */ >> >> ch = ((str[0] & 0x07) << 18) + ((str[1] & 0x3f) << 12) + >> >> ((str[2] & 0x3f) << 6) + (str[3] & 0x3f); >> >> if ((ch < 0x10000) || (0x10FFFF < ch)) >> >> return 0; >> >> } >> >> str += code_length; >> >> } >> >> return 1; >> >>} >> >> >> >> >> >>Regards, >> >>Ross. >> >>_______________________________________________ >>Delphi mailing list >>[email protected] >>http://lists.elists.org/cgi-bin/mailman/listinfo/delphi > >_______________________________________________ >Delphi mailing list >[email protected] >http://lists.elists.org/cgi-bin/mailman/listinfo/delphi > > >_______________________________________________ >Delphi mailing list >[email protected] >http://lists.elists.org/cgi-bin/mailman/listinfo/delphi _______________________________________________ Delphi mailing list [email protected] http://lists.elists.org/cgi-bin/mailman/listinfo/delphi
