I hadn't had time to implement the function but did so tonight.
Unfortunately feeding it 'abc' returns true. These are 7 bit characters
which I see the loop ignores. I'm not sure why or if the C++ code works.
I seem to have fixed this, getting correct results for 7-bit, 8-bit
European, and UTF-8 text. See the 2 lines with <<<<<<
begin
pStr := data; // start of byte buffer
pEnd := data + size; // end of byte buffer
Result := False; <<<<<<<
// while still bytes to examine
while (pStr <> pEnd) do
// search for any contradiction of UTF-8 encoding
begin
pb := pStr;
// bypass any 7-bit ASCII characters
if (pb[0] <= Chr($7F)) then
// 1-byte 7-bit ASCII sequence: U+0000..U+007F
begin
Inc(pStr); // point to next byte
CONTINUE; // OK so far, next loop iteration
end;
Result := True; <<<<<<<
// check for legal multi-byte sequence
....
Also removed the final Result := True at the end.
Ross.
-----Original Message-----
From: Ross Levis [mailto:[email protected]]
Sent: Wednesday, 27 July 2016 11:14 p.m.
To: 'Moderated List for the Discussion of Delphi Programming excluding
Database-related topics'
Subject: RE: C++ function convert to Delphi
I read contraction and wondered how that was relevant :-)
Cheers.
-----Original Message-----
From: Delphi [mailto:[email protected]] On Behalf Of Stephen
Posey
Sent: Monday, 25 July 2016 6:17 a.m.
To: Moderated List for the Discussion of Delphi Programming excluding
Database-related topics; 'Moderated List for the Discussion of Delphi
Programming excluding Database-related topics'
Subject: RE: C++ function convert to Delphi
Arrrgh, fumble-fingered typing, that second comment should read:
// search for any CONTRADICTION of UTF-8 encoding
Stephen Posey
[email protected]
-----Original Message-----
>From: Ross Levis <[email protected]>
>Sent: Jul 23, 2016 12:27 AM
>To: 'Moderated List for the Discussion of Delphi Programming excluding
>Database-related topics' <[email protected]>
>Subject: RE: C++ function convert to Delphi
>
>Hi Stephen,
>
>Many thanks for doing that. I won't necessarily know if it's working
>or not but I can run some tests on UTF-8 and European text and see if
>it appears to work.
>
>Cheers,
>Ross.
>
>-----Original Message-----
>From: Delphi [mailto:[email protected]] On Behalf Of
>Stephen Posey
>Sent: Saturday, 23 July 2016 1:04 a.m.
>To: Moderated List for the Discussion of Delphi Programming excluding
>Database-related topics; 'Moderated List for the Discussion of Delphi
>Programming excluding Database-related topics'
>Subject: Re: C++ function convert to Delphi
>
>Appended is a fairly literal translation (with some added annotations
>about what's going on).
>
>Compiles in D2007.
>
>I haven't tested it and can't swear that I've gotten all the pointer
>arithmetic and byte calculations 100% correct; but it ought to be close.
>
>HTH
>
>Stephen Posey
>[email protected]
>
>
>function isUTF8(data: PAnsiChar; size: Cardinal): Boolean; var
> pStr, pEnd: PAnsiChar;
> pb: PChar;
> codeLength, byteIdx: Integer;
> ch: Integer;
>begin
> pStr := data; // start of byte buffer
> pEnd := data + size; // end of byte buffer
>
> Result := True;
>
> // while still bytes to examine
> while (pStr <> pEnd) do
> // search for any contraction of UTF-8 encoding begin
> pb := @pStr^;
>
> // bypass any 7-bit ASCII characters
>
> if (pb[0] <= Chr($7F)) then
> // 1-byte 7-bit ASCII sequence: U+0000..U+007F
> begin
> Inc(pStr); // point to next byte
> CONTINUE; // OK so far, next loop iteration
> end;
>
> // check for legal multi-byte sequence
> if (pb[0] >= Chr($C2)) and (pb[0] <= Chr($DF)) then
> // binary 110xxxxx: 2-byte sequence
> codeLength := 2
> else if (pb[0] >= Chr($E0)) and (pb[0] <= Chr($EF)) then
> // binary 1110xxxx: 3-byte sequence
> codeLength := 3
> else if (pb[0] >= Chr($F0)) and (pb[0] <= Chr($F4)) then
> // binary 11110xxx: 4-byte sequence
> codeLength := 4
> else begin
> // invalid first byte for a multibyte character, definitely not UTF-8
> Result := False;
> EXIT;
> end;
>
> // not enough bytes left in byte buffer to account for indicated
>code length
> if ((pStr + (codeLength - 1)) >= pEnd) then
> begin
> // truncated string or invalid byte sequence, so not UTF-8
> Result := False;
> EXIT;
> end;
>
> // Check continuation bytes: bit 7 should be set, bit 6 should be
>unset (binary 10xxxxxx)
> for byteIdx := 1 to codeLength - 1 do
> begin
> if ((Byte(pb[byteIdx]) and $C0) <> $80) then
> begin
> // not UTF-8
> Result := False;
> EXIT;
> end;
> end;
>
> // build multi-byte character
> if (codeLength = 2) then
> begin
> // 2-byte sequence: U+0080..U+07FF
> ch := ((Byte(pb[0]) and $1F) shl 6) +
> (Byte(pb[1]) and $3F);
> // str[0] >= $C2, so ch >= $0080. str[0] <= $DF, (str[1] and $3F)
><= $3F, so ch <= $07FF
> end
>
> else if (codeLength = 3) then
> begin
> // 3-byte sequence: U+0800..U+FFFF
> ch := ((Byte(pb[0]) and $0F) shl 12) +
> ((Byte(pb[1]) and $3F) shl 6) +
> (Byte(pb[2]) and $3F);
> // ($FF and $0F) shl 12 or ($FF and $3F) shl 6 or ($FF and $3F) =
>$FFFF, so ch <= $FFFF
>
> if (ch < $0800) then
> begin
> // not UTF-8
> Result := False;
> EXIT;
> end;
>
> // surrogates (U+D800-U+DFFF) are invalid in UTF-8: test if (ch
>>=
>$D800 and ch <= $DFFF)
> if ((ch shr 11) = $1B) then
> begin
> // not UTF-8
> Result := False;
> EXIT;
> end
> end
>
> else if (codeLength = 4) then
> begin
> // 4-byte sequence: U+10000..U+10FFFF
> ch := ((Byte(pb[0]) and $07) shl 18) +
> ((Byte(pb[1]) and $3F) shl 12) +
> ((Byte(pb[2]) and $3F) shl 6) +
> (Byte(pb[3]) and $3F);
> if ((ch < $10000) or (ch > $10FFFF)) then
> begin
> // not UTF-8
> Result := False;
> EXIT;
> end;
> end;
>
> Inc(pStr, codeLength); // advance past multi-byte code point end;
>
> Result := True;
>end;
>
>
>-----Original Message-----
>>From: Ross Levis <[email protected]>
>>Sent: Jul 7, 2016 1:09 AM
>>To: 'Moderated List for the Discussion of Delphi Programming excluding
>>Database-related topics' <[email protected]>
>>Subject: C++ function convert to Delphi
>>
>>I'm hoping someone with C++ knowledge and some spare time can convert
>>this function to Delphi/Pascal for me. I don't have any C++ knowledge.
>>
>>
>>
>>It is a method to establish with "some" degree of certainty if text is
>>UTF-8 encoded as opposed to a European character set with extended
>characters.
>>
>>
>>
>>Much appreciated!
>>
>>
>>
>>int isUTF8(const char *data, size_t size)
>>
>>{
>>
>> const unsigned char *str = (unsigned char*)data;
>>
>> const unsigned char *end = str + size;
>>
>> unsigned char byte;
>>
>> unsigned int code_length, i;
>>
>> uint32_t ch;
>>
>> while (str != end) {
>>
>> byte = *str;
>>
>> if (byte <= 0x7F) {
>>
>> /* 1 byte sequence: U+0000..U+007F */
>>
>> str += 1;
>>
>> continue;
>>
>> }
>>
>>
>>
>> if (0xC2 <= byte && byte <= 0xDF)
>>
>> /* 0b110xxxxx: 2 bytes sequence */
>>
>> code_length = 2;
>>
>> else if (0xE0 <= byte && byte <= 0xEF)
>>
>> /* 0b1110xxxx: 3 bytes sequence */
>>
>> code_length = 3;
>>
>> else if (0xF0 <= byte && byte <= 0xF4)
>>
>> /* 0b11110xxx: 4 bytes sequence */
>>
>> code_length = 4;
>>
>> else {
>>
>> /* invalid first byte of a multibyte character */
>>
>> return 0;
>>
>> }
>>
>>
>>
>> if (str + (code_length - 1) >= end) {
>>
>> /* truncated string or invalid byte sequence */
>>
>> return 0;
>>
>> }
>>
>>
>>
>> /* Check continuation bytes: bit 7 should be set, bit 6 should
>> be
>>
>> * unset (b10xxxxxx). */
>>
>> for (i=1; i < code_length; i++) {
>>
>> if ((str[i] & 0xC0) != 0x80)
>>
>> return 0;
>>
>> }
>>
>>
>>
>> if (code_length == 2) {
>>
>> /* 2 bytes sequence: U+0080..U+07FF */
>>
>> ch = ((str[0] & 0x1f) << 6) + (str[1] & 0x3f);
>>
>> /* str[0] >= 0xC2, so ch >= 0x0080.
>>
>> str[0] <= 0xDF, (str[1] & 0x3f) <= 0x3f, so ch <=
>> 0x07ff */
>>
>> } else if (code_length == 3) {
>>
>> /* 3 bytes sequence: U+0800..U+FFFF */
>>
>> ch = ((str[0] & 0x0f) << 12) + ((str[1] & 0x3f) << 6) +
>>
>> (str[2] & 0x3f);
>>
>> /* (0xff & 0x0f) << 12 | (0xff & 0x3f) << 6 | (0xff &
>>0x3f) = 0xffff,
>>
>> so ch <= 0xffff */
>>
>> if (ch < 0x0800)
>>
>> return 0;
>>
>>
>>
>> /* surrogates (U+D800-U+DFFF) are invalid in UTF-8:
>>
>> test if (0xD800 <= ch && ch <= 0xDFFF) */
>>
>> if ((ch >> 11) == 0x1b)
>>
>> return 0;
>>
>> } else if (code_length == 4) {
>>
>> /* 4 bytes sequence: U+10000..U+10FFFF */
>>
>> ch = ((str[0] & 0x07) << 18) + ((str[1] & 0x3f) << 12) +
>>
>> ((str[2] & 0x3f) << 6) + (str[3] & 0x3f);
>>
>> if ((ch < 0x10000) || (0x10FFFF < ch))
>>
>> return 0;
>>
>> }
>>
>> str += code_length;
>>
>> }
>>
>> return 1;
>>
>>}
>>
>>
>>
>>
>>
>>Regards,
>>
>>Ross.
>>
>>_______________________________________________
>>Delphi mailing list
>>[email protected]
>>http://lists.elists.org/cgi-bin/mailman/listinfo/delphi
>
>_______________________________________________
>Delphi mailing list
>[email protected]
>http://lists.elists.org/cgi-bin/mailman/listinfo/delphi
>
>
>_______________________________________________
>Delphi mailing list
>[email protected]
>http://lists.elists.org/cgi-bin/mailman/listinfo/delphi
_______________________________________________
Delphi mailing list
[email protected]
http://lists.elists.org/cgi-bin/mailman/listinfo/delphi
_______________________________________________
Delphi mailing list
[email protected]
http://lists.elists.org/cgi-bin/mailman/listinfo/delphi