According to the UTF-8 standard, straight 7-bit ASCII is legal; so the function as defined SHOULD return True for a string like 'abc'.
Not to be presumptuous on your knowledge; but if you need an overview of how UTF-8 works: https://en.wikipedia.org/wiki/UTF-8 http://www.joelonsoftware.com/articles/Unicode.html What is it you're actually looking to accomplish here? Stephen Posey [email protected] -----Original Message----- >From: Ross Levis <[email protected]> >Sent: Jul 27, 2016 7:54 AM >To: 'Moderated List for the Discussion of Delphi Programming excluding > Database-related topics' <[email protected]> >Subject: RE: C++ function convert to Delphi > >I hadn't had time to implement the function but did so tonight. >Unfortunately feeding it 'abc' returns true. These are 7 bit characters >which I see the loop ignores. I'm not sure why or if the C++ code works. > >I seem to have fixed this, getting correct results for 7-bit, 8-bit >European, and UTF-8 text. See the 2 lines with <<<<<< > >begin > pStr := data; // start of byte buffer > pEnd := data + size; // end of byte buffer > > Result := False; <<<<<<< > > // while still bytes to examine > while (pStr <> pEnd) do > // search for any contradiction of UTF-8 encoding > begin > pb := pStr; > > // bypass any 7-bit ASCII characters > > if (pb[0] <= Chr($7F)) then > // 1-byte 7-bit ASCII sequence: U+0000..U+007F > begin > Inc(pStr); // point to next byte > CONTINUE; // OK so far, next loop iteration > end; > > Result := True; <<<<<<< > // check for legal multi-byte sequence > .... > >Also removed the final Result := True at the end. > >Ross. > >-----Original Message----- >From: Ross Levis [mailto:[email protected]] >Sent: Wednesday, 27 July 2016 11:14 p.m. >To: 'Moderated List for the Discussion of Delphi Programming excluding >Database-related topics' >Subject: RE: C++ function convert to Delphi > >I read contraction and wondered how that was relevant :-) > >Cheers. > >-----Original Message----- >From: Delphi [mailto:[email protected]] On Behalf Of Stephen >Posey >Sent: Monday, 25 July 2016 6:17 a.m. >To: Moderated List for the Discussion of Delphi Programming excluding >Database-related topics; 'Moderated List for the Discussion of Delphi >Programming excluding Database-related topics' >Subject: RE: C++ function convert to Delphi > >Arrrgh, fumble-fingered typing, that second comment should read: > >// search for any CONTRADICTION of UTF-8 encoding > >Stephen Posey >[email protected] > > >-----Original Message----- >>From: Ross Levis <[email protected]> >>Sent: Jul 23, 2016 12:27 AM >>To: 'Moderated List for the Discussion of Delphi Programming excluding >>Database-related topics' <[email protected]> >>Subject: RE: C++ function convert to Delphi >> >>Hi Stephen, >> >>Many thanks for doing that. I won't necessarily know if it's working >>or not but I can run some tests on UTF-8 and European text and see if >>it appears to work. >> >>Cheers, >>Ross. >> >>-----Original Message----- >>From: Delphi [mailto:[email protected]] On Behalf Of >>Stephen Posey >>Sent: Saturday, 23 July 2016 1:04 a.m. >>To: Moderated List for the Discussion of Delphi Programming excluding >>Database-related topics; 'Moderated List for the Discussion of Delphi >>Programming excluding Database-related topics' >>Subject: Re: C++ function convert to Delphi >> >>Appended is a fairly literal translation (with some added annotations >>about what's going on). >> >>Compiles in D2007. >> >>I haven't tested it and can't swear that I've gotten all the pointer >>arithmetic and byte calculations 100% correct; but it ought to be close. >> >>HTH >> >>Stephen Posey >>[email protected] >> >> >>function isUTF8(data: PAnsiChar; size: Cardinal): Boolean; var >> pStr, pEnd: PAnsiChar; >> pb: PChar; >> codeLength, byteIdx: Integer; >> ch: Integer; >>begin >> pStr := data; // start of byte buffer >> pEnd := data + size; // end of byte buffer >> >> Result := True; >> >> // while still bytes to examine >> while (pStr <> pEnd) do >> // search for any contraction of UTF-8 encoding begin >> pb := @pStr^; >> >> // bypass any 7-bit ASCII characters >> >> if (pb[0] <= Chr($7F)) then >> // 1-byte 7-bit ASCII sequence: U+0000..U+007F >> begin >> Inc(pStr); // point to next byte >> CONTINUE; // OK so far, next loop iteration >> end; >> >> // check for legal multi-byte sequence >> if (pb[0] >= Chr($C2)) and (pb[0] <= Chr($DF)) then >> // binary 110xxxxx: 2-byte sequence >> codeLength := 2 >> else if (pb[0] >= Chr($E0)) and (pb[0] <= Chr($EF)) then >> // binary 1110xxxx: 3-byte sequence >> codeLength := 3 >> else if (pb[0] >= Chr($F0)) and (pb[0] <= Chr($F4)) then >> // binary 11110xxx: 4-byte sequence >> codeLength := 4 >> else begin >> // invalid first byte for a multibyte character, definitely not UTF-8 >> Result := False; >> EXIT; >> end; >> >> // not enough bytes left in byte buffer to account for indicated >>code length >> if ((pStr + (codeLength - 1)) >= pEnd) then >> begin >> // truncated string or invalid byte sequence, so not UTF-8 >> Result := False; >> EXIT; >> end; >> >> // Check continuation bytes: bit 7 should be set, bit 6 should be >>unset (binary 10xxxxxx) >> for byteIdx := 1 to codeLength - 1 do >> begin >> if ((Byte(pb[byteIdx]) and $C0) <> $80) then >> begin >> // not UTF-8 >> Result := False; >> EXIT; >> end; >> end; >> >> // build multi-byte character >> if (codeLength = 2) then >> begin >> // 2-byte sequence: U+0080..U+07FF >> ch := ((Byte(pb[0]) and $1F) shl 6) + >> (Byte(pb[1]) and $3F); >> // str[0] >= $C2, so ch >= $0080. str[0] <= $DF, (str[1] and $3F) >><= $3F, so ch <= $07FF >> end >> >> else if (codeLength = 3) then >> begin >> // 3-byte sequence: U+0800..U+FFFF >> ch := ((Byte(pb[0]) and $0F) shl 12) + >> ((Byte(pb[1]) and $3F) shl 6) + >> (Byte(pb[2]) and $3F); >> // ($FF and $0F) shl 12 or ($FF and $3F) shl 6 or ($FF and $3F) = >>$FFFF, so ch <= $FFFF >> >> if (ch < $0800) then >> begin >> // not UTF-8 >> Result := False; >> EXIT; >> end; >> >> // surrogates (U+D800-U+DFFF) are invalid in UTF-8: test if (ch >>>= >>$D800 and ch <= $DFFF) >> if ((ch shr 11) = $1B) then >> begin >> // not UTF-8 >> Result := False; >> EXIT; >> end >> end >> >> else if (codeLength = 4) then >> begin >> // 4-byte sequence: U+10000..U+10FFFF >> ch := ((Byte(pb[0]) and $07) shl 18) + >> ((Byte(pb[1]) and $3F) shl 12) + >> ((Byte(pb[2]) and $3F) shl 6) + >> (Byte(pb[3]) and $3F); >> if ((ch < $10000) or (ch > $10FFFF)) then >> begin >> // not UTF-8 >> Result := False; >> EXIT; >> end; >> end; >> >> Inc(pStr, codeLength); // advance past multi-byte code point end; >> >> Result := True; >>end; >> >> >>-----Original Message----- >>>From: Ross Levis <[email protected]> >>>Sent: Jul 7, 2016 1:09 AM >>>To: 'Moderated List for the Discussion of Delphi Programming excluding >>>Database-related topics' <[email protected]> >>>Subject: C++ function convert to Delphi >>> >>>I'm hoping someone with C++ knowledge and some spare time can convert >>>this function to Delphi/Pascal for me. I don't have any C++ knowledge. >>> >>> >>> >>>It is a method to establish with "some" degree of certainty if text is >>>UTF-8 encoded as opposed to a European character set with extended >>characters. >>> >>> >>> >>>Much appreciated! >>> >>> >>> >>>int isUTF8(const char *data, size_t size) >>> >>>{ >>> >>> const unsigned char *str = (unsigned char*)data; >>> >>> const unsigned char *end = str + size; >>> >>> unsigned char byte; >>> >>> unsigned int code_length, i; >>> >>> uint32_t ch; >>> >>> while (str != end) { >>> >>> byte = *str; >>> >>> if (byte <= 0x7F) { >>> >>> /* 1 byte sequence: U+0000..U+007F */ >>> >>> str += 1; >>> >>> continue; >>> >>> } >>> >>> >>> >>> if (0xC2 <= byte && byte <= 0xDF) >>> >>> /* 0b110xxxxx: 2 bytes sequence */ >>> >>> code_length = 2; >>> >>> else if (0xE0 <= byte && byte <= 0xEF) >>> >>> /* 0b1110xxxx: 3 bytes sequence */ >>> >>> code_length = 3; >>> >>> else if (0xF0 <= byte && byte <= 0xF4) >>> >>> /* 0b11110xxx: 4 bytes sequence */ >>> >>> code_length = 4; >>> >>> else { >>> >>> /* invalid first byte of a multibyte character */ >>> >>> return 0; >>> >>> } >>> >>> >>> >>> if (str + (code_length - 1) >= end) { >>> >>> /* truncated string or invalid byte sequence */ >>> >>> return 0; >>> >>> } >>> >>> >>> >>> /* Check continuation bytes: bit 7 should be set, bit 6 should >>> be >>> >>> * unset (b10xxxxxx). */ >>> >>> for (i=1; i < code_length; i++) { >>> >>> if ((str[i] & 0xC0) != 0x80) >>> >>> return 0; >>> >>> } >>> >>> >>> >>> if (code_length == 2) { >>> >>> /* 2 bytes sequence: U+0080..U+07FF */ >>> >>> ch = ((str[0] & 0x1f) << 6) + (str[1] & 0x3f); >>> >>> /* str[0] >= 0xC2, so ch >= 0x0080. >>> >>> str[0] <= 0xDF, (str[1] & 0x3f) <= 0x3f, so ch <= >>> 0x07ff */ >>> >>> } else if (code_length == 3) { >>> >>> /* 3 bytes sequence: U+0800..U+FFFF */ >>> >>> ch = ((str[0] & 0x0f) << 12) + ((str[1] & 0x3f) << 6) + >>> >>> (str[2] & 0x3f); >>> >>> /* (0xff & 0x0f) << 12 | (0xff & 0x3f) << 6 | (0xff & >>>0x3f) = 0xffff, >>> >>> so ch <= 0xffff */ >>> >>> if (ch < 0x0800) >>> >>> return 0; >>> >>> >>> >>> /* surrogates (U+D800-U+DFFF) are invalid in UTF-8: >>> >>> test if (0xD800 <= ch && ch <= 0xDFFF) */ >>> >>> if ((ch >> 11) == 0x1b) >>> >>> return 0; >>> >>> } else if (code_length == 4) { >>> >>> /* 4 bytes sequence: U+10000..U+10FFFF */ >>> >>> ch = ((str[0] & 0x07) << 18) + ((str[1] & 0x3f) << 12) + >>> >>> ((str[2] & 0x3f) << 6) + (str[3] & 0x3f); >>> >>> if ((ch < 0x10000) || (0x10FFFF < ch)) >>> >>> return 0; >>> >>> } >>> >>> str += code_length; >>> >>> } >>> >>> return 1; >>> >>>} >>> >>> >>> >>> >>> >>>Regards, >>> >>>Ross. >>> >>>_______________________________________________ >>>Delphi mailing list >>>[email protected] >>>http://lists.elists.org/cgi-bin/mailman/listinfo/delphi >> >>_______________________________________________ >>Delphi mailing list >>[email protected] >>http://lists.elists.org/cgi-bin/mailman/listinfo/delphi >> >> >>_______________________________________________ >>Delphi mailing list >>[email protected] >>http://lists.elists.org/cgi-bin/mailman/listinfo/delphi > >_______________________________________________ >Delphi mailing list >[email protected] >http://lists.elists.org/cgi-bin/mailman/listinfo/delphi > > >_______________________________________________ >Delphi mailing list >[email protected] >http://lists.elists.org/cgi-bin/mailman/listinfo/delphi _______________________________________________ Delphi mailing list [email protected] http://lists.elists.org/cgi-bin/mailman/listinfo/delphi
