Hi Stephen,

Many thanks for doing that.  I won't necessarily know if it's working or not
but I can run some tests on UTF-8 and European text and see if it appears to
work.

Cheers,
Ross.

-----Original Message-----
From: Delphi [mailto:[email protected]] On Behalf Of Stephen
Posey
Sent: Saturday, 23 July 2016 1:04 a.m.
To: Moderated List for the Discussion of Delphi Programming excluding
Database-related topics; 'Moderated List for the Discussion of Delphi
Programming excluding Database-related topics'
Subject: Re: C++ function convert to Delphi

Appended is a fairly literal translation (with some added annotations about
what's going on). 

Compiles in D2007.

I haven't tested it and can't swear that I've gotten all the pointer
arithmetic and byte calculations 100% correct; but it ought to be close.

HTH

Stephen Posey
[email protected] 


function isUTF8(data: PAnsiChar; size: Cardinal): Boolean; var
  pStr, pEnd: PAnsiChar;
  pb: PChar;
  codeLength, byteIdx: Integer;
  ch: Integer;
begin
  pStr := data; // start of byte buffer
  pEnd := data + size; // end of byte buffer

  Result := True;

  // while still bytes to examine
  while (pStr <> pEnd) do
  // search for any contraction of UTF-8 encoding
  begin
    pb := @pStr^;

    // bypass any 7-bit ASCII characters

    if (pb[0] <= Chr($7F)) then
    // 1-byte 7-bit ASCII sequence: U+0000..U+007F
    begin
      Inc(pStr); // point to next byte
      CONTINUE; // OK so far, next loop iteration
    end;

    // check for legal multi-byte sequence
    if (pb[0] >= Chr($C2)) and (pb[0] <= Chr($DF)) then
      // binary 110xxxxx: 2-byte sequence
      codeLength := 2
    else if (pb[0] >= Chr($E0)) and (pb[0] <= Chr($EF)) then
      // binary 1110xxxx: 3-byte sequence
      codeLength := 3
    else if (pb[0] >= Chr($F0)) and (pb[0] <= Chr($F4)) then
      // binary 11110xxx: 4-byte sequence
      codeLength := 4
    else begin
      // invalid first byte for a multibyte character, definitely not UTF-8
      Result := False;
      EXIT;
    end;

    // not enough bytes left in byte buffer to account for indicated code
length
    if ((pStr + (codeLength - 1)) >= pEnd) then
    begin
      // truncated string or invalid byte sequence, so not UTF-8
      Result := False;
      EXIT;
    end;

    // Check continuation bytes: bit 7 should be set, bit 6 should be unset
(binary 10xxxxxx)
    for byteIdx := 1 to codeLength - 1 do
    begin
      if ((Byte(pb[byteIdx]) and $C0) <> $80) then
      begin
        // not UTF-8
        Result := False;
        EXIT;
      end;
    end;

    // build multi-byte character
    if (codeLength = 2) then
    begin
      // 2-byte sequence: U+0080..U+07FF
      ch := ((Byte(pb[0]) and $1F) shl 6) +
        (Byte(pb[1]) and $3F);
      // str[0] >= $C2, so ch >= $0080. str[0] <= $DF, (str[1] and $3F) <=
$3F, so ch <= $07FF
    end

    else if (codeLength = 3) then
    begin
      // 3-byte sequence: U+0800..U+FFFF
      ch := ((Byte(pb[0]) and $0F) shl 12) +
        ((Byte(pb[1]) and $3F) shl 6) +
        (Byte(pb[2]) and $3F);
      // ($FF and $0F) shl 12 or ($FF and $3F) shl 6 or ($FF and $3F) =
$FFFF, so ch <= $FFFF

      if (ch < $0800) then
      begin
        // not UTF-8
        Result := False;
        EXIT;
      end;

      // surrogates (U+D800-U+DFFF) are invalid in UTF-8: test if (ch >=
$D800 and ch <= $DFFF)
      if ((ch shr 11) = $1B) then
      begin
        // not UTF-8
        Result := False;
        EXIT;
      end
    end

    else if (codeLength = 4) then
    begin
      // 4-byte sequence: U+10000..U+10FFFF
      ch := ((Byte(pb[0]) and $07) shl 18) +
        ((Byte(pb[1]) and $3F) shl 12) +
        ((Byte(pb[2]) and $3F) shl 6) +
        (Byte(pb[3]) and $3F);
      if ((ch < $10000) or (ch > $10FFFF)) then
      begin
        // not UTF-8
        Result := False;
              EXIT;
            end;
    end;

    Inc(pStr, codeLength); // advance past multi-byte code point
  end;

  Result := True;
end;


-----Original Message-----
>From: Ross Levis <[email protected]>
>Sent: Jul 7, 2016 1:09 AM
>To: 'Moderated List for the Discussion of Delphi Programming excluding  
>Database-related topics' <[email protected]>
>Subject: C++ function convert to Delphi
>
>I'm hoping someone with C++ knowledge and some spare time can convert 
>this function to Delphi/Pascal for me.  I don't have any C++ knowledge.
>
> 
>
>It is a method to establish with "some" degree of certainty if text is 
>UTF-8 encoded as opposed to a European character set with extended
characters.
>
> 
>
>Much appreciated!
>
> 
>
>int isUTF8(const char *data, size_t size)
>
>{
>
>    const unsigned char *str = (unsigned char*)data;
>
>    const unsigned char *end = str + size;
>
>    unsigned char byte;
>
>    unsigned int code_length, i;
>
>    uint32_t ch;
>
>    while (str != end) {
>
>        byte = *str;
>
>        if (byte <= 0x7F) {
>
>            /* 1 byte sequence: U+0000..U+007F */
>
>            str += 1;
>
>            continue;
>
>        }
>
> 
>
>        if (0xC2 <= byte && byte <= 0xDF)
>
>            /* 0b110xxxxx: 2 bytes sequence */
>
>            code_length = 2;
>
>        else if (0xE0 <= byte && byte <= 0xEF)
>
>            /* 0b1110xxxx: 3 bytes sequence */
>
>            code_length = 3;
>
>        else if (0xF0 <= byte && byte <= 0xF4)
>
>            /* 0b11110xxx: 4 bytes sequence */
>
>            code_length = 4;
>
>        else {
>
>            /* invalid first byte of a multibyte character */
>
>            return 0;
>
>        }
>
> 
>
>        if (str + (code_length - 1) >= end) {
>
>            /* truncated string or invalid byte sequence */
>
>            return 0;
>
>        }
>
> 
>
>        /* Check continuation bytes: bit 7 should be set, bit 6 should 
> be
>
>         * unset (b10xxxxxx). */
>
>        for (i=1; i < code_length; i++) {
>
>            if ((str[i] & 0xC0) != 0x80)
>
>                return 0;
>
>        }
>
> 
>
>        if (code_length == 2) {
>
>            /* 2 bytes sequence: U+0080..U+07FF */
>
>            ch = ((str[0] & 0x1f) << 6) + (str[1] & 0x3f);
>
>            /* str[0] >= 0xC2, so ch >= 0x0080.
>
>               str[0] <= 0xDF, (str[1] & 0x3f) <= 0x3f, so ch <= 0x07ff 
> */
>
>        } else if (code_length == 3) {
>
>            /* 3 bytes sequence: U+0800..U+FFFF */
>
>            ch = ((str[0] & 0x0f) << 12) + ((str[1] & 0x3f) << 6) +
>
>                  (str[2] & 0x3f);
>
>            /* (0xff & 0x0f) << 12 | (0xff & 0x3f) << 6 | (0xff & 0x3f) 
>= 0xffff,
>
>               so ch <= 0xffff */
>
>            if (ch < 0x0800)
>
>                return 0;
>
> 
>
>            /* surrogates (U+D800-U+DFFF) are invalid in UTF-8:
>
>               test if (0xD800 <= ch && ch <= 0xDFFF) */
>
>            if ((ch >> 11) == 0x1b)
>
>                return 0;
>
>        } else if (code_length == 4) {
>
>            /* 4 bytes sequence: U+10000..U+10FFFF */
>
>            ch = ((str[0] & 0x07) << 18) + ((str[1] & 0x3f) << 12) +
>
>                 ((str[2] & 0x3f) << 6) + (str[3] & 0x3f);
>
>            if ((ch < 0x10000) || (0x10FFFF < ch))
>
>                return 0;
>
>        }
>
>        str += code_length;
>
>    }
>
>    return 1;
>
>}
>
> 
>
> 
>
>Regards,
>
>Ross.
>
>_______________________________________________
>Delphi mailing list
>[email protected]
>http://lists.elists.org/cgi-bin/mailman/listinfo/delphi

_______________________________________________
Delphi mailing list
[email protected]
http://lists.elists.org/cgi-bin/mailman/listinfo/delphi


_______________________________________________
Delphi mailing list
[email protected]
http://lists.elists.org/cgi-bin/mailman/listinfo/delphi

Reply via email to