Hi, Antonio

Pls check the function I used for check UTF8 string. Hope it helpful

function IsUTF8(UnknownStr:string):boolean;
var
    i    :Integer;
begin
    if length(UnknownStr)=0 then exit(true);
    i:=1;
    while i<length(UnknownStr) do
    begin
        // ASCII
        if  (UnknownStr[i] = #$09) or
            (UnknownStr[i] = #$0A) or
            (UnknownStr[i] = #$0D) or
            (UnknownStr[i] in [#$20..#$7E]) then
        begin
            inc(i);
            continue;
        end;
        // non-overlong 2-byte
        if  (UnknownStr[i] in [#$C2..#$DF]) and
            (UnknownStr[i+1] in [#$80..#$BF]) then
        begin
            inc(i,2);
            continue;
        end;
        // excluding overlongs
        if ((UnknownStr[i]=#$E0) and
              (UnknownStr[i+1] in [#$A0..#$BF]) and
            (UnknownStr[i+2] in [#$80..#$BF]))
        or
            // straight 3-byte
            (((UnknownStr[i] in [#$E1..#$EC]) or
            (UnknownStr[i] = #$EE) or
            (UnknownStr[i] = #$EF))
            and
            (UnknownStr[i+1] in [#$80..#$BF]) and
            (UnknownStr[i+2] in [#$80..#$BF]))
        or
             // excluding surrogates
             ((UnknownStr[i]=#$ED) and
              (UnknownStr[i+1] in [#$80..#$9F]) and
              (UnknownStr[i+2] in [#$80..#$BF])) then
        begin
                inc(i,3);
                continue;
        end;
        // planes 1-3
        if ((UnknownStr[i]=#$F0) and
            (UnknownStr[i+1] in [#$90..#$BF]) and
            (UnknownStr[i+2] in [#$80..#$BF]) and
            (UnknownStr[i+3] in [#$80..#$BF]))
        or
        // planes 4-15
           ((UnknownStr[i] in [#$F1..#$F3]) and
            (UnknownStr[i+1] in [#$80..#$BF]) and
            (UnknownStr[i+2] in [#$80..#$BF]) and
            (UnknownStr[i+3] in [#$80..#$BF]))
        or
        // plane 16
           ((UnknownStr[i]=#$F4) and
            (UnknownStr[i+1] in [#$80..#$8F]) and
            (UnknownStr[i+2] in [#$80..#$BF]) and
            (UnknownStr[i+3] in [#$80..#$BF])) then
        begin
            inc(i,4);
            continue;
        end;
        exit(false);
    end;
    exit(true);
end;


2010/2/27 Antônio <[email protected]>

> How to determine whether a string is UTF-8 or not?
>
> --
> _______________________________________________
> Lazarus mailing list
> [email protected]
> http://lists.lazarus.freepascal.org/mailman/listinfo/lazarus
>
--
_______________________________________________
Lazarus mailing list
[email protected]
http://lists.lazarus.freepascal.org/mailman/listinfo/lazarus

Reply via email to