On Thursday 18 April 2013 11:41:24 Graeme Geldenhuys wrote:
> On 2013-04-18 09:31, Martin Schreiber wrote:
> > It counts the number of a known constant Russian character in a random
> > string. In utf-16 and UCS4 this is an operation with numbers and string
> > index,
>
> Does that Russian character have a surrogate pair?

No character of the BMP (Basic Multilingual Plane) is a surrogate pair. The 
Cyrillic alphabeth is in range  U+0400–U+04FF, the searched character is 
U+042F. BTW even modern Chinese is in BMP.
The test program is is probably this:

"
{$H+,R-}
    uses SysUtils, Windows;//strings;
    const
      TRIES = 1;
      CAPACITY = 1000000;
    var
      ansi: AnsiString;
      utf8: Utf8String;
      i, j: LongWord;
      t1, t2: Int64;
      p: PChar;
      utf16: UnicodeString;
      utf32: UCS4String;
      TestCount: LongWord;
      c8:  AnsiString;
      c16: UCS2Char;
      c32: UCS4Char;

    begin
      //Randomize;
      SetLength(ansi, CAPACITY);
      for j := 1 to CAPACITY do
        ansi[j] := Char(32 + Random(256 - 32));
      utf8 := AnsiToUtf8(ansi);
      c8 := #$D0#$AF; // 'Я' in utf8
     
      { UTF-8 test }
      QueryPerformanceCounter(t1);
      TestCount := 0;
      for i := 1 to TRIES do begin
        p := @utf8[1];
        while true do begin
          p := StrPos(p, @c8[1]);
          if p = nil then break;
          Inc(p); Inc(TestCount);
        end;
      end;
      QueryPerformanceCounter(t2);
      WriteLn('UTF-8:  ', TestCount, ' entries in ', t2 - t1, ' ticks.');
     
      { UTF-16 test }
      utf16 := UTF8Decode(UTF8);//**
      QueryPerformanceCounter(t1);
      TestCount := 0;
      for i := 1 to TRIES do begin
        for j := 1 to Length(utf16) do begin
          c16 := utf16[j];
          if c16 = #$042F then Inc(TestCount);
        end;
      end;
      QueryPerformanceCounter(t2);
      WriteLn('UTF-16: ', TestCount, ' entries in ', t2 - t1, ' ticks.');

      { UTF-32 test }
      utf32 := UnicodeStringToUCS4String(utf16);//**
      QueryPerformanceCounter(t1);
      TestCount := 0;
      for i := 1 to TRIES do begin
        for j := 0 to Length(utf32) - 1 do begin
          c32 := utf32[j];
          if c32 = $042F then Inc(TestCount);
        end;
      end;
      QueryPerformanceCounter(t2);
      WriteLn('UTF-32: ', TestCount, ' entries in ', t2 - t1, ' ticks.');
    end.
"
Martin

------------------------------------------------------------------------------
Precog is a next-generation analytics platform capable of advanced
analytics on semi-structured data. The platform includes APIs for building
apps and a phenomenal toolset for data science. Developers can use
our toolset for easy data analysis & visualization. Get a free account!
http://www2.precog.com/precogplatform/slashdotnewsletter
_______________________________________________
mseide-msegui-talk mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/mseide-msegui-talk

Reply via email to