Fwd: [Lazarus] Faster than popcnt

Martin Frb via fpc-devel Mon, 03 Jan 2022 03:54:56 -0800

Hi Gareth,

not sure if this is of interest to you, but I see you do a lot on theoptimizer....

While testing the attached, I found that one of the functions wasnotable slower when compiled with 3.3.1 (compared to 3.2.3).

So maybe something you are interested in looking at?

The Code in "Utf8LengthFash" (fst) went from around 600ms to 700ms.

3.3.1 from Dec 10th
3.2.3 from Dec 9th

Core I7 8700K
-O4 -Cpcoreavx2

fpc 3.2.3 /   fpc 3.3.1

fst 594       fst 688
fst 578       fst 703
fst 578       fst 687
fst 562       fst 688

pop 485       pop 485
pop 500       pop 500
pop 500       pop 484
pop 484       pop 500

add 594       add 422
add 578       add 438
add 578       add 437
add 594       add 453

//
// (C) 2021 Martin Friebe and Marco van de Voort.
// attempt to accelerate utf8lengthfast which is a length(s) in utf8 codepoints 
without integrity checking
//
// 4 versions.
// - Original,
// - with popcount and
// - the "add" variant that accumulates 127 iterations of ptrints and only adds
//         the intermeidates outside that loop
// - a SSE2 version loosely inspired by the add variant combined with
//        the core of an existing (branchless) binarization routine for the 
main loop.

{$mode objfpc}{$H+}
{$asmmode intel}
{$coperators on}
{define asmdebug}

uses SysUtils,StrUtils;

const   
      mask3       :  array[0..15] of byte  = (   $C0,$C0,$C0,$C0,
                                                 $C0,$C0,$C0,$C0,
                                                 $C0,$C0,$C0,$C0,
                                                 $C0,$C0,$C0,$C0);

      mask4       :  array[0..15] of byte  = (   $80,$80,$80,$80,
                                                 $80,$80,$80,$80,
                                                 $80,$80,$80,$80,
                                                 $80,$80,$80,$80);

  
      mask2       :  array[0..15] of byte  = (   $1,$1,$1,$1,
                                                 $1,$1,$1,$1,
                                                 $1,$1,$1,$1,
                                                 $1,$1,$1,$1);

// Integer arguments are passed in registers RCX, RDX, R8, and R9.
// Floating point arguments are passed in XMM0L, XMM1L, XMM2L, and XMM3L.
// volatile: RAX, RCX, RDX, R8, R9, R10, R11
// nonvolatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15 are considered 
nonvolatile
// volatile xmm0-xmm3 (params) en xmm4,5
// https://msdn.microsoft.com/en-us/library/ms235286.aspx

{$ifdef asmdebug}
function asmutf8length(const s : pchar;len:integer,res:pbyte):int64;
{$else}
function asmutf8length(const s : pchar;len:integer):int64;{$ifndef 
Windows}assembler; nostackframe;{$endif}
{$endif}

{$ifdef Windows}
begin
{$endif}
 asm
  // tuning for short strings:
  // ------
  {$ifndef Windows}
  // we can't use [s] as an alias for the pointer parameter, because the non 
assembler procedure on Windows
  // changes that into a stack reference. FPC doesn't support non volatile 
frame management for assembler procs like Delphi does.
  mov rcx,s         // rdi
  mov edx,len       // rsi
  {$endif}
  test rax,rax
  je @theend
  cmp rdx,128 // threshold between long and short.
  jl @restbytes

  mov rax,rdx
  mov r10,rcx
  and r10,15
  mov r9,16
  sub r9,r10
  and r9,15
  test r9,r9
  je @nopreloop
  sub rdx,r9
@preloop:              // roughly 2 cycles per iteration on ivy bridge
  movzx r11d, byte [rcx]        // unaligned bytes after sse loop
  mov r10,r11
  shr r10,7
  not r11
  shr r11,6
  and r10,r11
  sub rax,r10
  inc rcx
  dec r9
  jne @preloop
@nopreloop:

  mov r9,rdx
  and r9,15
  shr rdx,4
  pxor xmm5,xmm5   // always zero
  pxor xmm6,xmm6   // dword counts

  // using broadcast etc raises requirements? -> use constant loads.

  movdqu xmm1,[rip+mask3]
  movdqu xmm2,[rip+mask4]
  movdqu xmm3,[rip+mask2]

  test rdx,rdx
  je @restbytes

@outer:
  mov r10,127           // max iterations per inner loop
  cmp r10,rdx           // more or less left?
  jl @last              // more
  mov r10,rdx           // less
  @last:
  sub rdx,r10            // iterations left - iterations to do

  pxor xmm4,xmm4

// process 127 iterations (limit of signed int8)

@inner:                // +/- 2.2 cycles per iteration for 16 bytes on ivy 
bridge
  movdqu xmm0, [rcx]
  pand  xmm0,xmm1      // mask out top 2 bits
  pcmpeqb xmm0,xmm2    // compare with $80. 
  pand  xmm0,xmm3      // change to $1 per byte.
  paddb  xmm4,xmm0     // add to cumulative

  add rcx,16
  dec r10
  jne @inner


  // SSSE3 vertical adds might help this, but increase CPU reqs.

  movdqa xmm0,xmm4

  PUNPCKLBW xmm0,xmm5           // zero extend to words
  PUNPCKHBW xmm4,xmm5
  paddw xmm0,xmm4               // add, now 8 16-bit words.

  movdqa xmm4,xmm0
  PUNPCKLWD xmm0,xmm5           // zero extend to dwords
  paddd xmm6,xmm0
  PUNPCKHWD  xmm4,xmm5
  paddd xmm6,xmm4               // add both L and H to cumulative 4x dword xmm6 
reg
  test rdx,rdx
  jne @outer

  MOVHLPS  xmm4,xmm6            // move high 8 bytes to low (no float->int 
penalty for move only?)

  paddd xmm6,xmm4               // add both 2*dwords (high doesn't matter)

  pshufd xmm4,xmm6,1           // mov 2nd dword in xmm6 to first in xmm4
  paddd xmm6,xmm4              // add

  movd edx,xmm6                // to int alu reg
  sub  rax,rdx                  // subtract from length in bytes.
@restbytes:
  test r9,r9
  je @theend                   // Done!
@restloop:
  movzx edx, byte [rcx]        // unaligned bytes after sse loop
  mov r10,rdx
  shr r10,7
  not rdx
  shr rdx,6
  and r10,rdx
  sub rax,r10
  inc rcx
  dec r9
  jne @restloop

@theend:
{$ifdef Windows}
end['xmm6']; // volatile registers used.
{$endif}
end;

function countmask(nx:int64):integer;

begin
   nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
   nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
   result := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF); 
end;


function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
  ONEMASK   =$01010101;
  EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
  ONEMASK   =$0101010101010101;
  EIGHTYMASK=$8080808080808080;
{$endif}
var
  pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x 
refers to 32 or 64 bits
  pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final 
loops
  ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
  nx: PtrInt;              // values processed in block loop
  i,cnt,e: PtrInt;
begin
  Result := 0;
  e := ix+ByteCount; // End marker
  // Handle any initial misaligned bytes.
  cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
  if cnt>ByteCount then
    cnt := ByteCount;
  for i := 1 to cnt do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  // Handle complete blocks
  for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
  begin
    // Count bytes which are NOT the first byte of a character.
    nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
    {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic 
overflow.
    Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
    {$pop}
    inc(pnx);
  end;
  // Take care of any left-over bytes.
  while ix<e do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  Result := ByteCount - Result;
end;

function UTF8LengthPop(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
  ONEMASK   =$01010101;
  EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
  ONEMASK   =$0101010101010101;
  EIGHTYMASK=$8080808080808080;
{$endif}
var
  pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x 
refers to 32 or 64 bits
  pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final 
loops
  ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
  nx: PtrInt;              // values processed in block loop
  i,cnt,e: PtrInt;
begin
  Result := 0;
  e := ix+ByteCount; // End marker
  // Handle any initial misaligned bytes.
  cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
  if cnt>ByteCount then
    cnt := ByteCount;
  for i := 1 to cnt do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  // Handle complete blocks
  for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
  begin
    // Count bytes which are NOT the first byte of a character.
    nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
    {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic 
overflow.
    //Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
    Result += PopCnt(qword(nx));
    {$pop}
    inc(pnx);
  end;
  // Take care of any left-over bytes.
  while ix<e do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  Result := ByteCount - Result;
end;


function UTF8LengthAdd(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
  ONEMASK   =$01010101;
  EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
  ONEMASK   =$0101010101010101;
  EIGHTYMASK=$8080808080808080;
{$endif}
var
  pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x 
refers to 32 or 64 bits
  pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final 
loops
  ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
  nx: PtrInt;              // values processed in block loop
  i,j,cnt,e, bc: PtrInt;
begin
  Result := 0;
  e := ix+ByteCount; // End marker
  // Handle any initial misaligned bytes.
  cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
  if cnt>ByteCount then
    cnt := ByteCount;
  for i := 1 to cnt do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  // Handle complete blocks

  bc := (ByteCount-cnt) div sizeof(PtrInt);
  for j := 1 to bc >> 7 do begin
    nx := 0;
    for i := 0 to 127 do
    begin
      // Count bytes which are NOT the first byte of a character.
      nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
      inc(pnx);
    end;
    nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
    nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
    nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
    Result := Result + nx;
  end;


  if (bc and 127) > 0 then begin
  nx := 0;
  for i := 1 to bc and 127 do
  begin
    // Count bytes which are NOT the first byte of a character.
    nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
    inc(pnx);
  end;
    nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
    nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
    nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
    Result := Result + nx;
  end;



  // Take care of any left-over bytes.
  while ix<e do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  Result := ByteCount - Result;
end;

// one of each pattern.
const pattern : array[0..3] of char = (chr(%11001001),chr(%10001001),
                                       chr(%00001001),chr(%01001001));

function pseudorandomutf8string(len:integer;var cnt:integer):string;
// random string but keep a count of bytes with high value %10
var lcnt:integer;
    i,j:integer;
begin
  setlength(result,len);

  lcnt:=0;
  for i:=1 to length(result) do
    begin
      j:=random(4);
      //j:=i and 3;
      if j=1 then inc(lcnt);
      result[i]:=pattern[j];
    end;
  cnt:=length(result)-lcnt;
end;

var   r : array[0..10000] of byte; // FPC "registers" dialog is poor, we use 
this for writeln like dumping

procedure testasmutf8length;

const testlen = 64*100;

var s : string;
    cnt : integer;
    rx : int64;

begin
  randomize;

  s:=pseudorandomutf8string(testlen+Random(50),cnt);

  rx:=asmutf8length(pchar(s),length(s){$ifdef asmdebug},@r[0]{$endif});

  writeln(inttohex(cnt,2),' = ',inttohex(rx,2),' ',inttohex(length(s)-cnt,2),' 
= ',inttohex(length(s)-rx,2));  // hex because most register dumps are easier 
in hex.
  {$ifdef asmdebug}
  for i:=0 to 6 do
      begin
        write(i:2,' ');
        for j:=0 to 3 do
           write(inttohex(pdword(@r[i*16+j*4])^,8), ' ');
        writeln;
      end;
  {$endif}
end;

  var
  a: ansistring;
  t: QWord;
  i, j, ii: Integer;
begin
  {$ifdef asmdebug}
     testasmutf8length;
  {$else}
  a := 'اربك تكست هو اول موقع يسمح لزواره الكرام بتحويل الكتابة العربي الى 
كتابة مفهومة من قبل اغلب برامج التصميم مثل الفوتوشوب و';
  a := 
a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
  a := 
a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
  a := 
a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
  a := a+a+A+a+a+A+a+a;

  writeln(Length(a));
  writeln('fst:',UTF8LengthFast(@a[1], Length(a)));
  writeln('pop:',UTF8LengthPop(@a[1], Length(a)));
  writeln('add:',UTF8LengthAdd(@a[1], Length(a)));
  writeln('asm:',asmUTF8Length(@a[1], Length(a){$ifdef 
asmdebug},@r[0]{$endif}));
  WriteLn();
  writeln(Length(a) div 8);

  WriteLn();

  for ii := 0 to 1 do begin
  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      UTF8LengthFast(@a[1], Length(a));
    t := GetTickCount64 - t;
    writeln('fst ',t);
  end;

  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      UTF8LengthPop(@a[1], Length(a));
    t := GetTickCount64 - t;
    writeln('pop ',t);
  end;

  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      UTF8LengthAdd(@a[1], Length(a));
    t := GetTickCount64 - t;
    writeln('add ',t);
  end;

  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      asmUTF8Length(@a[1], {$ifdef asmdebug}@r[0],{$endif} Length(a));
    t := GetTickCount64 - t;
    writeln('asm ',t);
  end;

  end;
    {$endif}
 {$ifndef FPC}
 if debughook<>nil then // runtime debugger detection
 {$endif}
   readln;
end.

-- 
_______________________________________________
lazarus mailing list
laza...@lists.lazarus-ide.org
https://lists.lazarus-ide.org/listinfo/lazarus

_______________________________________________
fpc-devel maillist  -  fpc-devel@lists.freepascal.org
https://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel

[fpc-devel] Attn: J. Gareth // 3.3.1 opt = slower // Fwd: [Lazarus] Faster than popcnt

Reply via email to