Hi Gareth,not sure if this is of interest to you, but I see you do a lot on the optimizer....
While testing the attached, I found that one of the functions was notable slower when compiled with 3.3.1 (compared to 3.2.3).
So maybe something you are interested in looking at? The Code in "Utf8LengthFash" (fst) went from around 600ms to 700ms. 3.3.1 from Dec 10th 3.2.3 from Dec 9th Core I7 8700K -O4 -Cpcoreavx2 fpc 3.2.3 / fpc 3.3.1 fst 594 fst 688 fst 578 fst 703 fst 578 fst 687 fst 562 fst 688 pop 485 pop 485 pop 500 pop 500 pop 500 pop 484 pop 484 pop 500 add 594 add 422 add 578 add 438 add 578 add 437 add 594 add 453
// // (C) 2021 Martin Friebe and Marco van de Voort. // attempt to accelerate utf8lengthfast which is a length(s) in utf8 codepoints without integrity checking // // 4 versions. // - Original, // - with popcount and // - the "add" variant that accumulates 127 iterations of ptrints and only adds // the intermeidates outside that loop // - a SSE2 version loosely inspired by the add variant combined with // the core of an existing (branchless) binarization routine for the main loop. {$mode objfpc}{$H+} {$asmmode intel} {$coperators on} {define asmdebug} uses SysUtils,StrUtils; const mask3 : array[0..15] of byte = ( $C0,$C0,$C0,$C0, $C0,$C0,$C0,$C0, $C0,$C0,$C0,$C0, $C0,$C0,$C0,$C0); mask4 : array[0..15] of byte = ( $80,$80,$80,$80, $80,$80,$80,$80, $80,$80,$80,$80, $80,$80,$80,$80); mask2 : array[0..15] of byte = ( $1,$1,$1,$1, $1,$1,$1,$1, $1,$1,$1,$1, $1,$1,$1,$1); // Integer arguments are passed in registers RCX, RDX, R8, and R9. // Floating point arguments are passed in XMM0L, XMM1L, XMM2L, and XMM3L. // volatile: RAX, RCX, RDX, R8, R9, R10, R11 // nonvolatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15 are considered nonvolatile // volatile xmm0-xmm3 (params) en xmm4,5 // https://msdn.microsoft.com/en-us/library/ms235286.aspx {$ifdef asmdebug} function asmutf8length(const s : pchar;len:integer,res:pbyte):int64; {$else} function asmutf8length(const s : pchar;len:integer):int64;{$ifndef Windows}assembler; nostackframe;{$endif} {$endif} {$ifdef Windows} begin {$endif} asm // tuning for short strings: // ------ {$ifndef Windows} // we can't use [s] as an alias for the pointer parameter, because the non assembler procedure on Windows // changes that into a stack reference. FPC doesn't support non volatile frame management for assembler procs like Delphi does. mov rcx,s // rdi mov edx,len // rsi {$endif} test rax,rax je @theend cmp rdx,128 // threshold between long and short. jl @restbytes mov rax,rdx mov r10,rcx and r10,15 mov r9,16 sub r9,r10 and r9,15 test r9,r9 je @nopreloop sub rdx,r9 @preloop: // roughly 2 cycles per iteration on ivy bridge movzx r11d, byte [rcx] // unaligned bytes after sse loop mov r10,r11 shr r10,7 not r11 shr r11,6 and r10,r11 sub rax,r10 inc rcx dec r9 jne @preloop @nopreloop: mov r9,rdx and r9,15 shr rdx,4 pxor xmm5,xmm5 // always zero pxor xmm6,xmm6 // dword counts // using broadcast etc raises requirements? -> use constant loads. movdqu xmm1,[rip+mask3] movdqu xmm2,[rip+mask4] movdqu xmm3,[rip+mask2] test rdx,rdx je @restbytes @outer: mov r10,127 // max iterations per inner loop cmp r10,rdx // more or less left? jl @last // more mov r10,rdx // less @last: sub rdx,r10 // iterations left - iterations to do pxor xmm4,xmm4 // process 127 iterations (limit of signed int8) @inner: // +/- 2.2 cycles per iteration for 16 bytes on ivy bridge movdqu xmm0, [rcx] pand xmm0,xmm1 // mask out top 2 bits pcmpeqb xmm0,xmm2 // compare with $80. pand xmm0,xmm3 // change to $1 per byte. paddb xmm4,xmm0 // add to cumulative add rcx,16 dec r10 jne @inner // SSSE3 vertical adds might help this, but increase CPU reqs. movdqa xmm0,xmm4 PUNPCKLBW xmm0,xmm5 // zero extend to words PUNPCKHBW xmm4,xmm5 paddw xmm0,xmm4 // add, now 8 16-bit words. movdqa xmm4,xmm0 PUNPCKLWD xmm0,xmm5 // zero extend to dwords paddd xmm6,xmm0 PUNPCKHWD xmm4,xmm5 paddd xmm6,xmm4 // add both L and H to cumulative 4x dword xmm6 reg test rdx,rdx jne @outer MOVHLPS xmm4,xmm6 // move high 8 bytes to low (no float->int penalty for move only?) paddd xmm6,xmm4 // add both 2*dwords (high doesn't matter) pshufd xmm4,xmm6,1 // mov 2nd dword in xmm6 to first in xmm4 paddd xmm6,xmm4 // add movd edx,xmm6 // to int alu reg sub rax,rdx // subtract from length in bytes. @restbytes: test r9,r9 je @theend // Done! @restloop: movzx edx, byte [rcx] // unaligned bytes after sse loop mov r10,rdx shr r10,7 not rdx shr rdx,6 and r10,rdx sub rax,r10 inc rcx dec r9 jne @restloop @theend: {$ifdef Windows} end['xmm6']; // volatile registers used. {$endif} end; function countmask(nx:int64):integer; begin nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF); nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF); result := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF); end; function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt; const {$ifdef CPU32} ONEMASK =$01010101; EIGHTYMASK=$80808080; {$endif} {$ifdef CPU64} ONEMASK =$0101010101010101; EIGHTYMASK=$8080808080808080; {$endif} var pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop nx: PtrInt; // values processed in block loop i,cnt,e: PtrInt; begin Result := 0; e := ix+ByteCount; // End marker // Handle any initial misaligned bytes. cnt := (not (ix-1)) and (sizeof(PtrInt)-1); if cnt>ByteCount then cnt := ByteCount; for i := 1 to cnt do begin // Is this byte NOT the first byte of a character? Result += (pn8^ shr 7) and ((not pn8^) shr 6); inc(pn8); end; // Handle complete blocks for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do begin // Count bytes which are NOT the first byte of a character. nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6); {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow. Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8); {$pop} inc(pnx); end; // Take care of any left-over bytes. while ix<e do begin // Is this byte NOT the first byte of a character? Result += (pn8^ shr 7) and ((not pn8^) shr 6); inc(pn8); end; Result := ByteCount - Result; end; function UTF8LengthPop(p: PChar; ByteCount: PtrInt): PtrInt; const {$ifdef CPU32} ONEMASK =$01010101; EIGHTYMASK=$80808080; {$endif} {$ifdef CPU64} ONEMASK =$0101010101010101; EIGHTYMASK=$8080808080808080; {$endif} var pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop nx: PtrInt; // values processed in block loop i,cnt,e: PtrInt; begin Result := 0; e := ix+ByteCount; // End marker // Handle any initial misaligned bytes. cnt := (not (ix-1)) and (sizeof(PtrInt)-1); if cnt>ByteCount then cnt := ByteCount; for i := 1 to cnt do begin // Is this byte NOT the first byte of a character? Result += (pn8^ shr 7) and ((not pn8^) shr 6); inc(pn8); end; // Handle complete blocks for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do begin // Count bytes which are NOT the first byte of a character. nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6); {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow. //Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8); Result += PopCnt(qword(nx)); {$pop} inc(pnx); end; // Take care of any left-over bytes. while ix<e do begin // Is this byte NOT the first byte of a character? Result += (pn8^ shr 7) and ((not pn8^) shr 6); inc(pn8); end; Result := ByteCount - Result; end; function UTF8LengthAdd(p: PChar; ByteCount: PtrInt): PtrInt; const {$ifdef CPU32} ONEMASK =$01010101; EIGHTYMASK=$80808080; {$endif} {$ifdef CPU64} ONEMASK =$0101010101010101; EIGHTYMASK=$8080808080808080; {$endif} var pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop nx: PtrInt; // values processed in block loop i,j,cnt,e, bc: PtrInt; begin Result := 0; e := ix+ByteCount; // End marker // Handle any initial misaligned bytes. cnt := (not (ix-1)) and (sizeof(PtrInt)-1); if cnt>ByteCount then cnt := ByteCount; for i := 1 to cnt do begin // Is this byte NOT the first byte of a character? Result += (pn8^ shr 7) and ((not pn8^) shr 6); inc(pn8); end; // Handle complete blocks bc := (ByteCount-cnt) div sizeof(PtrInt); for j := 1 to bc >> 7 do begin nx := 0; for i := 0 to 127 do begin // Count bytes which are NOT the first byte of a character. nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6); inc(pnx); end; nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF); nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF); nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF); Result := Result + nx; end; if (bc and 127) > 0 then begin nx := 0; for i := 1 to bc and 127 do begin // Count bytes which are NOT the first byte of a character. nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6); inc(pnx); end; nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF); nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF); nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF); Result := Result + nx; end; // Take care of any left-over bytes. while ix<e do begin // Is this byte NOT the first byte of a character? Result += (pn8^ shr 7) and ((not pn8^) shr 6); inc(pn8); end; Result := ByteCount - Result; end; // one of each pattern. const pattern : array[0..3] of char = (chr(%11001001),chr(%10001001), chr(%00001001),chr(%01001001)); function pseudorandomutf8string(len:integer;var cnt:integer):string; // random string but keep a count of bytes with high value %10 var lcnt:integer; i,j:integer; begin setlength(result,len); lcnt:=0; for i:=1 to length(result) do begin j:=random(4); //j:=i and 3; if j=1 then inc(lcnt); result[i]:=pattern[j]; end; cnt:=length(result)-lcnt; end; var r : array[0..10000] of byte; // FPC "registers" dialog is poor, we use this for writeln like dumping procedure testasmutf8length; const testlen = 64*100; var s : string; cnt : integer; rx : int64; begin randomize; s:=pseudorandomutf8string(testlen+Random(50),cnt); rx:=asmutf8length(pchar(s),length(s){$ifdef asmdebug},@r[0]{$endif}); writeln(inttohex(cnt,2),' = ',inttohex(rx,2),' ',inttohex(length(s)-cnt,2),' = ',inttohex(length(s)-rx,2)); // hex because most register dumps are easier in hex. {$ifdef asmdebug} for i:=0 to 6 do begin write(i:2,' '); for j:=0 to 3 do write(inttohex(pdword(@r[i*16+j*4])^,8), ' '); writeln; end; {$endif} end; var a: ansistring; t: QWord; i, j, ii: Integer; begin {$ifdef asmdebug} testasmutf8length; {$else} a := 'اربك تكست هو اول موقع يسمح لزواره الكرام بتحويل الكتابة العربي الى كتابة مفهومة من قبل اغلب برامج التصميم مثل الفوتوشوب و'; a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A; a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A; a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A; a := a+a+A+a+a+A+a+a; writeln(Length(a)); writeln('fst:',UTF8LengthFast(@a[1], Length(a))); writeln('pop:',UTF8LengthPop(@a[1], Length(a))); writeln('add:',UTF8LengthAdd(@a[1], Length(a))); writeln('asm:',asmUTF8Length(@a[1], Length(a){$ifdef asmdebug},@r[0]{$endif})); WriteLn(); writeln(Length(a) div 8); WriteLn(); for ii := 0 to 1 do begin for i := 0 to 3 do begin t := GetTickCount64; for j := 0 to 19 do UTF8LengthFast(@a[1], Length(a)); t := GetTickCount64 - t; writeln('fst ',t); end; for i := 0 to 3 do begin t := GetTickCount64; for j := 0 to 19 do UTF8LengthPop(@a[1], Length(a)); t := GetTickCount64 - t; writeln('pop ',t); end; for i := 0 to 3 do begin t := GetTickCount64; for j := 0 to 19 do UTF8LengthAdd(@a[1], Length(a)); t := GetTickCount64 - t; writeln('add ',t); end; for i := 0 to 3 do begin t := GetTickCount64; for j := 0 to 19 do asmUTF8Length(@a[1], {$ifdef asmdebug}@r[0],{$endif} Length(a)); t := GetTickCount64 - t; writeln('asm ',t); end; end; {$endif} {$ifndef FPC} if debughook<>nil then // runtime debugger detection {$endif} readln; end.
-- _______________________________________________ lazarus mailing list laza...@lists.lazarus-ide.org https://lists.lazarus-ide.org/listinfo/lazarus
_______________________________________________ fpc-devel maillist - fpc-devel@lists.freepascal.org https://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel