On 2023-10-11 04:15, J. Gareth Moreton via fpc-devel wrote:
Sweet, thank you. Would you be willing to share your modified test's
source? I was worried that if CPUID wasn't present it would cause a
SIGILL.
Sure, attached, but I didn't do anything special - I modified it in a
way allowing easy disabling of this detection for x86 by disabling
definition of a conditional symbol added to the source and I was
prepared to recompile with the functionality disabled on the old AMD DX4
if needed. However, I didn't need to do so - the AMD DX4 machine simply
ignored it and chose the branch used in case of missing support for the
particular CPUID function. I have no idea if this might be due to some
protection in OS/2 Warp 4 (used for compiling and running the test on
that machine) potentially masking that exception, or what was the
reason. Apparently, it should be possible to detect CPUID availability
(albeit not 100% reliably), see https://wiki.osdev.org/CPUID, but I
didn't use that.
Tomas
On 11/10/2023 01:47, Tomas Hajny via fpc-devel wrote:
On 2023-10-10 13:24, J. Gareth Moreton via fpc-devel wrote:
I'm all for receiving results for all kinds of processor, as it helps
me to make more informed choices on flags as well as confirming that
Agner Fog''s instruction tables are correct. Also, results for older
processors can be hard to come by sometimes.
Currently, most architectures have a fast LEA, and the default
"Athlon" option lines up with this. Of the Intel architectures, the
speed slows down on COREAVX onwards (COREI is fine), so I added a new
COREX (for 10th generation Core) option between ZEN2 and ZEN3 to mark
the point where LEA is fast again (its 16-bit version is also fast,
unlike Zen 3).
In the meantime I'll be looking at the benchmarking code that Stefan
provided to see if it can and should be integrated.
Thanks again everyone for the results you're giving.
Alright, fine (I modified your test to include the CPU name as well if
possible and added an IFDEFed distinction of 32-bits versus 64-bits):
32-bits:
CPU = AMD A9-9425 RADEON R5, 5 COMPUTE CORES 2C+3G
-----------------------------------------------------
Pascal control case: 0.85 ns/call
Using LEA instruction: 0.56 ns/call
Using ADD instructions: 0.84 ns/call
64-bits:
CPU = AMD A9-9425 RADEON R5, 5 COMPUTE CORES 2C+3G
-----------------------------------------------------
Pascal control case: 0.85 ns/call
Using LEA instruction: 0.56 ns/call
Using ADD instructions: 0.85 ns/call
32-bits:
CPU = AMD Athlon(tm) Processor
------------------------------
Pascal control case: 6.10 ns/call
Using LEA instruction: 3.40 ns/call
Using ADD instructions: 3.40 ns/call
32-bits:
(AMD DX4 100 MHz - no CPUID name)
Pascal control case: 123 ns/call
Using LEA instruction: 72 ns/call
Using ADD instructions: 73 ns/call
Tomas
{ %CPU=i386,x86_64 }
program blea;
{$IF not defined(CPUX86) and not defined(CPUX86_64)}
{$FATAL This test program requires an Intel x86 or x64 processor }
{$ENDIF}
{$MODE OBJFPC}
{$ASMMODE Intel}
{$DEFINE DETECTCPU}
uses
SysUtils;
type
TBenchmarkProc = function(const Input, X, Y: LongWord): LongWord;
var
CPUName: array[0..48] of Char;
{$ifdef CPUX86_64}
function FillBrandName: Boolean; assembler; nostackframe;
asm
PUSH RBX
MOV EAX, $80000000
CPUID
CMP EAX, $80000004
JB @Unavailable
LEA R8, [RIP + CPUName]
MOV EAX, $80000002
CPUID
MOV [R8], EAX
MOV [R8 + 4], EBX
MOV [R8 + 8], ECX
MOV [R8 + 12], EDX
MOV EAX, $80000003
CPUID
MOV [R8 + 16], EAX
MOV [R8 + 20], EBX
MOV [R8 + 24], ECX
MOV [R8 + 28], EDX
MOV EAX, $80000004
CPUID
MOV [R8 + 32], EAX
MOV [R8 + 36], EBX
MOV [R8 + 40], ECX
MOV [R8 + 44], EDX
MOV BYTE PTR [R8 + 48], 0
MOV AL, 1
JMP @ExitBrand
@Unavailable:
XOR AL, AL
@ExitBrand:
POP RBX
end;
{$else CPUX86_64}
function FillBrandName: Boolean; assembler; nostackframe;
asm
{$IFDEF DETECTCPU}
push ebx
mov eax, $80000000
cpuid
cmp eax, $80000004
jb @not_supported
lea esi, CPUName
mov eax, 80000002h
cpuid
mov [esi], eax
mov [esi+4], ebx
mov [esi+8], ecx
mov [esi+12], edx
mov eax, 80000003h
cpuid
mov [esi+16], eax
mov [esi+20], ebx
mov [esi+24], ecx
mov [esi+28], edx
mov eax, 80000004h
cpuid
mov [esi+32], eax
mov [esi+36], ebx
mov [esi+40], ecx
mov [esi+44], edx
mov eax, 1
jmp @exit
@not_supported:
xor eax, eax
@exit:
pop ebx
{$ELSE DETECTCPU}
xor eax, eax
{$ENDIF DETECTPU}
end;
{$endif CPUX86_64}
function Checksum_PAS(const Input, X, Y: LongWord): LongWord;
var
Counter: LongWord;
begin
Result := Input;
Counter := Y;
while (Counter > 0) do
begin
Result := Result + X + $87654321;
Result := Result xor Counter;
Dec(Counter);
end;
end;
function Checksum_ADD(const Input, X, Y: LongWord): LongWord; assembler;
nostackframe;
asm
@Loop1:
ADD Input, $87654321
ADD Input, X
XOR Input, Y
DEC Y
JNZ @Loop1
MOV Result, Input
end;
function Checksum_LEA(const Input, X, Y: LongWord): LongWord; assembler;
nostackframe;
asm
@Loop2:
LEA Input, [Input + X + $87654321]
XOR Input, Y
DEC Y
JNZ @Loop2
MOV Result, Input
end;
function Benchmark(const name: string; proc: TBenchmarkProc; Z, X: LongWord):
LongWord;
const
internal_reps = 1000;
var
start: TDateTime;
time: double;
reps: cardinal;
begin
Result := Z;
reps := 0;
Write(name, ': ');
start := Now;
repeat
inc(reps);
Result := proc(Result, X, internal_reps);
until (reps >= 100000);
time := ((Now - start) * SecsPerDay) / reps / internal_reps * 1e9;
WriteLn(time:0:(2 * ord(time < 10)), ' ns/call');
end;
var
Results: array[0..2] of LongWord;
FailureCode, X: Integer;
begin
{$IFDEF CPUX86}
WriteLn ('32 bits:');
{$ENDIF CPUX86}
{$IFDEF CPUX86_64}
WriteLn ('64 bits:');
{$ENDIF CPUX86_64}
if FillBrandName then
begin
WriteLn('CPU = ', CpuName);
X := 0;
while CpuName[X] <> #0 do
begin
CpuName[X] := '-';
Inc(X);
end;
WriteLn('------', CpuName);
end;
Results[0] := Benchmark(' Pascal control case', @Checksum_PAS, 5000000,
1000);
Results[1] := Benchmark(' Using LEA instruction', @Checksum_LEA, 5000000,
1000);
Results[2] := Benchmark('Using ADD instructions', @Checksum_ADD, 5000000,
1000);
FailureCode := 0;
if (Results[0] <> Results[1]) then
begin
WriteLn('ERROR: Checksum_LEA doesn''t match control case');
FailureCode := FailureCode or 1;
end;
if (Results[0] <> Results[2]) then
begin
WriteLn('ERROR: Checksum_ADD doesn''t match control case');
FailureCode := FailureCode or 2
end;
if FailureCode <> 0 then
Halt(FailureCode);
end.
_______________________________________________
fpc-devel maillist - [email protected]
https://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel