I had the feeling that the compiler does some strange optimizations somewhere since ChessKISS is slower in the 64 bits version, in order to prove that I've created the following test:
type
TForm44 = class(TForm)
procedure FormCreate(Sender: TObject);
private
FA,
FB,
FC,
FD: NativeInt;
{ Private declarations }
public
{ Public declarations }
end;
var
Form44: TForm44;
implementation
{$R *.dfm}
procedure TForm44.FormCreate(Sender: TObject);
var
a, b, c, d: NativeInt;
begin
a := 1;
FA := a * 2;
b := FA * 4;
FB := b * 8;
c := FB * 16;
FC := c * 32;
d := FC * 64;
FD := d * 128;
Caption := IntToStr(FD);
end;
end.
The optimized Win32 version:
Unit44.pas.31: a := 1;
0050FE73 B801000000 mov eax,$00000001
Unit44.pas.32: fa := a * 2;
0050FE78 8BD0 mov edx,eax
0050FE7A 03D2 add edx,edx
0050FE7C 899390030000 mov [ebx+$00000390],edx
Unit44.pas.33: b := fa * 4;
0050FE82 8BC2 mov eax,edx
0050FE84 03C0 add eax,eax
0050FE86 03C0 add eax,eax
Unit44.pas.34: fb := b * 8;
0050FE88 8BD0 mov edx,eax
0050FE8A 03D2 add edx,edx
0050FE8C 03D2 add edx,edx
0050FE8E 03D2 add edx,edx
0050FE90 899394030000 mov [ebx+$00000394],edx
Unit44.pas.35: c := fb * 16;
0050FE96 8BC2 mov eax,edx
0050FE98 C1E004 shl eax,$04
Unit44.pas.36: fc := c * 32;
0050FE9B 8BD0 mov edx,eax
0050FE9D C1E205 shl edx,$05
0050FEA0 899398030000 mov [ebx+$00000398],edx
Unit44.pas.37: d := fc * 64;
0050FEA6 8BC2 mov eax,edx
0050FEA8 C1E006 shl eax,$06
Unit44.pas.38: fd := d * 128;
0050FEAB 8BF0 mov esi,eax
0050FEAD C1E607 shl esi,$07
0050FEB0 89B39C030000 mov [ebx+$0000039c],esi
A more or less good optimization, let's see the win64 version:
Unit44.pas.31: a := 1;
000000000059C535 C7C001000000 mov eax,$00000001
Unit44.pas.32: FA := a * 2;
000000000059C53B 488D0400 lea rax,[rax+rax]
000000000059C53F 488B4D20 mov rcx,[rbp+$20]
000000000059C543 48898150060000 mov [rcx+$00000650],rax
Unit44.pas.33: b := FA * 4;
000000000059C54A 488B8150060000 mov rax,[rcx+$00000650]
000000000059C551 4803C0 add rax,rax
000000000059C554 4803C0 add rax,rax
Unit44.pas.34: FB := b * 8;
000000000059C557 488D04C500000000 lea rax,[rax*8+$0000]
000000000059C55F 488B4D20 mov rcx,[rbp+$20]
000000000059C563 48898158060000 mov [rcx+$00000658],rax
Unit44.pas.35: c := FB * 16;
000000000059C56A 488B8158060000 mov rax,[rcx+$00000658]
000000000059C571 48C1E004 shl rax,$04
Unit44.pas.36: FC := c * 32;
000000000059C575 48C1E005 shl rax,$05
000000000059C579 488B4D20 mov rcx,[rbp+$20]
000000000059C57D 48898160060000 mov [rcx+$00000660],rax
Unit44.pas.37: d := FC * 64;
000000000059C584 488B8160060000 mov rax,[rcx+$00000660]
000000000059C58B 48C1E006 shl rax,$06
Unit44.pas.38: FD := d * 128;
000000000059C58F 48C1E007 shl rax,$07
000000000059C593 488B4D20 mov rcx,[rbp+$20]
000000000059C597 48898168060000 mov [rcx+$00000668],rax
At a first glance it seems ok, but a closer look (check in blue) shows some strange results, Self accesses are not well optimized (I've noticed that also in ChessKISS), it keeps saving/reading the current value in rax register, looks like it could be easily kept on some register (and you have many in x64)
No comments:
Post a Comment