Monday, September 5, 2011

Trying Delphi XE2, some performance tests, part 2

I had the feeling that the compiler does some strange optimizations somewhere since ChessKISS is slower in the 64 bits version, in order to prove that I've created the following test:


type
  TForm44 = class(TForm)
    procedure FormCreate(Sender: TObject);
  private
    FA,
    FB,
    FC,
    FD: NativeInt;
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form44: TForm44;

implementation

{$R *.dfm}

procedure TForm44.FormCreate(Sender: TObject);
var
  a, b, c, d: NativeInt;

begin
  a := 1;
  FA := a * 2;

  b := FA * 4;
  FB := b * 8;

  c := FB * 16;
  FC := c * 32;

  d := FC * 64;
  FD := d * 128;

  Caption := IntToStr(FD);
end;

end.
The optimized Win32 version:
Unit44.pas.31: a := 1;
0050FE73 B801000000       mov eax,$00000001
Unit44.pas.32: fa := a * 2;
0050FE78 8BD0             mov edx,eax
0050FE7A 03D2             add edx,edx
0050FE7C 899390030000     mov [ebx+$00000390],edx
Unit44.pas.33: b := fa * 4;
0050FE82 8BC2             mov eax,edx
0050FE84 03C0             add eax,eax
0050FE86 03C0             add eax,eax
Unit44.pas.34: fb := b * 8;
0050FE88 8BD0             mov edx,eax
0050FE8A 03D2             add edx,edx
0050FE8C 03D2             add edx,edx
0050FE8E 03D2             add edx,edx
0050FE90 899394030000     mov [ebx+$00000394],edx
Unit44.pas.35: c := fb * 16;
0050FE96 8BC2             mov eax,edx
0050FE98 C1E004           shl eax,$04
Unit44.pas.36: fc := c * 32;
0050FE9B 8BD0             mov edx,eax
0050FE9D C1E205           shl edx,$05
0050FEA0 899398030000     mov [ebx+$00000398],edx
Unit44.pas.37: d := fc * 64;
0050FEA6 8BC2             mov eax,edx
0050FEA8 C1E006           shl eax,$06
Unit44.pas.38: fd := d * 128;
0050FEAB 8BF0             mov esi,eax
0050FEAD C1E607           shl esi,$07
0050FEB0 89B39C030000     mov [ebx+$0000039c],esi
A more or less good optimization, let's see the win64 version:
Unit44.pas.31: a := 1;
000000000059C535 C7C001000000     mov eax,$00000001
Unit44.pas.32: FA := a * 2;
000000000059C53B 488D0400         lea rax,[rax+rax]
000000000059C53F 488B4D20         mov rcx,[rbp+$20]
000000000059C543 48898150060000   mov [rcx+$00000650],rax
Unit44.pas.33: b := FA * 4;
000000000059C54A 488B8150060000   mov rax,[rcx+$00000650]
000000000059C551 4803C0           add rax,rax
000000000059C554 4803C0           add rax,rax
Unit44.pas.34: FB := b * 8;
000000000059C557 488D04C500000000 lea rax,[rax*8+$0000]
000000000059C55F 488B4D20         mov rcx,[rbp+$20]
000000000059C563 48898158060000   mov [rcx+$00000658],rax
Unit44.pas.35: c := FB * 16;
000000000059C56A 488B8158060000   mov rax,[rcx+$00000658]
000000000059C571 48C1E004         shl rax,$04
Unit44.pas.36: FC := c * 32;
000000000059C575 48C1E005         shl rax,$05
000000000059C579 488B4D20         mov rcx,[rbp+$20]
000000000059C57D 48898160060000   mov [rcx+$00000660],rax
Unit44.pas.37: d := FC * 64;
000000000059C584 488B8160060000   mov rax,[rcx+$00000660]
000000000059C58B 48C1E006         shl rax,$06
Unit44.pas.38: FD := d * 128;
000000000059C58F 48C1E007         shl rax,$07
000000000059C593 488B4D20         mov rcx,[rbp+$20]
000000000059C597 48898168060000   mov [rcx+$00000668],rax
At a first glance it seems ok, but a closer look (check in blue) shows some strange results, Self accesses are not well optimized (I've noticed that also in ChessKISS), it keeps saving/reading the current value in rax register, looks like it could be easily kept on some register (and you have many in x64)

No comments:

Post a Comment