Monday, September 19, 2011

XE2, strange optimizations when in 64 bits...

 Given this silly code:

type
  TForm50 = class(TForm)
    procedure FormCreate(Sender: TObject);
  private
    FA, FB, FC, FD: integer;
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form50: TForm50;

implementation

{$R *.dfm}

procedure TForm50.FormCreate(Sender: TObject);
var
  i: integer;

begin
  FA := 0;
  FB := 0;
  FC := 0;
  FD := 0;

  for i := 0 to 999999 do
  begin
    Inc(FA);
    Inc(FB, 2);
    Inc(FC, 3);
    Inc(FD, 4);
  end;
end;

The optimized 32 bits code looks like:

Unit50.pas.31: FA := 0;
0050FE5C 33D2             xor edx,edx
0050FE5E 899090030000     mov [eax+$00000390],edx
Unit50.pas.32: FB := 0;
0050FE64 33D2             xor edx,edx
0050FE66 899094030000     mov [eax+$00000394],edx
Unit50.pas.33: FC := 0;
0050FE6C 33D2             xor edx,edx
0050FE6E 899098030000     mov [eax+$00000398],edx
Unit50.pas.34: FD := 0;
0050FE74 33D2             xor edx,edx
0050FE76 89909C030000     mov [eax+$0000039c],edx

Unit50.pas.36: for i := 0 to 999999 do
0050FE7C BA40420F00       mov edx,$000f4240
Unit50.pas.38: Inc(FA);
0050FE81 FF8090030000     inc dword ptr [eax+$00000390]
Unit50.pas.39: Inc(FB, 2);
0050FE87 83809403000002   add dword ptr [eax+$00000394],$02
Unit50.pas.40: Inc(FC, 3);
0050FE8E 83809803000003   add dword ptr [eax+$00000398],$03
Unit50.pas.41: Inc(FD, 4);
0050FE95 83809C03000004   add dword ptr [eax+$0000039c],$04
Unit50.pas.36: for i := 0 to 999999 do
0050FE9C 4A               dec edx
0050FE9D 75E2             jnz $0050fe81

Quite ok, direct access to the Self pointer located in EAX and decreasing loop.

and now the 64 bits version:

Unit50.pas.31: FA := 0;
000000000059C520 C7815006000000000000 mov [rcx+$00000650],$00000000
Unit50.pas.32: FB := 0;
000000000059C52A C7815406000000000000 mov [rcx+$00000654],$00000000
Unit50.pas.33: FC := 0;
000000000059C534 C7815806000000000000 mov [rcx+$00000658],$00000000
Unit50.pas.34: FD := 0;
000000000059C53E C7815C06000000000000 mov [rcx+$0000065c],$00000000
Unit50.pas.36: for i := 0 to 999999 do
000000000059C548 4833C0           xor rax,rax
Unit50.pas.38: Inc(FA);
000000000059C54B 488D9150060000   lea rdx,[rcx+$00000650]
000000000059C552 830201           add dword ptr [rdx],$01
Unit50.pas.39: Inc(FB, 2);
000000000059C555 488D9154060000   lea rdx,[rcx+$00000654]
000000000059C55C 830202           add dword ptr [rdx],$02
Unit50.pas.40: Inc(FC, 3);
000000000059C55F 488D9158060000   lea rdx,[rcx+$00000658]
000000000059C566 830203           add dword ptr [rdx],$03
Unit50.pas.41: Inc(FD, 4);
000000000059C569 488D915C060000   lea rdx,[rcx+$0000065c]
000000000059C570 830204           add dword ptr [rdx],$04
Unit50.pas.42: end;
000000000059C573 83C001           add eax,$01
000000000059C576 81F840420F00     cmp eax,$000f4240
000000000059C57C 75CD             jnz $000000000059c54b

What is this?, why for every single self access a pointer in rdx is loaded?, is that an issue in the x64 assembler code that I'm not aware?, one cannot do add dword prt [rcx+$000065c], 4?, and what about the increasing plus comparing loop?

Dissapointing?, well, if you are able to modify your code to access self variables as less as possible then the optimization works better, let's see with this slightly modification:

procedure TForm50.FormCreate(Sender: TObject);
var
  a, b, c, d,
  i: integer;

begin
  a := 0;
  b := 0;
  c := 0;
  d := 0;

  for i := 0 to 999999 do
  begin
    Inc(a);
    Inc(b, 2);
    Inc(c, 3);
    Inc(d, 4);
  end;

  FA := a;
  FB := b;
  FC := c;
  FD := d;
end;

and now the assembler code shows:

Unit50.pas.32: a := 0;
000000000059C520 4833C0           xor rax,rax
Unit50.pas.33: b := 0;
000000000059C523 4833D2           xor rdx,rdx
Unit50.pas.34: c := 0;
000000000059C526 4D33C0           xor r8,r8
Unit50.pas.35: d := 0;
000000000059C529 4D33C9           xor r9,r9
Unit50.pas.37: for i := 0 to 999999 do
000000000059C52C 4D33D2           xor r10,r10
Unit50.pas.39: Inc(a);
000000000059C52F 83C001           add eax,$01
Unit50.pas.40: Inc(b, 2);
000000000059C532 83C202           add edx,$02
Unit50.pas.41: Inc(c, 3);
000000000059C535 4183C003         add r8d,$03
Unit50.pas.42: Inc(d, 4);
000000000059C539 4183C104         add r9d,$04
Unit50.pas.43: end;
000000000059C53D 4183C201         add r10d,$01
000000000059C541 4181FA40420F00   cmp r10d,$000f4240
000000000059C548 75E5             jnz $000000000059c52f
000000000059C54A 90               nop
Unit50.pas.45: FA := a;
000000000059C54B 898150060000     mov [rcx+$00000650],eax
Unit50.pas.46: FB := b;
000000000059C551 899154060000     mov [rcx+$00000654],edx
Unit50.pas.47: FC := c;
000000000059C557 44898158060000   mov [rcx+$00000658],r8d
Unit50.pas.48: FD := d;
000000000059C55E 4489895C060000   mov [rcx+$0000065c],r9d

The code looks much better now although the loop still funny (add + cmp). The thing is that in order to speed up ChessKISS in the 64 bits version, I've to remove as much as possible the use of class fields, which is not an easy task, let's see if I got the energy to do so and it yields the expected results.

No comments:

Post a Comment