Monday, September 19, 2011

XE2, strange optimizations when in 64 bits...

 Given this silly code:

type
  TForm50 = class(TForm)
    procedure FormCreate(Sender: TObject);
  private
    FA, FB, FC, FD: integer;
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form50: TForm50;

implementation

{$R *.dfm}

procedure TForm50.FormCreate(Sender: TObject);
var
  i: integer;

begin
  FA := 0;
  FB := 0;
  FC := 0;
  FD := 0;

  for i := 0 to 999999 do
  begin
    Inc(FA);
    Inc(FB, 2);
    Inc(FC, 3);
    Inc(FD, 4);
  end;
end;

The optimized 32 bits code looks like:

Unit50.pas.31: FA := 0;
0050FE5C 33D2             xor edx,edx
0050FE5E 899090030000     mov [eax+$00000390],edx
Unit50.pas.32: FB := 0;
0050FE64 33D2             xor edx,edx
0050FE66 899094030000     mov [eax+$00000394],edx
Unit50.pas.33: FC := 0;
0050FE6C 33D2             xor edx,edx
0050FE6E 899098030000     mov [eax+$00000398],edx
Unit50.pas.34: FD := 0;
0050FE74 33D2             xor edx,edx
0050FE76 89909C030000     mov [eax+$0000039c],edx

Unit50.pas.36: for i := 0 to 999999 do
0050FE7C BA40420F00       mov edx,$000f4240
Unit50.pas.38: Inc(FA);
0050FE81 FF8090030000     inc dword ptr [eax+$00000390]
Unit50.pas.39: Inc(FB, 2);
0050FE87 83809403000002   add dword ptr [eax+$00000394],$02
Unit50.pas.40: Inc(FC, 3);
0050FE8E 83809803000003   add dword ptr [eax+$00000398],$03
Unit50.pas.41: Inc(FD, 4);
0050FE95 83809C03000004   add dword ptr [eax+$0000039c],$04
Unit50.pas.36: for i := 0 to 999999 do
0050FE9C 4A               dec edx
0050FE9D 75E2             jnz $0050fe81

Quite ok, direct access to the Self pointer located in EAX and decreasing loop.

and now the 64 bits version:

Unit50.pas.31: FA := 0;
000000000059C520 C7815006000000000000 mov [rcx+$00000650],$00000000
Unit50.pas.32: FB := 0;
000000000059C52A C7815406000000000000 mov [rcx+$00000654],$00000000
Unit50.pas.33: FC := 0;
000000000059C534 C7815806000000000000 mov [rcx+$00000658],$00000000
Unit50.pas.34: FD := 0;
000000000059C53E C7815C06000000000000 mov [rcx+$0000065c],$00000000
Unit50.pas.36: for i := 0 to 999999 do
000000000059C548 4833C0           xor rax,rax
Unit50.pas.38: Inc(FA);
000000000059C54B 488D9150060000   lea rdx,[rcx+$00000650]
000000000059C552 830201           add dword ptr [rdx],$01
Unit50.pas.39: Inc(FB, 2);
000000000059C555 488D9154060000   lea rdx,[rcx+$00000654]
000000000059C55C 830202           add dword ptr [rdx],$02
Unit50.pas.40: Inc(FC, 3);
000000000059C55F 488D9158060000   lea rdx,[rcx+$00000658]
000000000059C566 830203           add dword ptr [rdx],$03
Unit50.pas.41: Inc(FD, 4);
000000000059C569 488D915C060000   lea rdx,[rcx+$0000065c]
000000000059C570 830204           add dword ptr [rdx],$04
Unit50.pas.42: end;
000000000059C573 83C001           add eax,$01
000000000059C576 81F840420F00     cmp eax,$000f4240
000000000059C57C 75CD             jnz $000000000059c54b

What is this?, why for every single self access a pointer in rdx is loaded?, is that an issue in the x64 assembler code that I'm not aware?, one cannot do add dword prt [rcx+$000065c], 4?, and what about the increasing plus comparing loop?

Dissapointing?, well, if you are able to modify your code to access self variables as less as possible then the optimization works better, let's see with this slightly modification:

procedure TForm50.FormCreate(Sender: TObject);
var
  a, b, c, d,
  i: integer;

begin
  a := 0;
  b := 0;
  c := 0;
  d := 0;

  for i := 0 to 999999 do
  begin
    Inc(a);
    Inc(b, 2);
    Inc(c, 3);
    Inc(d, 4);
  end;

  FA := a;
  FB := b;
  FC := c;
  FD := d;
end;

and now the assembler code shows:

Unit50.pas.32: a := 0;
000000000059C520 4833C0           xor rax,rax
Unit50.pas.33: b := 0;
000000000059C523 4833D2           xor rdx,rdx
Unit50.pas.34: c := 0;
000000000059C526 4D33C0           xor r8,r8
Unit50.pas.35: d := 0;
000000000059C529 4D33C9           xor r9,r9
Unit50.pas.37: for i := 0 to 999999 do
000000000059C52C 4D33D2           xor r10,r10
Unit50.pas.39: Inc(a);
000000000059C52F 83C001           add eax,$01
Unit50.pas.40: Inc(b, 2);
000000000059C532 83C202           add edx,$02
Unit50.pas.41: Inc(c, 3);
000000000059C535 4183C003         add r8d,$03
Unit50.pas.42: Inc(d, 4);
000000000059C539 4183C104         add r9d,$04
Unit50.pas.43: end;
000000000059C53D 4183C201         add r10d,$01
000000000059C541 4181FA40420F00   cmp r10d,$000f4240
000000000059C548 75E5             jnz $000000000059c52f
000000000059C54A 90               nop
Unit50.pas.45: FA := a;
000000000059C54B 898150060000     mov [rcx+$00000650],eax
Unit50.pas.46: FB := b;
000000000059C551 899154060000     mov [rcx+$00000654],edx
Unit50.pas.47: FC := c;
000000000059C557 44898158060000   mov [rcx+$00000658],r8d
Unit50.pas.48: FD := d;
000000000059C55E 4489895C060000   mov [rcx+$0000065c],r9d

The code looks much better now although the loop still funny (add + cmp). The thing is that in order to speed up ChessKISS in the 64 bits version, I've to remove as much as possible the use of class fields, which is not an easy task, let's see if I got the energy to do so and it yields the expected results.

New code formatter

 I know I'm a bit lazy when we talk about polishing my blog entries, I will try to improve the quality of the HTML visualization of the code submitted, as an example I'm posting the full 2d collision unit, which BTW let's you test the collision of two bitmaps taking into account zoom and angle.

unit BB.Screen.Collisions;

interface

uses
  Windows, Classes,
  BB.Screen.Surfaces, BB.Screen.Interfaces, BB.E3D.Interfaces, BB.Types, BB.Screen.Types, BB.E3D.Polygons;

type
  TBoundingBox = (bbFull, bbManual, bbAuto);

  TCollision = class
  private
    FCollisionShape: TCollisionShape;
    FCollider: ICollider;
    FBB,
    FSiluoette,
    FShape,
    FCurrentShape: IShape;
    FAccuracy: TCollisionAccuracy;
    FBoundingBox: TBoundingBox;

    function GetEdges: IShape;
    procedure CheckCollider;
    function GetAccuracy(aTotal: integer): integer;
    function GetRect(aShape: IShape): TRect;
    function GetShape(aShape: TCollisionShape): IShape;
    function GetBB: IShape;
    procedure SetCollisionShape(const aValue: TCollisionShape);
    function TestBB(aCollision: TCollision; out aRectA, aRectB, aOverlap: TRect): boolean;
    function TestColorKey(const aRectA, aRectB, aOverlap: TRect; aCollision: TCollision; out aX, aY: integer): boolean;
    procedure SetCurrentShapeAsBB;
    procedure SetCurrentShapeAsCustom;
  public
    constructor Create(aCollider: ICollider);
    destructor Destroy; override;
    function Test(aCollision: TCollision; out aX, aY: integer): boolean;
    procedure Reset;
    function GetCurrentShape: IShape;

    property Collider: ICollider read FCollider;
    property CollisionShape: TCollisionShape read FCollisionShape write SetCollisionShape;
    property Accuracy: TCollisionAccuracy read FAccuracy write FAccuracy;
    property BoundingBox: TBoundingBox read FBoundingBox write FBoundingBox;
  end;

implementation

uses
  SysUtils, {Types, }Generics.Collections,
  BB.Math.Matrix, BB.Math.Vector, BB.E3D.Vertex, BB.Math, BB.Interfaces, BB.Screen, BB.Colors, BB.Utils;

{ TCollision }

constructor TCollision.Create(aCollider: ICollider);
begin
  FCurrentShape := nil;
  FShape := nil;
  FBB := nil;
  FSiluoette := nil;
  FCollider := aCollider;
  FAccuracy := caLow;
  FBoundingBox := bbFull;

  CollisionShape := csRectangle;
end;

destructor TCollision.Destroy;
begin
  FSiluoette := nil;
  FBB := nil;
  FShape := nil;

  inherited;
end;

function TCollision.GetEdges: IShape;
begin
  Exit(IShape(FCollider.GetSurface.GetShape));
end;

function TCollision.GetShape(aShape: TCollisionShape): IShape;
var
  nx, ny, ScaledWidth, ScaledHeight, cx, cy,
  angle, x, y, z, zoom: TFloat;
  matrix: TMatrix;
  vertex, vector: TVector;
  accuracy, i, width, height: integer;
  surface: ISurface;
  //bb: TRect;

begin
  FCollider.GetInfo(x, y, z, zoom, angle);
  surface := FCollider.GetSurface;

  //width := (bb.Right - bb.Left);
  //height := (bb.Bottom - bb.Top);
  width := surface.GetWidth;
  height := surface.GetHeight;
  nx := zoom / 100;
  ny := zoom / 100;

  case FBoundingBox  of
    bbFull:
    begin
      ScaledWidth := width * nx / 2;
      ScaledHeight := height * ny / 2;
    end;

    bbManual, bbAuto:
    begin
      ScaledWidth := 0;
      ScaledHeight := 0;
      { TODO : bbAuto 0,0,31,31 }
    end;
  end;

  //cx := bb.Left + ScaledWidth;
  //cy := bb.Top + ScaledHeight;
  cx := x + ScaledWidth;
  cy := y + ScaledHeight;
  matrix := TMatrix.Create(0, 0, -angle);

  case aShape of
    csPrecise:
    begin
      if FSiluoette = nil then
      begin
        FSiluoette := GetEdges;
        if FSiluoette = nil then
          Exit(GetShape(csRectangle));
        FShape := TPolygon.Create(FSiluoette.GetSides);
      end;

      for i := 0 to FSiluoette.GetSides - 1 do
      begin
        vector := FSiluoette.GetVertex(i).ClippedVertex;
        vector.X := vector.X - (Width / 2);//ScaledWidth;
        vector.Y := vector.Y - (Height / 2);//ScaledHeight;
        vertex.Rotate(vector, matrix);

        FShape.GetVertex(i).ClippedVertex := TVector.Create(cx + (vertex.X * nx), cy + (vertex.Y * ny), z);
      end;
    end;

    csRhombus:
    begin
      //  1
      //4   2
      //  3

      //1
      vertex.Rotate(TVector.Create(0, -ScaledHeight, 0), matrix);
      FShape.GetVertex(0).ClippedVertex := TVector.Create(cx + vertex.X, cy + vertex.Y, z);

      //2
      vertex.Rotate(TVector.Create(ScaledWidth, 0, 0), matrix);
      FShape.GetVertex(1).ClippedVertex := TVector.Create(cx + vertex.X, cy + vertex.Y, z);

      //3
      vertex.Rotate(TVector.Create(0, ScaledHeight, 0), matrix);
      FShape.GetVertex(2).ClippedVertex := TVector.Create(cx + vertex.X, cy + vertex.Y, z);

      //4
      vertex.Rotate(TVector.Create(-ScaledWidth, 0, 0), matrix);
      FShape.GetVertex(3).ClippedVertex := TVector.Create(cx + vertex.X, cy + vertex.Y, z);
    end;

    csRectangle:
    begin
      FShape := FBB;
    end;

    csCircle:
    begin
      accuracy := 360 div GetAccuracy(360);
      for i := 0 to FShape.GetSides - 1 do
      begin
        vertex.Rotate(TVector.Create(ScaledWidth * Cos(i * accuracy * RAD), ScaledHeight * Sin(i * accuracy * RAD), 0), matrix);
        FShape.GetVertex(i).ClippedVertex := TVector.Create(cx + vertex.X, cy + vertex.Y, z);
      end;
    end;

    csTriangle:
    begin
      //  1
      //
      //3   2

      //1
      vertex.Rotate(TVector.Create(0, -ScaledHeight, 0), matrix);
      FShape.GetVertex(0).ClippedVertex := TVector.Create(cx + vertex.X, cy + vertex.Y, z);

      //2
      vertex.Rotate(TVector.Create(ScaledWidth, ScaledHeight, 0), matrix);
      FShape.GetVertex(1).ClippedVertex := TVector.Create(cx + vertex.X, cy + vertex.Y, z);

      //3
      vertex.Rotate(TVector.Create(-ScaledWidth, ScaledHeight, 0), matrix);
      FShape.GetVertex(2).ClippedVertex := TVector.Create(cx + vertex.X, cy + vertex.Y, z);
    end;

    csColorKey:
    begin
    end;
  end;

  result := FShape;
end;

procedure TCollision.SetCollisionShape(const aValue: TCollisionShape);
begin
  FCollisionShape := aValue;
  if FShape <> nil then
  begin
    FBB := nil;
    FShape := nil;

    Reset;
  end;

  //Always use Bounding Boxes
  FBB := TPolygon.Create(4);
  FCurrentShape := FBB;

  case FCollisionShape of
    csRectangle:
    begin
      FShape := FBB;
    end;

    csRhombus:
    begin
      FShape := TPolygon.Create(4);
    end;

    csCircle:
    begin
      FShape := TPolygon.Create(GetAccuracy(360));
    end;

    csTriangle:
    begin
      FShape := TPolygon.Create(3);
    end;
  end;
end;

function TCollision.Test(aCollision: TCollision; out aX, aY: integer): boolean;
var
  sa, sb: IShape;
  RectA, RectB, overlap: TRect;

begin
  CheckCollider;
  aCollision.CheckCollider;

  aX := -1;
  aY := -1;

  if (CollisionShape = csNone) or (aCollision.CollisionShape = csNone) or (aCollision = self) then
    Exit(False);

  //Fast bounding boxes
  SetCurrentShapeAsBB;
  aCollision.SetCurrentShapeAsBB;
  if not TestBB(aCollision, RectA, RectB, overlap) then
    Exit(False);

  if (CollisionShape = csColorKey) and (aCollision.CollisionShape = csColorKey) then
    Exit(TestColorKey(RectA, RectB, overlap, aCollision, aX, aY));

  //Custom collision test
  SetCurrentShapeAsCustom;
  aCollision.SetCurrentShapeAsCustom;
  sa := GetShape(FCollisionShape);
  sb := aCollision.GetShape(aCollision.FCollisionShape);
  if (sa = nil) or (sb = nil) then
    Exit(False);
  result := sa.Collision(sb, aX, aY) > -1;
end;

function TCollision.TestBB(aCollision: TCollision; out aRectA, aRectB, aOverlap: TRect): boolean;
begin
  aRectA := GetRect(GetBB);
  aRectB := GetRect(aCollision.GetBB);
  result := IntersectRect(aOverlap, aRectA, aRectB);
end;

function TCollision.TestColorKey(const aRectA, aRectB, aOverlap: TRect; aCollision: TCollision; out aX, aY: integer): boolean;
{ TODO :
-bpp
-optimize
}
var
  xb, yb, xa, ya,
  i, j, width, height: integer;
  BufferB, BufferA: pointer;
  ColorKeyB, ColorKeyA: cardinal;
  SurfaceB, SurfaceA: ISurface;

begin
  result := False;
  width := aOverlap.Right - aOverlap.Left - 1;
  height := aOverlap.Bottom - aOverlap.Top - 1;

  //Self
  if aRectA.Left < aOverlap.Left  then
    xa := aOverlap.Left - aRectA.Left
  else
    xa := 0;

  if aRectA.Top < aOverlap.Top  then
    ya := aOverlap.Top - aRectA.Top
  else
    ya := 0;

  //Other
  if aRectB.Left < aOverlap.Left  then
    xb := aOverlap.Left - aRectB.Left
  else
    xb := 0;

  if aRectB.Top < aOverlap.Top  then
    yb := aOverlap.Top - aRectB.Top
  else
    yb := 0;

  SurfaceB := aCollision.FCollider.GetSurface;
  SurfaceA := FCollider.GetSurface;

  SurfaceA.Lock([lfRead], nil);
  SurfaceB.Lock([lfRead], nil);
  try
    ColorKeyA := SurfaceA.GetColorKey;
    ColorKeyB := SurfaceB.GetColorKey;

    j := height;
    while j >= 0 do
    begin
      i := width;
      while i >= 0 do
      begin
        BufferA := SurfaceA.GetAddress(xa + i, ya + j);
        BufferB := SurfaceB.GetAddress(xb + i, yb + j);

        if (PCardinal(BufferA)^ <> ColorKeyA) and (PCardinal(BufferB)^ <> ColorKeyB) then
        begin
          aX := aOverlap.Left + i;
          aY := aOverlap.Top + j;
          Exit(True);
        end;

        Dec(i);
      end;

      Dec(j);
    end;

  finally
    SurfaceB.UnLock(nil);
    SurfaceA.Unlock(nil);
  end;
end;

function TCollision.GetAccuracy(aTotal: integer): integer;
begin
  case FAccuracy of
    caLow:
    begin
      result := Round(aTotal * 0.15);
    end;

    caHigh:
    begin
      result := Round(aTotal * 0.65);
    end;

    caMax:
    begin
      result := aTotal;
    end
    else
      result := Round(aTotal * 0.40);
  end;
end;

function TCollision.GetBB: IShape;
var
  nx, ny, ScaledWidth, ScaledHeight, cx, cy,
  angle, x, y, z, zoom: TFloat;
  matrix: TMatrix;
  vector: TVector;
  surface: ISurface;
  //bb: TRect;
  width, height: integer;

begin
  FCollider.GetInfo(x, y, z, zoom, angle);
  surface := FCollider.GetSurface;

  //width := (bb.Right - bb.Left);
  //height := (bb.Bottom - bb.Top);
  width := surface.GetWidth;
  height := surface.GetHeight;
  nx := zoom / 100;
  ny := zoom / 100;
  ScaledWidth := width * nx / 2;
  ScaledHeight := height * ny / 2;
  //cx := bb.Left + ScaledWidth;
  //cy := bb.Top + ScaledHeight;
  cx := x + ScaledWidth;
  cy := y + ScaledHeight;
  matrix := TMatrix.Create(0, 0, -angle);

  //1   2
  //
  //4   3

  //1
  vector.Rotate(TVector.Create(-ScaledWidth, -ScaledHeight, 0), matrix);
  FBB.GetVertex(0).ClippedVertex := TVector.Create(cx + vector.X, cy + vector.Y, z);

  //2
  vector.Rotate(TVector.Create(ScaledWidth, -ScaledHeight, 0), matrix);
  FBB.GetVertex(1).ClippedVertex := TVector.Create(cx + vector.X, cy + vector.Y, z);

  //3
  vector.Rotate(TVector.Create(ScaledWidth, ScaledHeight, 0), matrix);
  FBB.GetVertex(2).ClippedVertex := TVector.Create(cx + vector.X, cy + vector.Y, z);

  //4
  vector.Rotate(TVector.Create(-ScaledWidth, ScaledHeight, 0), matrix);
  FBB.GetVertex(3).ClippedVertex := TVector.Create(cx + vector.X, cy + vector.Y, z);

  result := FBB;
end;

function TCollision.GetCurrentShape: IShape;
begin
  Exit(FCurrentShape);
end;

procedure TCollision.SetCurrentShapeAsCustom;
begin
  FCurrentShape := FShape;
end;

procedure TCollision.SetCurrentShapeAsBB;
begin
  //Fast Bounding Boxes
  FCurrentShape := FBB;
end;

function TCollision.GetRect(aShape: IShape): TRect;
var
  v: TVector;
  i: Integer;

begin
  v := aShape.GetVertex(0).ClippedVertex;

  result.Left := Trunc(v.X);
  result.Right := result.Left;
  result.Top := Trunc(v.Y);
  result.Bottom := result.Top;

  for i := 1 to aShape.GetSides - 1 do
  begin
    v := aShape.GetVertex(i).ClippedVertex;

    if v.X < result.Left then
      result.Left := Trunc(v.X)
    else
      if v.X > result.Right then
        result.Right := Trunc(v.X);

    if v.Y < result.Top then
      result.Top := Trunc(v.Y)
    else
      if v.Y > result.Bottom then
        result.Bottom := Trunc(v.Y);
  end;
end;

procedure TCollision.Reset;
begin
  FSiluoette := nil;
end;

procedure TCollision.CheckCollider;
begin
  if FCollider = nil then
    raise Exception.Create('Collider not assigned');
end;

end.

Thanks to SyntaxHighlighter http://alexgorbatchev.com/SyntaxHighlighter/manual/themes/

What do you think mr. stranger?

Sunday, September 18, 2011

x64 release

 I've update the following links:

  • ChessKISS binaries and sources to 1.3a
  • BB Library to 1.2
  • Demos
Now they all compile in the 64 bits version of Delphi, everything seems to work except the DirectX stuff which I guess has a problem with the headers (not yet resolved), also some code is being fixed in order to compile under MACOS.

BTW, the 64 bits version of ChessKISS is not faster than the 32 bits even if lot's of stuff is using int64 (see a previous post), well is even slower...

Thursday, September 15, 2011

ChessKISS 64 bits finally works as expected!

I've finally found the error that provokes a different behaviour between the 32 and 64 bits version of ChessKISS, see this highlighted code in EvalRook():

//Check to see if the king has been forced to move and has trapped a rook at a1/b1/g1/h1,
  //if so, then penalize the trapped rook to help extricate it
  if FStage <= sMiddle then
  begin
    x := AllowedCols[col];
    if (x > 0) and (aPiece.GetRank = 1) then
    begin
      //blocked up?
      if (FBoard.GetPiece(aPiece.Forward) <> nil) and
      //Blocked left or right
         (FBoard.GetPieceType(aPiece.Index + Deltas[x]) = Kings[me]) then
        Dec(pos, TRAPPED_ROOK_PENALTY);
    end
end;

That is new and also was having an unknown behaviour in 32 bits in the trapped rook scenario:

  • Stage before ending
  • Columns 1 or 2 or 7 or 8
  • Next square occupied
  • Left or right squares, occupied by own king
But this should only be checked in rank 1!!!, in the other ranks it does not make sense, but even worst on the last rank you can only get garbage since the piece is out of the board...

Well, I will upload a fix for this and also the brand new 64 bits version

Saturday, September 10, 2011

Orient, the demo that was never released

 In case you want to know what was inside my brain 14 years ago...


(thankfully never released :-)

Old tools (really old)

 I've added some old tools used for the old demos (compression, colors, textures, 3d files, etc)

Find it in the download section

Another old release

 Every time I check the Legacy/ folder I found funny stuff, this time an intro for Micro Palma (an ancient BBS...)


It is strange to look at your old code, it can be very much crappy (being very polite...)

Anyway, I've added to the download section

Borland pascal 5.1, old libraries

 I've realized that although I've released the source of the old demos, I haven't release the source of the libraries, thus, you won't be able to compile the projects, ok, I've fix this issue, now there is a new link under the label "Borland Pascal 5.1 library"

Enjoy?

Friday, September 9, 2011

Trying Delphi XE2, testing Mac OSX

 I'm trying the Mac stuff with no success, I've followed all instructions:

  • Windows 7 IP is 192.168.2.2
  • Install VirtualBox
  • Install Mac OSX via VirtualBox 
  • Install in Mac the paserver thing from Embarcadero
  • Launch pasever at port 64211
  • VirtualBox guest IP is 192.168.2.69 forwared to 192.168.2.2 port 50000

Ping to remote OS works, but telnet on port 50000 doesn't (neither on port 64211), therefore in Delphi it does not work..., it shows: [PAClient Error] Error: EIdSocketError: Socket Error # 10061

The configuration in VirtualBox:


And the configuration in Delphi:


With Windows Firewall disabled it also does not work, so I'm really lost, somebody has been success with this kind of configuration?

Monday, September 5, 2011

Trying Delphi XE2, some performance tests, part 2

I had the feeling that the compiler does some strange optimizations somewhere since ChessKISS is slower in the 64 bits version, in order to prove that I've created the following test:


type
  TForm44 = class(TForm)
    procedure FormCreate(Sender: TObject);
  private
    FA,
    FB,
    FC,
    FD: NativeInt;
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form44: TForm44;

implementation

{$R *.dfm}

procedure TForm44.FormCreate(Sender: TObject);
var
  a, b, c, d: NativeInt;

begin
  a := 1;
  FA := a * 2;

  b := FA * 4;
  FB := b * 8;

  c := FB * 16;
  FC := c * 32;

  d := FC * 64;
  FD := d * 128;

  Caption := IntToStr(FD);
end;

end.
The optimized Win32 version:
Unit44.pas.31: a := 1;
0050FE73 B801000000       mov eax,$00000001
Unit44.pas.32: fa := a * 2;
0050FE78 8BD0             mov edx,eax
0050FE7A 03D2             add edx,edx
0050FE7C 899390030000     mov [ebx+$00000390],edx
Unit44.pas.33: b := fa * 4;
0050FE82 8BC2             mov eax,edx
0050FE84 03C0             add eax,eax
0050FE86 03C0             add eax,eax
Unit44.pas.34: fb := b * 8;
0050FE88 8BD0             mov edx,eax
0050FE8A 03D2             add edx,edx
0050FE8C 03D2             add edx,edx
0050FE8E 03D2             add edx,edx
0050FE90 899394030000     mov [ebx+$00000394],edx
Unit44.pas.35: c := fb * 16;
0050FE96 8BC2             mov eax,edx
0050FE98 C1E004           shl eax,$04
Unit44.pas.36: fc := c * 32;
0050FE9B 8BD0             mov edx,eax
0050FE9D C1E205           shl edx,$05
0050FEA0 899398030000     mov [ebx+$00000398],edx
Unit44.pas.37: d := fc * 64;
0050FEA6 8BC2             mov eax,edx
0050FEA8 C1E006           shl eax,$06
Unit44.pas.38: fd := d * 128;
0050FEAB 8BF0             mov esi,eax
0050FEAD C1E607           shl esi,$07
0050FEB0 89B39C030000     mov [ebx+$0000039c],esi
A more or less good optimization, let's see the win64 version:
Unit44.pas.31: a := 1;
000000000059C535 C7C001000000     mov eax,$00000001
Unit44.pas.32: FA := a * 2;
000000000059C53B 488D0400         lea rax,[rax+rax]
000000000059C53F 488B4D20         mov rcx,[rbp+$20]
000000000059C543 48898150060000   mov [rcx+$00000650],rax
Unit44.pas.33: b := FA * 4;
000000000059C54A 488B8150060000   mov rax,[rcx+$00000650]
000000000059C551 4803C0           add rax,rax
000000000059C554 4803C0           add rax,rax
Unit44.pas.34: FB := b * 8;
000000000059C557 488D04C500000000 lea rax,[rax*8+$0000]
000000000059C55F 488B4D20         mov rcx,[rbp+$20]
000000000059C563 48898158060000   mov [rcx+$00000658],rax
Unit44.pas.35: c := FB * 16;
000000000059C56A 488B8158060000   mov rax,[rcx+$00000658]
000000000059C571 48C1E004         shl rax,$04
Unit44.pas.36: FC := c * 32;
000000000059C575 48C1E005         shl rax,$05
000000000059C579 488B4D20         mov rcx,[rbp+$20]
000000000059C57D 48898160060000   mov [rcx+$00000660],rax
Unit44.pas.37: d := FC * 64;
000000000059C584 488B8160060000   mov rax,[rcx+$00000660]
000000000059C58B 48C1E006         shl rax,$06
Unit44.pas.38: FD := d * 128;
000000000059C58F 48C1E007         shl rax,$07
000000000059C593 488B4D20         mov rcx,[rbp+$20]
000000000059C597 48898168060000   mov [rcx+$00000668],rax
At a first glance it seems ok, but a closer look (check in blue) shows some strange results, Self accesses are not well optimized (I've noticed that also in ChessKISS), it keeps saving/reading the current value in rax register, looks like it could be easily kept on some register (and you have many in x64)

Saturday, September 3, 2011

Trying Delphi XE2, some performance tests....

 Given the following program I've measure the speed with combinations of WIN32/64 and Debug/Release:

TEST 1)


var
  i: integer;
  x, y: int64;

begin
  x := 0;
  y := 0;
  for i := 0 to MaxLongInt -1 do
  begin
    Inc(x, y);
    Inc(y);
  end;
end;
And the results expressed in milliseconds:
Debug Release
Win32 6.703 5.395
Win64 4.005 1.380
Clearly the Win64 + Release is almost four times faster than the optimized 32 bits  version, there is not big difference between non optimized versions of Win32 and Win64.
TEST 2)
Another test, this time implies multiplications:
var
  i: integer;
  x, y: int64;

begin
  x := 0;
  y := 0;
  for i := 0 to MaxLongInt -1 do
  begin
    x := x * y;
    Inc(y, i);
  end;
end;
And again the results:
Debug Release
Win32 14368 14461
Win64 5718 2398
The optimized Win64 version is seven times faster than optimized Win32, there is  no difference between optimized and non optimized versions of Win32, the conclussion is that Delphi does not know how to optimize x64 assembler when in 32 bits mode.
But it seems that they are doing a good job with the optimizations of the 64 bits, let's see if I'm able to compare with other compilers.

Trying Delphi XE2, ChessKISS 64 bits vs 32 bits

 Well, it was not difficult to make it work under 64 bits, but a bit to tune the WIN32 assembler versions that are not longer valid. I had to change the following functions:


function TBit.FirstBitSet(aValue: integer): integer;
{$IFDEF WIN32}
//EDX
ASM
  BSF EAX, EDX
END;
{$ELSE}
const
  Mod37BitPosition: array[0..36] of integer =
  (
    32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13, 4, 7, 17,
    0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9, 5, 20, 8, 19, 18
  );

begin
  {
  result := 0;

  for i := 0 to 31 do
  begin
    if _GetBit(aValue, i) then
      Exit(i);
  end;
  }

  result := Mod37BitPosition[(-aValue and aValue) mod 37];
end;
{$ENDIF}

function TBit.LastBitSet(aValue: integer): integer;
{$IFDEF WIN32}
//EDX
ASM
  BSR EAX, EDX
END;
{$ELSE}
const
  MultiplyDeBruijnBitPosition: array[0..31] of integer =
    (0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9);

begin
  {
  result := 0;

  for i := 31 downto 0 do
  begin
    if _GetBit(aValue, i) then
      Exit(i);
  end;
  }
  result := MultiplyDeBruijnBitPosition[cardinal(aValue * $077cb531) shr 27];
end;
{$ENDIF}

function TBit.PopFirstBitSet(var aValue: integer): integer;
//In = [EDX] Out = EAX
{$IFDEF WIN32}
ASM
  PUSH ECX
  MOV ECX, [EDX]
  BSF EAX, ECX
  BTR [EDX], EAX
  POP ECX
END;
{$ELSE}
begin
  result := FirstBitSet(aValue);
  _Clear(aValue, result);
end;
{$ENDIF}

Coming from Delphi 2010 is a bit strange this build configuration, I've set it up to release, this has boost again the performance so is not longer two times slower, silly me...


Some tests from initial position.

Win32)
02/09/2011 23:52:29, Send: 1 77 0 57 Nb1c3 
02/09/2011 23:52:29, Send: 2 5 0 228 Nb1c3 Nb8c6 
02/09/2011 23:52:29, Send: 3 69 0 359 Nb1c3 Nb8c6 Ng1f3 
02/09/2011 23:52:29, Send: 4 5 1 5309 Nb1c3 Nb8c6 Ng1f3 Ng8f6 
02/09/2011 23:52:29, Send: 5 52 0 7137 Nb1c3 Nb8c6 Ng1f3 Ng8f6 d4 
02/09/2011 23:52:29, Send: 6 39 1 12584 Nb1c3 Nb8c6 d3 Ng8f6 Bc1e3 d5 
02/09/2011 23:52:29, Send: 7 56 4 30821 Nb1c3 d6 Ng1f3 Bc8e6 e4 Nb8c6 Bf1b5 
02/09/2011 23:52:30, Send: 8 13 25 107775 Ng1f3 d5 e3 Bc8e6 Bf1b5 Nb8c6 Nf3e5 Ng8f6 Nb1c3 
02/09/2011 23:52:30, Send: 9 9 35 220814 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8g4 d4 Bg4xf3 
02/09/2011 23:52:31, Send: 10 7 78 451966 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8g4 d4 e6 
02/09/2011 23:52:32, Send: 11 12 157 905577 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8d7 d4 a6 Bb5d3 
02/09/2011 23:52:39, Send: 12 16 650 2598910 e4 e5 Nb1c3 Nb8c6 Ng1f3 Ng8f6 Bf1b5 Bf8d6 d3 O-O Bc1g5 Kg8h8 
02/09/2011 23:52:48, Send: 13 23 858 4698110 e4 Ng8f6 Nb1c3 d5 e4xd5 Nf6xd5 Ng1f3 Nb8c6 Bf1b5 Bc8g4 O-O a6 Bb5xc6 b7xc6 
02/09/2011 23:52:58, Send: 14 21 1085 7551076 e4 Ng8f6 Nb1c3 d5 e4xd5 Nf6xd5 Ng1f3 Nb8c6 Bf1b5 Nd5xc3 b2xc3 Qd8d5 Ra1b1 Bc8d7 
Depth 14 reached in 32 seconds.
Win64)
03/09/2011 0:01:30, Send: 1 77 0 57 Nb1c3 
03/09/2011 0:01:30, Send: 2 5 0 228 Nb1c3 Nb8c6 
03/09/2011 0:01:30, Send: 3 69 0 359 Nb1c3 Nb8c6 Ng1f3 
03/09/2011 0:01:30, Send: 4 5 1 5309 Nb1c3 Nb8c6 Ng1f3 Ng8f6 
03/09/2011 0:01:30, Send: 5 52 0 7137 Nb1c3 Nb8c6 Ng1f3 Ng8f6 d4 
03/09/2011 0:01:30, Send: 6 39 1 12584 Nb1c3 Nb8c6 d3 Ng8f6 Bc1e3 d5 
03/09/2011 0:01:30, Send: 7 56 6 30821 Nb1c3 d6 Ng1f3 Bc8e6 e4 Nb8c6 Bf1b5 
03/09/2011 0:01:31, Send: 8 13 32 107775 Ng1f3 d5 e3 Bc8e6 Bf1b5 Nb8c6 Nf3e5 Ng8f6 Nb1c3 
03/09/2011 0:01:31, Send: 9 9 51 220817 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8g4 d4 Bg4xf3 
03/09/2011 0:01:32, Send: 10 7 106 451687 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8g4 d4 e6 
03/09/2011 0:01:34, Send: 11 12 215 908965 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8d7 d4 a6 Bb5d3 
03/09/2011 0:01:43, Send: 12 19 900 2607087 e4 e5 Nb1c3 Nb8c6 Ng1f3 Bf8d6 Bf1b5 Ng8e7 O-O a6 Bb5d3 Bd6c5 
03/09/2011 0:01:53, Send: 13 23 931 4361400 e4 Ng8f6 Nb1c3 d5 e4xd5 Nf6xd5 Ng1f3 Nb8c6 Bf1b5 Bc8g4 O-O a6 Bb5xc6 b7xc6 
03/09/2011 0:02:10, Send: 14 21 1780 7805174 e4 Ng8f6 Nb1c3 d5 e4xd5 Nf6xd5 Ng1f3 Nb8c6 Bf1b5 Nd5xc3 b2xc3 Qd8d5 Ra1b1 Bc8d7 Bb5d3
Depth 14 reached in 40 seconds, 10 seconds slower!, but the worst thing is that they are different, OMG!
02/09/2011 23:52:29, Send: 1 77 0 57 Nb1c3 
03/09/2011 0:01:30,  Send: 1 77 0 57 Nb1c3 
02/09/2011 23:52:29, Send: 2 5 0 228 Nb1c3 Nb8c6 
03/09/2011 0:01:30,  Send: 2 5 0 228 Nb1c3 Nb8c6 
02/09/2011 23:52:29, Send: 3 69 0 359 Nb1c3 Nb8c6 Ng1f3 
03/09/2011 0:01:30,  Send: 3 69 0 359 Nb1c3 Nb8c6 Ng1f3 
02/09/2011 23:52:29, Send: 4 5 1 5309 Nb1c3 Nb8c6 Ng1f3 Ng8f6 
03/09/2011 0:01:30,  Send: 4 5 1 5309 Nb1c3 Nb8c6 Ng1f3 Ng8f6 
02/09/2011 23:52:29, Send: 5 52 0 7137 Nb1c3 Nb8c6 Ng1f3 Ng8f6 d4 
03/09/2011 0:01:30,  Send: 5 52 0 7137 Nb1c3 Nb8c6 Ng1f3 Ng8f6 d4 
02/09/2011 23:52:29, Send: 6 39 1 12584 Nb1c3 Nb8c6 d3 Ng8f6 Bc1e3 d5 
03/09/2011 0:01:30,  Send: 6 39 1 12584 Nb1c3 Nb8c6 d3 Ng8f6 Bc1e3 d5 
02/09/2011 23:52:29, Send: 7 56 4 30821 Nb1c3 d6 Ng1f3 Bc8e6 e4 Nb8c6 Bf1b5 
03/09/2011 0:01:30,  Send: 7 56 6 30821 Nb1c3 d6 Ng1f3 Bc8e6 e4 Nb8c6 Bf1b5 
02/09/2011 23:52:30, Send: 8 13 25 107775 Ng1f3 d5 e3 Bc8e6 Bf1b5 Nb8c6 Nf3e5 Ng8f6 Nb1c3 
03/09/2011 0:01:31,  Send: 8 13 32 107775 Ng1f3 d5 e3 Bc8e6 Bf1b5 Nb8c6 Nf3e5 Ng8f6 Nb1c3 
02/09/2011 23:52:30, Send: 9 9 35 220814 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8g4 d4 Bg4xf3 
03/09/2011 0:01:31,  Send: 9 9 51 220817 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8g4 d4 Bg4xf3 
02/09/2011 23:52:31, Send: 10 7 78 451966 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8g4 d4 e6 
03/09/2011 0:01:32,  Send: 10 7 106 451687 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8g4 d4 e6 
Different number of searches!

02/09/2011 23:52:32, Send: 11 12 157 905577 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8d7 d4 a6 Bb5d3 
03/09/2011 0:01:34,  Send: 11 12 215 908965 Ng1f3 d5 e3 Nb8c6 Bf1b5 Ng8f6 Nb1c3 Bc8d7 d4 a6 Bb5d3 
Different number of searches!

02/09/2011 23:52:39, Send: 12 16 650 2598910 e4 e5 Nb1c3 Nb8c6 Ng1f3 Ng8f6 Bf1b5 Bf8d6 d3 O-O Bc1g5 Kg8h8 
03/09/2011 0:01:43,  Send: 12 19 900 2607087 e4 e5 Nb1c3 Nb8c6 Ng1f3 Bf8d6 Bf1b5 Ng8e7 O-O a6 Bb5d3 Bd6c5 

02/09/2011 23:52:48, Send: 13 23 858 4698110 e4 Ng8f6 Nb1c3 d5 e4xd5 Nf6xd5 Ng1f3 Nb8c6 Bf1b5 Bc8g4 O-O a6 Bb5xc6 b7xc6 
03/09/2011 0:01:53,  Send: 13 23 931 4361400 e4 Ng8f6 Nb1c3 d5 e4xd5 Nf6xd5 Ng1f3 Nb8c6 Bf1b5 Bc8g4 O-O a6 Bb5xc6 b7xc6 

02/09/2011 23:52:58, Send: 14 21 1085 7551076 e4 Ng8f6 Nb1c3 d5 e4xd5 Nf6xd5 Ng1f3 Nb8c6 Bf1b5 Nd5xc3 b2xc3 Qd8d5 Ra1b1 Bc8d7 
03/09/2011 0:02:10,  Send: 14 21 1780 7805174 e4 Ng8f6 Nb1c3 d5 e4xd5 Nf6xd5 Ng1f3 Nb8c6 Bf1b5 Nd5xc3 b2xc3 Qd8d5 Ra1b1 Bc8d7 Bb5d3
Hooray!, more not so funny debugging sessions in order to find nasty bugs, let's see how I will tackle this...

Friday, September 2, 2011

Trying Delphi XE2, first impressions...

 Yesterday I downladed the 30 days demo of the brand new Delphi XE2 with 64 bits and Mac support, I want to share my feelings.

I Don't like:
-No assembler, I had to create quite a few WIN32/64 defines and then create de Pascal versions of those heavily optimized versions.
-My DirectX stuff does not work, I've to find out why...
-64 bits generated assembler code seems to be terrible!, ChessKISS is two times slower in 64 bits when it should be faster since a lot of int64 are used, I will create a more specific post about this.

I like:
-99% of the sutff compiles
-Finds unused variables that Delphi 2010 does not.
-Win64 support
-Mac support (even if I cannot try it)

Considerations:
-Casting pointers, objects or interfaces as integer does not work, one must cast them into int64

Bugs?
-D3DXVector3(-1, 0, 1) returns  "[DCC Error] BB.Screen.D3D.pas(895): E2029 ')' expected but ',' found", WTF?!, this function looks like this


//--------------------------
// 3D Vector
//--------------------------

function D3DXVector3(_x, _y, _z: Single): TD3DXVector3; inline;
begin
  with Result do
  begin
    x:= _x; y:= _y; z:=_z;
  end;
end;
-The following code gives the internal error "[DCC Fatal Error] BB.Sync.pas(1182): F2084 Internal Error: AV0A1B31D1-RFFFFFFFF-0"
constructor TFuture<T>.Create(aEvent: TNotification<T>; aPriority: TThreadPriority);
var
  e: TNotification<T>;

begin
  FResult := Default(T);
  FError := '';

  FWorker := TFutureThread<T>.Create(aEvent);
  FWorker.Priority := aPriority;
  FWorker.FreeOnTerminate := False;
end
with this workaround seems to at least compile...
constructor TFuture<T>.Create(aEvent: TNotification<T>; aPriority: TThreadPriority);
var
  e: TNotification<T>;

begin
  FResult := Default(T);
  FError := '';

  //Bloddy XE...
  e := aEvent;
  FWorker := TFutureThread<T>.Create(e);
  FWorker.Priority := aPriority;
  FWorker.FreeOnTerminate := False;
end;


-The following code does not compile, but I guess is something to do with the types of the windows function.


function GetCPUCount: integer;
var
  ProcessMask: DWORD;
  SystemMask: DWORD;

begin
  Result := 1;
  if GetProcessAffinityMask(GetCurrentProcess, ProcessMask, SystemMask) then
  begin
    while (ProcessMask <> 0) do
    begin
      if Odd(ProcessMask) then
        Inc(Result);
      ProcessMask := ProcessMask shr 1;
    end;
    Dec(Result);
  end;
end;

[DCC Error] BB.Utils.pas(1031): E2033 Types of actual and formal var parameters must be identical.