Two example cases:
loop1.d
---------
void main()
{
for (int i = 0; i < int.max; i++)
{
}
}
loop2.d
---------
void main()
{
int i = 0;
while(i < int.max)
{
i++;
}
}
$ dmd loop1.d
$ time ./loop1
real 0m2.914s
user 0m2.884s
sys 0m0.012s
$ ./dmd loop1.d -O
$ time ./loop1
real 0m5.695s
user 0m5.684s
sys 0m0.004s
$ ./dmd loop2.d
$ time ./loop2
real 0m2.912s
user 0m2.892s
sys 0m0.004s
$ ./dmd loop2.d -O
$ time ./loop2
real 0m5.703s
user 0m5.688s
sys 0m0.004s
The speed of execution slows by almost double when optimisations are turned on. Something isn't right here...
Comment #1 by spam — 2010-12-01T16:50:43Z
for the lazy #5100 is
Comment #2 by spam — 2010-12-01T16:51:37Z
(In reply to comment #1)
> for the lazy #5100 is
ooops very sorry this comment was meant to be for bug #5294 cause i think it is related.
Comment #3 by clugdbug — 2010-12-06T01:20:51Z
Cannot reproduce. On Windows, for both test cases, without -O it's about 5 seconds (does an INC and CMP of a stack variable). With -O it is about 1 second (just does INC and CMP of EAX).
Comment #4 by ibuclaw — 2010-12-06T14:51:32Z
objdump without -O on Linux:
push %ebp
mov %esp,%ebp
sub $0x4,%esp
movl $0x0,-0x4(%ebp)
cmpl $0x7fffffff,-0x4(%ebp)
jge 1c <_Dmain+0x1c>
addl $0x1,-0x4(%ebp)
jmp d <_Dmain+0xd>
xor %eax,%eax
leave
ret
objdump with -O on Linux
push %ebp
mov %esp,%ebp
xor %eax,%eax
add $0x1,%eax
cmp $0x7fffffff,%eax
jb 5 <_Dmain+0x5>
pop %ebp
xor %eax,%eax
ret
Looks to be same as what Don said that was on his Windows box.
Wonder why Linux is slower... (must be a quirk, that or my Intel Atom CPU is to blame).
Comment #5 by ibuclaw — 2010-12-07T00:59:45Z
Been playing about with GCC, this seems to be a better performant:
Objdump:
push %ebp
mov %esp,%ebp
and $0xfffffff0,%esp
push %eax
sub $0xc,%esp
lea 0x0(%esi),%esi
add $0x1,%eax
cmp $0x7fffffff,%eax
jne 30 <_Dmain+0x10>
add $0xc,%esp
mov %ebp,%esp
pop %ebp
ret
GCC assembly:
.globl _Dmain
.type _Dmain, @function
_Dmain:
.LFB0:
.cfi_startproc
.cfi_personality 0x0,__gdc_personality_v0
pushl %ebp
.cfi_def_cfa_offset 8
movl %esp, %ebp
.cfi_offset 5, -8
.cfi_def_cfa_register 5
andl $-16, %esp
pushl %eax
.cfi_escape 0x10,0x3,0x7,0x55,0x9,0xf0,0x1a,0x9,0xfc,0x22
subl $12, %esp
.p2align 4,,7
.p2align 3
.L4:
addl $1, %eax
cmpl $2147483647, %eax
jne .L4
addl $12, %esp
movl %ebp, %esp
popl %ebp
ret
.cfi_endproc
.LFE0:
.size _Dmain, .-_Dmain
Can attach the full .s file if needed.
Regards
Comment #6 by bugzilla — 2012-01-19T14:08:23Z
Perhaps it's because gcc is doing:
ADD EAX,1
instead of:
INC EAX
Comment #7 by ibuclaw — 2012-01-19T15:33:26Z
Maybe not...
I actually get the reverse on my new laptop with 2.057,
$ dmd loop2.d
$ objdump loop2.o -d
push %ebp
mov %esp,%ebp
sub $0x4,%esp
movl $0x0,-0x4(%ebp)
cmpl $0x7fffffff,-0x4(%ebp)
jge 1b <_Dmain+0x1b>
incl -0x4(%ebp)
jmp d <_Dmain+0xd>
xor %eax,%eax
leave
ret
$ time ./loop2
real 0m11.780s
user 0m11.769s
sys 0m0.004s
$ dmd loop2.d -O
$ objdump loop2.o -d
push %ebp
mov %esp,%ebp
xor %eax,%eax
inc %eax
cmp $0x7fffffff,%eax
jb 5 <_Dmain+0x5>
pop %ebp
xor %eax,%eax
ret
$ time ./loop2
real 0m3.936s
user 0m3.924s
sys 0m0.008s
Comment #8 by ibuclaw — 2012-01-19T15:39:32Z
And on my netbook:
$ dmd loop2.d
$ time ./loop2
real 0m2.948s
user 0m2.924s
sys 0m0.012s
$ dmd loop2.d -O
$ time ./loop2
real 0m5.725s
user 0m5.688s
sys 0m0.012s
Specs of Netbook:
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 28
model name : Intel(R) Atom(TM) CPU N270 @ 1.60GHz
stepping : 2
cpu MHz : 800.000
cache size : 512 KB
cpu cores : 1
Specs of Laptop:
processor : 0
vendor_id : AuthenticAMD
cpu family : 20
model : 2
model name : AMD E-450 APU with Radeon(tm) HD Graphics
stepping : 0
cpu MHz : 825.000
cache size : 512 KB
cpu cores : 2
Regards
Comment #9 by ibuclaw — 2012-01-19T15:48:04Z
My gut feeling is that the main source of it slowing down is the needless push and pop of the frame pointer.
Comment #10 by ibuclaw — 2015-08-09T06:27:51Z
For a while now I've been thinking that the bottleneck is probably to do with alignment, but I'd have to get out my (now two generations old) atom netbook to investigate further.
Comment #11 by robert.schadek — 2024-12-13T17:53:59Z