Comment #0 by bearophile_hugs — 2013-06-06T17:39:54Z
This program runs about 6 times faster compiled with ldc2 compared to dmd (I run it with 100000000 command line argument):
import core.stdc.stdio, std.random, std.conv;
void main(in string[] args) {
immutable uint N = (args.length == 2) ?
args[1].to!uint :
1_000;
auto rng = Xorshift(0);
uint total = 0;
for (uint i = 0; i < N; i++)
total += uniform(0, 10, rng);
printf("%u\n", total);
}
If I replace this line:
total += uniform(0, 10, rng);
with:
total += rng.front; rng.popFront;
Then the code compiled with ldc2 is only about 30% faster.
dmd 2.064alpha, -O -release -inline -noboundscheck.
Ldc2 V. 0.11.0, based on DMD v2.062 and LLVM 3.3svn, -O5 -release -profile-verifier-noassert.
Comment #1 by bearophile_hugs — 2013-06-06T17:54:27Z
This alternative version of the program is handy to produce a more readable asm output:
import core.stdc.stdio: printf;
import core.stdc.stdlib: atoi;
import std.random: Xorshift, uniform;
uint randomSum(in uint n, Xorshift rng) {
uint total = 0;
for (uint i = 0; i < n; i++)
total += uniform(0, 10, rng);
return total;
}
void main(in string[] args) {
immutable uint N = (args.length == 2) ?
atoi((args[1] ~ '\0').ptr) :
1_000;
auto rng = Xorshift(0);
immutable result = randomSum(N, rng);
printf("%u\n", result);
}
LDC2 produces for the randomSum() function (the inner loop is 26 instructions long, with no call instructions):
__D4temp9randomSumFxkS3std6random38__T14XorshiftEngineTkVi128Vi11Vi8Vi19Z14XorshiftEngineZk:
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
pushl %eax
xorl %eax, %eax
cmpl $0, 40(%esp)
je LBB0_5
leal 24(%esp), %ecx
movl 8(%ecx), %esi
movl (%ecx), %eax
movl 4(%ecx), %edx
movl 12(%ecx), %ebp
xorl %ebx, %ebx
movl $0, (%esp)
.align 16, 0x90
LBB0_2:
movl %esi, %edi
movl %edx, %ecx
movl %ebp, %esi
movl %eax, %ebp
shll $11, %ebp
xorl %eax, %ebp
movl %esi, %eax
shrl $19, %eax
xorl %esi, %eax
xorl %ebp, %eax
shrl $8, %ebp
xorl %eax, %ebp
cmpl $-7, %esi
movl %edi, %edx
movl %ecx, %eax
ja LBB0_2
movl %esi, %eax
movl $671088641, %edx
mull %edx
shrl $26, %edx
addl %edx, (%esp)
incl %ebx
cmpl 40(%esp), %ebx
movl %edi, %edx
movl %ecx, %eax
jne LBB0_2
leal 24(%esp), %eax
movl %ebp, 12(%eax)
movl %ecx, (%eax)
movl %edi, 4(%eax)
movl %esi, 8(%eax)
movl (%esp), %eax
LBB0_5:
addl $4, %esp
popl %esi
popl %edi
popl %ebx
popl %ebp
ret $20
Maybe it's also a matter of missed inlining.
Comment #2 by robert.schadek — 2024-12-13T18:07:46Z