#include <stdio.h>
#include <math.h>
double a, b;
#define SQR(a) ((a)*(a))
int main()
{
double sum;
a = 0.5;
b = 0.5;
sum = sqrt(SQR(sin(a)) + SQR(cos(b)));
printf("sum = %f\n", sum);
return 0;
}
#include <math.h>
double a, b;
#define SQR(a) ((a)*(a))
int main()
{
double sum;
a = 0.5;
b = 0.5;
sum = sqrt(SQR(sin(a)) + SQR(cos(b)));
printf("sum = %f\n", sum);
return 0;
}
CFLAGS is set to "-mtune=nocona -mfpmath=sse -msse3 -O3 -ffast-math"
The source code above, after compiled with GCC (e.g: gcc -S $CFLAGS test.c), gives:
.file "ssetest.c"
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC1:
.ascii "sum = %f\12\0"
.align 8
LC2:
.long 0
.long 1071644672
.text
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl $16, %eax
movl %esp, %ebp
subl $24, %esp
andl $-16, %esp
call __alloca
call ___main
fldl LC2
movl $LC1, (%esp)
fld %st(0)
fstl _a
fstl _b
fxch %st(1)
fsin
fxch %st(1)
fcos
fxch %st(1)
fstpl -8(%ebp)
movsd -8(%ebp), %xmm2
fstpl -8(%ebp)
movsd -8(%ebp), %xmm0
mulsd %xmm2, %xmm2
mulsd %xmm0, %xmm0
addsd %xmm0, %xmm2
sqrtsd %xmm2, %xmm1
movsd %xmm1, 4(%esp)
call _printf
xorl %eax, %eax
leave
ret
.comm _a, 16 # 8
.comm _b, 16 # 8
.def _printf; .scl 3; .type 32; .endef
The code is so efficient. fsin/fcos does the sine computation in CPU hardware (no emulation). It also utilize MMX registers (xmm0, xmm1, xmm2) so memory movement is minimum.