#include#define VECTOR_SIZE 4 typedef float v4sf __attribute__ ((vector_size(sizeof(float)*VECTOR_SIZE))); // vector of four singl e floats typedef union f4vector { v4sf v; float f[VECTOR_SIZE]; } f4vector; void print_vector (f4vector *v) { printf("%f,%f,%f,%f\n", v->f[0], v->f[1], v->f[2], v->f[3]); } int main() { union f4vector a, b, c; a.v = (v4sf){1., 2., 3., 4.}; b.v = (v4sf){5., 6., 7., 8.}; c.v = a.v + b.v; print_vector(&a); print_vector(&b); print_vector(&c); }
Compile with the following command:
gcc -ggdb -mtune=pentium3 -march=pentium3 -c -O3 -ffast-math -mfpmath=sse -msse5 sse.c
To test, just link the object code to binary:
gcc -lm sse.o -o sse $ ./sse 1.000000,2.000000,3.000000,4.000000 5.000000,6.000000,7.000000,8.000000 6.000000,8.000000,10.000000,12.000000
The assembled code:
$ objdump -dS ./sse.o | grep -2 c.v | tail -8 7c: 0f 58 c1 addps %xmm1,%xmm0 7f: 0f 29 45 c8 movaps %xmm0,-0x38(%ebp) -- 120: f2 0f 11 44 24 04 movsd %xmm0,0x4(%esp) 126: e8 00 00 00 00 call 12b <_main+0xdb> c.v = a.v + b.v; print_vector(&a);
As we can see, it's very optimized where adding 4 components of vector a and b is done in one SSE instruction (addps) instead of multiple instructions if we don't use -msse and -mfpmath=sse
How fast is the program?
$ time ./sse 1.000000,2.000000,3.000000,4.000000 5.000000,6.000000,7.000000,8.000000 6.000000,8.000000,10.000000,12.000000 real 0m0.109s user 0m0.046s sys 0m0.030s