Thursday, December 30, 2010

SSE built-in functions in GCC

int __builtin_ia32_comieq (v4sf, v4sf)
int __builtin_ia32_comineq (v4sf, v4sf)
int __builtin_ia32_comilt (v4sf, v4sf)
int __builtin_ia32_comile (v4sf, v4sf)
int __builtin_ia32_comigt (v4sf, v4sf)
int __builtin_ia32_comige (v4sf, v4sf)
int __builtin_ia32_ucomieq (v4sf, v4sf)
int __builtin_ia32_ucomineq (v4sf, v4sf)
int __builtin_ia32_ucomilt (v4sf, v4sf)
int __builtin_ia32_ucomile (v4sf, v4sf)
int __builtin_ia32_ucomigt (v4sf, v4sf)
int __builtin_ia32_ucomige (v4sf, v4sf)
v4sf __builtin_ia32_addps (v4sf, v4sf)
v4sf __builtin_ia32_subps (v4sf, v4sf)
v4sf __builtin_ia32_mulps (v4sf, v4sf)
v4sf __builtin_ia32_divps (v4sf, v4sf)
v4sf __builtin_ia32_addss (v4sf, v4sf)
v4sf __builtin_ia32_subss (v4sf, v4sf)
v4sf __builtin_ia32_mulss (v4sf, v4sf)
v4sf __builtin_ia32_divss (v4sf, v4sf)
v4si __builtin_ia32_cmpeqps (v4sf, v4sf)
v4si __builtin_ia32_cmpltps (v4sf, v4sf)
v4si __builtin_ia32_cmpleps (v4sf, v4sf)
v4si __builtin_ia32_cmpgtps (v4sf, v4sf)
v4si __builtin_ia32_cmpgeps (v4sf, v4sf)
v4si __builtin_ia32_cmpunordps (v4sf, v4sf)
v4si __builtin_ia32_cmpneqps (v4sf, v4sf)
v4si __builtin_ia32_cmpnltps (v4sf, v4sf)
v4si __builtin_ia32_cmpnleps (v4sf, v4sf)
v4si __builtin_ia32_cmpngtps (v4sf, v4sf)
v4si __builtin_ia32_cmpngeps (v4sf, v4sf)
v4si __builtin_ia32_cmpordps (v4sf, v4sf)
v4si __builtin_ia32_cmpeqss (v4sf, v4sf)
v4si __builtin_ia32_cmpltss (v4sf, v4sf)
v4si __builtin_ia32_cmpless (v4sf, v4sf)
v4si __builtin_ia32_cmpunordss (v4sf, v4sf)
v4si __builtin_ia32_cmpneqss (v4sf, v4sf)
v4si __builtin_ia32_cmpnlts (v4sf, v4sf)
v4si __builtin_ia32_cmpnless (v4sf, v4sf)
v4si __builtin_ia32_cmpordss (v4sf, v4sf)
v4sf __builtin_ia32_maxps (v4sf, v4sf)
v4sf __builtin_ia32_maxss (v4sf, v4sf)
v4sf __builtin_ia32_minps (v4sf, v4sf)
v4sf __builtin_ia32_minss (v4sf, v4sf)
v4sf __builtin_ia32_andps (v4sf, v4sf)
v4sf __builtin_ia32_andnps (v4sf, v4sf)
v4sf __builtin_ia32_orps (v4sf, v4sf)
v4sf __builtin_ia32_xorps (v4sf, v4sf)
v4sf __builtin_ia32_movss (v4sf, v4sf)
v4sf __builtin_ia32_movhlps (v4sf, v4sf)
v4sf __builtin_ia32_movlhps (v4sf, v4sf)
v4sf __builtin_ia32_unpckhps (v4sf, v4sf)
v4sf __builtin_ia32_unpcklps (v4sf, v4sf)
v4sf __builtin_ia32_cvtpi2ps (v4sf, v2si)
v4sf __builtin_ia32_cvtsi2ss (v4sf, int)
v2si __builtin_ia32_cvtps2pi (v4sf)
int __builtin_ia32_cvtss2si (v4sf)
v2si __builtin_ia32_cvttps2pi (v4sf)
int __builtin_ia32_cvttss2si (v4sf)
v4sf __builtin_ia32_rcpps (v4sf)
v4sf __builtin_ia32_rsqrtps (v4sf)
v4sf __builtin_ia32_sqrtps (v4sf)
v4sf __builtin_ia32_rcpss (v4sf)
v4sf __builtin_ia32_rsqrtss (v4sf)
v4sf __builtin_ia32_sqrtss (v4sf)
v4sf __builtin_ia32_shufps (v4sf, v4sf, int)
void __builtin_ia32_movntps (float *, v4sf)
int __builtin_ia32_movmskps (v4sf)

The following built-in functions are available when -msse is used.

v4sf __builtin_ia32_loadaps (float *)
Generates the movaps machine instruction as a load from memory. 
void __builtin_ia32_storeaps (float *, v4sf)
Generates the movaps machine instruction as a store to memory. 
v4sf __builtin_ia32_loadups (float *)
Generates the movups machine instruction as a load from memory. 
void __builtin_ia32_storeups (float *, v4sf)
Generates the movups machine instruction as a store to memory. 
v4sf __builtin_ia32_loadsss (float *)
Generates the movss machine instruction as a load from memory. 
void __builtin_ia32_storess (float *, v4sf)
Generates the movss machine instruction as a store to memory. 
v4sf __builtin_ia32_loadhps (v4sf, v2si *)
Generates the movhps machine instruction as a load from memory. 
v4sf __builtin_ia32_loadlps (v4sf, v2si *)
Generates the movlps machine instruction as a load from memory 
void __builtin_ia32_storehps (v4sf, v2si *)
Generates the movhps machine instruction as a store to memory. 
void __builtin_ia32_storelps (v4sf, v2si *)
Generates the movlps machine instruction as a store to memory.

Wednesday, December 29, 2010

Vector Addition using SIMD

#include 

#define VECTOR_SIZE         4
typedef float v4sf __attribute__ ((vector_size(sizeof(float)*VECTOR_SIZE))); // vector of four singl
e floats

typedef union f4vector
{
    v4sf    v;
    float   f[VECTOR_SIZE];
} f4vector;


void print_vector (f4vector *v)
{
    printf("%f,%f,%f,%f\n", v->f[0], v->f[1], v->f[2], v->f[3]);
}

int main()
{
    union f4vector a, b, c;

    a.v = (v4sf){1., 2., 3., 4.};
    b.v = (v4sf){5., 6., 7., 8.};
    c.v = a.v + b.v;

    print_vector(&a);
    print_vector(&b);
    print_vector(&c);
}

Compile with the following command:
gcc -ggdb -mtune=pentium3 -march=pentium3 -c -O3 -ffast-math -mfpmath=sse -msse5 sse.c

To test, just link the object code to binary:

gcc -lm sse.o -o sse

$ ./sse
1.000000,2.000000,3.000000,4.000000
5.000000,6.000000,7.000000,8.000000
6.000000,8.000000,10.000000,12.000000

The assembled code:

$ objdump -dS ./sse.o | grep -2 c.v | tail -8
  7c:   0f 58 c1                addps  %xmm1,%xmm0
  7f:   0f 29 45 c8             movaps %xmm0,-0x38(%ebp)
--
 120:   f2 0f 11 44 24 04       movsd  %xmm0,0x4(%esp)
 126:   e8 00 00 00 00          call   12b <_main+0xdb>
        c.v = a.v + b.v;

        print_vector(&a);

As we can see, it's very optimized where adding 4 components of vector a and b is done in one SSE instruction (addps) instead of multiple instructions if we don't use -msse and -mfpmath=sse



How fast is the program?

$ time ./sse
1.000000,2.000000,3.000000,4.000000
5.000000,6.000000,7.000000,8.000000
6.000000,8.000000,10.000000,12.000000

real    0m0.109s
user    0m0.046s
sys     0m0.030s

Thursday, December 23, 2010

Mac OSX Lion: Another Windows remake?

Apple has just updated its website and now it announces that they plan to release another Mac OS-X named "Lion".  A sneak peak to the features, some of the features are not really "wow" me and even seems too-old to be a breakthrough.  For example, "LauchPad".  Windows xx has had it for long time as "Desktop icons".  Another one is "Mission Control" which the similar feature has been in Windows 7 for awhile.

Unfortunately, Apple has not revealed all the features they plan to put in OS-X.  Not sure if the upgrade worth the cost of upgrade (well, if it is only $25 upgrade I'll just go ahead and upgrade mine).

Sunday, November 14, 2010

Downsampling MP3 file

#!/usr/bin/tclsh

set ffmpeg [exec which ffmpeg]
puts "ffmpeg  = $ffmpeg"
if { [llength $argv] < 2 } {
    puts "\n$argv0  n bps\n"
    exit -1
}
set fi [lindex $argv 0]
set br [lindex $argv 1]
puts "$fi"
puts "target $br bps"

if { [regexp -all {(.*)\.([mM][pP]3)} $fi a b c] } {
    set fo "${b}_${br}bps.$c"
    puts "$fi ==> $fo, ext=$c, new bitrate=$br"
    set id3 [exec id3v2 -l $fi]
    set id3par ""
    if { [regexp -line {TIT2.*: (.*)\n} $id3 dummy title c] } { puts Title=$title }
    if { [regexp -line {TPE1.*: (.*)\n} $id3 dummy singer c] } { puts singer=$singer }
    if { [regexp -line {TALB.*: (.*)\n} $id3 dummy album c] } { puts album=$album }
    if { [regexp -line {TYER.*: (.*)\n} $id3 dummy year c] } { puts year=$year }
    if { [regexp -line {TCON.*\(([0-9]+)\)\n} $id3 dummy genre c] } { puts genre=$genre }
    set cmd "$ffmpeg -threads 16 -y -ab $br -i $fi $fo"
    if {![file exists $fo]} {
        if { [catch { set res [eval exec $cmd] fid }] } {
            #puts stderr "Could not execute $cmd"
            if {[info exists fid] } { puts stderr $fid" }
            #exit 1
        }
    }
    if {[info exists title]} { append idpar " --TIT2 \"$title\"" }
    if {[info exists singer]} { append idpar " --TPE1 \"$singer\"" }
    if {[info exists album]} { append idpar " --TALB \"$album\"" }
    if {[info exists year]} { append idpar " --TYER \"$year\"" }
    if {[info exists genre]} { append idpar " --TCON \"$genre\"" }
    set cmd "id3v2 $idpar $fo"
    puts $cmd
    if { [file exists $fo]} {
        eval exec $cmd
    }
} else {
    puts "Not an MP3 file"
}


PAM Security explained

PAM Security: http://articles.techrepublic.com.com/5100-10878_11-1055269.html