diff options
author | IOhannes m zmölnig <zmoelnig@users.sourceforge.net> | 2005-12-21 17:15:41 +0000 |
---|---|---|
committer | IOhannes m zmölnig <zmoelnig@users.sourceforge.net> | 2005-12-21 17:15:41 +0000 |
commit | 9d7d02e7eb868ca1a054c0861c0cc7030553750d (patch) | |
tree | b2eb1ee84986445a9cee5bcda0b311da4133feea | |
parent | f0f2e78feae74ba5ea2faf3516fdc0711cc7fe3d (diff) |
only do 8x loop unrolling since it seems to be more efficient
(well, rather a joke: 4096 [abs~] need about 24% with 16x unrolling and about 23-24% with 8x unrolling)
svn path=/trunk/externals/zexy/; revision=4276
-rw-r--r-- | src/sgn~.c | 18 |
1 files changed, 3 insertions, 15 deletions
@@ -79,7 +79,7 @@ static t_int *sgnTilde_performSSE(t_int *w) __m128 *out = (__m128 *)(w[2]); __m128 val; - int n = (int)(w[3])>>4; // yea, we do 16x loop-unrolling + int n = (int)(w[3])>>3; // we do 8x loop-unrolling const __m128 sgnmask= _mm_loadu_ps((float*)l_bitmask); const __m128 zero = _mm_setzero_ps(); @@ -101,20 +101,8 @@ static t_int *sgnTilde_performSSE(t_int *w) xmm0 = _mm_and_ps (xmm0, one); out[1]= _mm_or_ps (xmm1, xmm0); - val=in[2]; - xmm0 = _mm_cmpneq_ps(val , zero); - xmm1 = _mm_and_ps (val, sgnmask); - xmm0 = _mm_and_ps (xmm0, one); - out[2]= _mm_or_ps (xmm1, xmm0); - - val=in[3]; - xmm0 = _mm_cmpneq_ps(val , zero); - xmm1 = _mm_and_ps (val, sgnmask); - xmm0 = _mm_and_ps (xmm0, one); - out[3]= _mm_or_ps (xmm1, xmm0); - - in +=4; - out+=4; + in +=2; + out+=2; } return (w+4); |