From 9d7d02e7eb868ca1a054c0861c0cc7030553750d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?IOhannes=20m=20zm=C3=B6lnig?= Date: Wed, 21 Dec 2005 17:15:41 +0000 Subject: only do 8x loop unrolling since it seems to be more efficient (well, rather a joke: 4096 [abs~] need about 24% with 16x unrolling and about 23-24% with 8x unrolling) svn path=/trunk/externals/zexy/; revision=4276 --- src/sgn~.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'src/sgn~.c') diff --git a/src/sgn~.c b/src/sgn~.c index b390144..a18c539 100644 --- a/src/sgn~.c +++ b/src/sgn~.c @@ -79,7 +79,7 @@ static t_int *sgnTilde_performSSE(t_int *w) __m128 *out = (__m128 *)(w[2]); __m128 val; - int n = (int)(w[3])>>4; // yea, we do 16x loop-unrolling + int n = (int)(w[3])>>3; // we do 8x loop-unrolling const __m128 sgnmask= _mm_loadu_ps((float*)l_bitmask); const __m128 zero = _mm_setzero_ps(); @@ -101,20 +101,8 @@ static t_int *sgnTilde_performSSE(t_int *w) xmm0 = _mm_and_ps (xmm0, one); out[1]= _mm_or_ps (xmm1, xmm0); - val=in[2]; - xmm0 = _mm_cmpneq_ps(val , zero); - xmm1 = _mm_and_ps (val, sgnmask); - xmm0 = _mm_and_ps (xmm0, one); - out[2]= _mm_or_ps (xmm1, xmm0); - - val=in[3]; - xmm0 = _mm_cmpneq_ps(val , zero); - xmm1 = _mm_and_ps (val, sgnmask); - xmm0 = _mm_and_ps (xmm0, one); - out[3]= _mm_or_ps (xmm1, xmm0); - - in +=4; - out+=4; + in +=2; + out+=2; } return (w+4); -- cgit v1.2.1