From 4934f05ba8e55f58ed5762180ea317da50a00a05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?IOhannes=20m=20zm=C3=B6lnig?= Date: Tue, 19 Jan 2010 14:56:40 +0000 Subject: SIMD code hopefully now also works on 64bit svn path=/trunk/externals/zexy/; revision=13045 --- src/0x260x260x7e.c | 4 ++-- src/0x3c0x7e.c | 2 +- src/0x3d0x3d0x7e.c | 2 +- src/0x3e0x7e.c | 2 +- src/0x7c0x7c0x7e.c | 2 +- src/absgn~.c | 17 ++++++------- src/abs~.c | 14 +++++------ src/sgn~.c | 12 +++++----- src/zexy.h | 11 +-------- src/zexySIMD.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 99 insertions(+), 37 deletions(-) create mode 100644 src/zexySIMD.h (limited to 'src') diff --git a/src/0x260x260x7e.c b/src/0x260x260x7e.c index 8a37cc9..ace6fe7 100644 --- a/src/0x260x260x7e.c +++ b/src/0x260x260x7e.c @@ -20,7 +20,7 @@ 1302:forum::für::umläute:2000 */ -#include "zexy.h" +#include "zexySIMD.h" /* ------------------------ logical~ ----------------------------- */ @@ -125,7 +125,7 @@ static t_int *scalarandand_tilde_perf8(t_int *w) } #ifdef __SSE__ -static long l_bitmask[]={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; +static int l_bitmask[]={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; static t_int *andand_tilde_performSSE(t_int *w) { diff --git a/src/0x3c0x7e.c b/src/0x3c0x7e.c index 24c0a90..313a5d4 100644 --- a/src/0x3c0x7e.c +++ b/src/0x3c0x7e.c @@ -20,7 +20,7 @@ 1302:forum::für::umläute:2000 */ -#include "zexy.h" +#include "zexySIMD.h" /* ------------------------ relational~ ----------------------------- */ diff --git a/src/0x3d0x3d0x7e.c b/src/0x3d0x3d0x7e.c index f605df1..9cbcd1c 100644 --- a/src/0x3d0x3d0x7e.c +++ b/src/0x3d0x3d0x7e.c @@ -20,7 +20,7 @@ 1302:forum::für::umläute:2000 */ -#include "zexy.h" +#include "zexySIMD.h" /* ----------------------------- eq_tilde ----------------------------- */ static t_class *eq_tilde_class, *scalareq_tilde_class; diff --git a/src/0x3e0x7e.c b/src/0x3e0x7e.c index ae997c6..7719ac5 100644 --- a/src/0x3e0x7e.c +++ b/src/0x3e0x7e.c @@ -20,7 +20,7 @@ 1302:forum::für::umläute:2000 */ -#include "zexy.h" +#include "zexySIMD.h" /* ------------------------ relational~ ----------------------------- */ diff --git a/src/0x7c0x7c0x7e.c b/src/0x7c0x7c0x7e.c index 1927df8..4511f8e 100644 --- a/src/0x7c0x7c0x7e.c +++ b/src/0x7c0x7c0x7e.c @@ -20,7 +20,7 @@ 1302:forum::für::umläute:2000 */ -#include "zexy.h" +#include "zexySIMD.h" /* ----------------------------- oror_tilde ----------------------------- */ static t_class *oror_tilde_class, *scalaroror_tilde_class; diff --git a/src/absgn~.c b/src/absgn~.c index 2279ade..73a0450 100644 --- a/src/absgn~.c +++ b/src/absgn~.c @@ -11,7 +11,7 @@ * ******************************************************/ -#include "zexy.h" +#include "zexySIMD.h" typedef struct _absgn { @@ -46,8 +46,8 @@ static t_int *sigABSGN_perform(t_int *w) } #ifdef __SSE__ -static long l_bitmask[] ={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; -static long l_sgnbitmask[]={0x80000000, 0x80000000, 0x80000000, 0x80000000}; +static int l_bitmask[] ={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; +static int l_sgnbitmask[]={0x80000000, 0x80000000, 0x80000000, 0x80000000}; static t_int *sigABSGN_performSSE(t_int *w) { __m128 *in = (__m128 *)(w[1]); @@ -91,11 +91,12 @@ static void sigABSGN_dsp(t_absgn *x, t_signal **sp) ZEXY_USEVAR(x); #ifdef __SSE__ if( - Z_SIMD_CHKBLOCKSIZE(sp[0]->s_n)&& - Z_SIMD_CHKALIGN(sp[0]->s_vec)&& - Z_SIMD_CHKALIGN(sp[1]->s_vec)&& - Z_SIMD_CHKALIGN(sp[2]->s_vec)&& - ZEXY_TYPE_EQUAL(t_sample, float) + ZEXY_TYPE_EQUAL(t_sample, float) && /* currently SSE2 code is only for float (not for double) */ + Z_SIMD_CHKBLOCKSIZE(sp[0]->s_n) && + Z_SIMD_CHKALIGN(sp[0]->s_vec) && + Z_SIMD_CHKALIGN(sp[1]->s_vec) && + Z_SIMD_CHKALIGN(sp[2]->s_vec) && + zexy_testSSE(sigABSGN_perform, sigABSGN_performSSE, 1, 2) ) { dsp_add(sigABSGN_performSSE, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, sp[0]->s_n); diff --git a/src/abs~.c b/src/abs~.c index c83e3da..f1ed564 100644 --- a/src/abs~.c +++ b/src/abs~.c @@ -20,7 +20,7 @@ 2112:forum::für::umläute:2005 */ -#include "zexy.h" +#include "zexySIMD.h" typedef struct _abs { @@ -45,10 +45,10 @@ static t_int *sigABS_perform(t_int *w) } #ifdef __SSE__ -static long l_bitmask[]={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; +static int l_bitmask[]={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; static t_int *sigABS_performSSE(t_int *w) { - __m128 *in = (__m128 *)(w[1]); + __m128 *in = (__m128 *)(w[1]); __m128 *out = (__m128 *)(w[2]); int n = (int)(w[3])>>4; @@ -121,10 +121,10 @@ static void sigABS_dsp(t_abs *x, t_signal **sp) { #ifdef __SSE__ if( - Z_SIMD_CHKBLOCKSIZE(sp[0]->s_n)&& - Z_SIMD_CHKALIGN(sp[0]->s_vec)&& - Z_SIMD_CHKALIGN(sp[1]->s_vec)&& - ZEXY_TYPE_EQUAL(t_sample, float) + ZEXY_TYPE_EQUAL(t_sample, float) && + zexy_testSSE(sigABS_perform, + sigABS_performSSE, + 1, 1) ) { dsp_add(sigABS_performSSE, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n); diff --git a/src/sgn~.c b/src/sgn~.c index 04076c2..03ffbf1 100644 --- a/src/sgn~.c +++ b/src/sgn~.c @@ -20,7 +20,7 @@ 2112:forum::für::umläute:2005 */ -#include "zexy.h" +#include "zexySIMD.h" typedef struct _sgnTilde { @@ -73,7 +73,7 @@ static t_int *sgnTilde_perform8(t_int *w) } #ifdef __SSE__ -static long l_bitmask[]={0x80000000, 0x80000000, 0x80000000, 0x80000000}; /* sign bitmask */ +static int l_bitmask[]={0x80000000, 0x80000000, 0x80000000, 0x80000000}; /* sign bitmask */ static t_int *sgnTilde_performSSE(t_int *w) { __m128 *in = (__m128 *)(w[1]); @@ -114,10 +114,10 @@ static void sgnTilde_dsp(t_sgnTilde *x, t_signal **sp) { #ifdef __SSE__ if( - Z_SIMD_CHKBLOCKSIZE(sp[0]->s_n)&& - Z_SIMD_CHKALIGN(sp[0]->s_vec)&& - Z_SIMD_CHKALIGN(sp[1]->s_vec)&& - ZEXY_TYPE_EQUAL(t_sample, float) /* currently SSE2 code is only for float (not for double) */ + ZEXY_TYPE_EQUAL(t_sample, float) && /* currently SSE2 code is only for float (not for double) */ + zexy_testSSE(sgnTilde_perform, + sgnTilde_performSSE, + 1,1) ) { dsp_add(sgnTilde_performSSE, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n); diff --git a/src/zexy.h b/src/zexy.h index 33e9c7d..a56f868 100644 --- a/src/zexy.h +++ b/src/zexy.h @@ -50,16 +50,6 @@ #include "m_pd.h" -#ifdef __SSE__ -# include -# define Z_SIMD_BLOCK 16 /* must be a power of 2 */ -# define Z_SIMD_BYTEALIGN (128/8) /* assume 128 bits */ -# define Z_SIMD_CHKBLOCKSIZE(n) (!(n&(Z_SIMD_BLOCK-1))) -# define Z_SIMD_CHKALIGN(ptr) ( ((unsigned long)(ptr) & (Z_SIMD_BYTEALIGN-1)) == 0 ) -#endif /* __SSE__ */ - -#include - #define VERSION "2.2.3" /* these pragmas are only used for MSVC, not MinGW or Cygwin */ @@ -74,6 +64,7 @@ # define HEARTSYMBOL 64 #endif +#include #ifdef __WIN32__ # define STATIC_INLINE diff --git a/src/zexySIMD.h b/src/zexySIMD.h new file mode 100644 index 0000000..893a857 --- /dev/null +++ b/src/zexySIMD.h @@ -0,0 +1,70 @@ +#include "zexy.h" + +#ifdef __SSE__ + +#include +#define Z_SIMD_BLOCK 16 /* must be a power of 2 */ +#define Z_SIMD_BYTEALIGN (128/8) /* assume 128 bits */ +#define Z_SIMD_CHKBLOCKSIZE(n) (!(n&(Z_SIMD_BLOCK-1))) +#define Z_SIMD_CHKALIGN(ptr) ( ((unsigned long)(ptr) & (Z_SIMD_BYTEALIGN-1)) == 0 ) + +typedef union { + __m128 vec; + t_sample f[4]; +} t_sample4; + +/** + * runs a check whether the SSE-optimized perform routine returns the same result as the generic routine + * if the results differ, the SSE-code is probably broken, so we should fallback to the generic code + */ +static int zexy_testSSE(t_perfroutine genericperf, t_perfroutine sseperf, unsigned int numinchannels, unsigned int numoutchannels) +{ +/* this currently only works with single input, single output */ +/* LATER make it work truely multichannel */ + if(1==numinchannels && 1==numoutchannels) { + t_int w1[4], w2[4]; + t_sample4 in, in1[4], in2[4], out1[4], out2[4]; + int i, j; + + z_verbose(2, "checking for SSE compatibility"); + + in.f[0]=0.; + in.f[1]=-0.5; + in.f[2]=0.5; + in.f[1]=5.; + + for(i=0; i<4; i++) { + in1[i].f[0]=in.f[i]; in1[i].f[1]=in.f[i]; in1[i].f[3]=in.f[i]; in1[i].f[2]=in.f[i]; + out1[i].f[0]=out1[i].f[1]=out1[i].f[2]=out1[i].f[3]=0.f; + + in2[i].f[0]=in.f[i]; in2[i].f[1]=in.f[i]; in2[i].f[3]=in.f[i]; in2[i].f[2]=in.f[i]; + out2[i].f[0]=out2[i].f[1]=out2[i].f[2]=out2[i].f[3]=0.f; + } + + w1[0]=(t_int)0; w1[1]=(t_int)&in1; w1[2]=(t_int)&out1; w1[3]=(t_int)16; (*genericperf)(w1); + w2[0]=(t_int)0; w2[1]=(t_int)&in2; w2[2]=(t_int)&out2; w2[3]=(t_int)16; (*sseperf)(w2); + + + for(i=0; i<4; i++) { + for(j=0; j<4; j++) { + if(fabsf(out1[i].f[j]-out2[i].f[j])>1e-17) { + z_verbose(2, "generic and optimized routines return different results: skipping optimization"); + z_verbose(2, "[%d,%d]: ((%f->%f)!=(%f->%f))", + i, j, + in1[i].f[j], out1[i].f[j], + in2[i].f[j], out2[i].f[j] + ); + return 0; + } + } + } + } else { + /* no tests yet */ + } + z_verbose(2, "using SSE optimization"); + return 1; +} + +#endif /* __SSE__ */ + + -- cgit v1.2.1