SIMD code hopefully now also works on 64bit

svn path=/trunk/externals/zexy/; revision=13045
author: IOhannes m zmÃ¶lnig <zmoelnig@users.sourceforge.net> 2010-01-19 14:56:40 +0000
committer: IOhannes m zmÃ¶lnig <zmoelnig@users.sourceforge.net> 2010-01-19 14:56:40 +0000
commit: 4934f05ba8e55f58ed5762180ea317da50a00a05 (patch)
tree: d1568a29be25660351692ab6a11fc785b179b158 /src
parent: 6d3d4743b01e15a82444b2ca900b60094e3c1800 (diff)
10 files changed, 99 insertions, 37 deletions
diff --git a/src/0x260x260x7e.c b/src/0x260x260x7e.c
index 8a37cc9..ace6fe7 100644
--- a/src/0x260x260x7e.c
+++ b/src/0x260x260x7e.c
@@ -20,7 +20,7 @@
   1302:forum::für::umläute:2000
 */
 
-#include "zexy.h"
+#include "zexySIMD.h"
 
 /* ------------------------ logical~ ----------------------------- */
 
@@ -125,7 +125,7 @@ static t_int *scalarandand_tilde_perf8(t_int *w)
 }
 
 #ifdef __SSE__
-static long l_bitmask[]={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+static int l_bitmask[]={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
 
 static t_int *andand_tilde_performSSE(t_int *w)
 {
diff --git a/src/0x3c0x7e.c b/src/0x3c0x7e.c
index 24c0a90..313a5d4 100644
--- a/src/0x3c0x7e.c
+++ b/src/0x3c0x7e.c
@@ -20,7 +20,7 @@
 	1302:forum::für::umläute:2000
 */
 
-#include "zexy.h"
+#include "zexySIMD.h"
 
 /* ------------------------ relational~ ----------------------------- */
 
diff --git a/src/0x3d0x3d0x7e.c b/src/0x3d0x3d0x7e.c
index f605df1..9cbcd1c 100644
--- a/src/0x3d0x3d0x7e.c
+++ b/src/0x3d0x3d0x7e.c
@@ -20,7 +20,7 @@
 	1302:forum::für::umläute:2000
 */
 
-#include "zexy.h"
+#include "zexySIMD.h"
 
 /* ----------------------------- eq_tilde ----------------------------- */
 static t_class *eq_tilde_class, *scalareq_tilde_class;
diff --git a/src/0x3e0x7e.c b/src/0x3e0x7e.c
index ae997c6..7719ac5 100644
--- a/src/0x3e0x7e.c
+++ b/src/0x3e0x7e.c
@@ -20,7 +20,7 @@
 	1302:forum::für::umläute:2000
 */
 
-#include "zexy.h"
+#include "zexySIMD.h"
 
 /* ------------------------ relational~ ----------------------------- */
 
diff --git a/src/0x7c0x7c0x7e.c b/src/0x7c0x7c0x7e.c
index 1927df8..4511f8e 100644
--- a/src/0x7c0x7c0x7e.c
+++ b/src/0x7c0x7c0x7e.c
@@ -20,7 +20,7 @@
 	1302:forum::für::umläute:2000
 */
 
-#include "zexy.h"
+#include "zexySIMD.h"
 
 /* ----------------------------- oror_tilde ----------------------------- */
 static t_class *oror_tilde_class, *scalaroror_tilde_class;
diff --git a/src/absgn~.c b/src/absgn~.c
index 2279ade..73a0450 100644
--- a/src/absgn~.c
+++ b/src/absgn~.c
@@ -11,7 +11,7 @@
  *
  ******************************************************/
 
-#include "zexy.h"
+#include "zexySIMD.h"
 
 typedef struct _absgn
 {
@@ -46,8 +46,8 @@ static t_int *sigABSGN_perform(t_int *w)
 }
 
 #ifdef __SSE__
-static long l_bitmask[]   ={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-static long l_sgnbitmask[]={0x80000000, 0x80000000, 0x80000000, 0x80000000};
+static int l_bitmask[]   ={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+static int l_sgnbitmask[]={0x80000000, 0x80000000, 0x80000000, 0x80000000};
 static t_int *sigABSGN_performSSE(t_int *w)
 {
   __m128 *in = (__m128 *)(w[1]);
@@ -91,11 +91,12 @@ static void sigABSGN_dsp(t_absgn *x, t_signal **sp)
   ZEXY_USEVAR(x);
 #ifdef __SSE__
   if(
-     Z_SIMD_CHKBLOCKSIZE(sp[0]->s_n)&&
-     Z_SIMD_CHKALIGN(sp[0]->s_vec)&&
-     Z_SIMD_CHKALIGN(sp[1]->s_vec)&&
-     Z_SIMD_CHKALIGN(sp[2]->s_vec)&&
-     ZEXY_TYPE_EQUAL(t_sample, float)
+     ZEXY_TYPE_EQUAL(t_sample, float) && /*  currently SSE2 code is only for float (not for double) */
+     Z_SIMD_CHKBLOCKSIZE(sp[0]->s_n) &&
+     Z_SIMD_CHKALIGN(sp[0]->s_vec) &&
+     Z_SIMD_CHKALIGN(sp[1]->s_vec) &&
+     Z_SIMD_CHKALIGN(sp[2]->s_vec) &&
+     zexy_testSSE(sigABSGN_perform, sigABSGN_performSSE, 1, 2)
      )
     {
       dsp_add(sigABSGN_performSSE, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, sp[0]->s_n);
diff --git a/src/abs~.c b/src/abs~.c
index c83e3da..f1ed564 100644
--- a/src/abs~.c
+++ b/src/abs~.c
@@ -20,7 +20,7 @@
   2112:forum::für::umläute:2005
 */
 
-#include "zexy.h"
+#include "zexySIMD.h"
 
 typedef struct _abs
 {
@@ -45,10 +45,10 @@ static t_int *sigABS_perform(t_int *w)
 }
 
 #ifdef __SSE__
-static long l_bitmask[]={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+static int l_bitmask[]={0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
 static t_int *sigABS_performSSE(t_int *w)
 {
-  __m128 *in = (__m128 *)(w[1]);
+  __m128 *in =  (__m128 *)(w[1]);
   __m128 *out = (__m128 *)(w[2]);
   int n = (int)(w[3])>>4;
 
@@ -121,10 +121,10 @@ static void sigABS_dsp(t_abs *x, t_signal **sp)
 {
 #ifdef __SSE__
   if(
-     Z_SIMD_CHKBLOCKSIZE(sp[0]->s_n)&&
-     Z_SIMD_CHKALIGN(sp[0]->s_vec)&&
-     Z_SIMD_CHKALIGN(sp[1]->s_vec)&&
-     ZEXY_TYPE_EQUAL(t_sample, float)
+     ZEXY_TYPE_EQUAL(t_sample, float) && 
+     zexy_testSSE(sigABS_perform,
+		  sigABS_performSSE, 
+		  1, 1)
      )
     {
       dsp_add(sigABS_performSSE, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
diff --git a/src/sgn~.c b/src/sgn~.c
index 04076c2..03ffbf1 100644
--- a/src/sgn~.c
+++ b/src/sgn~.c
@@ -20,7 +20,7 @@
   2112:forum::für::umläute:2005
 */
 
-#include "zexy.h"
+#include "zexySIMD.h"
 
 typedef struct _sgnTilde
 {
@@ -73,7 +73,7 @@ static t_int *sgnTilde_perform8(t_int *w)
 }
 
 #ifdef __SSE__
-static long l_bitmask[]={0x80000000, 0x80000000, 0x80000000, 0x80000000}; /* sign bitmask */
+static int l_bitmask[]={0x80000000, 0x80000000, 0x80000000, 0x80000000}; /* sign bitmask */
 static t_int *sgnTilde_performSSE(t_int *w)
 {
   __m128 *in = (__m128 *)(w[1]);
@@ -114,10 +114,10 @@ static void sgnTilde_dsp(t_sgnTilde *x, t_signal **sp)
 {
 #ifdef __SSE__
   if(
-     Z_SIMD_CHKBLOCKSIZE(sp[0]->s_n)&&
-     Z_SIMD_CHKALIGN(sp[0]->s_vec)&&
-     Z_SIMD_CHKALIGN(sp[1]->s_vec)&&
-     ZEXY_TYPE_EQUAL(t_sample, float) /*  currently SSE2 code is only for float (not for double) */
+     ZEXY_TYPE_EQUAL(t_sample, float) && /*  currently SSE2 code is only for float (not for double) */
+     zexy_testSSE(sgnTilde_perform,
+		  sgnTilde_performSSE, 
+		  1,1)
      )
     {
       dsp_add(sgnTilde_performSSE, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
diff --git a/src/zexy.h b/src/zexy.h
index 33e9c7d..a56f868 100644
--- a/src/zexy.h
+++ b/src/zexy.h
@@ -50,16 +50,6 @@
 
 #include "m_pd.h"
 
-#ifdef __SSE__
-# include <xmmintrin.h>
-# define Z_SIMD_BLOCK 16  /* must be a power of 2 */
-# define Z_SIMD_BYTEALIGN (128/8)   /* assume 128 bits */
-# define Z_SIMD_CHKBLOCKSIZE(n) (!(n&(Z_SIMD_BLOCK-1)))
-# define Z_SIMD_CHKALIGN(ptr) ( ((unsigned long)(ptr) & (Z_SIMD_BYTEALIGN-1)) == 0 )
-#endif /* __SSE__ */
-
-#include <math.h>
-
 #define VERSION "2.2.3"
 
 /* these pragmas are only used for MSVC, not MinGW or Cygwin */
@@ -74,6 +64,7 @@
 # define HEARTSYMBOL 64
 #endif
 
+#include <math.h>
 
 #ifdef __WIN32__
 # define STATIC_INLINE
diff --git a/src/zexySIMD.h b/src/zexySIMD.h
new file mode 100644
index 0000000..893a857
--- /dev/null
+++ b/src/zexySIMD.h
@@ -0,0 +1,70 @@
+#include "zexy.h"
+
+#ifdef __SSE__
+
+#include <xmmintrin.h>
+#define Z_SIMD_BLOCK 16  /* must be a power of 2 */
+#define Z_SIMD_BYTEALIGN (128/8)   /* assume 128 bits */
+#define Z_SIMD_CHKBLOCKSIZE(n) (!(n&(Z_SIMD_BLOCK-1)))
+#define Z_SIMD_CHKALIGN(ptr) ( ((unsigned long)(ptr) & (Z_SIMD_BYTEALIGN-1)) == 0 )
+
+typedef union {
+  __m128 vec;
+  t_sample f[4];
+} t_sample4;
+
+/**
+ * runs a check whether the SSE-optimized perform routine returns the same result as the generic routine
+ * if the results differ, the SSE-code is probably broken, so we should fallback to the generic code
+ */
+static int zexy_testSSE(t_perfroutine genericperf, t_perfroutine sseperf, unsigned int numinchannels, unsigned int numoutchannels)
+{
+/* this currently only works with single input, single output */
+/* LATER make it work truely multichannel */
+  if(1==numinchannels && 1==numoutchannels) {
+    t_int w1[4], w2[4];
+    t_sample4 in, in1[4], in2[4], out1[4], out2[4];
+    int i, j;
+
+    z_verbose(2, "checking for SSE compatibility");
+
+    in.f[0]=0.;
+    in.f[1]=-0.5;
+    in.f[2]=0.5;
+    in.f[1]=5.;
+
+    for(i=0; i<4; i++) {
+      in1[i].f[0]=in.f[i]; in1[i].f[1]=in.f[i]; in1[i].f[3]=in.f[i]; in1[i].f[2]=in.f[i];
+      out1[i].f[0]=out1[i].f[1]=out1[i].f[2]=out1[i].f[3]=0.f;
+
+      in2[i].f[0]=in.f[i]; in2[i].f[1]=in.f[i]; in2[i].f[3]=in.f[i]; in2[i].f[2]=in.f[i];
+      out2[i].f[0]=out2[i].f[1]=out2[i].f[2]=out2[i].f[3]=0.f;
+    }
+
+    w1[0]=(t_int)0; w1[1]=(t_int)&in1; w1[2]=(t_int)&out1; w1[3]=(t_int)16; (*genericperf)(w1);
+    w2[0]=(t_int)0; w2[1]=(t_int)&in2; w2[2]=(t_int)&out2; w2[3]=(t_int)16; (*sseperf)(w2);
+
+
+    for(i=0; i<4; i++) {
+      for(j=0; j<4; j++) {
+	if(fabsf(out1[i].f[j]-out2[i].f[j])>1e-17) {
+	  z_verbose(2, "generic and optimized routines return different results: skipping optimization");
+	  z_verbose(2, "[%d,%d]: ((%f->%f)!=(%f->%f))",
+		    i, j,
+		    in1[i].f[j], out1[i].f[j],
+		    in2[i].f[j], out2[i].f[j]
+		    );
+	  return 0;
+	}
+      }
+    }
+  } else {
+    /* no tests yet */
+  }
+  z_verbose(2, "using SSE optimization");
+  return 1;
+}
+
+#endif /* __SSE__ */
+
+
author	IOhannes m zmÃ¶lnig <zmoelnig@users.sourceforge.net>	2010-01-19 14:56:40 +0000
committer	IOhannes m zmÃ¶lnig <zmoelnig@users.sourceforge.net>	2010-01-19 14:56:40 +0000
commit	4934f05ba8e55f58ed5762180ea317da50a00a05 (patch)
tree	d1568a29be25660351692ab6a11fc785b179b158 /src
parent	6d3d4743b01e15a82444b2ca900b60094e3c1800 (diff)