From 2f98df88850ab893c7acf8ea2b9000c03c2e17da Mon Sep 17 00:00:00 2001
From: Tim Blechmann <timblech@users.sourceforge.net>
Date: Tue, 28 Dec 2004 15:48:19 +0000
Subject: simd-optimized ramp

svn path=/trunk/externals/tb/; revision=2435
---
 volctl~/makefile   |   2 +-
 volctl~/readme.txt |   3 +-
 volctl~/volctl~.c  | 381 +++++++++++++++++++++++++++++++----------------------
 3 files changed, 229 insertions(+), 157 deletions(-)

(limited to 'volctl~')

diff --git a/volctl~/makefile b/volctl~/makefile
index 9b4cc04..3457263 100644
--- a/volctl~/makefile
+++ b/volctl~/makefile
@@ -62,7 +62,7 @@ pd_linux: $(NAME).pd_linux
 
 LINUXCFLAGS = -DPD -O3 -fPIC -funroll-loops -fomit-frame-pointer \
     -Wall -W -Wshadow -Wstrict-prototypes -Werror \
-    -Wno-unused -Wno-parentheses -Wno-switch 
+    -Wno-unused -Wno-parentheses -Wno-switch
 
 LINUXINCLUDE =  -I../../src
 
diff --git a/volctl~/readme.txt b/volctl~/readme.txt
index 5707467..4efff34 100644
--- a/volctl~/readme.txt
+++ b/volctl~/readme.txt
@@ -8,7 +8,7 @@ volctl~ is doing more or less the same as
      |line~ 0 10		
 |    |              |     |     |
 |*~ 0|            = |volctl 0 10|
-|		    |
+|		            |
 
 except that it is faster
 
@@ -20,5 +20,4 @@ volctl~ will only probably only compile against pd>=devel_0_37 with gcc.
 i'm not planing to do a port to win/osx or any pd without aligned dsp blocks...
 
 todo:
-- write complete volctl_perform_simd function in assembler
 - check icc's segfault
\ No newline at end of file
diff --git a/volctl~/volctl~.c b/volctl~/volctl~.c
index 9ef96de..60abfac 100644
--- a/volctl~/volctl~.c
+++ b/volctl~/volctl~.c
@@ -1,31 +1,30 @@
 /* Copyright (c) 2004 Tim Blechmann.
- *For information on usage and redistribution, and for a DISCLAIMER OF ALL
- *WARRANTIES, see the file, "gpl.txt" in this distribution.
+ * For information on usage and redistribution, and for a DISCLAIMER OF ALL
+ * WARRANTIES, see the file, "gpl.txt" in this distribution.
  *
- *This program is free software; you can redistribute it and/or
- *modify it under the terms of the GNU General Public License
- *as published by the Free Software Foundation; either version 2
- *of the License, or (at your option) any later version.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
  *
- *See file LICENSE for further informations on licensing terms.
+ * See file LICENSE for further informations on licensing terms.
  *
- *This program is distributed in the hope that it will be useful,
- *but WITHOUT ANY WARRANTY; without even the implied warranty of
- *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  *
- *You should have received a copy of the GNU General Public License
- *along with this program; if not, write to the Free Software
- *Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  *
- *Based on PureData by Miller Puckette and others.
+ * Based on PureData by Miller Puckette and others.
  *
- *  coded while listening to: Julien Ottavi: Nervure Magnetique
+ * coded while listening to: Julien Ottavi: Nervure Magnetique
  *                                                                    */
 
 
 #include "m_pd.h"
-
 #include "m_simd.h"
 
 
@@ -36,15 +35,16 @@ static t_class *volctl_class;
 typedef struct _volctl
 {
     t_object x_obj;
-    float x_f;
+    t_float x_f;
 
-    float x_h; //interpolation time
-    float x_value; //current factor
+    t_float x_h; //interpolation time
+    t_float x_value; //current factor
     
     int x_ticksleft; //ticks to go
-    float x_mspersample; //ms per sample
-    float x_slope; //slope
-
+    t_float x_samples_per_ms; //ms per sample
+    t_float x_slope; //slope
+	t_float * x_slopes; //slopes for simd
+	t_float x_slope_step;
     int x_line; 
 
 } t_volctl;
@@ -54,7 +54,7 @@ void *volctl_new(t_symbol *s, int argc, t_atom *argv)
     if (argc > 2) post("volctl~: extra arguments ignored");
 
     t_volctl *x = (t_volctl *)pd_new(volctl_class);
-    inlet_new(&x->x_obj, &x->x_obj.ob_pd, gensym("float"), gensym("f1"));
+    inlet_new(&x->x_obj, &x->x_obj.ob_pd, &s_float, gensym("f1"));
     inlet_settip(x->x_obj.ob_inlet,gensym("factor"));
     x->x_value = atom_getfloatarg(0, argc, argv);
     
@@ -62,13 +62,22 @@ void *volctl_new(t_symbol *s, int argc, t_atom *argv)
     inlet_settip(time,gensym("interpolation_time"));
     x->x_h = atom_getfloatarg(1, argc, argv);
 
-    x->x_mspersample = 1000.f / 44100; // assume default samplerate
+    x->x_samples_per_ms = 44100.f / 1000.f; // assume default samplerate
 
     outlet_new(&x->x_obj, &s_signal);
     x->x_f = 0;
+	
+	x->x_slopes = getalignedbytes(4*sizeof(t_float));
+
     return (x);
 }
 
+static void volctl_free(t_volctl *x)
+{
+	freealignedbytes(x->x_slopes, 4*sizeof(t_float));
+}
+
+
 t_int *volctl_perform(t_int *w)
 {
     t_volctl * x = (t_volctl *)(w[1]);
@@ -76,40 +85,40 @@ t_int *volctl_perform(t_int *w)
     t_float *out = (t_float *)(w[3]);
     int n = (int)(w[4]);
     
-    float f = x->x_value;
+    t_float f = x->x_value;
 
     if (x->x_ticksleft)
     {
-	float x_slope = x->x_slope;
-	if (x->x_ticksleft < n)
-	{
-	    int remain = x->x_ticksleft;
-	    n-=remain;
-	    while (remain--)
-	    {
-		f+=x_slope;
-		*out++ = *in++ * f;
-	    }
-	    while (n--)
-	    {
-		*out++ = *in++ * f;
-	    }
-	    x->x_value = f;
-	    x->x_ticksleft = 0;
-	}
-	else
-	{
-	    x->x_ticksleft -=n;
-	    while (n--)
-	    {
-		f+=x_slope;
-		*out++ = *in++ * f;
-	    }
-	    x->x_value = f;
-	}
+		t_float x_slope = x->x_slope;
+		if (x->x_ticksleft < n)
+		{
+			int remain = x->x_ticksleft;
+			n-=remain;
+			while (remain--)
+			{
+				f+=x_slope;
+				*out++ = *in++ * f;
+			}
+			while (n--)
+			{
+				*out++ = *in++ * f;
+			}
+			x->x_value = f;
+			x->x_ticksleft = 0;
+		}
+		else
+		{
+			x->x_ticksleft -=n;
+			while (n--)
+			{
+				f+=x_slope;
+				*out++ = *in++ * f;
+			}
+			x->x_value = f;
+		}
     }
     else
-	while (n--) *out++ = *in++ * f; 
+		while (n--) *out++ = *in++ * f; 
 	
     return (w+5);
 }
@@ -122,48 +131,63 @@ t_int *volctl_perf8(t_int *w)
     t_float *out = (t_float *)(w[3]);
     int n = (int)(w[4]);
 
-    float f = x->x_value;
+    t_float f = x->x_value;
 
     if (x->x_ticksleft)
     {
-	float x_slope = x->x_slope;
-	if (x->x_ticksleft < n)
-	{
-	    int remain = x->x_ticksleft;
-	    n-=remain;
-	    while (remain--)
-	    {
-		*out++ = *in++ * f;
-		f+=x_slope;
-	    }
-	    while (n--)
-	    {
-		*out++ = *in++ * f;
-	    }
-	    x->x_value = f;
-	    x->x_ticksleft = 0;
-	}
-	else
-	{
-	    x->x_ticksleft -= n;
-	    while (n--)
-	    {
-		*out++ = *in++ * f;
-		f+=x_slope;
-	    }
-	    x->x_value = f;
-	}
+		t_float x_slope = x->x_slope;
+		if (x->x_ticksleft < n)
+		{
+			int remain = x->x_ticksleft;
+			n-=remain;
+			while (remain--)
+			{
+				*out++ = *in++ * f;
+				f+=x_slope;
+			}
+			while (n--)
+			{
+				*out++ = *in++ * f;
+			}
+			x->x_value = f;
+			x->x_ticksleft = 0;
+		}
+		else
+		{
+			x->x_ticksleft -= n;
+			n = n>>3;
+			while (n--)
+			{
+				*out++ = *in++ * f;
+				f+=x_slope;
+				*out++ = *in++ * f;
+				f+=x_slope;
+				*out++ = *in++ * f;
+				f+=x_slope;
+				*out++ = *in++ * f;
+				f+=x_slope;
+				*out++ = *in++ * f;
+				f+=x_slope;
+				*out++ = *in++ * f;
+				f+=x_slope;
+				*out++ = *in++ * f;
+				f+=x_slope;
+				*out++ = *in++ * f;
+				f+=x_slope;
+			}
+			x->x_value = f;
+		}
     }
     else
     {
-	for (; n; n -= 8, in += 8, out += 8)
-	{
-	    float f0 = in[0], f1 = in[1], f2 = in[2], f3 = in[3];
-	    float f4 = in[4], f5 = in[5], f6 = in[6], f7 = in[7];
+		for (; n; n -= 8, in += 8, out += 8)
+		{
+			t_float f0 = in[0], f1 = in[1], f2 = in[2], f3 = in[3];
+			t_float f4 = in[4], f5 = in[5], f6 = in[6], f7 = in[7];
 	    
-	    out[0] = f0 * f; out[1] = f1 * f; out[2] = f2 * f; out[3] = f3 * f;
-	    out[4] = f4 * f; out[5] = f5 * f; out[6] = f6 * f; out[7] = f7 * f;
-	}
+			out[0] = f0 * f; out[1] = f1 * f; out[2] = f2 * f; out[3] = f3 * f;
+			out[4] = f4 * f; out[5] = f5 * f; out[6] = f6 * f; out[7] = f7 * f;
+		}
     }
     return (w+5);
 }
@@ -176,67 +200,106 @@ t_int *volctl_perf_simd(t_int *w)
 
     if (x->x_ticksleft)
     {
-	int n = (int)(w[4]);
+		int n = (int)(w[4]);
 	
-	float f = x->x_value;
+		t_float x_slope = x->x_slope;
+		if (x->x_ticksleft < n)
+		{
+			t_float f = x->x_value;
+			
+			int remain = x->x_ticksleft;
+			n-=remain;
+			while (remain--)
+			{
+				*out++ = *in++ * f;
+				f+=x_slope;
+			}
+			while (n--)
+			{
+				*out++ = *in++ * f;
+			}
+			x->x_value = f;
+			x->x_ticksleft = 0;
+		}
+		else
+		{
+			x->x_ticksleft -= n;
+			
+			asm(
+				".set T_FLOAT,4                          \n"
+				"movss     (%3),%%xmm0                   \n" /* value */
+				"shufps    $0, %%xmm0, %%xmm0            \n"
+				"movaps    (%4), %%xmm1                  \n" /* x_slopes */
+				"addps     %%xmm0, %%xmm1                \n"
 
-	float x_slope = x->x_slope;
-	if (x->x_ticksleft < n)
-	{
-	    int remain = x->x_ticksleft;
-	    n-=remain;
-	    while (remain--)
-	    {
-		*out++ = *in++ * f;
-		f+=x_slope;
-	    }
-	    while (n--)
-	    {
-		*out++ = *in++ * f;
-	    }
-	    x->x_value = f;
-	    x->x_ticksleft = 0;
-	}
-	else
-	{
-	    x->x_ticksleft -= n;
-	    while (n--)
-	    {
-		*out++ = *in++ * f;
-		f+=x_slope;
-	    }
-	    x->x_value = f;
-	}
+				"movss     (%5), %%xmm0                  \n"
+				"shufps    $0, %%xmm0, %%xmm0            \n" /* x_slope_step */
+
+				"shrl      $4, %2                        \n" /* n>>4 */
+				
+				"1:                                      \n"
+				"movaps    (%0), %%xmm2                  \n"
+				"mulps     %%xmm1, %%xmm2                \n"
+				"movaps    %%xmm2, (%1)                  \n"
+				"addps     %%xmm0, %%xmm1                \n"
+
+				"movaps    4*T_FLOAT(%0), %%xmm2         \n"
+				"mulps     %%xmm1, %%xmm2                \n"
+				"movaps    %%xmm2, 4*T_FLOAT(%1)         \n"
+				"addps     %%xmm0, %%xmm1                \n"
+
+				"movaps    8*T_FLOAT(%0), %%xmm2         \n"
+				"mulps     %%xmm1, %%xmm2                \n"
+				"movaps    %%xmm2, 8*T_FLOAT(%1)         \n"
+				"addps     %%xmm0, %%xmm1                \n"
+
+				"movaps    12*T_FLOAT(%0), %%xmm2        \n"
+				"mulps     %%xmm1, %%xmm2                \n"
+				"movaps    %%xmm2, 12*T_FLOAT(%1)        \n"
+				"addps     %%xmm0, %%xmm1                \n" /* one instr. obsolete */
+
+				"addl      $16*T_FLOAT, %0               \n"
+				"addl      $16*T_FLOAT, %1               \n"
+				"loop      1b                            \n"
+
+				:
+				:"r"(in), "r"(out), "c"(n), "r"(&(t_float)(x->x_value)),
+				"r"((t_float*)x->x_slopes), "r"(&(t_float)(x->x_slope_step))
+				:"%xmm0", "%xmm1", "%xmm2");
+			
+/* 			post("value %f", x->x_value); */
+			x->x_value += n*(x->x_slope);
+		}
     }
     else
     {
-	asm(
-	    ".set T_FLOAT,4                            \n"
-
-	    "movss     (%3), %%xmm0                   \n"
-	    "shufps    $0, %%xmm0, %%xmm0                \n"
-	    "shrl      $4, %2                        \n"
-  
-	    "volctl_loop:                              \n"
-	    "movaps    (%0), %%xmm1                   \n"
-	    "mulps     %%xmm0, %%xmm1                    \n"
-	    "movaps    %%xmm1, (%1)                   \n" 
-	    "movaps    4*T_FLOAT(%0), %%xmm2          \n"
-	    "mulps     %%xmm0, %%xmm2                    \n"
-	    "movaps    %%xmm2, 4*T_FLOAT(%1)          \n"
-	    "movaps    8*T_FLOAT(%0), %%xmm3          \n"
-	    "mulps     %%xmm0, %%xmm3                    \n"
-	    "movaps    %%xmm3, 8*T_FLOAT(%1)          \n"
-	    "movaps    12*T_FLOAT(%0), %%xmm4         \n"
-	    "mulps     %%xmm0, %%xmm4                    \n"
-	    "movaps    %%xmm4, 12*T_FLOAT(%1)         \n"
-	    "addl      $64, %0                       \n"
-	    "addl      $64, %1                       \n"
-	    "loop      volctl_loop                     \n"
-	    :
-	    : "r"(in), "r"(out),
-	    "a"(w[4]),"r"(&(x->x_value))
-	    : "%xmm0","%xmm1","%xmm2","%xmm3","%xmm4");
+		asm(
+			".set T_FLOAT,4                          \n"
+			
+			"movss     (%3), %%xmm0                  \n"
+			"shufps    $0, %%xmm0, %%xmm0            \n"
+			"shrl      $4, %2                        \n"
+			
+			"volctl_loop:                            \n"
+			"movaps    (%0), %%xmm1                  \n"
+			"mulps     %%xmm0, %%xmm1                \n"
+			"movaps    %%xmm1, (%1)                  \n" 
+			"movaps    4*T_FLOAT(%0), %%xmm2         \n"
+			"mulps     %%xmm0, %%xmm2                \n"
+			"movaps    %%xmm2, 4*T_FLOAT(%1)         \n"
+			"movaps    8*T_FLOAT(%0), %%xmm3         \n"
+			"mulps     %%xmm0, %%xmm3                \n"
+			"movaps    %%xmm3, 8*T_FLOAT(%1)         \n"
+			"movaps    12*T_FLOAT(%0), %%xmm4        \n"
+			"mulps     %%xmm0, %%xmm4                \n"
+			"movaps    %%xmm4, 12*T_FLOAT(%1)        \n"
+			"addl      $16*T_FLOAT, %0               \n"
+			"addl      $16*T_FLOAT, %1               \n"
+			"loop      volctl_loop                   \n"
+			:
+			: "r"(in), "r"(out),
+			"c"(w[4]),"r"(&(t_float)(x->x_value))
+			: "%xmm0", "%xmm1","%xmm2","%xmm3","%xmm4");
     }
     return (w+5);
 }
@@ -244,8 +307,18 @@ t_int *volctl_perf_simd(t_int *w)
 
 void volctl_set(t_volctl *x, t_float f)
 {
-    x->x_ticksleft = x->x_h / x->x_mspersample;
-    x->x_slope = (f - x->x_value) / x->x_ticksleft;
+	t_float slope;
+	int i;
+	
+    x->x_ticksleft = x->x_h * x->x_samples_per_ms;
+    slope = (f - x->x_value) / x->x_ticksleft;
+    x->x_slope = slope;
+	
+	for (i = 0; i != 4; ++i)
+	{
+		x->x_slopes[i] = i*slope;
+	}
+	x->x_slope_step = 4*slope;
 }
 
 void volctl_dsp(t_volctl *x, t_signal **sp)
@@ -255,18 +328,18 @@ void volctl_dsp(t_volctl *x, t_signal **sp)
     	dsp_add(volctl_perform, 4, x, sp[0]->s_vec, sp[1]->s_vec, n);
     else 
     {
-	if(SIMD_CHECK2(n,sp[0]->s_vec,sp[1]->s_vec))
-	    dsp_add(volctl_perf_simd, 4, x, sp[0]->s_vec, sp[1]->s_vec, n);
-	else
-	dsp_add(volctl_perf8, 4, x, sp[0]->s_vec, sp[1]->s_vec, n);
+		if(SIMD_CHECK2(n,sp[0]->s_vec,sp[1]->s_vec))
+			dsp_add(volctl_perf_simd, 4, x, sp[0]->s_vec, sp[1]->s_vec, n);
+		else
+			dsp_add(volctl_perf8, 4, x, sp[0]->s_vec, sp[1]->s_vec, n);
     }
-    x->x_mspersample = 1000.f / sp[0]->s_sr;
+    x->x_samples_per_ms = sp[0]->s_sr / 1000.f;
 }
 
 void volctl_tilde_setup(void)
 {
-    volctl_class = class_new(gensym("volctl~"), (t_newmethod)volctl_new, 0,
-			     sizeof(t_volctl), 0, A_GIMME, 0);
+    volctl_class = class_new(gensym("volctl~"), (t_newmethod)volctl_new, 
+							 (t_method)volctl_free, sizeof(t_volctl), 0, A_GIMME, 0);
     CLASS_MAINSIGNALIN(volctl_class, t_volctl, x_f);
     class_addmethod(volctl_class, (t_method)volctl_dsp, gensym("dsp"), 0);
     class_addmethod(volctl_class, (t_method)volctl_set, gensym("f1"),A_FLOAT,0);
-- 
cgit v1.2.1