From 2f98df88850ab893c7acf8ea2b9000c03c2e17da Mon Sep 17 00:00:00 2001 From: Tim Blechmann Date: Tue, 28 Dec 2004 15:48:19 +0000 Subject: simd-optimized ramp svn path=/trunk/externals/tb/; revision=2435 --- volctl~/makefile | 2 +- volctl~/readme.txt | 3 +- volctl~/volctl~.c | 381 +++++++++++++++++++++++++++++++---------------------- 3 files changed, 229 insertions(+), 157 deletions(-) (limited to 'volctl~') diff --git a/volctl~/makefile b/volctl~/makefile index 9b4cc04..3457263 100644 --- a/volctl~/makefile +++ b/volctl~/makefile @@ -62,7 +62,7 @@ pd_linux: $(NAME).pd_linux LINUXCFLAGS = -DPD -O3 -fPIC -funroll-loops -fomit-frame-pointer \ -Wall -W -Wshadow -Wstrict-prototypes -Werror \ - -Wno-unused -Wno-parentheses -Wno-switch + -Wno-unused -Wno-parentheses -Wno-switch LINUXINCLUDE = -I../../src diff --git a/volctl~/readme.txt b/volctl~/readme.txt index 5707467..4efff34 100644 --- a/volctl~/readme.txt +++ b/volctl~/readme.txt @@ -8,7 +8,7 @@ volctl~ is doing more or less the same as |line~ 0 10 | | | | | |*~ 0| = |volctl 0 10| -| | +| | except that it is faster @@ -20,5 +20,4 @@ volctl~ will only probably only compile against pd>=devel_0_37 with gcc. i'm not planing to do a port to win/osx or any pd without aligned dsp blocks... todo: -- write complete volctl_perform_simd function in assembler - check icc's segfault \ No newline at end of file diff --git a/volctl~/volctl~.c b/volctl~/volctl~.c index 9ef96de..60abfac 100644 --- a/volctl~/volctl~.c +++ b/volctl~/volctl~.c @@ -1,31 +1,30 @@ /* Copyright (c) 2004 Tim Blechmann. - *For information on usage and redistribution, and for a DISCLAIMER OF ALL - *WARRANTIES, see the file, "gpl.txt" in this distribution. + * For information on usage and redistribution, and for a DISCLAIMER OF ALL + * WARRANTIES, see the file, "gpl.txt" in this distribution. * - *This program is free software; you can redistribute it and/or - *modify it under the terms of the GNU General Public License - *as published by the Free Software Foundation; either version 2 - *of the License, or (at your option) any later version. + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. * - *See file LICENSE for further informations on licensing terms. + * See file LICENSE for further informations on licensing terms. * - *This program is distributed in the hope that it will be useful, - *but WITHOUT ANY WARRANTY; without even the implied warranty of - *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - *GNU General Public License for more details. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. * - *You should have received a copy of the GNU General Public License - *along with this program; if not, write to the Free Software - *Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - *Based on PureData by Miller Puckette and others. + * Based on PureData by Miller Puckette and others. * - * coded while listening to: Julien Ottavi: Nervure Magnetique + * coded while listening to: Julien Ottavi: Nervure Magnetique * */ #include "m_pd.h" - #include "m_simd.h" @@ -36,15 +35,16 @@ static t_class *volctl_class; typedef struct _volctl { t_object x_obj; - float x_f; + t_float x_f; - float x_h; //interpolation time - float x_value; //current factor + t_float x_h; //interpolation time + t_float x_value; //current factor int x_ticksleft; //ticks to go - float x_mspersample; //ms per sample - float x_slope; //slope - + t_float x_samples_per_ms; //ms per sample + t_float x_slope; //slope + t_float * x_slopes; //slopes for simd + t_float x_slope_step; int x_line; } t_volctl; @@ -54,7 +54,7 @@ void *volctl_new(t_symbol *s, int argc, t_atom *argv) if (argc > 2) post("volctl~: extra arguments ignored"); t_volctl *x = (t_volctl *)pd_new(volctl_class); - inlet_new(&x->x_obj, &x->x_obj.ob_pd, gensym("float"), gensym("f1")); + inlet_new(&x->x_obj, &x->x_obj.ob_pd, &s_float, gensym("f1")); inlet_settip(x->x_obj.ob_inlet,gensym("factor")); x->x_value = atom_getfloatarg(0, argc, argv); @@ -62,13 +62,22 @@ void *volctl_new(t_symbol *s, int argc, t_atom *argv) inlet_settip(time,gensym("interpolation_time")); x->x_h = atom_getfloatarg(1, argc, argv); - x->x_mspersample = 1000.f / 44100; // assume default samplerate + x->x_samples_per_ms = 44100.f / 1000.f; // assume default samplerate outlet_new(&x->x_obj, &s_signal); x->x_f = 0; + + x->x_slopes = getalignedbytes(4*sizeof(t_float)); + return (x); } +static void volctl_free(t_volctl *x) +{ + freealignedbytes(x->x_slopes, 4*sizeof(t_float)); +} + + t_int *volctl_perform(t_int *w) { t_volctl * x = (t_volctl *)(w[1]); @@ -76,40 +85,40 @@ t_int *volctl_perform(t_int *w) t_float *out = (t_float *)(w[3]); int n = (int)(w[4]); - float f = x->x_value; + t_float f = x->x_value; if (x->x_ticksleft) { - float x_slope = x->x_slope; - if (x->x_ticksleft < n) - { - int remain = x->x_ticksleft; - n-=remain; - while (remain--) - { - f+=x_slope; - *out++ = *in++ * f; - } - while (n--) - { - *out++ = *in++ * f; - } - x->x_value = f; - x->x_ticksleft = 0; - } - else - { - x->x_ticksleft -=n; - while (n--) - { - f+=x_slope; - *out++ = *in++ * f; - } - x->x_value = f; - } + t_float x_slope = x->x_slope; + if (x->x_ticksleft < n) + { + int remain = x->x_ticksleft; + n-=remain; + while (remain--) + { + f+=x_slope; + *out++ = *in++ * f; + } + while (n--) + { + *out++ = *in++ * f; + } + x->x_value = f; + x->x_ticksleft = 0; + } + else + { + x->x_ticksleft -=n; + while (n--) + { + f+=x_slope; + *out++ = *in++ * f; + } + x->x_value = f; + } } else - while (n--) *out++ = *in++ * f; + while (n--) *out++ = *in++ * f; return (w+5); } @@ -122,48 +131,63 @@ t_int *volctl_perf8(t_int *w) t_float *out = (t_float *)(w[3]); int n = (int)(w[4]); - float f = x->x_value; + t_float f = x->x_value; if (x->x_ticksleft) { - float x_slope = x->x_slope; - if (x->x_ticksleft < n) - { - int remain = x->x_ticksleft; - n-=remain; - while (remain--) - { - *out++ = *in++ * f; - f+=x_slope; - } - while (n--) - { - *out++ = *in++ * f; - } - x->x_value = f; - x->x_ticksleft = 0; - } - else - { - x->x_ticksleft -= n; - while (n--) - { - *out++ = *in++ * f; - f+=x_slope; - } - x->x_value = f; - } + t_float x_slope = x->x_slope; + if (x->x_ticksleft < n) + { + int remain = x->x_ticksleft; + n-=remain; + while (remain--) + { + *out++ = *in++ * f; + f+=x_slope; + } + while (n--) + { + *out++ = *in++ * f; + } + x->x_value = f; + x->x_ticksleft = 0; + } + else + { + x->x_ticksleft -= n; + n = n>>3; + while (n--) + { + *out++ = *in++ * f; + f+=x_slope; + *out++ = *in++ * f; + f+=x_slope; + *out++ = *in++ * f; + f+=x_slope; + *out++ = *in++ * f; + f+=x_slope; + *out++ = *in++ * f; + f+=x_slope; + *out++ = *in++ * f; + f+=x_slope; + *out++ = *in++ * f; + f+=x_slope; + *out++ = *in++ * f; + f+=x_slope; + } + x->x_value = f; + } } else { - for (; n; n -= 8, in += 8, out += 8) - { - float f0 = in[0], f1 = in[1], f2 = in[2], f3 = in[3]; - float f4 = in[4], f5 = in[5], f6 = in[6], f7 = in[7]; + for (; n; n -= 8, in += 8, out += 8) + { + t_float f0 = in[0], f1 = in[1], f2 = in[2], f3 = in[3]; + t_float f4 = in[4], f5 = in[5], f6 = in[6], f7 = in[7]; - out[0] = f0 * f; out[1] = f1 * f; out[2] = f2 * f; out[3] = f3 * f; - out[4] = f4 * f; out[5] = f5 * f; out[6] = f6 * f; out[7] = f7 * f; - } + out[0] = f0 * f; out[1] = f1 * f; out[2] = f2 * f; out[3] = f3 * f; + out[4] = f4 * f; out[5] = f5 * f; out[6] = f6 * f; out[7] = f7 * f; + } } return (w+5); } @@ -176,67 +200,106 @@ t_int *volctl_perf_simd(t_int *w) if (x->x_ticksleft) { - int n = (int)(w[4]); + int n = (int)(w[4]); - float f = x->x_value; + t_float x_slope = x->x_slope; + if (x->x_ticksleft < n) + { + t_float f = x->x_value; + + int remain = x->x_ticksleft; + n-=remain; + while (remain--) + { + *out++ = *in++ * f; + f+=x_slope; + } + while (n--) + { + *out++ = *in++ * f; + } + x->x_value = f; + x->x_ticksleft = 0; + } + else + { + x->x_ticksleft -= n; + + asm( + ".set T_FLOAT,4 \n" + "movss (%3),%%xmm0 \n" /* value */ + "shufps $0, %%xmm0, %%xmm0 \n" + "movaps (%4), %%xmm1 \n" /* x_slopes */ + "addps %%xmm0, %%xmm1 \n" - float x_slope = x->x_slope; - if (x->x_ticksleft < n) - { - int remain = x->x_ticksleft; - n-=remain; - while (remain--) - { - *out++ = *in++ * f; - f+=x_slope; - } - while (n--) - { - *out++ = *in++ * f; - } - x->x_value = f; - x->x_ticksleft = 0; - } - else - { - x->x_ticksleft -= n; - while (n--) - { - *out++ = *in++ * f; - f+=x_slope; - } - x->x_value = f; - } + "movss (%5), %%xmm0 \n" + "shufps $0, %%xmm0, %%xmm0 \n" /* x_slope_step */ + + "shrl $4, %2 \n" /* n>>4 */ + + "1: \n" + "movaps (%0), %%xmm2 \n" + "mulps %%xmm1, %%xmm2 \n" + "movaps %%xmm2, (%1) \n" + "addps %%xmm0, %%xmm1 \n" + + "movaps 4*T_FLOAT(%0), %%xmm2 \n" + "mulps %%xmm1, %%xmm2 \n" + "movaps %%xmm2, 4*T_FLOAT(%1) \n" + "addps %%xmm0, %%xmm1 \n" + + "movaps 8*T_FLOAT(%0), %%xmm2 \n" + "mulps %%xmm1, %%xmm2 \n" + "movaps %%xmm2, 8*T_FLOAT(%1) \n" + "addps %%xmm0, %%xmm1 \n" + + "movaps 12*T_FLOAT(%0), %%xmm2 \n" + "mulps %%xmm1, %%xmm2 \n" + "movaps %%xmm2, 12*T_FLOAT(%1) \n" + "addps %%xmm0, %%xmm1 \n" /* one instr. obsolete */ + + "addl $16*T_FLOAT, %0 \n" + "addl $16*T_FLOAT, %1 \n" + "loop 1b \n" + + : + :"r"(in), "r"(out), "c"(n), "r"(&(t_float)(x->x_value)), + "r"((t_float*)x->x_slopes), "r"(&(t_float)(x->x_slope_step)) + :"%xmm0", "%xmm1", "%xmm2"); + +/* post("value %f", x->x_value); */ + x->x_value += n*(x->x_slope); + } } else { - asm( - ".set T_FLOAT,4 \n" - - "movss (%3), %%xmm0 \n" - "shufps $0, %%xmm0, %%xmm0 \n" - "shrl $4, %2 \n" - - "volctl_loop: \n" - "movaps (%0), %%xmm1 \n" - "mulps %%xmm0, %%xmm1 \n" - "movaps %%xmm1, (%1) \n" - "movaps 4*T_FLOAT(%0), %%xmm2 \n" - "mulps %%xmm0, %%xmm2 \n" - "movaps %%xmm2, 4*T_FLOAT(%1) \n" - "movaps 8*T_FLOAT(%0), %%xmm3 \n" - "mulps %%xmm0, %%xmm3 \n" - "movaps %%xmm3, 8*T_FLOAT(%1) \n" - "movaps 12*T_FLOAT(%0), %%xmm4 \n" - "mulps %%xmm0, %%xmm4 \n" - "movaps %%xmm4, 12*T_FLOAT(%1) \n" - "addl $64, %0 \n" - "addl $64, %1 \n" - "loop volctl_loop \n" - : - : "r"(in), "r"(out), - "a"(w[4]),"r"(&(x->x_value)) - : "%xmm0","%xmm1","%xmm2","%xmm3","%xmm4"); + asm( + ".set T_FLOAT,4 \n" + + "movss (%3), %%xmm0 \n" + "shufps $0, %%xmm0, %%xmm0 \n" + "shrl $4, %2 \n" + + "volctl_loop: \n" + "movaps (%0), %%xmm1 \n" + "mulps %%xmm0, %%xmm1 \n" + "movaps %%xmm1, (%1) \n" + "movaps 4*T_FLOAT(%0), %%xmm2 \n" + "mulps %%xmm0, %%xmm2 \n" + "movaps %%xmm2, 4*T_FLOAT(%1) \n" + "movaps 8*T_FLOAT(%0), %%xmm3 \n" + "mulps %%xmm0, %%xmm3 \n" + "movaps %%xmm3, 8*T_FLOAT(%1) \n" + "movaps 12*T_FLOAT(%0), %%xmm4 \n" + "mulps %%xmm0, %%xmm4 \n" + "movaps %%xmm4, 12*T_FLOAT(%1) \n" + "addl $16*T_FLOAT, %0 \n" + "addl $16*T_FLOAT, %1 \n" + "loop volctl_loop \n" + : + : "r"(in), "r"(out), + "c"(w[4]),"r"(&(t_float)(x->x_value)) + : "%xmm0", "%xmm1","%xmm2","%xmm3","%xmm4"); } return (w+5); } @@ -244,8 +307,18 @@ t_int *volctl_perf_simd(t_int *w) void volctl_set(t_volctl *x, t_float f) { - x->x_ticksleft = x->x_h / x->x_mspersample; - x->x_slope = (f - x->x_value) / x->x_ticksleft; + t_float slope; + int i; + + x->x_ticksleft = x->x_h * x->x_samples_per_ms; + slope = (f - x->x_value) / x->x_ticksleft; + x->x_slope = slope; + + for (i = 0; i != 4; ++i) + { + x->x_slopes[i] = i*slope; + } + x->x_slope_step = 4*slope; } void volctl_dsp(t_volctl *x, t_signal **sp) @@ -255,18 +328,18 @@ void volctl_dsp(t_volctl *x, t_signal **sp) dsp_add(volctl_perform, 4, x, sp[0]->s_vec, sp[1]->s_vec, n); else { - if(SIMD_CHECK2(n,sp[0]->s_vec,sp[1]->s_vec)) - dsp_add(volctl_perf_simd, 4, x, sp[0]->s_vec, sp[1]->s_vec, n); - else - dsp_add(volctl_perf8, 4, x, sp[0]->s_vec, sp[1]->s_vec, n); + if(SIMD_CHECK2(n,sp[0]->s_vec,sp[1]->s_vec)) + dsp_add(volctl_perf_simd, 4, x, sp[0]->s_vec, sp[1]->s_vec, n); + else + dsp_add(volctl_perf8, 4, x, sp[0]->s_vec, sp[1]->s_vec, n); } - x->x_mspersample = 1000.f / sp[0]->s_sr; + x->x_samples_per_ms = sp[0]->s_sr / 1000.f; } void volctl_tilde_setup(void) { - volctl_class = class_new(gensym("volctl~"), (t_newmethod)volctl_new, 0, - sizeof(t_volctl), 0, A_GIMME, 0); + volctl_class = class_new(gensym("volctl~"), (t_newmethod)volctl_new, + (t_method)volctl_free, sizeof(t_volctl), 0, A_GIMME, 0); CLASS_MAINSIGNALIN(volctl_class, t_volctl, x_f); class_addmethod(volctl_class, (t_method)volctl_dsp, gensym("dsp"), 0); class_addmethod(volctl_class, (t_method)volctl_set, gensym("f1"),A_FLOAT,0); -- cgit v1.2.1