aboutsummaryrefslogtreecommitdiff
path: root/partconv~.c
diff options
context:
space:
mode:
Diffstat (limited to 'partconv~.c')
-rw-r--r--partconv~.c382
1 files changed, 382 insertions, 0 deletions
diff --git a/partconv~.c b/partconv~.c
new file mode 100644
index 0000000..60175f2
--- /dev/null
+++ b/partconv~.c
@@ -0,0 +1,382 @@
+/* Copyright 2003-2005 Benjamin R. Saylor <bensaylor@fastmail.fm>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+// Thu Feb 17 22:29:27 CST 2005 - Changed outbuf loops to use comparison instead of modulo (suggested by c. clepper) - faster
+// Sat Jul 30 19:59:08 AKDT 2005 - Completed modification for re-blocking and dividing up the work between calls.
+// This has eliminated dropouts due to lots of work on large block boundaries.
+// Fri Aug 5 10:46:33 AKDT 2005 - accumulate in the frequency domain, so only 1 IFFT is needed per input block,
+// rather than <nparts> IFFTs. Big performance boost.
+// Fri Aug 5 20:27:05 AKDT 2005 - should work properly with arbitrary (2^n) blocksize <= partsize
+// Fri Aug 12 00:32:29 AKDT 2005 - added altivec code by Chris Clepper
+
+// TODO
+// SSE version
+// multichannel (multiple IRs)? probably wouldn't gain much from this
+// divide work more evenly? (0, 15, 7, 23, 3, 11, 19, 27, ...)
+// someday, an SSE3 version (supposed to make complex math fast)
+
+#include <math.h>
+#include <string.h>
+#include <fftw3.h>
+#include "m_pd.h"
+
+#define MAXPARTS 256 // max number of partitions
+
+#ifdef USE_SSE
+typedef float v4sf __attribute__ ((vector_size (16)));
+#endif
+
+static t_class *partconv_class;
+
+struct sumbuffer {
+ int index;
+ fftwf_complex *fd;
+ float *td;
+ fftwf_plan plan;
+ int readpos;
+ struct sumbuffer *next, *prev;
+};
+
+typedef struct _partconv {
+ t_object x_obj;
+ t_symbol *arrayname;
+ int partsize;
+ int fftsize;
+ float scale;
+ int paddedsize;
+ int nbins;
+ int nparts;
+ int ir_prepared;
+ int pd_blocksize;
+
+ // partitions of impulse response
+ fftwf_plan irpart_plan;
+ float *irpart_td[MAXPARTS];
+ fftwf_complex *irpart_fd[MAXPARTS];
+
+ // input
+ fftwf_plan input_plan;
+ float *inbuf;
+ int inbufpos;
+ float *input_td;
+ fftwf_complex *input_fd;
+
+ // circular array/list of buffers for accumulating results of convolution
+ struct sumbuffer sumbufs[MAXPARTS+2];
+ int nsumbufs; // number of sumbufs
+ struct sumbuffer *sumbuf; // the current sumbuf corresponding to the first partition of the IR
+
+ // dividing up the work between calls to perform()
+ int parts_per_call[MAXPARTS]; // parts_per_call[c] is the number of partitions to convolve during perform() call c
+ int curcall; // current call, counted from the beginning of the current cycle (input buffer full)
+ int curpart; // current partition to convolve
+} t_partconv;
+
+// Determine how to divide the work as evenly as possible between calls to perform().
+static void partconv_deal_work(t_partconv *x)
+{
+ int calls_per_cycle;
+ int parts_to_distribute;
+ int i;
+
+ // Like dealing cards.
+ // One cycle is defined as the time it takes to fill the input buffer (whose size is the user-given partition size)
+ calls_per_cycle = x->partsize / x->pd_blocksize;
+ for (i = 0; i < calls_per_cycle; i++) {
+ x->parts_per_call[i] = 0;
+ }
+ i = 0;
+ parts_to_distribute = x->nparts;
+ while (parts_to_distribute) {
+ x->parts_per_call[i]++;
+ parts_to_distribute--;
+ i = (i + 1) % calls_per_cycle;
+ }
+ /*
+ for (i = 0; i < calls_per_cycle; i++) {
+ printf("parts_per_call[%d] = %d\n", i, x->parts_per_call[i]);
+ }
+ */
+}
+
+#ifdef __VEC__
+#include "altivec-perform.inc.c"
+#else
+
+static t_int *partconv_perform(t_int *w)
+{
+ t_partconv *x = (t_partconv *)(w[1]);
+ t_float *in = (t_float *)(w[2]);
+ t_float *out = (t_float *)(w[3]);
+ int n = (int)(w[4]);
+ int i;
+ int j;
+ int k; // bin
+ int p; // partition
+ int endpart;
+
+#ifdef USE_SSE
+ int v1;
+ int v2;
+ int nvecs;
+ v4sf *cursumbuf_fd;
+ v4sf *input_fd;
+ v4sf *irpart_fd;
+#else
+ fftwf_complex *cursumbuf_fd;
+ fftwf_complex *input_fd;
+ fftwf_complex *irpart_fd;
+#endif
+
+ float *sumbuf1ptr;
+ float *sumbuf2ptr;
+
+ memcpy(&(x->inbuf[x->inbufpos]), in, n*sizeof(float)); // gather a block of input into input buffer
+ x->inbufpos += n;
+ if (x->inbufpos >= x->partsize) {
+ // input buffer is full, so we begin a new cycle
+
+ if (x->pd_blocksize != n) {
+ // the patch's blocksize has change since we last dealt the work
+ x->pd_blocksize = n;
+ partconv_deal_work(x);
+ }
+
+ x->inbufpos = 0;
+ x->curcall = 0;
+ x->curpart = 0;
+ memcpy(x->input_td, x->inbuf, x->partsize * sizeof(float)); // copy 'gathering' input buffer into 'transform' buffer
+ memset(&(x->input_td[x->partsize]), 0, (x->paddedsize - x->partsize) * sizeof(float)); // pad
+
+ fftwf_execute(x->input_plan); // transform the input
+
+ // everything has been read out of prev sumbuf, so clear it
+ memset(x->sumbuf->prev->td, 0, x->paddedsize * sizeof(float));
+
+ // advance sumbuf pointers
+ x->sumbuf = x->sumbuf->next;
+ x->sumbuf->readpos = 0;
+ x->sumbuf->prev->readpos = x->partsize;
+ }
+
+ // convolve this call's portion of partitions
+ endpart = x->curpart + x->parts_per_call[x->curcall];
+ if (endpart > x->nparts) // FIXME does this ever happen?
+ endpart = x->nparts;
+ for (p = x->curpart; p < endpart; p++) {
+ // multiply the input block by the partition, accumulating the result in the appropriate sumbuf
+#ifdef USE_SSE
+#include "sse-conv.inc.c"
+#else
+ cursumbuf_fd = x->sumbufs[(x->sumbuf->index + p) % x->nsumbufs].fd;
+ input_fd = x->input_fd;
+ irpart_fd = x->irpart_fd[p];
+
+ for (k = 0; k < x->nbins; k++) {
+
+ cursumbuf_fd[k][0]
+ +=
+ ( input_fd[k][0] * irpart_fd[k][0]
+ - input_fd[k][1] * irpart_fd[k][1]);
+
+ cursumbuf_fd[k][1]
+ +=
+ ( input_fd[k][0] * irpart_fd[k][1]
+ + input_fd[k][1] * irpart_fd[k][0]);
+ }
+#endif
+ }
+ x->curpart = p;
+
+ // The convolution of the fresh block of input with the first partition of the IR
+ // is the last thing that gets summed into the current sumbuf before it gets IFFTed and starts being output.
+ // This happens during the first call of every cycle.
+ if (x->curcall == 0) {
+ // current sumbuf has been filled, so transform it
+ // Output loop will begin to read it and sum it with the last one
+ fftwf_execute(x->sumbuf->plan);
+ }
+
+ // we're summing and outputting the first half of the most recently IFFTed sumbuf
+ // and the second half of the previous one
+ sumbuf1ptr = &(x->sumbuf->td[x->sumbuf->readpos]);
+ sumbuf2ptr = &(x->sumbuf->prev->td[x->sumbuf->prev->readpos]);
+ for (i = 0; i < n; i++) {
+ out[i] = (sumbuf1ptr[i] + sumbuf2ptr[i]) * x->scale;
+ }
+ x->sumbuf->readpos += n;
+ x->sumbuf->prev->readpos += n;
+
+ x->curcall++;
+
+ return (w+5);
+}
+
+#endif // __VEC__
+
+static void partconv_free(t_partconv *x)
+{
+ int i;
+
+ fftwf_free(x->inbuf);
+ for (i = 0; i < x->nparts; i++)
+ fftwf_free(x->irpart_td[i]);
+ fftwf_free(x->input_td);
+ fftwf_destroy_plan(x->input_plan);
+ for (i = 0; i < x->nsumbufs; i++) {
+ fftwf_free(x->sumbufs[i].fd);
+ fftwf_destroy_plan(x->sumbufs[i].plan);
+ }
+}
+
+static void partconv_set(t_partconv *x, t_symbol *s)
+{
+ int i;
+ int j;
+ t_garray *arrayobj;
+ t_float *array;
+ int arraysize;
+ int arraypos;
+
+ // get the array from pd
+ x->arrayname = s;
+ if ( ! (arrayobj = (t_garray *)pd_findbyclass(x->arrayname, garray_class))) {
+ if (*x->arrayname->s_name) {
+ pd_error(x, "partconv~: %s: no such array", x->arrayname->s_name);
+ return;
+ }
+ } else if ( ! garray_getfloatarray(arrayobj, &arraysize, &array)) {
+ pd_error(x, "%s: bad template", x->arrayname->s_name);
+ return;
+ }
+
+ // if the IR has already been prepared, free everything first
+ if (x->ir_prepared == 1) {
+ partconv_free(x);
+ }
+
+ // caculate number of partitions
+ x->nparts = arraysize / x->partsize;
+ if (arraysize % x->partsize != 0)
+ x->nparts++;
+ if (x->nparts > MAXPARTS)
+ x->nparts = MAXPARTS;
+
+ // allocate, fill, pad, and transform each IR partition
+ for (arraypos = 0, i = 0; i < x->nparts; i++) {
+ x->irpart_td[i] = fftwf_malloc(sizeof(float) * x->paddedsize);
+ x->irpart_fd[i] = (fftwf_complex *) x->irpart_td[i];
+ x->irpart_plan = fftwf_plan_dft_r2c_1d(x->fftsize, x->irpart_td[i], x->irpart_fd[i], FFTW_MEASURE);
+ for (j = 0; j < x->partsize && arraypos < arraysize; j++, arraypos++) {
+ x->irpart_td[i][j] = array[arraypos];
+ }
+ for ( ; j < x->paddedsize; j++) {
+ x->irpart_td[i][j] = 0;
+ }
+ fftwf_execute(x->irpart_plan);
+ fftwf_destroy_plan(x->irpart_plan);
+ // now, x->irpart[i] contains the DFT of the ith partition of the impulse response.
+ }
+
+ x->inbuf = fftwf_malloc(sizeof(float) * x->partsize);
+ x->inbufpos = 0;
+
+ // allocate buffer for DFT of padded input
+ x->input_td = fftwf_malloc(sizeof(float) * x->paddedsize); // float array into which input block is copied and padded
+ x->input_fd = (fftwf_complex *) x->input_td; // fftwf_complex pointer to the same array to facilitate in-place fft
+ x->input_plan = fftwf_plan_dft_r2c_1d(x->fftsize, x->input_td, x->input_fd, FFTW_MEASURE);
+
+ // set up circular list/array of buffers for accumulating results of convolution
+ x->nsumbufs = x->nparts + 2;
+ x->sumbuf = &(x->sumbufs[0]);
+ for (i = 0; i < x->nsumbufs; i++) {
+ x->sumbufs[i].index = i;
+ x->sumbufs[i].fd = fftwf_malloc(sizeof(float) * x->paddedsize);
+ memset(x->sumbufs[i].fd, 0, sizeof(float) * x->paddedsize);
+ x->sumbufs[i].td = (float *) x->sumbufs[i].fd;
+ x->sumbufs[i].plan = fftwf_plan_dft_c2r_1d(x->fftsize, x->sumbufs[i].fd, x->sumbufs[i].td, FFTW_MEASURE);
+ x->sumbufs[i].readpos = 0;
+ }
+ x->sumbufs[0].next = &(x->sumbufs[1]);
+ x->sumbufs[0].prev = &(x->sumbufs[x->nsumbufs - 1]);
+ for (i = 1; i < x->nsumbufs; i++) {
+ x->sumbufs[i].next = &(x->sumbufs[(i + 1) % x->nsumbufs]);
+ x->sumbufs[i].prev = &(x->sumbufs[i - 1]);
+ }
+
+ partconv_deal_work(x);
+ x->curcall = 0;
+ x->curpart = 0;
+
+ post("partconv~: using %s in %d partitions with FFT-size %d", x->arrayname->s_name, x->nparts, x->fftsize);
+ x->ir_prepared = 1;
+}
+
+static void partconv_dsp(t_partconv *x, t_signal **sp)
+{
+ // if the ir array has not been prepared, prepare it
+ if (x->ir_prepared == 0) {
+ partconv_set(x, x->arrayname);
+ }
+
+ dsp_add(partconv_perform, 4, x, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
+}
+
+static void *partconv_new(t_symbol *s, int argc, t_atom *argv)
+{
+ t_partconv *x = (t_partconv *)pd_new(partconv_class);
+
+ outlet_new(&x->x_obj, gensym("signal"));
+
+ if (argc != 2) {
+ post("argc = %d", argc);
+ error("partconv~: usage: [partconv~ <arrayname> <partsize>]\n\t- partition size must be a power of 2 >= blocksize");
+ return NULL;
+ }
+
+ x->arrayname = atom_getsymbol(argv);
+ x->partsize = atom_getfloatarg(1, argc, argv);
+ if (x->partsize <= 0 || x->partsize != (1 << ilog2(x->partsize)))
+ {
+ error("partconv~: partition size not a power of 2");
+ return NULL;
+ }
+ x->fftsize = 2 * x->partsize;
+ x->scale = 1 / ((float) x->fftsize);
+
+ // need 2*(n/2+1) float array for in-place transform, where n is fftsize.
+#ifdef USE_SSE
+ // for sse, make it a multiple of 8, because we pull in 4 bins at a time and don't want to get a segfault
+ x->paddedsize = 2 * (x->fftsize / 2 + 4);
+#else
+ x->paddedsize = 2 * (x->fftsize / 2 + 1);
+#endif
+ x->nbins = x->fftsize / 2 + 1;
+ x->ir_prepared = 0;
+ x->pd_blocksize = sys_getblksize();
+
+ return (x);
+}
+
+void partconv_tilde_setup(void)
+{
+ partconv_class = class_new(gensym("partconv~"), (t_newmethod)partconv_new,
+ (t_method)partconv_free, sizeof(t_partconv), 0, A_GIMME, 0);
+ class_addmethod(partconv_class, nullfn, gensym("signal"), 0);
+ class_addmethod(partconv_class, (t_method) partconv_dsp, gensym("dsp"), 0);
+ class_addmethod(partconv_class, (t_method) partconv_set, gensym("set"), A_DEFSYMBOL, 0);
+}