/* Copyright 2003-2005 Benjamin R. Saylor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ // Thu Feb 17 22:29:27 CST 2005 - Changed outbuf loops to use comparison instead of modulo (suggested by c. clepper) - faster // Sat Jul 30 19:59:08 AKDT 2005 - Completed modification for re-blocking and dividing up the work between calls. // This has eliminated dropouts due to lots of work on large block boundaries. // Fri Aug 5 10:46:33 AKDT 2005 - accumulate in the frequency domain, so only 1 IFFT is needed per input block, // rather than IFFTs. Big performance boost. // Fri Aug 5 20:27:05 AKDT 2005 - should work properly with arbitrary (2^n) blocksize <= partsize // Fri Aug 12 00:32:29 AKDT 2005 - added altivec code by Chris Clepper // TODO // SSE version // multichannel (multiple IRs)? probably wouldn't gain much from this // divide work more evenly? (0, 15, 7, 23, 3, 11, 19, 27, ...) // someday, an SSE3 version (supposed to make complex math fast) #include #include #include #include "m_pd.h" #ifdef __VEC__ #include #endif #define MAXPARTS 256 // max number of partitions #ifdef USE_SSE typedef float v4sf __attribute__ ((vector_size (16))); #endif static t_class *partconv_class; struct sumbuffer { int index; fftwf_complex *fd; float *td; fftwf_plan plan; int readpos; struct sumbuffer *next, *prev; }; typedef struct _partconv { t_object x_obj; t_symbol *arrayname; int partsize; int fftsize; float scale; int paddedsize; int nbins; int nparts; int ir_prepared; int pd_blocksize; // partitions of impulse response fftwf_plan irpart_plan; float *irpart_td[MAXPARTS]; fftwf_complex *irpart_fd[MAXPARTS]; // input fftwf_plan input_plan; float *inbuf; int inbufpos; float *input_td; fftwf_complex *input_fd; // circular array/list of buffers for accumulating results of convolution struct sumbuffer sumbufs[MAXPARTS+2]; int nsumbufs; // number of sumbufs struct sumbuffer *sumbuf; // the current sumbuf corresponding to the first partition of the IR // dividing up the work between calls to perform() int parts_per_call[MAXPARTS]; // parts_per_call[c] is the number of partitions to convolve during perform() call c int curcall; // current call, counted from the beginning of the current cycle (input buffer full) int curpart; // current partition to convolve } t_partconv; // Determine how to divide the work as evenly as possible between calls to perform(). static void partconv_deal_work(t_partconv *x) { int calls_per_cycle; int parts_to_distribute; int i; // Like dealing cards. // One cycle is defined as the time it takes to fill the input buffer (whose size is the user-given partition size) calls_per_cycle = x->partsize / x->pd_blocksize; for (i = 0; i < calls_per_cycle; i++) { x->parts_per_call[i] = 0; } i = 0; parts_to_distribute = x->nparts; while (parts_to_distribute) { x->parts_per_call[i]++; parts_to_distribute--; i = (i + 1) % calls_per_cycle; } /* for (i = 0; i < calls_per_cycle; i++) { printf("parts_per_call[%d] = %d\n", i, x->parts_per_call[i]); } */ } #ifdef __VEC__ #include "altivec-perform.inc.c" #else static t_int *partconv_perform(t_int *w) { t_partconv *x = (t_partconv *)(w[1]); t_float *in = (t_float *)(w[2]); t_float *out = (t_float *)(w[3]); int n = (int)(w[4]); int i; int j; int k; // bin int p; // partition int endpart; #ifdef USE_SSE int v1; int v2; int nvecs; v4sf *cursumbuf_fd; v4sf *input_fd; v4sf *irpart_fd; #else fftwf_complex *cursumbuf_fd; fftwf_complex *input_fd; fftwf_complex *irpart_fd; #endif float *sumbuf1ptr; float *sumbuf2ptr; memcpy(&(x->inbuf[x->inbufpos]), in, n*sizeof(float)); // gather a block of input into input buffer x->inbufpos += n; if (x->inbufpos >= x->partsize) { // input buffer is full, so we begin a new cycle if (x->pd_blocksize != n) { // the patch's blocksize has change since we last dealt the work x->pd_blocksize = n; partconv_deal_work(x); } x->inbufpos = 0; x->curcall = 0; x->curpart = 0; memcpy(x->input_td, x->inbuf, x->partsize * sizeof(float)); // copy 'gathering' input buffer into 'transform' buffer memset(&(x->input_td[x->partsize]), 0, (x->paddedsize - x->partsize) * sizeof(float)); // pad fftwf_execute(x->input_plan); // transform the input // everything has been read out of prev sumbuf, so clear it memset(x->sumbuf->prev->td, 0, x->paddedsize * sizeof(float)); // advance sumbuf pointers x->sumbuf = x->sumbuf->next; x->sumbuf->readpos = 0; x->sumbuf->prev->readpos = x->partsize; } // convolve this call's portion of partitions endpart = x->curpart + x->parts_per_call[x->curcall]; if (endpart > x->nparts) // FIXME does this ever happen? endpart = x->nparts; for (p = x->curpart; p < endpart; p++) { // multiply the input block by the partition, accumulating the result in the appropriate sumbuf #ifdef USE_SSE #include "sse-conv.inc.c" #else cursumbuf_fd = x->sumbufs[(x->sumbuf->index + p) % x->nsumbufs].fd; input_fd = x->input_fd; irpart_fd = x->irpart_fd[p]; for (k = 0; k < x->nbins; k++) { cursumbuf_fd[k][0] += ( input_fd[k][0] * irpart_fd[k][0] - input_fd[k][1] * irpart_fd[k][1]); cursumbuf_fd[k][1] += ( input_fd[k][0] * irpart_fd[k][1] + input_fd[k][1] * irpart_fd[k][0]); } #endif } x->curpart = p; // The convolution of the fresh block of input with the first partition of the IR // is the last thing that gets summed into the current sumbuf before it gets IFFTed and starts being output. // This happens during the first call of every cycle. if (x->curcall == 0) { // current sumbuf has been filled, so transform it // Output loop will begin to read it and sum it with the last one fftwf_execute(x->sumbuf->plan); } // we're summing and outputting the first half of the most recently IFFTed sumbuf // and the second half of the previous one sumbuf1ptr = &(x->sumbuf->td[x->sumbuf->readpos]); sumbuf2ptr = &(x->sumbuf->prev->td[x->sumbuf->prev->readpos]); for (i = 0; i < n; i++) { out[i] = (sumbuf1ptr[i] + sumbuf2ptr[i]) * x->scale; } x->sumbuf->readpos += n; x->sumbuf->prev->readpos += n; x->curcall++; return (w+5); } #endif // __VEC__ static void partconv_free(t_partconv *x) { int i; fftwf_free(x->inbuf); for (i = 0; i < x->nparts; i++) fftwf_free(x->irpart_td[i]); fftwf_free(x->input_td); fftwf_destroy_plan(x->input_plan); for (i = 0; i < x->nsumbufs; i++) { fftwf_free(x->sumbufs[i].fd); fftwf_destroy_plan(x->sumbufs[i].plan); } } static void partconv_set(t_partconv *x, t_symbol *s) { int i; int j; t_garray *arrayobj; t_word *array; int arraysize; int arraypos; // get the array from pd x->arrayname = s; if ( ! (arrayobj = (t_garray *)pd_findbyclass(x->arrayname, garray_class))) { if (*x->arrayname->s_name) { pd_error(x, "partconv~: %s: no such array", x->arrayname->s_name); return; } } else if ( ! garray_getfloatwords(arrayobj, &arraysize, &array)) { pd_error(x, "%s: bad template", x->arrayname->s_name); return; } // if the IR has already been prepared, free everything first if (x->ir_prepared == 1) { partconv_free(x); } // caculate number of partitions x->nparts = arraysize / x->partsize; if (arraysize % x->partsize != 0) x->nparts++; if (x->nparts > MAXPARTS) x->nparts = MAXPARTS; // allocate, fill, pad, and transform each IR partition for (arraypos = 0, i = 0; i < x->nparts; i++) { x->irpart_td[i] = fftwf_malloc(sizeof(float) * x->paddedsize); x->irpart_fd[i] = (fftwf_complex *) x->irpart_td[i]; x->irpart_plan = fftwf_plan_dft_r2c_1d(x->fftsize, x->irpart_td[i], x->irpart_fd[i], FFTW_MEASURE); for (j = 0; j < x->partsize && arraypos < arraysize; j++, arraypos++) { x->irpart_td[i][j] = array[arraypos].w_float; } for ( ; j < x->paddedsize; j++) { x->irpart_td[i][j] = 0; } fftwf_execute(x->irpart_plan); fftwf_destroy_plan(x->irpart_plan); // now, x->irpart[i] contains the DFT of the ith partition of the impulse response. } x->inbuf = fftwf_malloc(sizeof(float) * x->partsize); x->inbufpos = 0; // allocate buffer for DFT of padded input x->input_td = fftwf_malloc(sizeof(float) * x->paddedsize); // float array into which input block is copied and padded x->input_fd = (fftwf_complex *) x->input_td; // fftwf_complex pointer to the same array to facilitate in-place fft x->input_plan = fftwf_plan_dft_r2c_1d(x->fftsize, x->input_td, x->input_fd, FFTW_MEASURE); // set up circular list/array of buffers for accumulating results of convolution x->nsumbufs = x->nparts + 2; x->sumbuf = &(x->sumbufs[0]); for (i = 0; i < x->nsumbufs; i++) { x->sumbufs[i].index = i; x->sumbufs[i].fd = fftwf_malloc(sizeof(float) * x->paddedsize); memset(x->sumbufs[i].fd, 0, sizeof(float) * x->paddedsize); x->sumbufs[i].td = (float *) x->sumbufs[i].fd; x->sumbufs[i].plan = fftwf_plan_dft_c2r_1d(x->fftsize, x->sumbufs[i].fd, x->sumbufs[i].td, FFTW_MEASURE); x->sumbufs[i].readpos = 0; } x->sumbufs[0].next = &(x->sumbufs[1]); x->sumbufs[0].prev = &(x->sumbufs[x->nsumbufs - 1]); for (i = 1; i < x->nsumbufs; i++) { x->sumbufs[i].next = &(x->sumbufs[(i + 1) % x->nsumbufs]); x->sumbufs[i].prev = &(x->sumbufs[i - 1]); } partconv_deal_work(x); x->curcall = 0; x->curpart = 0; post("partconv~: using %s in %d partitions with FFT-size %d", x->arrayname->s_name, x->nparts, x->fftsize); x->ir_prepared = 1; } static void partconv_dsp(t_partconv *x, t_signal **sp) { // if the ir array has not been prepared, prepare it if (x->ir_prepared == 0) { partconv_set(x, x->arrayname); } dsp_add(partconv_perform, 4, x, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n); } static void *partconv_new(t_symbol *s, int argc, t_atom *argv) { t_partconv *x = (t_partconv *)pd_new(partconv_class); outlet_new(&x->x_obj, gensym("signal")); if (argc != 2) { post("argc = %d", argc); error("partconv~: usage: [partconv~ ]\n\t- partition size must be a power of 2 >= blocksize"); return NULL; } x->arrayname = atom_getsymbol(argv); x->partsize = atom_getfloatarg(1, argc, argv); if (x->partsize <= 0 || x->partsize != (1 << ilog2(x->partsize))) { error("partconv~: partition size not a power of 2"); return NULL; } x->fftsize = 2 * x->partsize; x->scale = 1 / ((float) x->fftsize); // need 2*(n/2+1) float array for in-place transform, where n is fftsize. #ifdef USE_SSE // for sse, make it a multiple of 8, because we pull in 4 bins at a time and don't want to get a segfault x->paddedsize = 2 * (x->fftsize / 2 + 4); #else x->paddedsize = 2 * (x->fftsize / 2 + 1); #endif x->nbins = x->fftsize / 2 + 1; x->ir_prepared = 0; x->pd_blocksize = sys_getblksize(); return (x); } void partconv_tilde_setup(void) { partconv_class = class_new(gensym("partconv~"), (t_newmethod)partconv_new, (t_method)partconv_free, sizeof(t_partconv), 0, A_GIMME, 0); class_addmethod(partconv_class, nullfn, gensym("signal"), 0); class_addmethod(partconv_class, (t_method) partconv_dsp, gensym("dsp"), 0); class_addmethod(partconv_class, (t_method) partconv_set, gensym("set"), A_DEFSYMBOL, 0); }