aboutsummaryrefslogtreecommitdiff
path: root/partconv~.c
blob: 373d58036c830eea7b3345489735f31a62af2da9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
/* Copyright 2003-2005 Benjamin R. Saylor <bensaylor@fastmail.fm>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

// Thu Feb 17 22:29:27 CST 2005 - Changed outbuf loops to use comparison instead of modulo (suggested by c. clepper) - faster
// Sat Jul 30 19:59:08 AKDT 2005 - Completed modification for re-blocking and dividing up the work between calls.
// 					This has eliminated dropouts due to lots of work on large block boundaries.
// Fri Aug  5 10:46:33 AKDT 2005 - accumulate in the frequency domain, so only 1 IFFT is needed per input block,
// 					rather than <nparts> IFFTs.  Big performance boost.
// Fri Aug  5 20:27:05 AKDT 2005 - should work properly with arbitrary (2^n) blocksize <= partsize
// Fri Aug 12 00:32:29 AKDT 2005 - added altivec code by Chris Clepper

// TODO
// SSE version
// multichannel (multiple IRs)?  probably wouldn't gain much from this
// divide work more evenly? (0, 15, 7, 23, 3, 11, 19, 27, ...)
// someday, an SSE3 version (supposed to make complex math fast)

#include <math.h>
#include <string.h>
#include <fftw3.h>
#include "m_pd.h"

#ifdef __VEC__
#include <altivec.h>
#endif

#define MAXPARTS 256	// max number of partitions

#ifdef USE_SSE
typedef float v4sf __attribute__ ((vector_size (16)));
#endif

static t_class *partconv_class;

struct sumbuffer {
	int index;
	fftwf_complex *fd;
	float *td;
	fftwf_plan plan;
	int readpos;
	struct sumbuffer *next, *prev;
};

typedef struct _partconv {
	t_object x_obj;
	t_symbol *arrayname;
	int partsize;
	int fftsize;
	float scale;
	int paddedsize;
	int nbins;
	int nparts;
	int ir_prepared;
	int pd_blocksize;

	// partitions of impulse response
	fftwf_plan irpart_plan;
	float *irpart_td[MAXPARTS];
	fftwf_complex *irpart_fd[MAXPARTS];

	// input
	fftwf_plan input_plan;
	float *inbuf;
	int inbufpos;
	float *input_td;
	fftwf_complex *input_fd;

	// circular array/list of buffers for accumulating results of convolution
	struct sumbuffer sumbufs[MAXPARTS+2];
	int nsumbufs;  // number of sumbufs
	struct sumbuffer *sumbuf;  // the current sumbuf corresponding to the first partition of the IR

	// dividing up the work between calls to perform()
	int parts_per_call[MAXPARTS];	// parts_per_call[c] is the number of partitions to convolve during perform() call c
	int curcall;			// current call, counted from the beginning of the current cycle (input buffer full)
	int curpart;			// current partition to convolve
} t_partconv;

// Determine how to divide the work as evenly as possible between calls to perform().
static void partconv_deal_work(t_partconv *x)
{
	int calls_per_cycle;
	int parts_to_distribute;
	int i;

	// Like dealing cards.
	// One cycle is defined as the time it takes to fill the input buffer (whose size is the user-given partition size)
	calls_per_cycle = x->partsize / x->pd_blocksize;
	for (i = 0; i < calls_per_cycle; i++) {
		x->parts_per_call[i] = 0;
	}
	i = 0;
	parts_to_distribute = x->nparts;
	while (parts_to_distribute) {
		x->parts_per_call[i]++;
		parts_to_distribute--;
		i = (i + 1) % calls_per_cycle;
	}
	/*
	for (i = 0; i < calls_per_cycle; i++) {
		printf("parts_per_call[%d] = %d\n", i, x->parts_per_call[i]);
	}
	*/
}

#ifdef __VEC__
#include "altivec-perform.inc.c"
#else

static t_int *partconv_perform(t_int *w)
{
	t_partconv *x = (t_partconv *)(w[1]);
	t_float *in = (t_float *)(w[2]);
	t_float *out = (t_float *)(w[3]);
	int n = (int)(w[4]);
	int i;
	int j;
	int k;	// bin
	int p;	// partition
	int endpart;

#ifdef USE_SSE
	int v1;
	int v2;
	int nvecs;
	v4sf *cursumbuf_fd;
	v4sf *input_fd;
	v4sf *irpart_fd;
#else
	fftwf_complex *cursumbuf_fd;
	fftwf_complex *input_fd;
	fftwf_complex *irpart_fd;
#endif

	float *sumbuf1ptr;
	float *sumbuf2ptr;

	memcpy(&(x->inbuf[x->inbufpos]), in, n*sizeof(float));  // gather a block of input into input buffer
	x->inbufpos += n;
	if (x->inbufpos >= x->partsize) {
		// input buffer is full, so we begin a new cycle
		
		if (x->pd_blocksize != n) {
			// the patch's blocksize has change since we last dealt the work
			x->pd_blocksize = n;
			partconv_deal_work(x);
		}
		
		x->inbufpos = 0;
		x->curcall = 0;
		x->curpart = 0;
		memcpy(x->input_td, x->inbuf, x->partsize * sizeof(float));  // copy 'gathering' input buffer into 'transform' buffer
		memset(&(x->input_td[x->partsize]), 0, (x->paddedsize - x->partsize) * sizeof(float));  // pad

		fftwf_execute(x->input_plan);  // transform the input

		// everything has been read out of prev sumbuf, so clear it
		memset(x->sumbuf->prev->td, 0,  x->paddedsize * sizeof(float));

		// advance sumbuf pointers
		x->sumbuf = x->sumbuf->next;
		x->sumbuf->readpos = 0;
		x->sumbuf->prev->readpos = x->partsize;
	}

	// convolve this call's portion of partitions
	endpart = x->curpart + x->parts_per_call[x->curcall];
	if (endpart > x->nparts)  // FIXME does this ever happen?
		endpart = x->nparts;
	for (p = x->curpart; p < endpart; p++) {
		// multiply the input block by the partition, accumulating the result in the appropriate sumbuf
#ifdef USE_SSE
#include "sse-conv.inc.c"
#else
		cursumbuf_fd = x->sumbufs[(x->sumbuf->index + p) % x->nsumbufs].fd;
		input_fd = x->input_fd;
		irpart_fd = x->irpart_fd[p];

		for (k = 0; k < x->nbins; k++) {

			cursumbuf_fd[k][0]
				+=
				(  input_fd[k][0] * irpart_fd[k][0]
				 - input_fd[k][1] * irpart_fd[k][1]);

			cursumbuf_fd[k][1]
				+=
				(  input_fd[k][0] * irpart_fd[k][1]
				 + input_fd[k][1] * irpart_fd[k][0]);
		}
#endif
	}
	x->curpart = p;

	// The convolution of the fresh block of input with the first partition of the IR
	// is the last thing that gets summed into the current sumbuf before it gets IFFTed and starts being output.
	// This happens during the first call of every cycle.
	if (x->curcall == 0) {
		// current sumbuf has been filled, so transform it
		// Output loop will begin to read it and sum it with the last one
		fftwf_execute(x->sumbuf->plan);
	}

	// we're summing and outputting the first half of the most recently IFFTed sumbuf
	// and the second half of the previous one
	sumbuf1ptr = &(x->sumbuf->td[x->sumbuf->readpos]);
	sumbuf2ptr = &(x->sumbuf->prev->td[x->sumbuf->prev->readpos]);
	for (i = 0; i < n; i++) {
		out[i] = (sumbuf1ptr[i] + sumbuf2ptr[i]) * x->scale;
	}
	x->sumbuf->readpos += n;
	x->sumbuf->prev->readpos += n;

	x->curcall++;

	return (w+5);
}

#endif // __VEC__

static void partconv_free(t_partconv *x)
{
	int i;

	fftwf_free(x->inbuf);
	for (i = 0; i < x->nparts; i++)
		fftwf_free(x->irpart_td[i]);
	fftwf_free(x->input_td);
	fftwf_destroy_plan(x->input_plan);
	for (i  = 0; i < x->nsumbufs; i++) {
		fftwf_free(x->sumbufs[i].fd);
		fftwf_destroy_plan(x->sumbufs[i].plan);
	}
}

static void partconv_set(t_partconv *x, t_symbol *s)
{
	int i;
	int j;
	t_garray *arrayobj;
	t_float *array;
	int arraysize;
	int arraypos;

	// get the array from pd
	x->arrayname = s;
	if ( ! (arrayobj = (t_garray *)pd_findbyclass(x->arrayname, garray_class))) {
		if (*x->arrayname->s_name) {
			pd_error(x, "partconv~: %s: no such array", x->arrayname->s_name);
			return;
		}
	} else if ( ! garray_getfloatarray(arrayobj, &arraysize, &array)) {
		pd_error(x, "%s: bad template", x->arrayname->s_name);
		return;
	}

	// if the IR has already been prepared, free everything first
	if (x->ir_prepared == 1) {
		partconv_free(x);
	}

	// caculate number of partitions
	x->nparts = arraysize / x->partsize;
	if (arraysize % x->partsize != 0)
		x->nparts++;
	if (x->nparts > MAXPARTS)
		x->nparts = MAXPARTS;

	// allocate, fill, pad, and transform each IR partition
	for (arraypos = 0, i = 0; i < x->nparts; i++) {
		x->irpart_td[i] = fftwf_malloc(sizeof(float) * x->paddedsize);
		x->irpart_fd[i] = (fftwf_complex *) x->irpart_td[i];
		x->irpart_plan = fftwf_plan_dft_r2c_1d(x->fftsize, x->irpart_td[i], x->irpart_fd[i], FFTW_MEASURE);
		for (j = 0; j < x->partsize && arraypos < arraysize; j++, arraypos++) {
			x->irpart_td[i][j] = array[arraypos];
		}
		for ( ; j < x->paddedsize; j++) {
			x->irpart_td[i][j] = 0;
		}
		fftwf_execute(x->irpart_plan);
		fftwf_destroy_plan(x->irpart_plan);
		// now, x->irpart[i] contains the DFT of the ith partition of the impulse response.
	}

	x->inbuf = fftwf_malloc(sizeof(float) * x->partsize);
	x->inbufpos = 0;
	
	// allocate buffer for DFT of padded input
	x->input_td = fftwf_malloc(sizeof(float) * x->paddedsize);	// float array into which input block is copied and padded
	x->input_fd = (fftwf_complex *) x->input_td;			// fftwf_complex pointer to the same array to facilitate in-place fft
	x->input_plan =  fftwf_plan_dft_r2c_1d(x->fftsize, x->input_td, x->input_fd, FFTW_MEASURE);

	// set up circular list/array of buffers for accumulating results of convolution
	x->nsumbufs = x->nparts + 2;
	x->sumbuf = &(x->sumbufs[0]);
	for (i  = 0; i < x->nsumbufs; i++) {
		x->sumbufs[i].index = i;
		x->sumbufs[i].fd = fftwf_malloc(sizeof(float) * x->paddedsize);
		memset(x->sumbufs[i].fd, 0, sizeof(float) * x->paddedsize);
		x->sumbufs[i].td = (float *) x->sumbufs[i].fd;
		x->sumbufs[i].plan = fftwf_plan_dft_c2r_1d(x->fftsize, x->sumbufs[i].fd, x->sumbufs[i].td, FFTW_MEASURE);
		x->sumbufs[i].readpos = 0;
	}
	x->sumbufs[0].next = &(x->sumbufs[1]);
	x->sumbufs[0].prev = &(x->sumbufs[x->nsumbufs - 1]);
	for (i = 1; i < x->nsumbufs; i++) {
		x->sumbufs[i].next = &(x->sumbufs[(i + 1) % x->nsumbufs]);
		x->sumbufs[i].prev = &(x->sumbufs[i - 1]);
	}

	partconv_deal_work(x);
	x->curcall = 0;
	x->curpart = 0;

	post("partconv~: using %s in %d partitions with FFT-size %d", x->arrayname->s_name, x->nparts, x->fftsize);
	x->ir_prepared = 1;
}

static void partconv_dsp(t_partconv *x, t_signal **sp)
{
	// if the ir array has not been prepared, prepare it
	if (x->ir_prepared == 0) {
		partconv_set(x, x->arrayname);
	}

	dsp_add(partconv_perform, 4, x, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
}

static void *partconv_new(t_symbol *s, int argc, t_atom *argv)
{
	t_partconv *x = (t_partconv *)pd_new(partconv_class);

	outlet_new(&x->x_obj, gensym("signal"));

	if (argc != 2) {
		post("argc = %d", argc);
		error("partconv~: usage: [partconv~ <arrayname> <partsize>]\n\t- partition size must be a power of 2 >= blocksize");
		return NULL;
	}

	x->arrayname = atom_getsymbol(argv);
	x->partsize = atom_getfloatarg(1, argc, argv);
	if (x->partsize <= 0 || x->partsize != (1 << ilog2(x->partsize)))
	{
		error("partconv~: partition size not a power of 2");
		return NULL;
	}
	x->fftsize = 2 * x->partsize;
	x->scale = 1 / ((float) x->fftsize);

	// need 2*(n/2+1) float array for in-place transform, where n is fftsize.
#ifdef USE_SSE
	// for sse, make it a multiple of 8, because we pull in 4 bins at a time and don't want to get a segfault
	x->paddedsize = 2 * (x->fftsize / 2 + 4);
#else
	x->paddedsize = 2 * (x->fftsize / 2 + 1);
#endif
	x->nbins = x->fftsize / 2 + 1;
	x->ir_prepared = 0;
	x->pd_blocksize = sys_getblksize();

	return (x);
}

void partconv_tilde_setup(void)
{
	partconv_class = class_new(gensym("partconv~"), (t_newmethod)partconv_new,
			(t_method)partconv_free, sizeof(t_partconv), 0, A_GIMME, 0);
	class_addmethod(partconv_class, nullfn, gensym("signal"), 0);
	class_addmethod(partconv_class, (t_method) partconv_dsp, gensym("dsp"), 0);
	class_addmethod(partconv_class, (t_method) partconv_set, gensym("set"), A_DEFSYMBOL, 0);
}