aboutsummaryrefslogtreecommitdiff
path: root/altivec-perform.inc.c
diff options
context:
space:
mode:
authorHans-Christoph Steiner <eighthave@users.sourceforge.net>2006-02-07 18:23:33 +0000
committerIOhannes m zmölnig <zmoelnig@iem.at>2015-10-14 13:36:00 +0200
commit2a5a823d12708ca0fa5c56347593afd9ce3b9b83 (patch)
treeeeb5906131003278b368711eb1a5238f39cd3f50 /altivec-perform.inc.c
parentdac2e8e0cb5980d41dc3879338816d6f52a5ff8a (diff)
added Ben Saylor's pvoc~ and partconv~ from his sources so they can be added to Pd-extended
svn path=/trunk/externals/bsaylor/; revision=4566
Diffstat (limited to 'altivec-perform.inc.c')
-rw-r--r--altivec-perform.inc.c283
1 files changed, 283 insertions, 0 deletions
diff --git a/altivec-perform.inc.c b/altivec-perform.inc.c
new file mode 100644
index 0000000..3ce1cf0
--- /dev/null
+++ b/altivec-perform.inc.c
@@ -0,0 +1,283 @@
+//altivec version by Chris Clepper
+//
+static t_int *partconv_perform(t_int *w)
+{
+ t_partconv *x = (t_partconv *)(w[1]);
+ t_float *in = (t_float *)(w[2]);
+ t_float *out = (t_float *)(w[3]);
+ int n = (int)(w[4]);
+ int i;
+ int j;
+ int k; // bin
+ int p; // partition
+ int endpart;
+ fftwf_complex *cursumbuf_fd;
+ float *sumbuf1ptr;
+ float *sumbuf2ptr;
+
+ union {
+ unsigned char c[16];
+ vector unsigned char v;
+ }permfill;
+
+ union {
+ float f[4];
+ vector float v;
+ }floatfill;
+
+ vector float *load_input, *load_irpart;
+ vector float store_multbuf1,store_multbuf2;
+ vector float vinput_fd0, vinput_fd4; //input vectors
+ vector float virpart_fd0, virpart_fd4; //ir partition vectors
+ vector float permtemp1357, permtemp0246;
+ vector float vzero;// vscale;
+ vector unsigned char input_0022, input_1133, perm_0246, perm_1357, perm_0123,perm_4567;
+ vector float vtemp1, vtemp2, vtemp3, vtemp4, vtemp5, vtemp6, vtemp7, vtemp8;
+
+ floatfill.f[0] = 0.f;
+ floatfill.f[1] = 0.f;
+ floatfill.f[2] = 0.f;
+ floatfill.f[3] = 0.f;
+ vzero = floatfill.v;
+
+ //store_multbuf = vzero;
+
+ floatfill.f[0] = x->scale;
+ floatfill.f[1] = x->scale;
+ floatfill.f[2] = x->scale;
+ floatfill.f[3] = x->scale;
+ //vscale = floatfill.v;
+
+ //fill the permute buffer for the first input_fd multiply
+ permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float
+ permfill.c[4] = 0; permfill.c[5] = 1; permfill.c[6] = 2; permfill.c[7] = 3; //second float
+ permfill.c[8] = 8; permfill.c[9] = 9; permfill.c[10] = 10; permfill.c[11] = 11; //third float
+ permfill.c[12] = 8; permfill.c[13] = 9; permfill.c[14] = 10; permfill.c[15] = 11; //fourth float
+
+ input_0022 = permfill.v;
+
+ permfill.c[0] = 4; permfill.c[1] = 5; permfill.c[2] = 6; permfill.c[3] = 7; //first float
+ permfill.c[4] = 4; permfill.c[5] = 5; permfill.c[6] = 6; permfill.c[7] = 7; //second float
+ permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float
+ permfill.c[12] = 12; permfill.c[13] = 13; permfill.c[14] = 14; permfill.c[15] = 15; //fourth float
+
+ input_1133 = permfill.v;
+
+ //perm_0246
+ //0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27
+ permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float
+ permfill.c[4] = 8; permfill.c[5] = 9; permfill.c[6] = 10; permfill.c[7] = 11; //second float
+ permfill.c[8] = 16; permfill.c[9] = 17; permfill.c[10] = 18; permfill.c[11] = 19; //third float
+ permfill.c[12] = 24; permfill.c[13] = 25; permfill.c[14] = 26; permfill.c[15] = 27; //fourth float
+
+ perm_0246 = permfill.v;
+
+ // perm_1357
+ // 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31
+ permfill.c[0] = 4; permfill.c[1] = 5; permfill.c[2] = 6; permfill.c[3] = 7; //first float
+ permfill.c[4] = 12; permfill.c[5] = 13; permfill.c[6] = 14; permfill.c[7] = 15; //second float
+ permfill.c[8] = 20; permfill.c[9] = 21; permfill.c[10] = 22; permfill.c[11] = 23; //third float
+ permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float
+
+ perm_1357 = permfill.v;
+
+ // perm_0123 from [0,2,4,6] and [1,3,5,7]
+ // 0,1,2,3 16,17,18,19 4,5,6,7 20,21,22,23
+ permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float
+ permfill.c[4] = 16; permfill.c[5] = 17; permfill.c[6] = 18; permfill.c[7] = 19; //second float
+ permfill.c[8] = 4; permfill.c[9] = 5; permfill.c[10] = 6; permfill.c[11] = 7; //third float
+ permfill.c[12] = 20; permfill.c[13] = 21; permfill.c[14] = 22; permfill.c[15] = 23; //fourth float
+
+ perm_0123 = permfill.v;
+
+ // perm_4567 from [0,2,4,6] and [1,3,5,7]
+ // 8.9.10.11 24,25,26,27 12,13,14,15 28,29,30,31
+ permfill.c[0] = 8; permfill.c[1] = 9; permfill.c[2] = 10; permfill.c[3] = 11; //first float
+ permfill.c[4] = 24; permfill.c[5] = 25; permfill.c[6] = 26; permfill.c[7] = 27; //second float
+ permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float
+ permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float
+
+ // perm_4567 from [0,2,4,6] and [1,3,5,7]
+ // 8.9.10.11 24,25,26,27 12,13,14,15 28,29,30,31
+ permfill.c[0] = 8; permfill.c[1] = 9; permfill.c[2] = 10; permfill.c[3] = 11; //first float
+ permfill.c[4] = 24; permfill.c[5] = 25; permfill.c[6] = 26; permfill.c[7] = 27; //second float
+ permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float
+ permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float
+
+ perm_4567 = permfill.v;
+
+
+ memcpy(&(x->inbuf[x->inbufpos]), in, n*sizeof(float)); // gather a block of input into input buffer
+ x->inbufpos += n;
+ if (x->inbufpos >= x->partsize) {
+ // input buffer is full, so we begin a new cycle
+
+ if (x->pd_blocksize != n) {
+ // the patch's blocksize has change since we last dealt the work
+ x->pd_blocksize = n;
+ partconv_deal_work(x);
+ }
+
+ x->inbufpos = 0;
+ x->curcall = 0;
+ x->curpart = 0;
+ memcpy(x->input_td, x->inbuf, x->partsize * sizeof(float)); // copy 'gathering' input buffer into 'transform' buffer
+ memset(&(x->input_td[x->partsize]), 0, (x->paddedsize - x->partsize) * sizeof(float)); // pad
+
+ fftwf_execute(x->input_plan); // transform the input
+
+ // everything has been read out of prev sumbuf, so clear it
+ memset(x->sumbuf->prev->td, 0, x->paddedsize * sizeof(float));
+
+ // advance sumbuf pointers
+ x->sumbuf = x->sumbuf->next;
+ x->sumbuf->readpos = 0;
+ x->sumbuf->prev->readpos = x->partsize;
+ }
+
+ // convolve this call's portion of partitions
+ endpart = x->curpart + x->parts_per_call[x->curcall];
+ if (endpart > x->nparts) // FIXME does this ever happen?
+ endpart = x->nparts;
+ for (p = x->curpart; p < endpart; p++) {
+ //printf("convolving with partition %d\n", p);
+ //
+ // multiply the input block by the partition, accumulating the result in the appropriate sumbuf
+ //
+
+ // FIXME do this in a circular list-type fashion so we don't need "index"
+ cursumbuf_fd = x->sumbufs[(x->sumbuf->index + p) % x->nsumbufs].fd;
+
+ for (k = 0; k < x->nbins; k+=4) {
+
+
+
+
+ load_input = (vector float *)&x->input_fd[k][0];
+ vinput_fd0 = vec_ld(0, (vector float *) load_input);
+
+ vtemp1 = vec_perm(load_input[0],vzero,input_0022);
+
+ load_input = (vector float *)&x->input_fd[k][4];
+ //load input_fd[k][4]
+ //vector will have input_fd[4,5,6,7]
+ vinput_fd4 = vec_ld(0, (vector float *) &x->input_fd[k][4]);
+
+ vtemp3 = vec_perm(load_input[0],vzero,input_0022);
+
+ //vec_ld irpart[p][k][0]
+ //vector will have irpart_fd[0,1,2,3]
+
+ load_irpart = (vector float *) &x->irpart_fd[p][k][0];
+
+ virpart_fd0 = vec_ld(0,&x->irpart_fd[p][k][0]);
+ vtemp1 = vec_madd(vtemp1,load_irpart[0],vzero);
+
+ load_irpart = (vector float *) &x->irpart_fd[p][k][4];
+ virpart_fd4 = vec_ld(0,&x->irpart_fd[p][k][4]);
+ vtemp3 = vec_madd(vtemp3,load_irpart[0],vzero);
+
+
+ store_multbuf1 = vec_ld(0,&cursumbuf_fd[k][0]);
+
+ store_multbuf2 = vec_ld(0,&cursumbuf_fd[k][4]);
+
+
+ //vec_perm to line up the elements
+ // irpart is fine
+ // make vector of input_fd[0] [2] and [4] [6]
+ //make vector of input_fd[1] [3] and [5] [7]
+ //
+ // permute only works on bytes so the first float is bytes 0,1,2,3 the second is 4,5,6,7 etc
+ //
+ // 0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27
+ //
+ // 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31
+
+
+ //vec_perm temp1 and temp3 into [0,2,4,6]
+ permtemp0246 = vec_perm(vtemp1,vtemp3,perm_0246);
+
+ //and [1,3,5,7]
+ permtemp1357 = vec_perm(vtemp1,vtemp3,perm_1357);
+
+ //vinput_fd[1,3,5,7]
+ vtemp2 = vec_perm(vinput_fd0,vinput_fd4,perm_1357);
+
+ //irpart[1,3,5,7]
+ vtemp4 = vec_perm(virpart_fd0,virpart_fd4,perm_1357);
+
+ //irpart[0,2,4,6]
+ vtemp5 = vec_perm(virpart_fd0,virpart_fd4,perm_0246);
+
+ //vec_nmsub input_fd[1,3,5,7] irpart[1,3,5,7] temp[0,2,4,6]
+ vtemp6 = vec_nmsub(vtemp2,vtemp4,permtemp0246);
+
+ //vec_madd input_fd[1,3,5,7] irpart[0,2,4,6] temp[1,3,5,7]
+ vtemp7 = vec_madd(vtemp2,vtemp5,permtemp1357);
+
+
+
+
+ //vec_madd all by scale - this is now done after the loop
+ // vtemp6 = vec_madd(vtemp6,vscale,vzero);
+
+ // vtemp7 = vec_madd(vtemp7,vscale,vzero);
+
+
+ //vec_perm data back into place - tricky!
+
+ //vec_perm nmsub_result[0,2,4,6] madd_result [1,3,5,7]
+ // results will be [0,1,2,3] [4,5,6,7]
+ vtemp1 = vec_perm(vtemp6,vtemp7,perm_0123);
+
+ vtemp2 = vec_perm(vtemp6,vtemp7,perm_4567);
+
+
+ //vec_st
+
+ store_multbuf1 = vec_add(store_multbuf1,vtemp1);
+ store_multbuf2 = vec_add(store_multbuf2,vtemp2);
+
+ vec_st(store_multbuf1,0,&cursumbuf_fd[k][0]);
+
+ vec_st(store_multbuf2,0,&cursumbuf_fd[k][4]);
+
+
+ /*
+ cursumbuf_fd[k][0]
+ +=
+ ( x->input_fd[k][0] * x->irpart_fd[p][k][0]
+ - x->input_fd[k][1] * x->irpart_fd[p][k][1]);
+
+ cursumbuf_fd[k][1]
+ +=
+ ( x->input_fd[k][0] * x->irpart_fd[p][k][1]
+ + x->input_fd[k][1] * x->irpart_fd[p][k][0]);*/
+ }
+ }
+ x->curpart = p;
+
+ // The convolution of the fresh block of input with the first partition of the IR
+ // is the last thing that gets summed into the current sumbuf before it gets IFFTed and starts being output.
+ // This happens during the first call of every cycle.
+ if (x->curcall == 0) {
+ // current sumbuf has been filled, so transform it (TD to FD).
+ // Output loop will begin to read it and sum it with the last one
+ fftwf_execute(x->sumbuf->plan);
+ }
+
+ // we're summing and outputting the first half of the most recently IFFTed sumbuf
+ // and the second half of the previous one
+ sumbuf1ptr = &(x->sumbuf->td[x->sumbuf->readpos]);
+ sumbuf2ptr = &(x->sumbuf->prev->td[x->sumbuf->prev->readpos]);
+ for (i = 0; i < n; i++) {
+ *(out++) = (*(sumbuf1ptr++) + *(sumbuf2ptr++)) * x->scale;
+ }
+ x->sumbuf->readpos += n;
+ x->sumbuf->prev->readpos += n;
+
+ x->curcall++;
+
+ return (w+5);
+}