From 2a5a823d12708ca0fa5c56347593afd9ce3b9b83 Mon Sep 17 00:00:00 2001 From: Hans-Christoph Steiner Date: Tue, 7 Feb 2006 18:23:33 +0000 Subject: added Ben Saylor's pvoc~ and partconv~ from his sources so they can be added to Pd-extended svn path=/trunk/externals/bsaylor/; revision=4566 --- altivec-perform.inc.c | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 altivec-perform.inc.c (limited to 'altivec-perform.inc.c') diff --git a/altivec-perform.inc.c b/altivec-perform.inc.c new file mode 100644 index 0000000..3ce1cf0 --- /dev/null +++ b/altivec-perform.inc.c @@ -0,0 +1,283 @@ +//altivec version by Chris Clepper +// +static t_int *partconv_perform(t_int *w) +{ + t_partconv *x = (t_partconv *)(w[1]); + t_float *in = (t_float *)(w[2]); + t_float *out = (t_float *)(w[3]); + int n = (int)(w[4]); + int i; + int j; + int k; // bin + int p; // partition + int endpart; + fftwf_complex *cursumbuf_fd; + float *sumbuf1ptr; + float *sumbuf2ptr; + + union { + unsigned char c[16]; + vector unsigned char v; + }permfill; + + union { + float f[4]; + vector float v; + }floatfill; + + vector float *load_input, *load_irpart; + vector float store_multbuf1,store_multbuf2; + vector float vinput_fd0, vinput_fd4; //input vectors + vector float virpart_fd0, virpart_fd4; //ir partition vectors + vector float permtemp1357, permtemp0246; + vector float vzero;// vscale; + vector unsigned char input_0022, input_1133, perm_0246, perm_1357, perm_0123,perm_4567; + vector float vtemp1, vtemp2, vtemp3, vtemp4, vtemp5, vtemp6, vtemp7, vtemp8; + + floatfill.f[0] = 0.f; + floatfill.f[1] = 0.f; + floatfill.f[2] = 0.f; + floatfill.f[3] = 0.f; + vzero = floatfill.v; + + //store_multbuf = vzero; + + floatfill.f[0] = x->scale; + floatfill.f[1] = x->scale; + floatfill.f[2] = x->scale; + floatfill.f[3] = x->scale; + //vscale = floatfill.v; + + //fill the permute buffer for the first input_fd multiply + permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float + permfill.c[4] = 0; permfill.c[5] = 1; permfill.c[6] = 2; permfill.c[7] = 3; //second float + permfill.c[8] = 8; permfill.c[9] = 9; permfill.c[10] = 10; permfill.c[11] = 11; //third float + permfill.c[12] = 8; permfill.c[13] = 9; permfill.c[14] = 10; permfill.c[15] = 11; //fourth float + + input_0022 = permfill.v; + + permfill.c[0] = 4; permfill.c[1] = 5; permfill.c[2] = 6; permfill.c[3] = 7; //first float + permfill.c[4] = 4; permfill.c[5] = 5; permfill.c[6] = 6; permfill.c[7] = 7; //second float + permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float + permfill.c[12] = 12; permfill.c[13] = 13; permfill.c[14] = 14; permfill.c[15] = 15; //fourth float + + input_1133 = permfill.v; + + //perm_0246 + //0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27 + permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float + permfill.c[4] = 8; permfill.c[5] = 9; permfill.c[6] = 10; permfill.c[7] = 11; //second float + permfill.c[8] = 16; permfill.c[9] = 17; permfill.c[10] = 18; permfill.c[11] = 19; //third float + permfill.c[12] = 24; permfill.c[13] = 25; permfill.c[14] = 26; permfill.c[15] = 27; //fourth float + + perm_0246 = permfill.v; + + // perm_1357 + // 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31 + permfill.c[0] = 4; permfill.c[1] = 5; permfill.c[2] = 6; permfill.c[3] = 7; //first float + permfill.c[4] = 12; permfill.c[5] = 13; permfill.c[6] = 14; permfill.c[7] = 15; //second float + permfill.c[8] = 20; permfill.c[9] = 21; permfill.c[10] = 22; permfill.c[11] = 23; //third float + permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float + + perm_1357 = permfill.v; + + // perm_0123 from [0,2,4,6] and [1,3,5,7] + // 0,1,2,3 16,17,18,19 4,5,6,7 20,21,22,23 + permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float + permfill.c[4] = 16; permfill.c[5] = 17; permfill.c[6] = 18; permfill.c[7] = 19; //second float + permfill.c[8] = 4; permfill.c[9] = 5; permfill.c[10] = 6; permfill.c[11] = 7; //third float + permfill.c[12] = 20; permfill.c[13] = 21; permfill.c[14] = 22; permfill.c[15] = 23; //fourth float + + perm_0123 = permfill.v; + + // perm_4567 from [0,2,4,6] and [1,3,5,7] + // 8.9.10.11 24,25,26,27 12,13,14,15 28,29,30,31 + permfill.c[0] = 8; permfill.c[1] = 9; permfill.c[2] = 10; permfill.c[3] = 11; //first float + permfill.c[4] = 24; permfill.c[5] = 25; permfill.c[6] = 26; permfill.c[7] = 27; //second float + permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float + permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float + + // perm_4567 from [0,2,4,6] and [1,3,5,7] + // 8.9.10.11 24,25,26,27 12,13,14,15 28,29,30,31 + permfill.c[0] = 8; permfill.c[1] = 9; permfill.c[2] = 10; permfill.c[3] = 11; //first float + permfill.c[4] = 24; permfill.c[5] = 25; permfill.c[6] = 26; permfill.c[7] = 27; //second float + permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float + permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float + + perm_4567 = permfill.v; + + + memcpy(&(x->inbuf[x->inbufpos]), in, n*sizeof(float)); // gather a block of input into input buffer + x->inbufpos += n; + if (x->inbufpos >= x->partsize) { + // input buffer is full, so we begin a new cycle + + if (x->pd_blocksize != n) { + // the patch's blocksize has change since we last dealt the work + x->pd_blocksize = n; + partconv_deal_work(x); + } + + x->inbufpos = 0; + x->curcall = 0; + x->curpart = 0; + memcpy(x->input_td, x->inbuf, x->partsize * sizeof(float)); // copy 'gathering' input buffer into 'transform' buffer + memset(&(x->input_td[x->partsize]), 0, (x->paddedsize - x->partsize) * sizeof(float)); // pad + + fftwf_execute(x->input_plan); // transform the input + + // everything has been read out of prev sumbuf, so clear it + memset(x->sumbuf->prev->td, 0, x->paddedsize * sizeof(float)); + + // advance sumbuf pointers + x->sumbuf = x->sumbuf->next; + x->sumbuf->readpos = 0; + x->sumbuf->prev->readpos = x->partsize; + } + + // convolve this call's portion of partitions + endpart = x->curpart + x->parts_per_call[x->curcall]; + if (endpart > x->nparts) // FIXME does this ever happen? + endpart = x->nparts; + for (p = x->curpart; p < endpart; p++) { + //printf("convolving with partition %d\n", p); + // + // multiply the input block by the partition, accumulating the result in the appropriate sumbuf + // + + // FIXME do this in a circular list-type fashion so we don't need "index" + cursumbuf_fd = x->sumbufs[(x->sumbuf->index + p) % x->nsumbufs].fd; + + for (k = 0; k < x->nbins; k+=4) { + + + + + load_input = (vector float *)&x->input_fd[k][0]; + vinput_fd0 = vec_ld(0, (vector float *) load_input); + + vtemp1 = vec_perm(load_input[0],vzero,input_0022); + + load_input = (vector float *)&x->input_fd[k][4]; + //load input_fd[k][4] + //vector will have input_fd[4,5,6,7] + vinput_fd4 = vec_ld(0, (vector float *) &x->input_fd[k][4]); + + vtemp3 = vec_perm(load_input[0],vzero,input_0022); + + //vec_ld irpart[p][k][0] + //vector will have irpart_fd[0,1,2,3] + + load_irpart = (vector float *) &x->irpart_fd[p][k][0]; + + virpart_fd0 = vec_ld(0,&x->irpart_fd[p][k][0]); + vtemp1 = vec_madd(vtemp1,load_irpart[0],vzero); + + load_irpart = (vector float *) &x->irpart_fd[p][k][4]; + virpart_fd4 = vec_ld(0,&x->irpart_fd[p][k][4]); + vtemp3 = vec_madd(vtemp3,load_irpart[0],vzero); + + + store_multbuf1 = vec_ld(0,&cursumbuf_fd[k][0]); + + store_multbuf2 = vec_ld(0,&cursumbuf_fd[k][4]); + + + //vec_perm to line up the elements + // irpart is fine + // make vector of input_fd[0] [2] and [4] [6] + //make vector of input_fd[1] [3] and [5] [7] + // + // permute only works on bytes so the first float is bytes 0,1,2,3 the second is 4,5,6,7 etc + // + // 0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27 + // + // 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31 + + + //vec_perm temp1 and temp3 into [0,2,4,6] + permtemp0246 = vec_perm(vtemp1,vtemp3,perm_0246); + + //and [1,3,5,7] + permtemp1357 = vec_perm(vtemp1,vtemp3,perm_1357); + + //vinput_fd[1,3,5,7] + vtemp2 = vec_perm(vinput_fd0,vinput_fd4,perm_1357); + + //irpart[1,3,5,7] + vtemp4 = vec_perm(virpart_fd0,virpart_fd4,perm_1357); + + //irpart[0,2,4,6] + vtemp5 = vec_perm(virpart_fd0,virpart_fd4,perm_0246); + + //vec_nmsub input_fd[1,3,5,7] irpart[1,3,5,7] temp[0,2,4,6] + vtemp6 = vec_nmsub(vtemp2,vtemp4,permtemp0246); + + //vec_madd input_fd[1,3,5,7] irpart[0,2,4,6] temp[1,3,5,7] + vtemp7 = vec_madd(vtemp2,vtemp5,permtemp1357); + + + + + //vec_madd all by scale - this is now done after the loop + // vtemp6 = vec_madd(vtemp6,vscale,vzero); + + // vtemp7 = vec_madd(vtemp7,vscale,vzero); + + + //vec_perm data back into place - tricky! + + //vec_perm nmsub_result[0,2,4,6] madd_result [1,3,5,7] + // results will be [0,1,2,3] [4,5,6,7] + vtemp1 = vec_perm(vtemp6,vtemp7,perm_0123); + + vtemp2 = vec_perm(vtemp6,vtemp7,perm_4567); + + + //vec_st + + store_multbuf1 = vec_add(store_multbuf1,vtemp1); + store_multbuf2 = vec_add(store_multbuf2,vtemp2); + + vec_st(store_multbuf1,0,&cursumbuf_fd[k][0]); + + vec_st(store_multbuf2,0,&cursumbuf_fd[k][4]); + + + /* + cursumbuf_fd[k][0] + += + ( x->input_fd[k][0] * x->irpart_fd[p][k][0] + - x->input_fd[k][1] * x->irpart_fd[p][k][1]); + + cursumbuf_fd[k][1] + += + ( x->input_fd[k][0] * x->irpart_fd[p][k][1] + + x->input_fd[k][1] * x->irpart_fd[p][k][0]);*/ + } + } + x->curpart = p; + + // The convolution of the fresh block of input with the first partition of the IR + // is the last thing that gets summed into the current sumbuf before it gets IFFTed and starts being output. + // This happens during the first call of every cycle. + if (x->curcall == 0) { + // current sumbuf has been filled, so transform it (TD to FD). + // Output loop will begin to read it and sum it with the last one + fftwf_execute(x->sumbuf->plan); + } + + // we're summing and outputting the first half of the most recently IFFTed sumbuf + // and the second half of the previous one + sumbuf1ptr = &(x->sumbuf->td[x->sumbuf->readpos]); + sumbuf2ptr = &(x->sumbuf->prev->td[x->sumbuf->prev->readpos]); + for (i = 0; i < n; i++) { + *(out++) = (*(sumbuf1ptr++) + *(sumbuf2ptr++)) * x->scale; + } + x->sumbuf->readpos += n; + x->sumbuf->prev->readpos += n; + + x->curcall++; + + return (w+5); +} -- cgit v1.2.1