From 2a5a823d12708ca0fa5c56347593afd9ce3b9b83 Mon Sep 17 00:00:00 2001
From: Hans-Christoph Steiner <eighthave@users.sourceforge.net>
Date: Tue, 7 Feb 2006 18:23:33 +0000
Subject: added Ben Saylor's pvoc~ and partconv~ from his sources so they can
 be added to Pd-extended

svn path=/trunk/externals/bsaylor/; revision=4566
---
 altivec-perform.inc.c | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 283 insertions(+)
 create mode 100644 altivec-perform.inc.c

(limited to 'altivec-perform.inc.c')

diff --git a/altivec-perform.inc.c b/altivec-perform.inc.c
new file mode 100644
index 0000000..3ce1cf0
--- /dev/null
+++ b/altivec-perform.inc.c
@@ -0,0 +1,283 @@
+//altivec version by Chris Clepper
+//
+static t_int *partconv_perform(t_int *w)
+{
+    t_partconv *x = (t_partconv *)(w[1]);
+    t_float *in = (t_float *)(w[2]);
+    t_float *out = (t_float *)(w[3]);
+    int n = (int)(w[4]);
+    int i;
+    int j;
+    int k;	// bin
+    int p;	// partition
+    int endpart;
+    fftwf_complex *cursumbuf_fd;
+    float *sumbuf1ptr;
+    float *sumbuf2ptr;
+
+    union {
+        unsigned char c[16];
+        vector unsigned char v;
+    }permfill;
+
+    union {
+        float f[4];
+        vector float v;
+    }floatfill;
+
+    vector float *load_input, *load_irpart;
+    vector float store_multbuf1,store_multbuf2;
+    vector float vinput_fd0, vinput_fd4; //input vectors
+    vector float virpart_fd0, virpart_fd4;  //ir partition vectors
+    vector float permtemp1357, permtemp0246;
+    vector float vzero;// vscale;
+    vector unsigned char input_0022, input_1133, perm_0246, perm_1357, perm_0123,perm_4567;
+    vector float vtemp1, vtemp2, vtemp3, vtemp4, vtemp5, vtemp6, vtemp7, vtemp8;
+
+    floatfill.f[0] = 0.f;
+    floatfill.f[1] = 0.f;
+    floatfill.f[2] = 0.f;
+    floatfill.f[3] = 0.f;
+    vzero = floatfill.v;
+
+    //store_multbuf = vzero;
+
+    floatfill.f[0] = x->scale;
+    floatfill.f[1] = x->scale;
+    floatfill.f[2] = x->scale;
+    floatfill.f[3] = x->scale;
+    //vscale = floatfill.v;
+
+    //fill the permute buffer for the first input_fd multiply
+    permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float
+    permfill.c[4] = 0; permfill.c[5] = 1; permfill.c[6] = 2; permfill.c[7] = 3; //second float
+    permfill.c[8] = 8; permfill.c[9] = 9; permfill.c[10] = 10; permfill.c[11] = 11; //third float
+    permfill.c[12] = 8; permfill.c[13] = 9; permfill.c[14] = 10; permfill.c[15] = 11; //fourth float
+
+    input_0022 = permfill.v;
+
+    permfill.c[0] = 4; permfill.c[1] = 5; permfill.c[2] = 6; permfill.c[3] = 7; //first float
+    permfill.c[4] = 4; permfill.c[5] = 5; permfill.c[6] = 6; permfill.c[7] = 7; //second float
+    permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float
+    permfill.c[12] = 12; permfill.c[13] = 13; permfill.c[14] = 14; permfill.c[15] = 15; //fourth float
+
+    input_1133 = permfill.v;
+
+    //perm_0246
+    //0,1,2,3,        8,9,10,11,          16,17,18,19,          24,25,26,27
+    permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float
+    permfill.c[4] = 8; permfill.c[5] = 9; permfill.c[6] = 10; permfill.c[7] = 11; //second float
+    permfill.c[8] = 16; permfill.c[9] = 17; permfill.c[10] = 18; permfill.c[11] = 19; //third float
+    permfill.c[12] = 24; permfill.c[13] = 25; permfill.c[14] = 26; permfill.c[15] = 27; //fourth float
+
+    perm_0246 = permfill.v;
+
+    // perm_1357
+    //         4,5,6,7,         12,13,14,15,           20,21,22,23,         28,29,30,31
+    permfill.c[0] = 4; permfill.c[1] = 5; permfill.c[2] = 6; permfill.c[3] = 7; //first float
+    permfill.c[4] = 12; permfill.c[5] = 13; permfill.c[6] = 14; permfill.c[7] = 15; //second float
+    permfill.c[8] = 20; permfill.c[9] = 21; permfill.c[10] = 22; permfill.c[11] = 23; //third float
+    permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float
+
+    perm_1357 = permfill.v;
+
+    // perm_0123  from [0,2,4,6] and [1,3,5,7]
+    //         0,1,2,3	16,17,18,19	4,5,6,7	20,21,22,23
+    permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float
+    permfill.c[4] = 16; permfill.c[5] = 17; permfill.c[6] = 18; permfill.c[7] = 19; //second float
+    permfill.c[8] = 4; permfill.c[9] = 5; permfill.c[10] = 6; permfill.c[11] = 7; //third float
+    permfill.c[12] = 20; permfill.c[13] = 21; permfill.c[14] = 22; permfill.c[15] = 23; //fourth float
+
+    perm_0123 = permfill.v;
+
+    // perm_4567  from [0,2,4,6] and [1,3,5,7]
+    //        8.9.10.11      24,25,26,27              12,13,14,15         28,29,30,31
+    permfill.c[0] = 8; permfill.c[1] = 9; permfill.c[2] = 10; permfill.c[3] = 11; //first float
+    permfill.c[4] = 24; permfill.c[5] = 25; permfill.c[6] = 26; permfill.c[7] = 27; //second float
+    permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float
+    permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float
+
+    // perm_4567  from [0,2,4,6] and [1,3,5,7]
+    //        8.9.10.11      24,25,26,27              12,13,14,15         28,29,30,31
+    permfill.c[0] = 8; permfill.c[1] = 9; permfill.c[2] = 10; permfill.c[3] = 11; //first float
+    permfill.c[4] = 24; permfill.c[5] = 25; permfill.c[6] = 26; permfill.c[7] = 27; //second float
+    permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float
+    permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float
+
+    perm_4567 = permfill.v;
+    
+
+    memcpy(&(x->inbuf[x->inbufpos]), in, n*sizeof(float));  // gather a block of input into input buffer
+    x->inbufpos += n;
+    if (x->inbufpos >= x->partsize) {
+        // input buffer is full, so we begin a new cycle
+
+        if (x->pd_blocksize != n) {
+            // the patch's blocksize has change since we last dealt the work
+            x->pd_blocksize = n;
+            partconv_deal_work(x);
+        }
+
+        x->inbufpos = 0;
+        x->curcall = 0;
+        x->curpart = 0;
+        memcpy(x->input_td, x->inbuf, x->partsize * sizeof(float));  // copy 'gathering' input buffer into 'transform' buffer
+        memset(&(x->input_td[x->partsize]), 0, (x->paddedsize - x->partsize) * sizeof(float));  // pad
+
+        fftwf_execute(x->input_plan);  // transform the input
+
+        // everything has been read out of prev sumbuf, so clear it
+        memset(x->sumbuf->prev->td, 0,  x->paddedsize * sizeof(float));
+
+        // advance sumbuf pointers
+        x->sumbuf = x->sumbuf->next;
+        x->sumbuf->readpos = 0;
+        x->sumbuf->prev->readpos = x->partsize;
+    }
+
+    // convolve this call's portion of partitions
+    endpart = x->curpart + x->parts_per_call[x->curcall];
+    if (endpart > x->nparts)  // FIXME does this ever happen?
+        endpart = x->nparts;
+    for (p = x->curpart; p < endpart; p++) {
+        //printf("convolving with partition %d\n", p);
+        //
+        // multiply the input block by the partition, accumulating the result in the appropriate sumbuf
+        //
+
+        // FIXME do this in a circular list-type fashion so we don't need "index"
+        cursumbuf_fd =  x->sumbufs[(x->sumbuf->index + p) % x->nsumbufs].fd;
+
+        for (k = 0; k < x->nbins; k+=4) {
+
+
+            
+            
+            load_input = (vector float *)&x->input_fd[k][0];
+            vinput_fd0 = vec_ld(0, (vector float *) load_input);
+
+            vtemp1 = vec_perm(load_input[0],vzero,input_0022);
+
+            load_input = (vector float *)&x->input_fd[k][4];
+            //load input_fd[k][4]
+            //vector will have input_fd[4,5,6,7]
+            vinput_fd4 = vec_ld(0, (vector float *) &x->input_fd[k][4]);
+
+            vtemp3 = vec_perm(load_input[0],vzero,input_0022);
+
+            //vec_ld irpart[p][k][0]
+            //vector will have irpart_fd[0,1,2,3]
+
+            load_irpart = (vector float *) &x->irpart_fd[p][k][0];
+
+            virpart_fd0 = vec_ld(0,&x->irpart_fd[p][k][0]);
+            vtemp1 = vec_madd(vtemp1,load_irpart[0],vzero);
+
+            load_irpart = (vector float *) &x->irpart_fd[p][k][4];
+            virpart_fd4 = vec_ld(0,&x->irpart_fd[p][k][4]);
+            vtemp3 = vec_madd(vtemp3,load_irpart[0],vzero);
+
+
+            store_multbuf1 = vec_ld(0,&cursumbuf_fd[k][0]);
+
+            store_multbuf2 = vec_ld(0,&cursumbuf_fd[k][4]);
+
+
+            //vec_perm to line up the elements
+            // irpart is fine
+            // make vector of input_fd[0] [2] and [4] [6]
+            //make vector of input_fd[1] [3] and [5] [7]
+            //
+            // permute only works on bytes so the first float is bytes 0,1,2,3 the second is 4,5,6,7 etc
+            //
+            // 0,1,2,3,        8,9,10,11,          16,17,18,19,          24,25,26,27
+            //
+            //         4,5,6,7,         12,13,14,15,           20,21,22,23,         28,29,30,31
+
+
+            //vec_perm temp1 and temp3 into [0,2,4,6]
+            permtemp0246 = vec_perm(vtemp1,vtemp3,perm_0246);
+
+            //and [1,3,5,7]
+            permtemp1357 = vec_perm(vtemp1,vtemp3,perm_1357);
+
+            //vinput_fd[1,3,5,7]
+            vtemp2 = vec_perm(vinput_fd0,vinput_fd4,perm_1357);
+
+            //irpart[1,3,5,7]
+            vtemp4 = vec_perm(virpart_fd0,virpart_fd4,perm_1357);
+
+            //irpart[0,2,4,6]
+            vtemp5 = vec_perm(virpart_fd0,virpart_fd4,perm_0246);
+
+            //vec_nmsub  input_fd[1,3,5,7]  irpart[1,3,5,7] temp[0,2,4,6]
+            vtemp6 = vec_nmsub(vtemp2,vtemp4,permtemp0246);
+
+            //vec_madd  input_fd[1,3,5,7] irpart[0,2,4,6] temp[1,3,5,7]
+            vtemp7 = vec_madd(vtemp2,vtemp5,permtemp1357);
+
+
+            
+
+            //vec_madd  all by scale - this is now done after the loop
+          //  vtemp6 = vec_madd(vtemp6,vscale,vzero);
+
+           // vtemp7 = vec_madd(vtemp7,vscale,vzero);
+
+
+            //vec_perm data back into place - tricky!
+
+            //vec_perm nmsub_result[0,2,4,6] madd_result [1,3,5,7]
+            // results will be [0,1,2,3] [4,5,6,7]
+            vtemp1 = vec_perm(vtemp6,vtemp7,perm_0123);
+
+            vtemp2 = vec_perm(vtemp6,vtemp7,perm_4567);
+
+
+            //vec_st
+            
+            store_multbuf1 = vec_add(store_multbuf1,vtemp1);
+            store_multbuf2 = vec_add(store_multbuf2,vtemp2);
+
+            vec_st(store_multbuf1,0,&cursumbuf_fd[k][0]);
+            
+            vec_st(store_multbuf2,0,&cursumbuf_fd[k][4]);
+            
+            
+            /*
+            cursumbuf_fd[k][0]
+            +=
+            (  x->input_fd[k][0] * x->irpart_fd[p][k][0]
+               - x->input_fd[k][1] * x->irpart_fd[p][k][1]);
+
+            cursumbuf_fd[k][1]
+                +=
+                (  x->input_fd[k][0] * x->irpart_fd[p][k][1]
+                   + x->input_fd[k][1] * x->irpart_fd[p][k][0]);*/
+        }
+    }
+    x->curpart = p;
+
+    // The convolution of the fresh block of input with the first partition of the IR
+    // is the last thing that gets summed into the current sumbuf before it gets IFFTed and starts being output.
+    // This happens during the first call of every cycle.
+    if (x->curcall == 0) {
+        // current sumbuf has been filled, so transform it (TD to FD).
+        // Output loop will begin to read it and sum it with the last one
+        fftwf_execute(x->sumbuf->plan);
+    }
+
+    // we're summing and outputting the first half of the most recently IFFTed sumbuf
+    // and the second half of the previous one
+    sumbuf1ptr = &(x->sumbuf->td[x->sumbuf->readpos]);
+    sumbuf2ptr = &(x->sumbuf->prev->td[x->sumbuf->prev->readpos]);
+    for (i = 0; i < n; i++) {
+        *(out++) = (*(sumbuf1ptr++) + *(sumbuf2ptr++)) * x->scale;
+    }
+    x->sumbuf->readpos += n;
+    x->sumbuf->prev->readpos += n;
+
+    x->curcall++;
+
+    return (w+5);
+}
-- 
cgit v1.2.1