//altivec version by Chris Clepper // static t_int *partconv_perform(t_int *w) { t_partconv *x = (t_partconv *)(w[1]); t_float *in = (t_float *)(w[2]); t_float *out = (t_float *)(w[3]); int n = (int)(w[4]); int i; int j; int k; // bin int p; // partition int endpart; fftwf_complex *cursumbuf_fd; float *sumbuf1ptr; float *sumbuf2ptr; union { unsigned char c[16]; vector unsigned char v; }permfill; union { float f[4]; vector float v; }floatfill; vector float *load_input, *load_irpart; vector float store_multbuf1,store_multbuf2; vector float vinput_fd0, vinput_fd4; //input vectors vector float virpart_fd0, virpart_fd4; //ir partition vectors vector float permtemp1357, permtemp0246; vector float vzero;// vscale; vector unsigned char input_0022, input_1133, perm_0246, perm_1357, perm_0123,perm_4567; vector float vtemp1, vtemp2, vtemp3, vtemp4, vtemp5, vtemp6, vtemp7, vtemp8; floatfill.f[0] = 0.f; floatfill.f[1] = 0.f; floatfill.f[2] = 0.f; floatfill.f[3] = 0.f; vzero = floatfill.v; //store_multbuf = vzero; floatfill.f[0] = x->scale; floatfill.f[1] = x->scale; floatfill.f[2] = x->scale; floatfill.f[3] = x->scale; //vscale = floatfill.v; //fill the permute buffer for the first input_fd multiply permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float permfill.c[4] = 0; permfill.c[5] = 1; permfill.c[6] = 2; permfill.c[7] = 3; //second float permfill.c[8] = 8; permfill.c[9] = 9; permfill.c[10] = 10; permfill.c[11] = 11; //third float permfill.c[12] = 8; permfill.c[13] = 9; permfill.c[14] = 10; permfill.c[15] = 11; //fourth float input_0022 = permfill.v; permfill.c[0] = 4; permfill.c[1] = 5; permfill.c[2] = 6; permfill.c[3] = 7; //first float permfill.c[4] = 4; permfill.c[5] = 5; permfill.c[6] = 6; permfill.c[7] = 7; //second float permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float permfill.c[12] = 12; permfill.c[13] = 13; permfill.c[14] = 14; permfill.c[15] = 15; //fourth float input_1133 = permfill.v; //perm_0246 //0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27 permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float permfill.c[4] = 8; permfill.c[5] = 9; permfill.c[6] = 10; permfill.c[7] = 11; //second float permfill.c[8] = 16; permfill.c[9] = 17; permfill.c[10] = 18; permfill.c[11] = 19; //third float permfill.c[12] = 24; permfill.c[13] = 25; permfill.c[14] = 26; permfill.c[15] = 27; //fourth float perm_0246 = permfill.v; // perm_1357 // 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31 permfill.c[0] = 4; permfill.c[1] = 5; permfill.c[2] = 6; permfill.c[3] = 7; //first float permfill.c[4] = 12; permfill.c[5] = 13; permfill.c[6] = 14; permfill.c[7] = 15; //second float permfill.c[8] = 20; permfill.c[9] = 21; permfill.c[10] = 22; permfill.c[11] = 23; //third float permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float perm_1357 = permfill.v; // perm_0123 from [0,2,4,6] and [1,3,5,7] // 0,1,2,3 16,17,18,19 4,5,6,7 20,21,22,23 permfill.c[0] = 0; permfill.c[1] = 1; permfill.c[2] = 2; permfill.c[3] = 3; //first float permfill.c[4] = 16; permfill.c[5] = 17; permfill.c[6] = 18; permfill.c[7] = 19; //second float permfill.c[8] = 4; permfill.c[9] = 5; permfill.c[10] = 6; permfill.c[11] = 7; //third float permfill.c[12] = 20; permfill.c[13] = 21; permfill.c[14] = 22; permfill.c[15] = 23; //fourth float perm_0123 = permfill.v; // perm_4567 from [0,2,4,6] and [1,3,5,7] // 8.9.10.11 24,25,26,27 12,13,14,15 28,29,30,31 permfill.c[0] = 8; permfill.c[1] = 9; permfill.c[2] = 10; permfill.c[3] = 11; //first float permfill.c[4] = 24; permfill.c[5] = 25; permfill.c[6] = 26; permfill.c[7] = 27; //second float permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float // perm_4567 from [0,2,4,6] and [1,3,5,7] // 8.9.10.11 24,25,26,27 12,13,14,15 28,29,30,31 permfill.c[0] = 8; permfill.c[1] = 9; permfill.c[2] = 10; permfill.c[3] = 11; //first float permfill.c[4] = 24; permfill.c[5] = 25; permfill.c[6] = 26; permfill.c[7] = 27; //second float permfill.c[8] = 12; permfill.c[9] = 13; permfill.c[10] = 14; permfill.c[11] = 15; //third float permfill.c[12] = 28; permfill.c[13] = 29; permfill.c[14] = 30; permfill.c[15] = 31; //fourth float perm_4567 = permfill.v; memcpy(&(x->inbuf[x->inbufpos]), in, n*sizeof(float)); // gather a block of input into input buffer x->inbufpos += n; if (x->inbufpos >= x->partsize) { // input buffer is full, so we begin a new cycle if (x->pd_blocksize != n) { // the patch's blocksize has change since we last dealt the work x->pd_blocksize = n; partconv_deal_work(x); } x->inbufpos = 0; x->curcall = 0; x->curpart = 0; memcpy(x->input_td, x->inbuf, x->partsize * sizeof(float)); // copy 'gathering' input buffer into 'transform' buffer memset(&(x->input_td[x->partsize]), 0, (x->paddedsize - x->partsize) * sizeof(float)); // pad fftwf_execute(x->input_plan); // transform the input // everything has been read out of prev sumbuf, so clear it memset(x->sumbuf->prev->td, 0, x->paddedsize * sizeof(float)); // advance sumbuf pointers x->sumbuf = x->sumbuf->next; x->sumbuf->readpos = 0; x->sumbuf->prev->readpos = x->partsize; } // convolve this call's portion of partitions endpart = x->curpart + x->parts_per_call[x->curcall]; if (endpart > x->nparts) // FIXME does this ever happen? endpart = x->nparts; for (p = x->curpart; p < endpart; p++) { //printf("convolving with partition %d\n", p); // // multiply the input block by the partition, accumulating the result in the appropriate sumbuf // // FIXME do this in a circular list-type fashion so we don't need "index" cursumbuf_fd = x->sumbufs[(x->sumbuf->index + p) % x->nsumbufs].fd; for (k = 0; k < x->nbins; k+=4) { load_input = (vector float *)&x->input_fd[k][0]; vinput_fd0 = vec_ld(0, (vector float *) load_input); vtemp1 = vec_perm(load_input[0],vzero,input_0022); load_input = (vector float *)&x->input_fd[k][4]; //load input_fd[k][4] //vector will have input_fd[4,5,6,7] vinput_fd4 = vec_ld(0, (vector float *) &x->input_fd[k][4]); vtemp3 = vec_perm(load_input[0],vzero,input_0022); //vec_ld irpart[p][k][0] //vector will have irpart_fd[0,1,2,3] load_irpart = (vector float *) &x->irpart_fd[p][k][0]; virpart_fd0 = vec_ld(0,&x->irpart_fd[p][k][0]); vtemp1 = vec_madd(vtemp1,load_irpart[0],vzero); load_irpart = (vector float *) &x->irpart_fd[p][k][4]; virpart_fd4 = vec_ld(0,&x->irpart_fd[p][k][4]); vtemp3 = vec_madd(vtemp3,load_irpart[0],vzero); store_multbuf1 = vec_ld(0,&cursumbuf_fd[k][0]); store_multbuf2 = vec_ld(0,&cursumbuf_fd[k][4]); //vec_perm to line up the elements // irpart is fine // make vector of input_fd[0] [2] and [4] [6] //make vector of input_fd[1] [3] and [5] [7] // // permute only works on bytes so the first float is bytes 0,1,2,3 the second is 4,5,6,7 etc // // 0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27 // // 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31 //vec_perm temp1 and temp3 into [0,2,4,6] permtemp0246 = vec_perm(vtemp1,vtemp3,perm_0246); //and [1,3,5,7] permtemp1357 = vec_perm(vtemp1,vtemp3,perm_1357); //vinput_fd[1,3,5,7] vtemp2 = vec_perm(vinput_fd0,vinput_fd4,perm_1357); //irpart[1,3,5,7] vtemp4 = vec_perm(virpart_fd0,virpart_fd4,perm_1357); //irpart[0,2,4,6] vtemp5 = vec_perm(virpart_fd0,virpart_fd4,perm_0246); //vec_nmsub input_fd[1,3,5,7] irpart[1,3,5,7] temp[0,2,4,6] vtemp6 = vec_nmsub(vtemp2,vtemp4,permtemp0246); //vec_madd input_fd[1,3,5,7] irpart[0,2,4,6] temp[1,3,5,7] vtemp7 = vec_madd(vtemp2,vtemp5,permtemp1357); //vec_madd all by scale - this is now done after the loop // vtemp6 = vec_madd(vtemp6,vscale,vzero); // vtemp7 = vec_madd(vtemp7,vscale,vzero); //vec_perm data back into place - tricky! //vec_perm nmsub_result[0,2,4,6] madd_result [1,3,5,7] // results will be [0,1,2,3] [4,5,6,7] vtemp1 = vec_perm(vtemp6,vtemp7,perm_0123); vtemp2 = vec_perm(vtemp6,vtemp7,perm_4567); //vec_st store_multbuf1 = vec_add(store_multbuf1,vtemp1); store_multbuf2 = vec_add(store_multbuf2,vtemp2); vec_st(store_multbuf1,0,&cursumbuf_fd[k][0]); vec_st(store_multbuf2,0,&cursumbuf_fd[k][4]); /* cursumbuf_fd[k][0] += ( x->input_fd[k][0] * x->irpart_fd[p][k][0] - x->input_fd[k][1] * x->irpart_fd[p][k][1]); cursumbuf_fd[k][1] += ( x->input_fd[k][0] * x->irpart_fd[p][k][1] + x->input_fd[k][1] * x->irpart_fd[p][k][0]);*/ } } x->curpart = p; // The convolution of the fresh block of input with the first partition of the IR // is the last thing that gets summed into the current sumbuf before it gets IFFTed and starts being output. // This happens during the first call of every cycle. if (x->curcall == 0) { // current sumbuf has been filled, so transform it (TD to FD). // Output loop will begin to read it and sum it with the last one fftwf_execute(x->sumbuf->plan); } // we're summing and outputting the first half of the most recently IFFTed sumbuf // and the second half of the previous one sumbuf1ptr = &(x->sumbuf->td[x->sumbuf->readpos]); sumbuf2ptr = &(x->sumbuf->prev->td[x->sumbuf->prev->readpos]); for (i = 0; i < n; i++) { *(out++) = (*(sumbuf1ptr++) + *(sumbuf2ptr++)) * x->scale; } x->sumbuf->readpos += n; x->sumbuf->prev->readpos += n; x->curcall++; return (w+5); }