diff options
author | Thomas Grill <xovo@users.sourceforge.net> | 2005-01-27 04:57:32 +0000 |
---|---|---|
committer | Thomas Grill <xovo@users.sourceforge.net> | 2005-01-27 04:57:32 +0000 |
commit | 3e86a4527748e5de5c091ffcb4ae5ac2dd8abd8a (patch) | |
tree | 915cbff77a3b863a1fe6bfe0c44c6597c62f7d85 /externals/grill/flext/source | |
parent | 22e2955b51ce16f5a550ef845f54241f7ec4d585 (diff) |
Fixes for Mac
fixed autoconf files
updates for batch mode
some more SIMD optimized functions
svn path=/trunk/; revision=2543
Diffstat (limited to 'externals/grill/flext/source')
-rwxr-xr-x | externals/grill/flext/source/flsimd.cpp | 454 |
1 files changed, 390 insertions, 64 deletions
diff --git a/externals/grill/flext/source/flsimd.cpp b/externals/grill/flext/source/flsimd.cpp index 7b35903b..fdb9c2d2 100755 --- a/externals/grill/flext/source/flsimd.cpp +++ b/externals/grill/flext/source/flsimd.cpp @@ -246,6 +246,36 @@ notamd: return feature; } + +inline bool IsVectorAligned(const void *where) +{ + return (reinterpret_cast<size_t>(where)&(__alignof(__m128)-1)) == 0; +} + +inline bool VectorsAligned(const void *v1,const void *v2) +{ + return ( + (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)) + &(__alignof(__m128)-1) + ) == 0; +} + +inline bool VectorsAligned(const void *v1,const void *v2,const void *v3) +{ + return ( + (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3)) + &(__alignof(__m128)-1) + ) == 0; +} + +inline bool VectorsAligned(const void *v1,const void *v2,const void *v3,const void *v4) +{ + return ( + (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3)|reinterpret_cast<size_t>(v4)) + &(__alignof(__m128)-1) + ) == 0; +} + #else // not MSVC static int _cpuid (_p_info *pinfo) @@ -347,6 +377,30 @@ inline bool IsVectorAligned(const void *where) return (reinterpret_cast<size_t>(where)&(sizeof(vector float)-1)) == 0; } +inline bool VectorsAligned(const void *v1,const void *v2) +{ + return ( + (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)) + &(sizeof(vector float)-1) + ) == 0; +} + +inline bool VectorsAligned(const void *v1,const void *v2,const void *v3) +{ + return ( + (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3)) + &(sizeof(vector float)-1) + ) == 0; +} + +inline bool VectorsAligned(const void *v1,const void *v2,const void *v3,const void *v4) +{ + return ( + (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3)|reinterpret_cast<size_t>(v4)) + &(sizeof(vector float)-1) + ) == 0; +} + inline vector float LoadValue(const float &f) { return IsVectorAligned(&f)?vec_splat(vec_ld(0,(vector float *)&f),0):LoadUnaligned(&f); @@ -385,8 +439,8 @@ void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt) prefetcht0 [eax+32] } - if((reinterpret_cast<size_t>(src)&(__alignof(__m128)-1)) == 0) { - if((reinterpret_cast<size_t>(dst)&(__alignof(__m128)-1)) == 0) { + if(IsVectorAligned(src)) { + if(IsVectorAligned(dst)) { // aligned src, aligned dst __asm { mov eax,dword ptr [src] @@ -434,7 +488,7 @@ loopau: } } else { - if((reinterpret_cast<size_t>(dst)&(__alignof(__m128)-1)) == 0) { + if(IsVectorAligned(dst)) { // unaligned src, aligned dst __asm { mov eax,dword ptr [src] @@ -638,24 +692,45 @@ static void ScaleAltivec(t_sample *dst,const t_sample *src,t_sample opmul,t_samp cnt -= n<<4; for(; n--; src += 16,dst += 16) { - vector float a1 = vec_ld( 0,src); - vector float a2 = vec_ld(16,src); - vector float a3 = vec_ld(32,src); - vector float a4 = vec_ld(48,src); - - a1 = vec_madd(a1,argmul,argadd); - a2 = vec_madd(a2,argmul,argadd); - a3 = vec_madd(a3,argmul,argadd); - a4 = vec_madd(a4,argmul,argadd); - - vec_st(a1, 0,dst); - vec_st(a2,16,dst); - vec_st(a3,32,dst); - vec_st(a4,48,dst); + vec_st(vec_madd(vec_ld( 0,src),argmul,argadd), 0,dst); + vec_st(vec_madd(vec_ld(16,src),argmul,argadd),16,dst); + vec_st(vec_madd(vec_ld(32,src),argmul,argadd),32,dst); + vec_st(vec_madd(vec_ld(48,src),argmul,argadd),48,dst); } while(cnt--) *(dst++) = *(src++)*opmul+opadd; } + +static void ScaleAltivec(t_sample *dst,const t_sample *src,t_sample opmul,const t_sample *add,int cnt) +{ + const vector float argmul = LoadValue(opmul); + int n = cnt>>4; + cnt -= n<<4; + + for(; n--; src += 16,dst += 16,add += 16) { + vec_st(vec_madd(vec_ld( 0,src),argmul,vec_ld( 0,add)), 0,dst); + vec_st(vec_madd(vec_ld(16,src),argmul,vec_ld(16,add)),16,dst); + vec_st(vec_madd(vec_ld(32,src),argmul,vec_ld(32,add)),32,dst); + vec_st(vec_madd(vec_ld(48,src),argmul,vec_ld(48,add)),48,dst); + } + + while(cnt--) *(dst++) = *(src++) * opmul + *(add++); +} + +static void ScaleAltivec(t_sample *dst,const t_sample *src,const t_sample *mul,const t_sample *add,int cnt) +{ + int n = cnt>>4; + cnt -= n<<4; + + for(; n--; src += 16,dst += 16,mul += 16,add += 16) { + vec_st(vec_madd(vec_ld( 0,src),vec_ld( 0,mul),vec_ld( 0,add)), 0,dst); + vec_st(vec_madd(vec_ld(16,src),vec_ld(16,mul),vec_ld(16,add)),16,dst); + vec_st(vec_madd(vec_ld(32,src),vec_ld(32,mul),vec_ld(32,add)),32,dst); + vec_st(vec_madd(vec_ld(48,src),vec_ld(48,mul),vec_ld(48,add)),48,dst); + } + + while(cnt--) *(dst++) = *(src++) * *(mul++) + *(add++); +} #endif void flext::SetSamples(t_sample *dst,int cnt,t_sample s) @@ -682,7 +757,7 @@ void flext::SetSamples(t_sample *dst,int cnt,t_sample s) shufps xmm0,xmm0,0 } - if((reinterpret_cast<size_t>(dst)&(__alignof(__m128)-1)) == 0) { + if(IsVectorAligned(dst)) { // aligned version __asm { mov ecx,[n] @@ -768,9 +843,7 @@ void flext::MulSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt) shufps xmm0,xmm0,0 } - if((reinterpret_cast<size_t>(src)&(__alignof(__m128)-1)) == 0 - && (reinterpret_cast<size_t>(dst)&(__alignof(__m128)-1)) == 0 - ) { + if(VectorsAligned(src,dst)) { // aligned version __asm { mov ecx,[n] @@ -842,7 +915,7 @@ zero: } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) - if(GetSIMDCapabilities()&simd_altivec && IsVectorAligned(src) && IsVectorAligned(dst)) + if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst)) MulAltivec(dst,src,op,cnt); else #endif // _MSC_VER @@ -904,10 +977,8 @@ void flext::MulSamples(t_sample *dst,const t_sample *src,const t_sample *op,int prefetcht0 [ebx+32] } - if((reinterpret_cast<size_t>(src)&(__alignof(__m128)-1)) == 0 - && (reinterpret_cast<size_t>(dst)&(__alignof(__m128)-1)) == 0 - ) { - if((reinterpret_cast<size_t>(op)&(__alignof(__m128)-1)) == 0) { + if(VectorsAligned(src,dst)) { + if(IsVectorAligned(op)) { __asm { mov ecx,[n] mov eax,dword ptr [src] @@ -985,7 +1056,7 @@ void flext::MulSamples(t_sample *dst,const t_sample *src,const t_sample *op,int } } else { - if((reinterpret_cast<size_t>(op)&(__alignof(__m128)-1)) == 0) { + if(IsVectorAligned(op)) { __asm { mov ecx,[n] mov eax,dword ptr [src] @@ -1072,7 +1143,7 @@ zero: } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) - if(GetSIMDCapabilities()&simd_altivec && IsVectorAligned(src) && IsVectorAligned(op) && IsVectorAligned(dst)) + if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,op,dst)) MulAltivec(dst,src,op,cnt); else #endif // _MSC_VER @@ -1134,9 +1205,7 @@ void flext::AddSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt) shufps xmm0,xmm0,0 } - if((reinterpret_cast<size_t>(src)&(__alignof(__m128)-1)) == 0 - && (reinterpret_cast<size_t>(dst)&(__alignof(__m128)-1)) == 0 - ) { + if(VectorsAligned(src,dst)) { // aligned version __asm { mov ecx,[n] @@ -1202,7 +1271,7 @@ loopu: } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) - if(GetSIMDCapabilities()&simd_altivec && IsVectorAligned(src) && IsVectorAligned(dst)) + if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst)) AddAltivec(dst,src,op,cnt); else #endif // _MSC_VER @@ -1263,10 +1332,8 @@ void flext::AddSamples(t_sample *dst,const t_sample *src,const t_sample *op,int int n = cnt>>4; cnt -= n<<4; - if((reinterpret_cast<size_t>(src)&(__alignof(__m128)-1)) == 0 - && (reinterpret_cast<size_t>(dst)&(__alignof(__m128)-1)) == 0 - ) { - if((reinterpret_cast<size_t>(op)&(__alignof(__m128)-1)) == 0) { + if(VectorsAligned(src,dst)) { + if(IsVectorAligned(op)) { __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] @@ -1344,7 +1411,7 @@ void flext::AddSamples(t_sample *dst,const t_sample *src,const t_sample *op,int } } else { - if((reinterpret_cast<size_t>(op)&(__alignof(__m128)-1)) == 0) { + if(IsVectorAligned(op)) { __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] @@ -1430,7 +1497,7 @@ void flext::AddSamples(t_sample *dst,const t_sample *src,const t_sample *op,int } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) - if(GetSIMDCapabilities()&simd_altivec && IsVectorAligned(src) && IsVectorAligned(op) && IsVectorAligned(dst)) + if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,op,dst)) AddAltivec(dst,src,op,cnt); else #endif // _MSC_VER @@ -1496,9 +1563,7 @@ void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,t_samp shufps xmm1,xmm1,0 } - if((reinterpret_cast<size_t>(src)&(__alignof(__m128)-1)) == 0 - && (reinterpret_cast<size_t>(dst)&(__alignof(__m128)-1)) == 0 - ) { + if(VectorsAligned(src,dst)) { // aligned version __asm { mov ecx,dword ptr [n] @@ -1572,7 +1637,7 @@ loopu: } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) - if(GetSIMDCapabilities()&simd_altivec && IsVectorAligned(src) && IsVectorAligned(dst)) + if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst)) ScaleAltivec(dst,src,opmul,opadd,cnt); else #endif // _MSC_VER @@ -1592,12 +1657,135 @@ loopu: #endif } -void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,const t_sample *add,int cnt) +void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,const t_sample *opadd,int cnt) { +#ifdef FLEXT_USE_IPP + if(sizeof(t_sample) == 4) { + ippsMulC_32f((const float *)src,(float)opmul,(float *)dst,cnt); + ippsAdd_32f_I((float *)opadd,(float *)dst,cnt); + } + else if(sizeof(t_sample) == 8) { + ippsMulC_64f((const double *)src,(double)opmul,(double *)dst,cnt); + ippsAdd_64f_I((double *)opadd,(double *)dst,cnt); + } + else + ERRINTERNAL(); +#else +#ifdef FLEXT_USE_SIMD +#ifdef _MSC_VER + if(GetSIMDCapabilities()&simd_sse) { + // single precision + int n = cnt>>4; + cnt -= n<<4; + + __asm { + mov eax,dword ptr [src] + prefetcht0 [eax+0] + prefetcht0 [eax+32] + + movss xmm0,xmmword ptr [opmul] + shufps xmm0,xmm0,0 + } + + if(VectorsAligned(src,dst,opadd)) { + // aligned version + __asm { + mov ecx,dword ptr [n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov ebx,dword ptr [opadd] +loopa: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] + + movaps xmm2,xmmword ptr[eax] + movaps xmm1,xmmword ptr[ebx] + mulps xmm2,xmm0 + addps xmm2,xmm1 + movaps xmmword ptr[edx],xmm2 + + movaps xmm3,xmmword ptr[eax+4*4] + movaps xmm1,xmmword ptr[ebx+4*4] + mulps xmm3,xmm0 + addps xmm3,xmm1 + movaps xmmword ptr[edx+4*4],xmm3 + + movaps xmm4,xmmword ptr[eax+8*4] + movaps xmm1,xmmword ptr[ebx+8*4] + mulps xmm4,xmm0 + addps xmm4,xmm1 + movaps xmmword ptr[edx+8*4],xmm4 + + movaps xmm5,xmmword ptr[eax+12*4] + movaps xmm1,xmmword ptr[ebx+12*4] + mulps xmm5,xmm0 + addps xmm5,xmm1 + movaps xmmword ptr[edx+12*4],xmm5 + + add eax,16*4 + add edx,16*4 + add ebx,16*4 + loop loopa + } + } + else { + // unaligned version + __asm { + mov ecx,dword ptr [n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov ebx,dword ptr [opadd] +loopu: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] + + movups xmm2,xmmword ptr[eax] + movups xmm1,xmmword ptr[ebx] + mulps xmm2,xmm0 + addps xmm2,xmm1 + movups xmmword ptr[edx],xmm2 + + movups xmm3,xmmword ptr[eax+4*4] + movups xmm1,xmmword ptr[ebx+4*4] + mulps xmm3,xmm0 + addps xmm3,xmm1 + movups xmmword ptr[edx+4*4],xmm3 + + movups xmm4,xmmword ptr[eax+8*4] + movups xmm1,xmmword ptr[ebx+8*4] + mulps xmm4,xmm0 + addps xmm4,xmm1 + movups xmmword ptr[edx+8*4],xmm4 + + movups xmm5,xmmword ptr[eax+12*4] + movups xmm1,xmmword ptr[ebx+12*4] + mulps xmm5,xmm0 + addps xmm5,xmm1 + movups xmmword ptr[edx+12*4],xmm5 + + add eax,16*4 + add edx,16*4 + add ebx,16*4 + loop loopu + } + } + while(cnt--) *(dst++) = *(src++) * opmul + *(opadd++); + } + else +#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) + if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst,opadd)) + ScaleAltivec(dst,src,opmul,opadd,cnt); + else +#endif // _MSC_VER +#endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; - if(dst == add) { + if(dst == opadd) { while(n--) { dst[0] += src[0]*opmul; dst[1] += src[1]*opmul; dst[2] += src[2]*opmul; dst[3] += src[3]*opmul; @@ -1609,41 +1797,179 @@ void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,const } else { while(n--) { - dst[0] = src[0]*opmul+add[0]; dst[1] = src[1]*opmul+add[1]; - dst[2] = src[2]*opmul+add[2]; dst[3] = src[3]*opmul+add[3]; - dst[4] = src[4]*opmul+add[4]; dst[5] = src[5]*opmul+add[5]; - dst[6] = src[6]*opmul+add[6]; dst[7] = src[7]*opmul+add[7]; - src += 8,dst += 8,add += 8; + dst[0] = src[0]*opmul+opadd[0]; dst[1] = src[1]*opmul+opadd[1]; + dst[2] = src[2]*opmul+opadd[2]; dst[3] = src[3]*opmul+opadd[3]; + dst[4] = src[4]*opmul+opadd[4]; dst[5] = src[5]*opmul+opadd[5]; + dst[6] = src[6]*opmul+opadd[6]; dst[7] = src[7]*opmul+opadd[7]; + src += 8,dst += 8,opadd += 8; } - while(cnt--) *(dst++) = *(src++)*opmul+*(add++); + while(cnt--) *(dst++) = *(src++)*opmul+*(opadd++); } } +#endif } -void flext::ScaleSamples(t_sample *dst,const t_sample *src,const t_sample *mul,const t_sample *add,int cnt) +void flext::ScaleSamples(t_sample *dst,const t_sample *src,const t_sample *opmul,const t_sample *opadd,int cnt) { +#ifdef FLEXT_USE_IPP + if(sizeof(t_sample) == 4) { + ippsMul_32f((const float *)src,(const float *)opmul,(float *)dst,cnt); + ippsAdd_32f_I((const float *)opadd,(float *)dst,cnt); + } + else if(sizeof(t_sample) == 8) { + ippsMul_64f((const double *)src,(const double *)opmul,(double *)dst,cnt); + ippsAdd_64f_I((const double *)opadd,(double *)dst,cnt); + } + else + ERRINTERNAL(); +#else +#ifdef FLEXT_USE_SIMD +#ifdef _MSC_VER + if(GetSIMDCapabilities()&simd_sse) { + // single precision + int n = cnt>>4; + cnt -= n<<4; + + __asm { + mov eax,dword ptr [src] + prefetcht0 [eax+0] + prefetcht0 [eax+32] + } + + if(VectorsAligned(src,dst,opmul,opadd)) { + // aligned version + __asm { + mov ecx,dword ptr [n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov esi,dword ptr [opmul] + mov ebx,dword ptr [opadd] +loopa: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [esi+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] + prefetcht0 [esi+96] + + movaps xmm2,xmmword ptr[eax] + movaps xmm0,xmmword ptr[esi] + movaps xmm1,xmmword ptr[ebx] + mulps xmm2,xmm0 + addps xmm2,xmm1 + movaps xmmword ptr[edx],xmm2 + + movaps xmm3,xmmword ptr[eax+4*4] + movaps xmm0,xmmword ptr[esi+4*4] + movaps xmm1,xmmword ptr[ebx+4*4] + mulps xmm3,xmm0 + addps xmm3,xmm1 + movaps xmmword ptr[edx+4*4],xmm3 + + movaps xmm4,xmmword ptr[eax+8*4] + movaps xmm0,xmmword ptr[esi+8*4] + movaps xmm1,xmmword ptr[ebx+8*4] + mulps xmm4,xmm0 + addps xmm4,xmm1 + movaps xmmword ptr[edx+8*4],xmm4 + + movaps xmm5,xmmword ptr[eax+12*4] + movaps xmm0,xmmword ptr[esi+12*4] + movaps xmm1,xmmword ptr[ebx+12*4] + mulps xmm5,xmm0 + addps xmm5,xmm1 + movaps xmmword ptr[edx+12*4],xmm5 + + add eax,16*4 + add edx,16*4 + add ebx,16*4 + add esi,16*4 + loop loopa + } + } + else { + // unaligned version + __asm { + mov ecx,dword ptr [n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov esi,dword ptr [opmul] + mov ebx,dword ptr [opadd] +loopu: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [esi+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] + prefetcht0 [esi+96] + + movups xmm2,xmmword ptr[eax] + movups xmm0,xmmword ptr[esi] + movups xmm1,xmmword ptr[ebx] + mulps xmm2,xmm0 + addps xmm2,xmm1 + movups xmmword ptr[edx],xmm2 + + movups xmm3,xmmword ptr[eax+4*4] + movups xmm0,xmmword ptr[esi+4*4] + movups xmm1,xmmword ptr[ebx+4*4] + mulps xmm3,xmm0 + addps xmm3,xmm1 + movups xmmword ptr[edx+4*4],xmm3 + + movups xmm4,xmmword ptr[eax+8*4] + movups xmm0,xmmword ptr[esi+8*4] + movups xmm1,xmmword ptr[ebx+8*4] + mulps xmm4,xmm0 + addps xmm4,xmm1 + movups xmmword ptr[edx+8*4],xmm4 + + movups xmm5,xmmword ptr[eax+12*4] + movups xmm0,xmmword ptr[esi+12*4] + movups xmm1,xmmword ptr[ebx+12*4] + mulps xmm5,xmm0 + addps xmm5,xmm1 + movups xmmword ptr[edx+12*4],xmm5 + + add eax,16*4 + add edx,16*4 + add ebx,16*4 + add esi,16*4 + loop loopu + } + } + while(cnt--) *(dst++) = *(src++) * *(opmul++) + *(opadd++); + } + else +#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) + if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst,opmul,opadd)) + ScaleAltivec(dst,src,opmul,opadd,cnt); + else +#endif // _MSC_VER +#endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; - if(dst == add) { + if(dst == opadd) { while(n--) { - dst[0] += src[0]*mul[0]; dst[1] += src[1]*mul[1]; - dst[2] += src[2]*mul[2]; dst[3] += src[3]*mul[3]; - dst[4] += src[4]*mul[4]; dst[5] += src[5]*mul[5]; - dst[6] += src[6]*mul[6]; dst[7] += src[7]*mul[7]; - src += 8,dst += 8,mul += 8; + dst[0] += src[0]*opmul[0]; dst[1] += src[1]*opmul[1]; + dst[2] += src[2]*opmul[2]; dst[3] += src[3]*opmul[3]; + dst[4] += src[4]*opmul[4]; dst[5] += src[5]*opmul[5]; + dst[6] += src[6]*opmul[6]; dst[7] += src[7]*opmul[7]; + src += 8,dst += 8,opmul += 8; } - while(cnt--) *(dst++) += *(src++) * *(mul++); + while(cnt--) *(dst++) += *(src++) * *(opmul++); } else { while(n--) { - dst[0] = src[0]*mul[0]+add[0]; dst[1] = src[1]*mul[1]+add[1]; - dst[2] = src[2]*mul[2]+add[2]; dst[3] = src[3]*mul[3]+add[3]; - dst[4] = src[4]*mul[4]+add[4]; dst[5] = src[5]*mul[5]+add[5]; - dst[6] = src[6]*mul[6]+add[6]; dst[7] = src[7]*mul[7]+add[7]; - src += 8,dst += 8,mul += 8,add += 8; + dst[0] = src[0]*opmul[0]+opadd[0]; dst[1] = src[1]*opmul[1]+opadd[1]; + dst[2] = src[2]*opmul[2]+opadd[2]; dst[3] = src[3]*opmul[3]+opadd[3]; + dst[4] = src[4]*opmul[4]+opadd[4]; dst[5] = src[5]*opmul[5]+opadd[5]; + dst[6] = src[6]*opmul[6]+opadd[6]; dst[7] = src[7]*opmul[7]+opadd[7]; + src += 8,dst += 8,opmul += 8,opadd += 8; } - while(cnt--) *(dst++) = *(src++)* *(mul++) + *(add++); + while(cnt--) *(dst++) = *(src++)* *(opmul++) + *(opadd++); } } +#endif } |