/* flext - C++ layer for Max/MSP and pd (pure data) externals Copyright (c) 2001-2005 Thomas Grill (gr@grrrr.org) For information on usage and redistribution, and for a DISCLAIMER OF ALL WARRANTIES, see the file, "license.txt," in this distribution. */ /*! \file flsimd.cpp \brief flext SIMD support functions If FLEXT_USE_SIMD is defined at compilation, SIMD instructions are used wherever feasible. If used with MSVC++ 6 the "Processor Pack" must be installed. If FLEXT_USE_IPP is defined the Intel Performance Package is used. */ #include "flext.h" #include <string.h> #if FLEXT_OS == FLEXT_OS_WIN #include <windows.h> #endif #ifdef FLEXT_USE_IPP #include <ipps.h> #endif #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER // include MSVC SIMD header files #include <mmintrin.h> // MMX #include <xmmintrin.h> // SSE #include <emmintrin.h> // SSE2 #include <mm3dnow.h> // 3DNow! #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__MWERKS__) && defined(__VEC__) #if FLEXT_OSAPI == FLEXT_OSAPI_MAC_MACH #include <sys/sysctl.h> #include <vDSP.h> #else #include <Gestalt.h> #endif #pragma altivec_model on #include <altivec.h> #include <vectorOps.h> #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__GNUC__) && defined(__VEC__) #include <sys/sysctl.h> #include <vecLib/vecLib.h> #endif #endif // FLEXT_USE_SIMD static unsigned long setsimdcaps(); /*! \brief Holds SIMD capability flags \internal */ unsigned long flext::simdcaps = setsimdcaps(); unsigned long flext::GetSIMDCapabilities() { return simdcaps; } #ifdef FLEXT_USE_SIMD #if FLEXT_CPU == FLEXT_CPU_IA32 || FLEXT_CPU == FLEXT_CPU_X84_64 #define _CPU_FEATURE_MMX 0x0001 #define _CPU_FEATURE_SSE 0x0002 #define _CPU_FEATURE_SSE2 0x0004 #define _CPU_FEATURE_3DNOW 0x0008 typedef struct _processor_info { int family; // family of the processor // e.g. 6 = Pentium-Pro architecture int model; // model of processor // e.g. 1 = Pentium-Pro for family = 6 int stepping; // processor revision number int feature; // processor feature // (same as return value from _cpuid) int os_support; // does OS Support the feature? int checks; // mask of checked bits in feature // and os_support fields } _p_info; // These are the bit flags that get set on calling cpuid // with register eax set to 1 #define _MMX_FEATURE_BIT 0x00800000 #define _SSE_FEATURE_BIT 0x02000000 #define _SSE2_FEATURE_BIT 0x04000000 // This bit is set when cpuid is called with // register set to 80000001h (only applicable to AMD) #define _3DNOW_FEATURE_BIT 0x80000000 #ifdef _MSC_VER static int IsCPUID() { __try { _asm { xor eax, eax cpuid } } __except (EXCEPTION_EXECUTE_HANDLER) { return 0; } return 1; } static int _os_support(int feature) { __try { switch (feature) { case _CPU_FEATURE_SSE: __asm { xorps xmm0, xmm0 // executing SSE instruction } break; case _CPU_FEATURE_SSE2: __asm { xorpd xmm0, xmm0 // executing SSE2 instruction } break; case _CPU_FEATURE_3DNOW: __asm { pfrcp mm0, mm0 // executing 3DNow! instruction emms } break; case _CPU_FEATURE_MMX: __asm { pxor mm0, mm0 // executing MMX instruction emms } break; } } __except (EXCEPTION_EXECUTE_HANDLER) { if (_exception_code() == STATUS_ILLEGAL_INSTRUCTION) { return 0; } return 0; } return 1; } static int _cpuid (_p_info *pinfo) { DWORD dwStandard = 0; DWORD dwFeature = 0; DWORD dwMax = 0; DWORD dwExt = 0; int feature = 0; int os_support = 0; union { struct { DWORD dw0; DWORD dw1; DWORD dw2; } s; } Ident; if (!IsCPUID()) { return 0; } _asm { push ebx push ecx push edx // get the vendor string xor eax, eax cpuid mov dwMax, eax mov Ident.s.dw0, ebx mov Ident.s.dw1, edx mov Ident.s.dw2, ecx // get the Standard bits mov eax, 1 cpuid mov dwStandard, eax mov dwFeature, edx // get AMD-specials mov eax, 80000000h cpuid cmp eax, 80000000h jc notamd mov eax, 80000001h cpuid mov dwExt, edx notamd: pop ecx pop ebx pop edx } if (dwFeature & _MMX_FEATURE_BIT) { feature |= _CPU_FEATURE_MMX; if (_os_support(_CPU_FEATURE_MMX)) os_support |= _CPU_FEATURE_MMX; } if (dwExt & _3DNOW_FEATURE_BIT) { feature |= _CPU_FEATURE_3DNOW; if (_os_support(_CPU_FEATURE_3DNOW)) os_support |= _CPU_FEATURE_3DNOW; } if (dwFeature & _SSE_FEATURE_BIT) { feature |= _CPU_FEATURE_SSE; if (_os_support(_CPU_FEATURE_SSE)) os_support |= _CPU_FEATURE_SSE; } if (dwFeature & _SSE2_FEATURE_BIT) { feature |= _CPU_FEATURE_SSE2; if (_os_support(_CPU_FEATURE_SSE2)) os_support |= _CPU_FEATURE_SSE2; } if (pinfo) { memset(pinfo, 0, sizeof(_p_info)); pinfo->os_support = os_support; pinfo->feature = feature; pinfo->family = (dwStandard >> 8) & 0xF; // retrieve family if (pinfo->family == 15) { // retrieve extended family pinfo->family |= (dwStandard >> 16) & 0xFF0; } pinfo->model = (dwStandard >> 4) & 0xF; // retrieve model if (pinfo->model == 15) { // retrieve extended model pinfo->model |= (dwStandard >> 12) & 0xF; } pinfo->stepping = (dwStandard) & 0xF; // retrieve stepping pinfo->checks = _CPU_FEATURE_MMX | _CPU_FEATURE_SSE | _CPU_FEATURE_SSE2 | _CPU_FEATURE_3DNOW; } return feature; } inline bool IsVectorAligned(const void *where) { return (reinterpret_cast<size_t>(where)&(__alignof(__m128)-1)) == 0; } inline bool VectorsAligned(const void *v1,const void *v2) { return ( (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)) &(__alignof(__m128)-1) ) == 0; } inline bool VectorsAligned(const void *v1,const void *v2,const void *v3) { return ( (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3)) &(__alignof(__m128)-1) ) == 0; } inline bool VectorsAligned(const void *v1,const void *v2,const void *v3,const void *v4) { return ( (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3)|reinterpret_cast<size_t>(v4)) &(__alignof(__m128)-1) ) == 0; } #else // not MSVC static int _cpuid (_p_info *pinfo) { if(pinfo) memset(pinfo,0,sizeof *pinfo); return 0; } #endif #endif /*! \brief Determine SIMD capabilities \internal */ static unsigned long setsimdcaps() { unsigned long simdflags = flext::simd_none; #if FLEXT_CPU == FLEXT_CPU_IA32 || FLEXT_CPU == FLEXT_CPU_AMD64 _p_info cpuinfo; int feature = _cpuid(&cpuinfo); if(cpuinfo.os_support&_CPU_FEATURE_MMX) simdflags += flext::simd_mmx; if(cpuinfo.os_support&_CPU_FEATURE_3DNOW) simdflags += flext::simd_3dnow; if(cpuinfo.os_support&_CPU_FEATURE_SSE) simdflags += flext::simd_sse; if(cpuinfo.os_support&_CPU_FEATURE_SSE2) simdflags += flext::simd_sse2; #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) #if FLEXT_OSAPI == FLEXT_OSAPI_MAC_MACH int selectors[2] = { CTL_HW, HW_VECTORUNIT }; int hasVectorUnit = 0; size_t length = sizeof(hasVectorUnit); int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); if(!error && hasVectorUnit != 0) simdflags += flext::simd_altivec; #else long cpuAttributes; Boolean hasAltiVec = false; OSErr err = Gestalt( gestaltPowerPCProcessorFeatures, &cpuAttributes ); if( noErr == err ) if(( 1 << gestaltPowerPCHasVectorInstructions) & cpuAttributes) simdflags += flext::simd_altivec; #endif #endif return simdflags; } #if FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) /* functions for misaligned vector data - taken from the Altivec tutorial of Ian Ollmann, Ph.D. */ //! Load a vector from an unaligned location in memory inline vector unsigned char LoadUnaligned( vector unsigned char *v ) { vector unsigned char permuteVector = vec_lvsl( 0, (int*) v ); vector unsigned char low = vec_ld( 0, v ); vector unsigned char high = vec_ld( 15, v ); return vec_perm( low, high, permuteVector ); } /* //! Store a vector to an unaligned location in memory inline void StoreUnaligned( vector unsigned char v, vector unsigned char *where) { // Load the surrounding area vector unsigned char low = vec_ld( 0, where ); vector unsigned char high = vec_ld( 16, where ); // Prepare the constants that we need vector unsigned char permuteVector = vec_lvsr( 0, (int*) where ); vector unsigned char oxFF = (vector unsigned char)vec_splat_s8( -1 ); vector unsigned char ox00 = (vector unsigned char)vec_splat_s8( 0 ); // Make a mask for which parts of the vectors to swap out vector unsigned char mask = vec_perm( ox00, oxFF, permuteVector ); // Right rotate our input data v = vec_perm( v, v, permuteVector ); // Insert our data into the low and high vectors low = vec_sel( v, low, mask ); high = vec_sel( high, v, mask ); // Store the two aligned result vectors vec_st( low, 0, where ); vec_st( high, 16, where ); } */ inline vector float LoadUnaligned(const float *v ) { return (vector float)LoadUnaligned((vector unsigned char *)v); } /* inline void StoreUnaligned( vector float v,float *where) { return StoreUnaligned((vector unsigned char)v,(vector unsigned char *)where); } */ inline bool IsVectorAligned(const void *where) { return (reinterpret_cast<size_t>(where)&(sizeof(vector float)-1)) == 0; } inline bool VectorsAligned(const void *v1,const void *v2) { return ( (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)) &(sizeof(vector float)-1) ) == 0; } inline bool VectorsAligned(const void *v1,const void *v2,const void *v3) { return ( (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3)) &(sizeof(vector float)-1) ) == 0; } inline bool VectorsAligned(const void *v1,const void *v2,const void *v3,const void *v4) { return ( (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3)|reinterpret_cast<size_t>(v4)) &(sizeof(vector float)-1) ) == 0; } inline vector float LoadValue(const float &f) { return vec_splat(IsVectorAligned(&f)?vec_ld(0,(vector float *)&f):LoadUnaligned(&f),0); } #endif #else // FLEXT_USE_SIMD static unsigned long setsimdcaps() { return 0; } #endif // FLEXT_USE_SIMD void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt) { #ifdef FLEXT_USE_IPP if(sizeof(t_sample) == 4) ippsCopy_32f((const float *)src,(float *)dst,cnt); else if(sizeof(t_sample) == 8) ippsCopy_64f((const double *)src,(double *)dst,cnt); else ERRINTERNAL(); #else #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER if(GetSIMDCapabilities()&simd_sse) { // single precision int n = cnt>>4; if(!n) goto zero; cnt -= n<<4; __asm { mov eax,dword ptr [src] prefetcht0 [eax+0] prefetcht0 [eax+32] } if(IsVectorAligned(src)) { if(IsVectorAligned(dst)) { // aligned src, aligned dst __asm { mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ecx,[n] loopaa: prefetcht0 [eax+64] prefetcht0 [eax+96] movaps xmm0,xmmword ptr[eax] movaps xmmword ptr[edx],xmm0 movaps xmm1,xmmword ptr[eax+4*4] movaps xmmword ptr[edx+4*4],xmm1 movaps xmm2,xmmword ptr[eax+8*4] movaps xmmword ptr[edx+8*4],xmm2 movaps xmm3,xmmword ptr[eax+12*4] movaps xmmword ptr[edx+12*4],xmm3 add eax,16*4 add edx,16*4 loop loopaa } } else { // aligned src, unaligned dst __asm { mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ecx,[n] loopau: prefetcht0 [eax+64] prefetcht0 [eax+96] movaps xmm0,xmmword ptr[eax] movups xmmword ptr[edx],xmm0 movaps xmm1,xmmword ptr[eax+4*4] movups xmmword ptr[edx+4*4],xmm1 movaps xmm2,xmmword ptr[eax+8*4] movups xmmword ptr[edx+8*4],xmm2 movaps xmm3,xmmword ptr[eax+12*4] movups xmmword ptr[edx+12*4],xmm3 add eax,16*4 add edx,16*4 loop loopau } } } else { if(IsVectorAligned(dst)) { // unaligned src, aligned dst __asm { mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ecx,[n] loopua: prefetcht0 [eax+64] prefetcht0 [eax+96] movups xmm0,xmmword ptr[eax] movaps xmmword ptr[edx],xmm0 movups xmm1,xmmword ptr[eax+4*4] movaps xmmword ptr[edx+4*4],xmm1 movups xmm2,xmmword ptr[eax+8*4] movaps xmmword ptr[edx+8*4],xmm2 movups xmm3,xmmword ptr[eax+12*4] movaps xmmword ptr[edx+12*4],xmm3 add eax,16*4 add edx,16*4 loop loopua } } else { // unaligned src, unaligned dst __asm { mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ecx,[n] loopuu: prefetcht0 [eax+64] prefetcht0 [eax+96] movups xmm0,xmmword ptr[eax] movups xmmword ptr[edx],xmm0 movups xmm1,xmmword ptr[eax+4*4] movups xmmword ptr[edx+4*4],xmm1 movups xmm2,xmmword ptr[eax+8*4] movups xmmword ptr[edx+8*4],xmm2 movups xmm3,xmmword ptr[eax+12*4] movups xmmword ptr[edx+12*4],xmm3 add eax,16*4 add edx,16*4 loop loopuu } } } src += n<<4,dst += n<<4; zero: while(cnt--) *(dst++) = *(src++); } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VECTOROPS__) if(true) { int n = cnt>>2,n4 = n<<2; vScopy(n4,(vector float *)src,(vector float *)dst); cnt -= n4,src += n4,dst += n4; while(cnt--) *(dst++) = *(src++); } else #endif // _MSC_VER #endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; while(n--) { dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; src += 8,dst += 8; } while(cnt--) *(dst++) = *(src++); } #endif } #if defined(FLEXT_USE_SIMD) && FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) // because of some frame code Altivec stuff should be in seperate functions.... static const vector float zero = (vector float)(0); static void SetAltivec(t_sample *dst,int cnt,t_sample s) { vector float svec = LoadValue(s); int n = cnt>>4; cnt -= n<<4; while(n--) { vec_st(svec,0,dst); vec_st(svec,16,dst); vec_st(svec,32,dst); vec_st(svec,48,dst); dst += 16; } while(cnt--) *(dst++) = s; } static void MulAltivec(t_sample *dst,const t_sample *src,t_sample op,int cnt) { const vector float arg = LoadValue(op); int n = cnt>>4; cnt -= n<<4; for(; n--; src += 16,dst += 16) { vector float a1 = vec_ld( 0,src); vector float a2 = vec_ld(16,src); vector float a3 = vec_ld(32,src); vector float a4 = vec_ld(48,src); a1 = vec_madd(a1,arg,zero); a2 = vec_madd(a2,arg,zero); a3 = vec_madd(a3,arg,zero); a4 = vec_madd(a4,arg,zero); vec_st(a1, 0,dst); vec_st(a2,16,dst); vec_st(a3,32,dst); vec_st(a4,48,dst); } while(cnt--) *(dst++) = *(src++)*op; } static void MulAltivec(t_sample *dst,const t_sample *src,const t_sample *op,int cnt) { int n = cnt>>4; cnt -= n<<4; for(; n--; src += 16,op += 16,dst += 16) { vector float a1 = vec_ld( 0,src),b1 = vec_ld( 0,op); vector float a2 = vec_ld(16,src),b2 = vec_ld(16,op); vector float a3 = vec_ld(32,src),b3 = vec_ld(32,op); vector float a4 = vec_ld(48,src),b4 = vec_ld(48,op); a1 = vec_madd(a1,b1,zero); a2 = vec_madd(a2,b2,zero); a3 = vec_madd(a3,b3,zero); a4 = vec_madd(a4,b4,zero); vec_st(a1, 0,dst); vec_st(a2,16,dst); vec_st(a3,32,dst); vec_st(a4,48,dst); } while(cnt--) *(dst++) = *(src++) * *(op++); } static void AddAltivec(t_sample *dst,const t_sample *src,t_sample op,int cnt) { const vector float arg = LoadValue(op); int n = cnt>>4; cnt -= n<<4; for(; n--; src += 16,dst += 16) { vector float a1 = vec_ld( 0,src); vector float a2 = vec_ld(16,src); vector float a3 = vec_ld(32,src); vector float a4 = vec_ld(48,src); a1 = vec_add(a1,arg); a2 = vec_add(a2,arg); a3 = vec_add(a3,arg); a4 = vec_add(a4,arg); vec_st(a1, 0,dst); vec_st(a2,16,dst); vec_st(a3,32,dst); vec_st(a4,48,dst); } while(cnt--) *(dst++) = *(src++)+op; } static void AddAltivec(t_sample *dst,const t_sample *src,const t_sample *op,int cnt) { int n = cnt>>4; cnt -= n<<4; for(; n--; src += 16,op += 16,dst += 16) { vector float a1 = vec_ld( 0,src),b1 = vec_ld( 0,op); vector float a2 = vec_ld(16,src),b2 = vec_ld(16,op); vector float a3 = vec_ld(32,src),b3 = vec_ld(32,op); vector float a4 = vec_ld(48,src),b4 = vec_ld(48,op); a1 = vec_add(a1,b1); a2 = vec_add(a2,b2); a3 = vec_add(a3,b3); a4 = vec_add(a4,b4); vec_st(a1, 0,dst); vec_st(a2,16,dst); vec_st(a3,32,dst); vec_st(a4,48,dst); } while(cnt--) *(dst++) = *(src++) + *(op++); } static void ScaleAltivec(t_sample *dst,const t_sample *src,t_sample opmul,t_sample opadd,int cnt) { const vector float argmul = LoadValue(opmul); const vector float argadd = LoadValue(opadd); int n = cnt>>4; cnt -= n<<4; for(; n--; src += 16,dst += 16) { vec_st(vec_madd(vec_ld( 0,src),argmul,argadd), 0,dst); vec_st(vec_madd(vec_ld(16,src),argmul,argadd),16,dst); vec_st(vec_madd(vec_ld(32,src),argmul,argadd),32,dst); vec_st(vec_madd(vec_ld(48,src),argmul,argadd),48,dst); } while(cnt--) *(dst++) = *(src++)*opmul+opadd; } static void ScaleAltivec(t_sample *dst,const t_sample *src,t_sample opmul,const t_sample *add,int cnt) { const vector float argmul = LoadValue(opmul); int n = cnt>>4; cnt -= n<<4; for(; n--; src += 16,dst += 16,add += 16) { vec_st(vec_madd(vec_ld( 0,src),argmul,vec_ld( 0,add)), 0,dst); vec_st(vec_madd(vec_ld(16,src),argmul,vec_ld(16,add)),16,dst); vec_st(vec_madd(vec_ld(32,src),argmul,vec_ld(32,add)),32,dst); vec_st(vec_madd(vec_ld(48,src),argmul,vec_ld(48,add)),48,dst); } while(cnt--) *(dst++) = *(src++) * opmul + *(add++); } static void ScaleAltivec(t_sample *dst,const t_sample *src,const t_sample *mul,const t_sample *add,int cnt) { int n = cnt>>4; cnt -= n<<4; for(; n--; src += 16,dst += 16,mul += 16,add += 16) { vec_st(vec_madd(vec_ld( 0,src),vec_ld( 0,mul),vec_ld( 0,add)), 0,dst); vec_st(vec_madd(vec_ld(16,src),vec_ld(16,mul),vec_ld(16,add)),16,dst); vec_st(vec_madd(vec_ld(32,src),vec_ld(32,mul),vec_ld(32,add)),32,dst); vec_st(vec_madd(vec_ld(48,src),vec_ld(48,mul),vec_ld(48,add)),48,dst); } while(cnt--) *(dst++) = *(src++) * *(mul++) + *(add++); } #endif void flext::SetSamples(t_sample *dst,int cnt,t_sample s) { #ifdef FLEXT_USE_IPP if(sizeof(t_sample) == 4) ippsSet_32f((float)s,(float *)dst,cnt); else if(sizeof(t_sample) == 8) ippsSet_64f((double)s,(double *)dst,cnt); else ERRINTERNAL(); #else #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER if(GetSIMDCapabilities()&simd_sse) { // single precision int n = cnt>>4; if(!n) goto zero; cnt -= n<<4; __asm { movss xmm0,xmmword ptr [s] shufps xmm0,xmm0,0 } if(IsVectorAligned(dst)) { // aligned version __asm { mov ecx,[n] mov edx,dword ptr [dst] loopa: movaps xmmword ptr[edx],xmm0 movaps xmmword ptr[edx+4*4],xmm0 movaps xmmword ptr[edx+8*4],xmm0 movaps xmmword ptr[edx+12*4],xmm0 add edx,16*4 loop loopa } } else { // unaligned version __asm { mov ecx,[n] mov edx,dword ptr [dst] loopu: movups xmmword ptr[edx],xmm0 movups xmmword ptr[edx+4*4],xmm0 movups xmmword ptr[edx+8*4],xmm0 movups xmmword ptr[edx+12*4],xmm0 add edx,16*4 loop loopu } } dst += n<<4; zero: while(cnt--) *(dst++) = s; } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) if(GetSIMDCapabilities()&simd_altivec && IsVectorAligned(dst)) SetAltivec(dst,cnt,s); else #endif #endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; while(n--) { dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = dst[7] = s; dst += 8; } while(cnt--) *(dst++) = s; } #endif } void flext::MulSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt) { #ifdef FLEXT_USE_IPP if(sizeof(t_sample) == 4) { ippsMulC_32f((const float *)src,(float)op,(float *)dst,cnt); } else if(sizeof(t_sample) == 8) { ippsMulC_64f((const double *)src,(double)op,(double *)dst,cnt); } else ERRINTERNAL(); #else #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER if(GetSIMDCapabilities()&simd_sse) { // single precision __m128 a = _mm_load1_ps(&op); int n = cnt>>4; if(!n) goto zero; cnt -= n<<4; __asm { mov eax,dword ptr [src] prefetcht0 [eax+0] prefetcht0 [eax+32] movss xmm0,xmmword ptr [op] shufps xmm0,xmm0,0 } if(VectorsAligned(src,dst)) { // aligned version __asm { mov ecx,[n] mov eax,dword ptr [src] mov edx,dword ptr [dst] loopa: prefetcht0 [eax+64] prefetcht0 [eax+96] movaps xmm1,xmmword ptr[eax] mulps xmm1,xmm0 movaps xmmword ptr[edx],xmm1 movaps xmm2,xmmword ptr[eax+4*4] mulps xmm2,xmm0 movaps xmmword ptr[edx+4*4],xmm2 movaps xmm3,xmmword ptr[eax+8*4] mulps xmm3,xmm0 movaps xmmword ptr[edx+8*4],xmm3 movaps xmm4,xmmword ptr[eax+12*4] mulps xmm4,xmm0 movaps xmmword ptr[edx+12*4],xmm4 add eax,16*4 add edx,16*4 loop loopa } } else { // unaligned version __asm { mov ecx,[n] mov eax,dword ptr [src] mov edx,dword ptr [dst] loopu: prefetcht0 [eax+64] prefetcht0 [eax+96] movups xmm1,xmmword ptr[eax] mulps xmm1,xmm0 movups xmmword ptr[edx],xmm1 movups xmm2,xmmword ptr[eax+4*4] mulps xmm2,xmm0 movups xmmword ptr[edx+4*4],xmm2 movups xmm3,xmmword ptr[eax+8*4] mulps xmm3,xmm0 movups xmmword ptr[edx+8*4],xmm3 movups xmm4,xmmword ptr[eax+12*4] mulps xmm4,xmm0 movups xmmword ptr[edx+12*4],xmm4 add eax,16*4 add edx,16*4 loop loopu } } src += n<<4,dst += n<<4; zero: while(cnt--) *(dst++) = *(src++)*op; } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VDSP__) if(true) { vsmul(src,1,&op,dst,1,cnt); } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst)) MulAltivec(dst,src,op,cnt); else #endif // _MSC_VER #endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; if(src == dst) { while(n--) { dst[0] *= op; dst[1] *= op; dst[2] *= op; dst[3] *= op; dst[4] *= op; dst[5] *= op; dst[6] *= op; dst[7] *= op; dst += 8; } while(cnt--) *(dst++) *= op; } else { while(n--) { dst[0] = src[0]*op; dst[1] = src[1]*op; dst[2] = src[2]*op; dst[3] = src[3]*op; dst[4] = src[4]*op; dst[5] = src[5]*op; dst[6] = src[6]*op; dst[7] = src[7]*op; src += 8,dst += 8; } while(cnt--) *(dst++) = *(src++)*op; } } #endif } void flext::MulSamples(t_sample *dst,const t_sample *src,const t_sample *op,int cnt) { #ifdef FLEXT_USE_IPP if(sizeof(t_sample) == 4) { ippsMul_32f((const float *)src,(const float *)op,(float *)dst,cnt); } else if(sizeof(t_sample) == 8) { ippsMul_32f((const double *)src,(const double *)op,(double *)dst,cnt); } else ERRINTERNAL(); #else #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER if(GetSIMDCapabilities()&simd_sse) { // single precision int n = cnt>>4; if(!n) goto zero; cnt -= n<<4; __asm { mov eax,[src] mov ebx,[op] prefetcht0 [eax+0] prefetcht0 [ebx+0] prefetcht0 [eax+32] prefetcht0 [ebx+32] } if(VectorsAligned(src,dst)) { if(IsVectorAligned(op)) { __asm { mov ecx,[n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ebx,dword ptr [op] loopaa: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] movaps xmm0,xmmword ptr[eax] movaps xmm1,xmmword ptr[ebx] mulps xmm0,xmm1 movaps xmmword ptr[edx],xmm0 movaps xmm2,xmmword ptr[eax+4*4] movaps xmm3,xmmword ptr[ebx+4*4] mulps xmm2,xmm3 movaps xmmword ptr[edx+4*4],xmm2 movaps xmm4,xmmword ptr[eax+8*4] movaps xmm5,xmmword ptr[ebx+8*4] mulps xmm4,xmm5 movaps xmmword ptr[edx+8*4],xmm4 movaps xmm6,xmmword ptr[eax+12*4] movaps xmm7,xmmword ptr[ebx+12*4] mulps xmm6,xmm7 movaps xmmword ptr[edx+12*4],xmm6 add eax,16*4 add ebx,16*4 add edx,16*4 loop loopaa } } else { __asm { mov ecx,[n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ebx,dword ptr [op] loopau: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] movaps xmm0,xmmword ptr[eax] movups xmm1,xmmword ptr[ebx] mulps xmm0,xmm1 movaps xmmword ptr[edx],xmm0 movaps xmm2,xmmword ptr[eax+4*4] movups xmm3,xmmword ptr[ebx+4*4] mulps xmm2,xmm3 movaps xmmword ptr[edx+4*4],xmm2 movaps xmm4,xmmword ptr[eax+8*4] movups xmm5,xmmword ptr[ebx+8*4] mulps xmm4,xmm5 movaps xmmword ptr[edx+8*4],xmm4 movaps xmm6,xmmword ptr[eax+12*4] movups xmm7,xmmword ptr[ebx+12*4] mulps xmm6,xmm7 movaps xmmword ptr[edx+12*4],xmm6 add eax,16*4 add ebx,16*4 add edx,16*4 loop loopau } } } else { if(IsVectorAligned(op)) { __asm { mov ecx,[n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ebx,dword ptr [op] loopua: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] movups xmm0,xmmword ptr[eax] movaps xmm1,xmmword ptr[ebx] mulps xmm0,xmm1 movups xmmword ptr[edx],xmm0 movups xmm2,xmmword ptr[eax+4*4] movaps xmm3,xmmword ptr[ebx+4*4] mulps xmm2,xmm3 movups xmmword ptr[edx+4*4],xmm2 movups xmm4,xmmword ptr[eax+8*4] movaps xmm5,xmmword ptr[ebx+8*4] mulps xmm4,xmm5 movups xmmword ptr[edx+8*4],xmm4 movups xmm6,xmmword ptr[eax+12*4] movaps xmm7,xmmword ptr[ebx+12*4] mulps xmm6,xmm7 movups xmmword ptr[edx+12*4],xmm6 add eax,16*4 add ebx,16*4 add edx,16*4 loop loopua } } else { __asm { mov ecx,[n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ebx,dword ptr [op] loopuu: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] movups xmm0,xmmword ptr[eax] movups xmm1,xmmword ptr[ebx] mulps xmm0,xmm1 movups xmmword ptr[edx],xmm0 movups xmm2,xmmword ptr[eax+4*4] movups xmm3,xmmword ptr[ebx+4*4] mulps xmm2,xmm3 movups xmmword ptr[edx+4*4],xmm2 movups xmm4,xmmword ptr[eax+8*4] movups xmm5,xmmword ptr[ebx+8*4] mulps xmm4,xmm5 movups xmmword ptr[edx+8*4],xmm4 movups xmm6,xmmword ptr[eax+12*4] movups xmm7,xmmword ptr[ebx+12*4] mulps xmm6,xmm7 movups xmmword ptr[edx+12*4],xmm6 add eax,16*4 add ebx,16*4 add edx,16*4 loop loopuu } } } src += n<<4,dst += n<<4,op += n<<4; zero: while(cnt--) *(dst++) = *(src++) * *(op++); } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VDSP__) if(true) { vmul(src,1,op,1,dst,1,cnt); } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,op,dst)) MulAltivec(dst,src,op,cnt); else #endif // _MSC_VER #endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; if(src == dst) { while(n--) { dst[0] *= op[0]; dst[1] *= op[1]; dst[2] *= op[2]; dst[3] *= op[3]; dst[4] *= op[4]; dst[5] *= op[5]; dst[6] *= op[6]; dst[7] *= op[7]; dst += 8,op += 8; } while(cnt--) *(dst++) *= *(op++); } else { while(n--) { dst[0] = src[0]*op[0]; dst[1] = src[1]*op[1]; dst[2] = src[2]*op[2]; dst[3] = src[3]*op[3]; dst[4] = src[4]*op[4]; dst[5] = src[5]*op[5]; dst[6] = src[6]*op[6]; dst[7] = src[7]*op[7]; src += 8,dst += 8,op += 8; } while(cnt--) *(dst++) = *(src++) * *(op++); } } #endif } void flext::AddSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt) { #ifdef FLEXT_USE_IPP if(sizeof(t_sample) == 4) { ippsAddC_32f((const float *)src,(float)op,(float *)dst,cnt); } else if(sizeof(t_sample) == 8) { ippsAddC_64f_I((const double *)src,(double)op,(double *)dst,cnt); } else ERRINTERNAL(); #else #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER if(GetSIMDCapabilities()&simd_sse) { // single precision int n = cnt>>4; if(!n) goto zero; cnt -= n<<4; __asm { mov eax,[src] prefetcht0 [eax+0] prefetcht0 [eax+32] movss xmm0,xmmword ptr [op] shufps xmm0,xmm0,0 } if(VectorsAligned(src,dst)) { // aligned version __asm { mov ecx,[n] mov eax,dword ptr [src] mov edx,dword ptr [dst] loopa: prefetcht0 [eax+64] prefetcht0 [eax+96] movaps xmm1,xmmword ptr[eax] addps xmm1,xmm0 movaps xmmword ptr[edx],xmm1 movaps xmm2,xmmword ptr[eax+4*4] addps xmm2,xmm0 movaps xmmword ptr[edx+4*4],xmm2 movaps xmm3,xmmword ptr[eax+8*4] addps xmm3,xmm0 movaps xmmword ptr[edx+8*4],xmm3 movaps xmm4,xmmword ptr[eax+12*4] addps xmm4,xmm0 movaps xmmword ptr[edx+12*4],xmm4 add eax,16*4 add edx,16*4 loop loopa } } else { // unaligned version __asm { mov ecx,[n] mov eax,dword ptr [src] mov edx,dword ptr [dst] loopu: prefetcht0 [eax+64] prefetcht0 [eax+96] movups xmm1,xmmword ptr[eax] addps xmm1,xmm0 movups xmmword ptr[edx],xmm1 movups xmm2,xmmword ptr[eax+4*4] addps xmm2,xmm0 movups xmmword ptr[edx+4*4],xmm2 movups xmm3,xmmword ptr[eax+8*4] addps xmm3,xmm0 movups xmmword ptr[edx+8*4],xmm3 movups xmm4,xmmword ptr[eax+12*4] addps xmm4,xmm0 movups xmmword ptr[edx+12*4],xmm4 add eax,16*4 add edx,16*4 loop loopu } } src += n<<4,dst += n<<4,op += n<<4; zero: while(cnt--) *(dst++) = *(src++)+op; } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst)) AddAltivec(dst,src,op,cnt); else #endif // _MSC_VER #endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; if(src == dst) { while(n--) { dst[0] += op; dst[1] += op; dst[2] += op; dst[3] += op; dst[4] += op; dst[5] += op; dst[6] += op; dst[7] += op; dst += 8; } while(cnt--) *(dst++) += op; } else { while(n--) { dst[0] = src[0]+op; dst[1] = src[1]+op; dst[2] = src[2]+op; dst[3] = src[3]+op; dst[4] = src[4]+op; dst[5] = src[5]+op; dst[6] = src[6]+op; dst[7] = src[7]+op; src += 8,dst += 8; } while(cnt--) *(dst++) = *(src++)+op; } } #endif } void flext::AddSamples(t_sample *dst,const t_sample *src,const t_sample *op,int cnt) { #ifdef FLEXT_USE_IPP if(sizeof(t_sample) == 4) { ippsAdd_32f((const float *)src,(const float *)op,(float *)dst,cnt); } else if(sizeof(t_sample) == 8) { ippsAdd_64f((const double *)src,(const double *)op,(double *)dst,cnt); } else ERRINTERNAL(); #else #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER if(GetSIMDCapabilities()&simd_sse) { // Prefetch cache __asm { mov eax,dword ptr [src] mov ebx,dword ptr [op] prefetcht0 [eax] prefetcht0 [ebx] prefetcht0 [eax+32] prefetcht0 [ebx+32] } // single precision int n = cnt>>4; if(!n) goto zero; cnt -= n<<4; if(VectorsAligned(src,dst)) { if(IsVectorAligned(op)) { __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ebx,dword ptr [op] loopaa: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] movaps xmm0,xmmword ptr[eax] movaps xmm1,xmmword ptr[ebx] addps xmm0,xmm1 movaps xmmword ptr[edx],xmm0 movaps xmm2,xmmword ptr[eax+4*4] movaps xmm3,xmmword ptr[ebx+4*4] addps xmm2,xmm3 movaps xmmword ptr[edx+4*4],xmm2 movaps xmm4,xmmword ptr[eax+8*4] movaps xmm5,xmmword ptr[ebx+8*4] addps xmm4,xmm5 movaps xmmword ptr[edx+8*4],xmm4 movaps xmm6,xmmword ptr[eax+12*4] movaps xmm7,xmmword ptr[ebx+12*4] addps xmm6,xmm7 movaps xmmword ptr[edx+12*4],xmm6 add eax,16*4 add ebx,16*4 add edx,16*4 loop loopaa } } else { __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ebx,dword ptr [op] loopau: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] movaps xmm0,xmmword ptr[eax] movups xmm1,xmmword ptr[ebx] addps xmm0,xmm1 movaps xmmword ptr[edx],xmm0 movaps xmm2,xmmword ptr[eax+4*4] movups xmm3,xmmword ptr[ebx+4*4] addps xmm2,xmm3 movaps xmmword ptr[edx+4*4],xmm2 movaps xmm4,xmmword ptr[eax+8*4] movups xmm5,xmmword ptr[ebx+8*4] addps xmm4,xmm5 movaps xmmword ptr[edx+8*4],xmm4 movaps xmm6,xmmword ptr[eax+12*4] movups xmm7,xmmword ptr[ebx+12*4] addps xmm6,xmm7 movaps xmmword ptr[edx+12*4],xmm6 add eax,16*4 add ebx,16*4 add edx,16*4 loop loopau } } } else { if(IsVectorAligned(op)) { __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ebx,dword ptr [op] loopua: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] movups xmm0,xmmword ptr[eax] movaps xmm1,xmmword ptr[ebx] addps xmm0,xmm1 movups xmmword ptr[edx],xmm0 movups xmm2,xmmword ptr[eax+4*4] movaps xmm3,xmmword ptr[ebx+4*4] addps xmm2,xmm3 movups xmmword ptr[edx+4*4],xmm2 movups xmm4,xmmword ptr[eax+8*4] movaps xmm5,xmmword ptr[ebx+8*4] addps xmm4,xmm5 movups xmmword ptr[edx+8*4],xmm4 movups xmm6,xmmword ptr[eax+12*4] movaps xmm7,xmmword ptr[ebx+12*4] addps xmm6,xmm7 movups xmmword ptr[edx+12*4],xmm6 add eax,16*4 add ebx,16*4 add edx,16*4 loop loopua } } else { __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ebx,dword ptr [op] loopuu: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] movups xmm0,xmmword ptr[eax] movups xmm1,xmmword ptr[ebx] addps xmm0,xmm1 movups xmmword ptr[edx],xmm0 movups xmm2,xmmword ptr[eax+4*4] movups xmm3,xmmword ptr[ebx+4*4] addps xmm2,xmm3 movups xmmword ptr[edx+4*4],xmm2 movups xmm4,xmmword ptr[eax+8*4] movups xmm5,xmmword ptr[ebx+8*4] addps xmm4,xmm5 movups xmmword ptr[edx+8*4],xmm4 movups xmm6,xmmword ptr[eax+12*4] movups xmm7,xmmword ptr[ebx+12*4] addps xmm6,xmm7 movups xmmword ptr[edx+12*4],xmm6 add eax,16*4 add ebx,16*4 add edx,16*4 loop loopuu } } } src += n<<4,dst += n<<4,op += n<<4; zero: while(cnt--) *(dst++) = *(src++) + *(op++); } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VDSP__) if(true) { vadd(src,1,op,1,dst,1,cnt); } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,op,dst)) AddAltivec(dst,src,op,cnt); else #endif // _MSC_VER #endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; if(dst == src) { while(n--) { dst[0] += op[0]; dst[1] += op[1]; dst[2] += op[2]; dst[3] += op[3]; dst[4] += op[4]; dst[5] += op[5]; dst[6] += op[6]; dst[7] += op[7]; dst += 8,op += 8; } while(cnt--) *(dst++) += *(op++); } else { while(n--) { dst[0] = src[0]+op[0]; dst[1] = src[1]+op[1]; dst[2] = src[2]+op[2]; dst[3] = src[3]+op[3]; dst[4] = src[4]+op[4]; dst[5] = src[5]+op[5]; dst[6] = src[6]+op[6]; dst[7] = src[7]+op[7]; src += 8,dst += 8,op += 8; } while(cnt--) *(dst++) = *(src++) + *(op++); } } #endif } void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,t_sample opadd,int cnt) { #ifdef FLEXT_USE_IPP if(sizeof(t_sample) == 4) { ippsMulC_32f((const float *)src,(float)opmul,(float *)dst,cnt); ippsAddC_32f_I((float)opadd,(float *)dst,cnt); } else if(sizeof(t_sample) == 8) { ippsMulC_64f((const double *)src,(double)opmul,(double *)dst,cnt); ippsAddC_64f_I((double)opadd,(double *)dst,cnt); } else ERRINTERNAL(); #else #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER if(GetSIMDCapabilities()&simd_sse) { // single precision int n = cnt>>4; if(!n) goto zero; cnt -= n<<4; __asm { mov eax,dword ptr [src] prefetcht0 [eax+0] prefetcht0 [eax+32] movss xmm0,xmmword ptr [opadd] shufps xmm0,xmm0,0 movss xmm1,xmmword ptr [opmul] shufps xmm1,xmm1,0 } if(VectorsAligned(src,dst)) { // aligned version __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] mov edx,dword ptr [dst] loopa: prefetcht0 [eax+64] prefetcht0 [eax+96] movaps xmm2,xmmword ptr[eax] mulps xmm2,xmm1 addps xmm2,xmm0 movaps xmmword ptr[edx],xmm2 movaps xmm3,xmmword ptr[eax+4*4] mulps xmm3,xmm1 addps xmm3,xmm0 movaps xmmword ptr[edx+4*4],xmm3 movaps xmm4,xmmword ptr[eax+8*4] mulps xmm4,xmm1 addps xmm4,xmm0 movaps xmmword ptr[edx+8*4],xmm4 movaps xmm5,xmmword ptr[eax+12*4] mulps xmm5,xmm1 addps xmm5,xmm0 movaps xmmword ptr[edx+12*4],xmm5 add eax,16*4 add edx,16*4 loop loopa } } else { // unaligned version __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] mov edx,dword ptr [dst] loopu: prefetcht0 [eax+64] prefetcht0 [eax+96] movups xmm2,xmmword ptr[eax] mulps xmm2,xmm1 addps xmm2,xmm0 movups xmmword ptr[edx],xmm2 movups xmm3,xmmword ptr[eax+4*4] mulps xmm3,xmm1 addps xmm3,xmm0 movups xmmword ptr[edx+4*4],xmm3 movups xmm4,xmmword ptr[eax+8*4] mulps xmm4,xmm1 addps xmm4,xmm0 movups xmmword ptr[edx+8*4],xmm4 movups xmm5,xmmword ptr[eax+12*4] mulps xmm5,xmm1 addps xmm5,xmm0 movups xmmword ptr[edx+12*4],xmm5 add eax,16*4 add edx,16*4 loop loopu } } src += n<<4,dst += n<<4; zero: while(cnt--) *(dst++) = *(src++)*opmul+opadd; } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst)) ScaleAltivec(dst,src,opmul,opadd,cnt); else #endif // _MSC_VER #endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; while(n--) { dst[0] = src[0]*opmul+opadd; dst[1] = src[1]*opmul+opadd; dst[2] = src[2]*opmul+opadd; dst[3] = src[3]*opmul+opadd; dst[4] = src[4]*opmul+opadd; dst[5] = src[5]*opmul+opadd; dst[6] = src[6]*opmul+opadd; dst[7] = src[7]*opmul+opadd; src += 8,dst += 8; } while(cnt--) *(dst++) = *(src++)*opmul+opadd; } #endif } void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,const t_sample *opadd,int cnt) { #ifdef FLEXT_USE_IPP if(sizeof(t_sample) == 4) { ippsMulC_32f((const float *)src,(float)opmul,(float *)dst,cnt); ippsAdd_32f_I((float *)opadd,(float *)dst,cnt); } else if(sizeof(t_sample) == 8) { ippsMulC_64f((const double *)src,(double)opmul,(double *)dst,cnt); ippsAdd_64f_I((double *)opadd,(double *)dst,cnt); } else ERRINTERNAL(); #else #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER if(GetSIMDCapabilities()&simd_sse) { // single precision int n = cnt>>4; if(!n) goto zero; cnt -= n<<4; __asm { mov eax,dword ptr [src] prefetcht0 [eax+0] prefetcht0 [eax+32] movss xmm0,xmmword ptr [opmul] shufps xmm0,xmm0,0 } if(VectorsAligned(src,dst,opadd)) { // aligned version __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ebx,dword ptr [opadd] loopa: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] movaps xmm2,xmmword ptr[eax] movaps xmm1,xmmword ptr[ebx] mulps xmm2,xmm0 addps xmm2,xmm1 movaps xmmword ptr[edx],xmm2 movaps xmm3,xmmword ptr[eax+4*4] movaps xmm1,xmmword ptr[ebx+4*4] mulps xmm3,xmm0 addps xmm3,xmm1 movaps xmmword ptr[edx+4*4],xmm3 movaps xmm4,xmmword ptr[eax+8*4] movaps xmm1,xmmword ptr[ebx+8*4] mulps xmm4,xmm0 addps xmm4,xmm1 movaps xmmword ptr[edx+8*4],xmm4 movaps xmm5,xmmword ptr[eax+12*4] movaps xmm1,xmmword ptr[ebx+12*4] mulps xmm5,xmm0 addps xmm5,xmm1 movaps xmmword ptr[edx+12*4],xmm5 add eax,16*4 add edx,16*4 add ebx,16*4 loop loopa } } else { // unaligned version __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov ebx,dword ptr [opadd] loopu: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] movups xmm2,xmmword ptr[eax] movups xmm1,xmmword ptr[ebx] mulps xmm2,xmm0 addps xmm2,xmm1 movups xmmword ptr[edx],xmm2 movups xmm3,xmmword ptr[eax+4*4] movups xmm1,xmmword ptr[ebx+4*4] mulps xmm3,xmm0 addps xmm3,xmm1 movups xmmword ptr[edx+4*4],xmm3 movups xmm4,xmmword ptr[eax+8*4] movups xmm1,xmmword ptr[ebx+8*4] mulps xmm4,xmm0 addps xmm4,xmm1 movups xmmword ptr[edx+8*4],xmm4 movups xmm5,xmmword ptr[eax+12*4] movups xmm1,xmmword ptr[ebx+12*4] mulps xmm5,xmm0 addps xmm5,xmm1 movups xmmword ptr[edx+12*4],xmm5 add eax,16*4 add edx,16*4 add ebx,16*4 loop loopu } } src += n<<4,dst += n<<4,opadd += n<<4; zero: while(cnt--) *(dst++) = *(src++) * opmul + *(opadd++); } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst,opadd)) ScaleAltivec(dst,src,opmul,opadd,cnt); else #endif // _MSC_VER #endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; if(dst == opadd) { while(n--) { dst[0] += src[0]*opmul; dst[1] += src[1]*opmul; dst[2] += src[2]*opmul; dst[3] += src[3]*opmul; dst[4] += src[4]*opmul; dst[5] += src[5]*opmul; dst[6] += src[6]*opmul; dst[7] += src[7]*opmul; src += 8,dst += 8; } while(cnt--) *(dst++) += *(src++)*opmul; } else { while(n--) { dst[0] = src[0]*opmul+opadd[0]; dst[1] = src[1]*opmul+opadd[1]; dst[2] = src[2]*opmul+opadd[2]; dst[3] = src[3]*opmul+opadd[3]; dst[4] = src[4]*opmul+opadd[4]; dst[5] = src[5]*opmul+opadd[5]; dst[6] = src[6]*opmul+opadd[6]; dst[7] = src[7]*opmul+opadd[7]; src += 8,dst += 8,opadd += 8; } while(cnt--) *(dst++) = *(src++)*opmul+*(opadd++); } } #endif } void flext::ScaleSamples(t_sample *dst,const t_sample *src,const t_sample *opmul,const t_sample *opadd,int cnt) { #ifdef FLEXT_USE_IPP if(sizeof(t_sample) == 4) { ippsMul_32f((const float *)src,(const float *)opmul,(float *)dst,cnt); ippsAdd_32f_I((const float *)opadd,(float *)dst,cnt); } else if(sizeof(t_sample) == 8) { ippsMul_64f((const double *)src,(const double *)opmul,(double *)dst,cnt); ippsAdd_64f_I((const double *)opadd,(double *)dst,cnt); } else ERRINTERNAL(); #else #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER if(GetSIMDCapabilities()&simd_sse) { // single precision int n = cnt>>4; if(!n) goto zero; cnt -= n<<4; __asm { mov eax,dword ptr [src] prefetcht0 [eax+0] prefetcht0 [eax+32] } if(VectorsAligned(src,dst,opmul,opadd)) { // aligned version __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov esi,dword ptr [opmul] mov ebx,dword ptr [opadd] loopa: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [esi+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] prefetcht0 [esi+96] movaps xmm2,xmmword ptr[eax] movaps xmm0,xmmword ptr[esi] movaps xmm1,xmmword ptr[ebx] mulps xmm2,xmm0 addps xmm2,xmm1 movaps xmmword ptr[edx],xmm2 movaps xmm3,xmmword ptr[eax+4*4] movaps xmm0,xmmword ptr[esi+4*4] movaps xmm1,xmmword ptr[ebx+4*4] mulps xmm3,xmm0 addps xmm3,xmm1 movaps xmmword ptr[edx+4*4],xmm3 movaps xmm4,xmmword ptr[eax+8*4] movaps xmm0,xmmword ptr[esi+8*4] movaps xmm1,xmmword ptr[ebx+8*4] mulps xmm4,xmm0 addps xmm4,xmm1 movaps xmmword ptr[edx+8*4],xmm4 movaps xmm5,xmmword ptr[eax+12*4] movaps xmm0,xmmword ptr[esi+12*4] movaps xmm1,xmmword ptr[ebx+12*4] mulps xmm5,xmm0 addps xmm5,xmm1 movaps xmmword ptr[edx+12*4],xmm5 add eax,16*4 add edx,16*4 add ebx,16*4 add esi,16*4 loop loopa } } else { // unaligned version __asm { mov ecx,dword ptr [n] mov eax,dword ptr [src] mov edx,dword ptr [dst] mov esi,dword ptr [opmul] mov ebx,dword ptr [opadd] loopu: prefetcht0 [eax+64] prefetcht0 [ebx+64] prefetcht0 [esi+64] prefetcht0 [eax+96] prefetcht0 [ebx+96] prefetcht0 [esi+96] movups xmm2,xmmword ptr[eax] movups xmm0,xmmword ptr[esi] movups xmm1,xmmword ptr[ebx] mulps xmm2,xmm0 addps xmm2,xmm1 movups xmmword ptr[edx],xmm2 movups xmm3,xmmword ptr[eax+4*4] movups xmm0,xmmword ptr[esi+4*4] movups xmm1,xmmword ptr[ebx+4*4] mulps xmm3,xmm0 addps xmm3,xmm1 movups xmmword ptr[edx+4*4],xmm3 movups xmm4,xmmword ptr[eax+8*4] movups xmm0,xmmword ptr[esi+8*4] movups xmm1,xmmword ptr[ebx+8*4] mulps xmm4,xmm0 addps xmm4,xmm1 movups xmmword ptr[edx+8*4],xmm4 movups xmm5,xmmword ptr[eax+12*4] movups xmm0,xmmword ptr[esi+12*4] movups xmm1,xmmword ptr[ebx+12*4] mulps xmm5,xmm0 addps xmm5,xmm1 movups xmmword ptr[edx+12*4],xmm5 add eax,16*4 add edx,16*4 add ebx,16*4 add esi,16*4 loop loopu } } src += n<<4,dst += n<<4,opmul += n<<4,opadd += n<<4; zero: while(cnt--) *(dst++) = *(src++) * *(opmul++) + *(opadd++); } else #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__) if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst,opmul,opadd)) ScaleAltivec(dst,src,opmul,opadd,cnt); else #endif // _MSC_VER #endif // FLEXT_USE_SIMD { int n = cnt>>3; cnt -= n<<3; if(dst == opadd) { while(n--) { dst[0] += src[0]*opmul[0]; dst[1] += src[1]*opmul[1]; dst[2] += src[2]*opmul[2]; dst[3] += src[3]*opmul[3]; dst[4] += src[4]*opmul[4]; dst[5] += src[5]*opmul[5]; dst[6] += src[6]*opmul[6]; dst[7] += src[7]*opmul[7]; src += 8,dst += 8,opmul += 8; } while(cnt--) *(dst++) += *(src++) * *(opmul++); } else { while(n--) { dst[0] = src[0]*opmul[0]+opadd[0]; dst[1] = src[1]*opmul[1]+opadd[1]; dst[2] = src[2]*opmul[2]+opadd[2]; dst[3] = src[3]*opmul[3]+opadd[3]; dst[4] = src[4]*opmul[4]+opadd[4]; dst[5] = src[5]*opmul[5]+opadd[5]; dst[6] = src[6]*opmul[6]+opadd[6]; dst[7] = src[7]*opmul[7]+opadd[7]; src += 8,dst += 8,opmul += 8,opadd += 8; } while(cnt--) *(dst++) = *(src++)* *(opmul++) + *(opadd++); } } #endif }