aboutsummaryrefslogtreecommitdiff
path: root/externals/grill/flext/source/flsimd.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'externals/grill/flext/source/flsimd.cpp')
-rw-r--r--externals/grill/flext/source/flsimd.cpp2002
1 files changed, 0 insertions, 2002 deletions
diff --git a/externals/grill/flext/source/flsimd.cpp b/externals/grill/flext/source/flsimd.cpp
deleted file mode 100644
index 8ed13a4a..00000000
--- a/externals/grill/flext/source/flsimd.cpp
+++ /dev/null
@@ -1,2002 +0,0 @@
-/*
-
-flext - C++ layer for Max/MSP and pd (pure data) externals
-
-Copyright (c) 2001-2005 Thomas Grill (gr@grrrr.org)
-For information on usage and redistribution, and for a DISCLAIMER OF ALL
-WARRANTIES, see the file, "license.txt," in this distribution.
-
-*/
-
-/*! \file flsimd.cpp
- \brief flext SIMD support functions
-
- If FLEXT_USE_SIMD is defined at compilation, SIMD instructions are used wherever feasible.
- If used with MSVC++ 6 the "Processor Pack" must be installed.
-
- If FLEXT_USE_IPP is defined the Intel Performance Package is used.
-*/
-
-#include "flext.h"
-#include <string.h>
-
-#if FLEXT_OS == FLEXT_OS_WIN
-#include <windows.h>
-#endif
-
-#ifdef FLEXT_USE_IPP
-#include <ipps.h>
-#endif
-
-#ifdef FLEXT_USE_SIMD
- #ifdef _MSC_VER
- // include MSVC SIMD header files
- #include <mmintrin.h> // MMX
- #include <xmmintrin.h> // SSE
- #include <emmintrin.h> // SSE2
- #include <mm3dnow.h> // 3DNow!
- #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__MWERKS__) && defined(__VEC__)
- #if FLEXT_OSAPI == FLEXT_OSAPI_MAC_MACH
- #include <sys/sysctl.h>
- #include <vDSP.h>
- #else
- #include <Gestalt.h>
- #endif
-
- #pragma altivec_model on
-
- #include <altivec.h>
- #include <vectorOps.h>
- #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__GNUC__) && defined(__VEC__)
- #include <sys/sysctl.h>
- #include <vecLib/vecLib.h>
- #endif
-
-#endif // FLEXT_USE_SIMD
-
-static unsigned long setsimdcaps();
-
-/*! \brief Holds SIMD capability flags
- \internal
-*/
-unsigned long flext::simdcaps = setsimdcaps();
-
-unsigned long flext::GetSIMDCapabilities() { return simdcaps; }
-
-
-#ifdef FLEXT_USE_SIMD
-
-#if FLEXT_CPU == FLEXT_CPU_IA32 || FLEXT_CPU == FLEXT_CPU_X84_64
-
-#define _CPU_FEATURE_MMX 0x0001
-#define _CPU_FEATURE_SSE 0x0002
-#define _CPU_FEATURE_SSE2 0x0004
-#define _CPU_FEATURE_3DNOW 0x0008
-
-typedef struct _processor_info {
- int family; // family of the processor
- // e.g. 6 = Pentium-Pro architecture
- int model; // model of processor
- // e.g. 1 = Pentium-Pro for family = 6
- int stepping; // processor revision number
- int feature; // processor feature
- // (same as return value from _cpuid)
- int os_support; // does OS Support the feature?
- int checks; // mask of checked bits in feature
- // and os_support fields
-} _p_info;
-
-// These are the bit flags that get set on calling cpuid
-// with register eax set to 1
-#define _MMX_FEATURE_BIT 0x00800000
-#define _SSE_FEATURE_BIT 0x02000000
-#define _SSE2_FEATURE_BIT 0x04000000
-
-// This bit is set when cpuid is called with
-// register set to 80000001h (only applicable to AMD)
-#define _3DNOW_FEATURE_BIT 0x80000000
-
-#ifdef _MSC_VER
-static int IsCPUID()
-{
- __try {
- _asm {
- xor eax, eax
- cpuid
- }
- }
- __except (EXCEPTION_EXECUTE_HANDLER) {
- return 0;
- }
- return 1;
-}
-
-static int _os_support(int feature)
-{
- __try {
- switch (feature) {
- case _CPU_FEATURE_SSE:
- __asm {
- xorps xmm0, xmm0 // executing SSE instruction
- }
- break;
- case _CPU_FEATURE_SSE2:
- __asm {
- xorpd xmm0, xmm0 // executing SSE2 instruction
- }
- break;
- case _CPU_FEATURE_3DNOW:
- __asm {
- pfrcp mm0, mm0 // executing 3DNow! instruction
- emms
- }
- break;
- case _CPU_FEATURE_MMX:
- __asm {
- pxor mm0, mm0 // executing MMX instruction
- emms
- }
- break;
- }
- }
- __except (EXCEPTION_EXECUTE_HANDLER) {
- if (_exception_code() == STATUS_ILLEGAL_INSTRUCTION) {
- return 0;
- }
- return 0;
- }
- return 1;
-}
-
-static int _cpuid (_p_info *pinfo)
-{
- DWORD dwStandard = 0;
- DWORD dwFeature = 0;
- DWORD dwMax = 0;
- DWORD dwExt = 0;
- int feature = 0;
- int os_support = 0;
- union {
- struct {
- DWORD dw0;
- DWORD dw1;
- DWORD dw2;
- } s;
- } Ident;
-
- if (!IsCPUID()) {
- return 0;
- }
-
- _asm {
- push ebx
- push ecx
- push edx
-
- // get the vendor string
- xor eax, eax
- cpuid
- mov dwMax, eax
- mov Ident.s.dw0, ebx
- mov Ident.s.dw1, edx
- mov Ident.s.dw2, ecx
-
- // get the Standard bits
- mov eax, 1
- cpuid
- mov dwStandard, eax
- mov dwFeature, edx
-
- // get AMD-specials
- mov eax, 80000000h
- cpuid
- cmp eax, 80000000h
- jc notamd
- mov eax, 80000001h
- cpuid
- mov dwExt, edx
-
-notamd:
- pop ecx
- pop ebx
- pop edx
- }
-
- if (dwFeature & _MMX_FEATURE_BIT) {
- feature |= _CPU_FEATURE_MMX;
- if (_os_support(_CPU_FEATURE_MMX))
- os_support |= _CPU_FEATURE_MMX;
- }
- if (dwExt & _3DNOW_FEATURE_BIT) {
- feature |= _CPU_FEATURE_3DNOW;
- if (_os_support(_CPU_FEATURE_3DNOW))
- os_support |= _CPU_FEATURE_3DNOW;
- }
- if (dwFeature & _SSE_FEATURE_BIT) {
- feature |= _CPU_FEATURE_SSE;
- if (_os_support(_CPU_FEATURE_SSE))
- os_support |= _CPU_FEATURE_SSE;
- }
- if (dwFeature & _SSE2_FEATURE_BIT) {
- feature |= _CPU_FEATURE_SSE2;
- if (_os_support(_CPU_FEATURE_SSE2))
- os_support |= _CPU_FEATURE_SSE2;
- }
-
- if (pinfo) {
- memset(pinfo, 0, sizeof(_p_info));
-
- pinfo->os_support = os_support;
- pinfo->feature = feature;
- pinfo->family = (dwStandard >> 8) & 0xF; // retrieve family
- if (pinfo->family == 15) { // retrieve extended family
- pinfo->family |= (dwStandard >> 16) & 0xFF0;
- }
- pinfo->model = (dwStandard >> 4) & 0xF; // retrieve model
- if (pinfo->model == 15) { // retrieve extended model
- pinfo->model |= (dwStandard >> 12) & 0xF;
- }
- pinfo->stepping = (dwStandard) & 0xF; // retrieve stepping
-
- pinfo->checks = _CPU_FEATURE_MMX |
- _CPU_FEATURE_SSE |
- _CPU_FEATURE_SSE2 |
- _CPU_FEATURE_3DNOW;
- }
-
- return feature;
-}
-
-inline bool IsVectorAligned(const void *where)
-{
- return (reinterpret_cast<size_t>(where)&(__alignof(__m128)-1)) == 0;
-}
-
-inline bool VectorsAligned(const void *v1,const void *v2)
-{
- return (
- (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2))
- &(__alignof(__m128)-1)
- ) == 0;
-}
-
-inline bool VectorsAligned(const void *v1,const void *v2,const void *v3)
-{
- return (
- (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3))
- &(__alignof(__m128)-1)
- ) == 0;
-}
-
-inline bool VectorsAligned(const void *v1,const void *v2,const void *v3,const void *v4)
-{
- return (
- (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3)|reinterpret_cast<size_t>(v4))
- &(__alignof(__m128)-1)
- ) == 0;
-}
-
-#else
-// not MSVC
-static int _cpuid (_p_info *pinfo)
-{
- if(pinfo) memset(pinfo,0,sizeof *pinfo);
- return 0;
-}
-#endif
-
-#endif
-
-
-/*! \brief Determine SIMD capabilities
- \internal
-*/
-static unsigned long setsimdcaps()
-{
- unsigned long simdflags = flext::simd_none;
-#if FLEXT_CPU == FLEXT_CPU_IA32 || FLEXT_CPU == FLEXT_CPU_AMD64
- _p_info cpuinfo;
- int feature = _cpuid(&cpuinfo);
- if(cpuinfo.os_support&_CPU_FEATURE_MMX) simdflags += flext::simd_mmx;
- if(cpuinfo.os_support&_CPU_FEATURE_3DNOW) simdflags += flext::simd_3dnow;
- if(cpuinfo.os_support&_CPU_FEATURE_SSE) simdflags += flext::simd_sse;
- if(cpuinfo.os_support&_CPU_FEATURE_SSE2) simdflags += flext::simd_sse2;
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
- #if FLEXT_OSAPI == FLEXT_OSAPI_MAC_MACH
-
- int selectors[2] = { CTL_HW, HW_VECTORUNIT };
- int hasVectorUnit = 0;
- size_t length = sizeof(hasVectorUnit);
- int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
-
- if(!error && hasVectorUnit != 0) simdflags += flext::simd_altivec;
-
- #else
-
- long cpuAttributes;
- Boolean hasAltiVec = false;
- OSErr err = Gestalt( gestaltPowerPCProcessorFeatures, &cpuAttributes );
-
- if( noErr == err )
- if(( 1 << gestaltPowerPCHasVectorInstructions) & cpuAttributes) simdflags += flext::simd_altivec;
-
- #endif
-#endif
- return simdflags;
-}
-
-
-#if FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
-
-/* functions for misaligned vector data - taken from the Altivec tutorial of Ian Ollmann, Ph.D. */
-
-//! Load a vector from an unaligned location in memory
-inline vector unsigned char LoadUnaligned( vector unsigned char *v )
-{
- vector unsigned char permuteVector = vec_lvsl( 0, (int*) v );
- vector unsigned char low = vec_ld( 0, v );
- vector unsigned char high = vec_ld( 15, v );
- return vec_perm( low, high, permuteVector );
-}
-
-/*
-//! Store a vector to an unaligned location in memory
-inline void StoreUnaligned( vector unsigned char v, vector unsigned char *where)
-{
- // Load the surrounding area
- vector unsigned char low = vec_ld( 0, where );
- vector unsigned char high = vec_ld( 16, where );
- // Prepare the constants that we need
- vector unsigned char permuteVector = vec_lvsr( 0, (int*) where );
-
- vector unsigned char oxFF = (vector unsigned char)vec_splat_s8( -1 );
- vector unsigned char ox00 = (vector unsigned char)vec_splat_s8( 0 );
- // Make a mask for which parts of the vectors to swap out
- vector unsigned char mask = vec_perm( ox00, oxFF, permuteVector );
- // Right rotate our input data
- v = vec_perm( v, v, permuteVector );
- // Insert our data into the low and high vectors
- low = vec_sel( v, low, mask );
- high = vec_sel( high, v, mask );
- // Store the two aligned result vectors
- vec_st( low, 0, where );
- vec_st( high, 16, where );
-}
-*/
-
-inline vector float LoadUnaligned(const float *v )
-{
- return (vector float)LoadUnaligned((vector unsigned char *)v);
-}
-
-/*
-inline void StoreUnaligned( vector float v,float *where)
-{
- return StoreUnaligned((vector unsigned char)v,(vector unsigned char *)where);
-}
-*/
-
-inline bool IsVectorAligned(const void *where)
-{
- return (reinterpret_cast<size_t>(where)&(sizeof(vector float)-1)) == 0;
-}
-
-inline bool VectorsAligned(const void *v1,const void *v2)
-{
- return (
- (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2))
- &(sizeof(vector float)-1)
- ) == 0;
-}
-
-inline bool VectorsAligned(const void *v1,const void *v2,const void *v3)
-{
- return (
- (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3))
- &(sizeof(vector float)-1)
- ) == 0;
-}
-
-inline bool VectorsAligned(const void *v1,const void *v2,const void *v3,const void *v4)
-{
- return (
- (reinterpret_cast<size_t>(v1)|reinterpret_cast<size_t>(v2)|reinterpret_cast<size_t>(v3)|reinterpret_cast<size_t>(v4))
- &(sizeof(vector float)-1)
- ) == 0;
-}
-
-inline vector float LoadValue(const float &f)
-{
- return vec_splat(IsVectorAligned(&f)?vec_ld(0,(vector float *)&f):LoadUnaligned(&f),0);
-}
-#endif
-
-
-#else // FLEXT_USE_SIMD
-static unsigned long setsimdcaps() { return 0; }
-#endif // FLEXT_USE_SIMD
-
-
-void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt)
-{
-#ifdef FLEXT_USE_IPP
- if(sizeof(t_sample) == 4)
- ippsCopy_32f((const float *)src,(float *)dst,cnt);
- else if(sizeof(t_sample) == 8)
- ippsCopy_64f((const double *)src,(double *)dst,cnt);
- else
- ERRINTERNAL();
-#else
-#ifdef FLEXT_USE_SIMD
-#ifdef _MSC_VER
- if(GetSIMDCapabilities()&simd_sse) {
- // single precision
-
- int n = cnt>>4;
- if(!n) goto zero;
- cnt -= n<<4;
-
- __asm {
- mov eax,dword ptr [src]
- prefetcht0 [eax+0]
- prefetcht0 [eax+32]
- }
-
- if(IsVectorAligned(src)) {
- if(IsVectorAligned(dst)) {
- // aligned src, aligned dst
- __asm {
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ecx,[n]
-loopaa:
- prefetcht0 [eax+64]
- prefetcht0 [eax+96]
- movaps xmm0,xmmword ptr[eax]
- movaps xmmword ptr[edx],xmm0
- movaps xmm1,xmmword ptr[eax+4*4]
- movaps xmmword ptr[edx+4*4],xmm1
- movaps xmm2,xmmword ptr[eax+8*4]
- movaps xmmword ptr[edx+8*4],xmm2
- movaps xmm3,xmmword ptr[eax+12*4]
- movaps xmmword ptr[edx+12*4],xmm3
-
- add eax,16*4
- add edx,16*4
- loop loopaa
- }
- }
- else {
- // aligned src, unaligned dst
- __asm {
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ecx,[n]
-loopau:
- prefetcht0 [eax+64]
- prefetcht0 [eax+96]
- movaps xmm0,xmmword ptr[eax]
- movups xmmword ptr[edx],xmm0
- movaps xmm1,xmmword ptr[eax+4*4]
- movups xmmword ptr[edx+4*4],xmm1
- movaps xmm2,xmmword ptr[eax+8*4]
- movups xmmword ptr[edx+8*4],xmm2
- movaps xmm3,xmmword ptr[eax+12*4]
- movups xmmword ptr[edx+12*4],xmm3
-
- add eax,16*4
- add edx,16*4
- loop loopau
- }
- }
- }
- else {
- if(IsVectorAligned(dst)) {
- // unaligned src, aligned dst
- __asm {
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ecx,[n]
-loopua:
- prefetcht0 [eax+64]
- prefetcht0 [eax+96]
- movups xmm0,xmmword ptr[eax]
- movaps xmmword ptr[edx],xmm0
- movups xmm1,xmmword ptr[eax+4*4]
- movaps xmmword ptr[edx+4*4],xmm1
- movups xmm2,xmmword ptr[eax+8*4]
- movaps xmmword ptr[edx+8*4],xmm2
- movups xmm3,xmmword ptr[eax+12*4]
- movaps xmmword ptr[edx+12*4],xmm3
-
- add eax,16*4
- add edx,16*4
- loop loopua
- }
- }
- else {
- // unaligned src, unaligned dst
- __asm {
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ecx,[n]
-loopuu:
- prefetcht0 [eax+64]
- prefetcht0 [eax+96]
- movups xmm0,xmmword ptr[eax]
- movups xmmword ptr[edx],xmm0
- movups xmm1,xmmword ptr[eax+4*4]
- movups xmmword ptr[edx+4*4],xmm1
- movups xmm2,xmmword ptr[eax+8*4]
- movups xmmword ptr[edx+8*4],xmm2
- movups xmm3,xmmword ptr[eax+12*4]
- movups xmmword ptr[edx+12*4],xmm3
-
- add eax,16*4
- add edx,16*4
- loop loopuu
- }
- }
- }
-
- src += n<<4,dst += n<<4;
-zero:
- while(cnt--) *(dst++) = *(src++);
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VECTOROPS__)
- if(true) {
- int n = cnt>>2,n4 = n<<2;
- vScopy(n4,(vector float *)src,(vector float *)dst);
- cnt -= n4,src += n4,dst += n4;
- while(cnt--) *(dst++) = *(src++);
- }
- else
-#endif // _MSC_VER
-#endif // FLEXT_USE_SIMD
- {
- int n = cnt>>3;
- cnt -= n<<3;
- while(n--) {
- dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3];
- dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7];
- src += 8,dst += 8;
- }
- while(cnt--) *(dst++) = *(src++);
- }
-#endif
-}
-
-#if defined(FLEXT_USE_SIMD) && FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
-// because of some frame code Altivec stuff should be in seperate functions....
-
-static const vector float zero = (vector float)(0);
-
-static void SetAltivec(t_sample *dst,int cnt,t_sample s)
-{
- vector float svec = LoadValue(s);
- int n = cnt>>4;
- cnt -= n<<4;
-
- while(n--) {
- vec_st(svec,0,dst);
- vec_st(svec,16,dst);
- vec_st(svec,32,dst);
- vec_st(svec,48,dst);
- dst += 16;
- }
-
- while(cnt--) *(dst++) = s;
-}
-
-static void MulAltivec(t_sample *dst,const t_sample *src,t_sample op,int cnt)
-{
- const vector float arg = LoadValue(op);
- int n = cnt>>4;
- cnt -= n<<4;
-
- for(; n--; src += 16,dst += 16) {
- vector float a1 = vec_ld( 0,src);
- vector float a2 = vec_ld(16,src);
- vector float a3 = vec_ld(32,src);
- vector float a4 = vec_ld(48,src);
-
- a1 = vec_madd(a1,arg,zero);
- a2 = vec_madd(a2,arg,zero);
- a3 = vec_madd(a3,arg,zero);
- a4 = vec_madd(a4,arg,zero);
-
- vec_st(a1, 0,dst);
- vec_st(a2,16,dst);
- vec_st(a3,32,dst);
- vec_st(a4,48,dst);
- }
-
- while(cnt--) *(dst++) = *(src++)*op;
-}
-
-static void MulAltivec(t_sample *dst,const t_sample *src,const t_sample *op,int cnt)
-{
- int n = cnt>>4;
- cnt -= n<<4;
-
- for(; n--; src += 16,op += 16,dst += 16) {
- vector float a1 = vec_ld( 0,src),b1 = vec_ld( 0,op);
- vector float a2 = vec_ld(16,src),b2 = vec_ld(16,op);
- vector float a3 = vec_ld(32,src),b3 = vec_ld(32,op);
- vector float a4 = vec_ld(48,src),b4 = vec_ld(48,op);
-
- a1 = vec_madd(a1,b1,zero);
- a2 = vec_madd(a2,b2,zero);
- a3 = vec_madd(a3,b3,zero);
- a4 = vec_madd(a4,b4,zero);
-
- vec_st(a1, 0,dst);
- vec_st(a2,16,dst);
- vec_st(a3,32,dst);
- vec_st(a4,48,dst);
- }
- while(cnt--) *(dst++) = *(src++) * *(op++);
-}
-
-static void AddAltivec(t_sample *dst,const t_sample *src,t_sample op,int cnt)
-{
- const vector float arg = LoadValue(op);
- int n = cnt>>4;
- cnt -= n<<4;
-
- for(; n--; src += 16,dst += 16) {
- vector float a1 = vec_ld( 0,src);
- vector float a2 = vec_ld(16,src);
- vector float a3 = vec_ld(32,src);
- vector float a4 = vec_ld(48,src);
-
- a1 = vec_add(a1,arg);
- a2 = vec_add(a2,arg);
- a3 = vec_add(a3,arg);
- a4 = vec_add(a4,arg);
-
- vec_st(a1, 0,dst);
- vec_st(a2,16,dst);
- vec_st(a3,32,dst);
- vec_st(a4,48,dst);
- }
-
- while(cnt--) *(dst++) = *(src++)+op;
-}
-
-static void AddAltivec(t_sample *dst,const t_sample *src,const t_sample *op,int cnt)
-{
- int n = cnt>>4;
- cnt -= n<<4;
-
- for(; n--; src += 16,op += 16,dst += 16) {
- vector float a1 = vec_ld( 0,src),b1 = vec_ld( 0,op);
- vector float a2 = vec_ld(16,src),b2 = vec_ld(16,op);
- vector float a3 = vec_ld(32,src),b3 = vec_ld(32,op);
- vector float a4 = vec_ld(48,src),b4 = vec_ld(48,op);
-
- a1 = vec_add(a1,b1);
- a2 = vec_add(a2,b2);
- a3 = vec_add(a3,b3);
- a4 = vec_add(a4,b4);
-
- vec_st(a1, 0,dst);
- vec_st(a2,16,dst);
- vec_st(a3,32,dst);
- vec_st(a4,48,dst);
- }
- while(cnt--) *(dst++) = *(src++) + *(op++);
-}
-
-static void ScaleAltivec(t_sample *dst,const t_sample *src,t_sample opmul,t_sample opadd,int cnt)
-{
- const vector float argmul = LoadValue(opmul);
- const vector float argadd = LoadValue(opadd);
- int n = cnt>>4;
- cnt -= n<<4;
-
- for(; n--; src += 16,dst += 16) {
- vec_st(vec_madd(vec_ld( 0,src),argmul,argadd), 0,dst);
- vec_st(vec_madd(vec_ld(16,src),argmul,argadd),16,dst);
- vec_st(vec_madd(vec_ld(32,src),argmul,argadd),32,dst);
- vec_st(vec_madd(vec_ld(48,src),argmul,argadd),48,dst);
- }
-
- while(cnt--) *(dst++) = *(src++)*opmul+opadd;
-}
-
-static void ScaleAltivec(t_sample *dst,const t_sample *src,t_sample opmul,const t_sample *add,int cnt)
-{
- const vector float argmul = LoadValue(opmul);
- int n = cnt>>4;
- cnt -= n<<4;
-
- for(; n--; src += 16,dst += 16,add += 16) {
- vec_st(vec_madd(vec_ld( 0,src),argmul,vec_ld( 0,add)), 0,dst);
- vec_st(vec_madd(vec_ld(16,src),argmul,vec_ld(16,add)),16,dst);
- vec_st(vec_madd(vec_ld(32,src),argmul,vec_ld(32,add)),32,dst);
- vec_st(vec_madd(vec_ld(48,src),argmul,vec_ld(48,add)),48,dst);
- }
-
- while(cnt--) *(dst++) = *(src++) * opmul + *(add++);
-}
-
-static void ScaleAltivec(t_sample *dst,const t_sample *src,const t_sample *mul,const t_sample *add,int cnt)
-{
- int n = cnt>>4;
- cnt -= n<<4;
-
- for(; n--; src += 16,dst += 16,mul += 16,add += 16) {
- vec_st(vec_madd(vec_ld( 0,src),vec_ld( 0,mul),vec_ld( 0,add)), 0,dst);
- vec_st(vec_madd(vec_ld(16,src),vec_ld(16,mul),vec_ld(16,add)),16,dst);
- vec_st(vec_madd(vec_ld(32,src),vec_ld(32,mul),vec_ld(32,add)),32,dst);
- vec_st(vec_madd(vec_ld(48,src),vec_ld(48,mul),vec_ld(48,add)),48,dst);
- }
-
- while(cnt--) *(dst++) = *(src++) * *(mul++) + *(add++);
-}
-#endif
-
-void flext::SetSamples(t_sample *dst,int cnt,t_sample s)
-{
-#ifdef FLEXT_USE_IPP
- if(sizeof(t_sample) == 4)
- ippsSet_32f((float)s,(float *)dst,cnt);
- else if(sizeof(t_sample) == 8)
- ippsSet_64f((double)s,(double *)dst,cnt);
- else
- ERRINTERNAL();
-#else
-#ifdef FLEXT_USE_SIMD
-#ifdef _MSC_VER
- if(GetSIMDCapabilities()&simd_sse) {
- // single precision
-
- int n = cnt>>4;
- if(!n) goto zero;
- cnt -= n<<4;
-
- __asm {
- movss xmm0,xmmword ptr [s]
- shufps xmm0,xmm0,0
- }
-
- if(IsVectorAligned(dst)) {
- // aligned version
- __asm {
- mov ecx,[n]
- mov edx,dword ptr [dst]
-loopa:
- movaps xmmword ptr[edx],xmm0
- movaps xmmword ptr[edx+4*4],xmm0
- movaps xmmword ptr[edx+8*4],xmm0
- movaps xmmword ptr[edx+12*4],xmm0
-
- add edx,16*4
- loop loopa
- }
- }
- else {
- // unaligned version
- __asm {
- mov ecx,[n]
- mov edx,dword ptr [dst]
-loopu:
- movups xmmword ptr[edx],xmm0
- movups xmmword ptr[edx+4*4],xmm0
- movups xmmword ptr[edx+8*4],xmm0
- movups xmmword ptr[edx+12*4],xmm0
-
- add edx,16*4
- loop loopu
- }
- }
-
- dst += n<<4;
-zero:
- while(cnt--) *(dst++) = s;
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
- if(GetSIMDCapabilities()&simd_altivec && IsVectorAligned(dst))
- SetAltivec(dst,cnt,s);
- else
-#endif
-#endif // FLEXT_USE_SIMD
- {
- int n = cnt>>3;
- cnt -= n<<3;
- while(n--) {
- dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = dst[7] = s;
- dst += 8;
- }
-
- while(cnt--) *(dst++) = s;
- }
-#endif
-}
-
-
-void flext::MulSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt)
-{
-#ifdef FLEXT_USE_IPP
- if(sizeof(t_sample) == 4) {
- ippsMulC_32f((const float *)src,(float)op,(float *)dst,cnt);
- }
- else if(sizeof(t_sample) == 8) {
- ippsMulC_64f((const double *)src,(double)op,(double *)dst,cnt);
- }
- else
- ERRINTERNAL();
-#else
-#ifdef FLEXT_USE_SIMD
-#ifdef _MSC_VER
- if(GetSIMDCapabilities()&simd_sse) {
- // single precision
- __m128 a = _mm_load1_ps(&op);
-
- int n = cnt>>4;
- if(!n) goto zero;
- cnt -= n<<4;
-
- __asm {
- mov eax,dword ptr [src]
- prefetcht0 [eax+0]
- prefetcht0 [eax+32]
-
- movss xmm0,xmmword ptr [op]
- shufps xmm0,xmm0,0
- }
-
- if(VectorsAligned(src,dst)) {
- // aligned version
- __asm {
- mov ecx,[n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
-loopa:
- prefetcht0 [eax+64]
- prefetcht0 [eax+96]
-
- movaps xmm1,xmmword ptr[eax]
- mulps xmm1,xmm0
- movaps xmmword ptr[edx],xmm1
-
- movaps xmm2,xmmword ptr[eax+4*4]
- mulps xmm2,xmm0
- movaps xmmword ptr[edx+4*4],xmm2
-
- movaps xmm3,xmmword ptr[eax+8*4]
- mulps xmm3,xmm0
- movaps xmmword ptr[edx+8*4],xmm3
-
- movaps xmm4,xmmword ptr[eax+12*4]
- mulps xmm4,xmm0
- movaps xmmword ptr[edx+12*4],xmm4
-
- add eax,16*4
- add edx,16*4
- loop loopa
- }
- }
- else {
- // unaligned version
- __asm {
- mov ecx,[n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
-loopu:
- prefetcht0 [eax+64]
- prefetcht0 [eax+96]
-
- movups xmm1,xmmword ptr[eax]
- mulps xmm1,xmm0
- movups xmmword ptr[edx],xmm1
-
- movups xmm2,xmmword ptr[eax+4*4]
- mulps xmm2,xmm0
- movups xmmword ptr[edx+4*4],xmm2
-
- movups xmm3,xmmword ptr[eax+8*4]
- mulps xmm3,xmm0
- movups xmmword ptr[edx+8*4],xmm3
-
- movups xmm4,xmmword ptr[eax+12*4]
- mulps xmm4,xmm0
- movups xmmword ptr[edx+12*4],xmm4
-
- add eax,16*4
- add edx,16*4
- loop loopu
- }
- }
-
- src += n<<4,dst += n<<4;
-zero:
- while(cnt--) *(dst++) = *(src++)*op;
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VDSP__)
- if(true) {
- vsmul(src,1,&op,dst,1,cnt);
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
- if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst))
- MulAltivec(dst,src,op,cnt);
- else
-#endif // _MSC_VER
-#endif // FLEXT_USE_SIMD
- {
- int n = cnt>>3;
- cnt -= n<<3;
-
- if(src == dst) {
- while(n--) {
- dst[0] *= op; dst[1] *= op; dst[2] *= op; dst[3] *= op;
- dst[4] *= op; dst[5] *= op; dst[6] *= op; dst[7] *= op;
- dst += 8;
- }
- while(cnt--) *(dst++) *= op;
- }
- else {
- while(n--) {
- dst[0] = src[0]*op; dst[1] = src[1]*op;
- dst[2] = src[2]*op; dst[3] = src[3]*op;
- dst[4] = src[4]*op; dst[5] = src[5]*op;
- dst[6] = src[6]*op; dst[7] = src[7]*op;
- src += 8,dst += 8;
- }
- while(cnt--) *(dst++) = *(src++)*op;
- }
- }
-#endif
-}
-
-
-void flext::MulSamples(t_sample *dst,const t_sample *src,const t_sample *op,int cnt)
-{
-#ifdef FLEXT_USE_IPP
- if(sizeof(t_sample) == 4) {
- ippsMul_32f((const float *)src,(const float *)op,(float *)dst,cnt);
- }
- else if(sizeof(t_sample) == 8) {
- ippsMul_32f((const double *)src,(const double *)op,(double *)dst,cnt);
- }
- else
- ERRINTERNAL();
-#else
-#ifdef FLEXT_USE_SIMD
-#ifdef _MSC_VER
- if(GetSIMDCapabilities()&simd_sse) {
- // single precision
- int n = cnt>>4;
- if(!n) goto zero;
- cnt -= n<<4;
-
- __asm {
- mov eax,[src]
- mov ebx,[op]
- prefetcht0 [eax+0]
- prefetcht0 [ebx+0]
- prefetcht0 [eax+32]
- prefetcht0 [ebx+32]
- }
-
- if(VectorsAligned(src,dst)) {
- if(IsVectorAligned(op)) {
- __asm {
- mov ecx,[n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ebx,dword ptr [op]
- loopaa:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
-
- movaps xmm0,xmmword ptr[eax]
- movaps xmm1,xmmword ptr[ebx]
- mulps xmm0,xmm1
- movaps xmmword ptr[edx],xmm0
-
- movaps xmm2,xmmword ptr[eax+4*4]
- movaps xmm3,xmmword ptr[ebx+4*4]
- mulps xmm2,xmm3
- movaps xmmword ptr[edx+4*4],xmm2
-
- movaps xmm4,xmmword ptr[eax+8*4]
- movaps xmm5,xmmword ptr[ebx+8*4]
- mulps xmm4,xmm5
- movaps xmmword ptr[edx+8*4],xmm4
-
- movaps xmm6,xmmword ptr[eax+12*4]
- movaps xmm7,xmmword ptr[ebx+12*4]
- mulps xmm6,xmm7
- movaps xmmword ptr[edx+12*4],xmm6
-
- add eax,16*4
- add ebx,16*4
- add edx,16*4
- loop loopaa
- }
- }
- else {
- __asm {
- mov ecx,[n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ebx,dword ptr [op]
- loopau:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
-
- movaps xmm0,xmmword ptr[eax]
- movups xmm1,xmmword ptr[ebx]
- mulps xmm0,xmm1
- movaps xmmword ptr[edx],xmm0
-
- movaps xmm2,xmmword ptr[eax+4*4]
- movups xmm3,xmmword ptr[ebx+4*4]
- mulps xmm2,xmm3
- movaps xmmword ptr[edx+4*4],xmm2
-
- movaps xmm4,xmmword ptr[eax+8*4]
- movups xmm5,xmmword ptr[ebx+8*4]
- mulps xmm4,xmm5
- movaps xmmword ptr[edx+8*4],xmm4
-
- movaps xmm6,xmmword ptr[eax+12*4]
- movups xmm7,xmmword ptr[ebx+12*4]
- mulps xmm6,xmm7
- movaps xmmword ptr[edx+12*4],xmm6
-
- add eax,16*4
- add ebx,16*4
- add edx,16*4
- loop loopau
- }
- }
- }
- else {
- if(IsVectorAligned(op)) {
- __asm {
- mov ecx,[n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ebx,dword ptr [op]
- loopua:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
-
- movups xmm0,xmmword ptr[eax]
- movaps xmm1,xmmword ptr[ebx]
- mulps xmm0,xmm1
- movups xmmword ptr[edx],xmm0
-
- movups xmm2,xmmword ptr[eax+4*4]
- movaps xmm3,xmmword ptr[ebx+4*4]
- mulps xmm2,xmm3
- movups xmmword ptr[edx+4*4],xmm2
-
- movups xmm4,xmmword ptr[eax+8*4]
- movaps xmm5,xmmword ptr[ebx+8*4]
- mulps xmm4,xmm5
- movups xmmword ptr[edx+8*4],xmm4
-
- movups xmm6,xmmword ptr[eax+12*4]
- movaps xmm7,xmmword ptr[ebx+12*4]
- mulps xmm6,xmm7
- movups xmmword ptr[edx+12*4],xmm6
-
- add eax,16*4
- add ebx,16*4
- add edx,16*4
- loop loopua
- }
- }
- else {
- __asm {
- mov ecx,[n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ebx,dword ptr [op]
-loopuu:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
-
- movups xmm0,xmmword ptr[eax]
- movups xmm1,xmmword ptr[ebx]
- mulps xmm0,xmm1
- movups xmmword ptr[edx],xmm0
-
- movups xmm2,xmmword ptr[eax+4*4]
- movups xmm3,xmmword ptr[ebx+4*4]
- mulps xmm2,xmm3
- movups xmmword ptr[edx+4*4],xmm2
-
- movups xmm4,xmmword ptr[eax+8*4]
- movups xmm5,xmmword ptr[ebx+8*4]
- mulps xmm4,xmm5
- movups xmmword ptr[edx+8*4],xmm4
-
- movups xmm6,xmmword ptr[eax+12*4]
- movups xmm7,xmmword ptr[ebx+12*4]
- mulps xmm6,xmm7
- movups xmmword ptr[edx+12*4],xmm6
-
- add eax,16*4
- add ebx,16*4
- add edx,16*4
- loop loopuu
- }
- }
- }
-
- src += n<<4,dst += n<<4,op += n<<4;
-zero:
- while(cnt--) *(dst++) = *(src++) * *(op++);
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VDSP__)
- if(true) {
- vmul(src,1,op,1,dst,1,cnt);
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
- if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,op,dst))
- MulAltivec(dst,src,op,cnt);
- else
-#endif // _MSC_VER
-#endif // FLEXT_USE_SIMD
- {
- int n = cnt>>3;
- cnt -= n<<3;
-
- if(src == dst) {
- while(n--) {
- dst[0] *= op[0]; dst[1] *= op[1];
- dst[2] *= op[2]; dst[3] *= op[3];
- dst[4] *= op[4]; dst[5] *= op[5];
- dst[6] *= op[6]; dst[7] *= op[7];
- dst += 8,op += 8;
- }
- while(cnt--) *(dst++) *= *(op++);
- }
- else {
- while(n--) {
- dst[0] = src[0]*op[0]; dst[1] = src[1]*op[1];
- dst[2] = src[2]*op[2]; dst[3] = src[3]*op[3];
- dst[4] = src[4]*op[4]; dst[5] = src[5]*op[5];
- dst[6] = src[6]*op[6]; dst[7] = src[7]*op[7];
- src += 8,dst += 8,op += 8;
- }
- while(cnt--) *(dst++) = *(src++) * *(op++);
- }
- }
-#endif
-}
-
-
-void flext::AddSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt)
-{
-#ifdef FLEXT_USE_IPP
- if(sizeof(t_sample) == 4) {
- ippsAddC_32f((const float *)src,(float)op,(float *)dst,cnt);
- }
- else if(sizeof(t_sample) == 8) {
- ippsAddC_64f_I((const double *)src,(double)op,(double *)dst,cnt);
- }
- else
- ERRINTERNAL();
-#else
-#ifdef FLEXT_USE_SIMD
-#ifdef _MSC_VER
- if(GetSIMDCapabilities()&simd_sse) {
- // single precision
- int n = cnt>>4;
- if(!n) goto zero;
- cnt -= n<<4;
-
- __asm {
- mov eax,[src]
- prefetcht0 [eax+0]
- prefetcht0 [eax+32]
-
- movss xmm0,xmmword ptr [op]
- shufps xmm0,xmm0,0
- }
-
- if(VectorsAligned(src,dst)) {
- // aligned version
- __asm {
- mov ecx,[n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
-loopa:
- prefetcht0 [eax+64]
- prefetcht0 [eax+96]
-
- movaps xmm1,xmmword ptr[eax]
- addps xmm1,xmm0
- movaps xmmword ptr[edx],xmm1
-
- movaps xmm2,xmmword ptr[eax+4*4]
- addps xmm2,xmm0
- movaps xmmword ptr[edx+4*4],xmm2
-
- movaps xmm3,xmmword ptr[eax+8*4]
- addps xmm3,xmm0
- movaps xmmword ptr[edx+8*4],xmm3
-
- movaps xmm4,xmmword ptr[eax+12*4]
- addps xmm4,xmm0
- movaps xmmword ptr[edx+12*4],xmm4
-
- add eax,16*4
- add edx,16*4
- loop loopa
- }
- }
- else {
- // unaligned version
- __asm {
- mov ecx,[n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
-loopu:
- prefetcht0 [eax+64]
- prefetcht0 [eax+96]
-
- movups xmm1,xmmword ptr[eax]
- addps xmm1,xmm0
- movups xmmword ptr[edx],xmm1
-
- movups xmm2,xmmword ptr[eax+4*4]
- addps xmm2,xmm0
- movups xmmword ptr[edx+4*4],xmm2
-
- movups xmm3,xmmword ptr[eax+8*4]
- addps xmm3,xmm0
- movups xmmword ptr[edx+8*4],xmm3
-
- movups xmm4,xmmword ptr[eax+12*4]
- addps xmm4,xmm0
- movups xmmword ptr[edx+12*4],xmm4
-
- add eax,16*4
- add edx,16*4
- loop loopu
- }
- }
- src += n<<4,dst += n<<4,op += n<<4;
-zero:
- while(cnt--) *(dst++) = *(src++)+op;
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
- if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst))
- AddAltivec(dst,src,op,cnt);
- else
-#endif // _MSC_VER
-#endif // FLEXT_USE_SIMD
- {
- int n = cnt>>3;
- cnt -= n<<3;
-
- if(src == dst) {
- while(n--) {
- dst[0] += op; dst[1] += op; dst[2] += op; dst[3] += op;
- dst[4] += op; dst[5] += op; dst[6] += op; dst[7] += op;
- dst += 8;
- }
- while(cnt--) *(dst++) += op;
- }
- else {
- while(n--) {
- dst[0] = src[0]+op; dst[1] = src[1]+op;
- dst[2] = src[2]+op; dst[3] = src[3]+op;
- dst[4] = src[4]+op; dst[5] = src[5]+op;
- dst[6] = src[6]+op; dst[7] = src[7]+op;
- src += 8,dst += 8;
- }
- while(cnt--) *(dst++) = *(src++)+op;
- }
- }
-#endif
-}
-
-
-void flext::AddSamples(t_sample *dst,const t_sample *src,const t_sample *op,int cnt)
-{
-#ifdef FLEXT_USE_IPP
- if(sizeof(t_sample) == 4) {
- ippsAdd_32f((const float *)src,(const float *)op,(float *)dst,cnt);
- }
- else if(sizeof(t_sample) == 8) {
- ippsAdd_64f((const double *)src,(const double *)op,(double *)dst,cnt);
- }
- else
- ERRINTERNAL();
-#else
-#ifdef FLEXT_USE_SIMD
-#ifdef _MSC_VER
- if(GetSIMDCapabilities()&simd_sse) {
- // Prefetch cache
- __asm {
- mov eax,dword ptr [src]
- mov ebx,dword ptr [op]
- prefetcht0 [eax]
- prefetcht0 [ebx]
- prefetcht0 [eax+32]
- prefetcht0 [ebx+32]
- }
-
- // single precision
- int n = cnt>>4;
- if(!n) goto zero;
- cnt -= n<<4;
-
- if(VectorsAligned(src,dst)) {
- if(IsVectorAligned(op)) {
- __asm {
- mov ecx,dword ptr [n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ebx,dword ptr [op]
- loopaa:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
-
- movaps xmm0,xmmword ptr[eax]
- movaps xmm1,xmmword ptr[ebx]
- addps xmm0,xmm1
- movaps xmmword ptr[edx],xmm0
-
- movaps xmm2,xmmword ptr[eax+4*4]
- movaps xmm3,xmmword ptr[ebx+4*4]
- addps xmm2,xmm3
- movaps xmmword ptr[edx+4*4],xmm2
-
- movaps xmm4,xmmword ptr[eax+8*4]
- movaps xmm5,xmmword ptr[ebx+8*4]
- addps xmm4,xmm5
- movaps xmmword ptr[edx+8*4],xmm4
-
- movaps xmm6,xmmword ptr[eax+12*4]
- movaps xmm7,xmmword ptr[ebx+12*4]
- addps xmm6,xmm7
- movaps xmmword ptr[edx+12*4],xmm6
-
- add eax,16*4
- add ebx,16*4
- add edx,16*4
- loop loopaa
- }
- }
- else {
- __asm {
- mov ecx,dword ptr [n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ebx,dword ptr [op]
- loopau:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
-
- movaps xmm0,xmmword ptr[eax]
- movups xmm1,xmmword ptr[ebx]
- addps xmm0,xmm1
- movaps xmmword ptr[edx],xmm0
-
- movaps xmm2,xmmword ptr[eax+4*4]
- movups xmm3,xmmword ptr[ebx+4*4]
- addps xmm2,xmm3
- movaps xmmword ptr[edx+4*4],xmm2
-
- movaps xmm4,xmmword ptr[eax+8*4]
- movups xmm5,xmmword ptr[ebx+8*4]
- addps xmm4,xmm5
- movaps xmmword ptr[edx+8*4],xmm4
-
- movaps xmm6,xmmword ptr[eax+12*4]
- movups xmm7,xmmword ptr[ebx+12*4]
- addps xmm6,xmm7
- movaps xmmword ptr[edx+12*4],xmm6
-
- add eax,16*4
- add ebx,16*4
- add edx,16*4
- loop loopau
- }
- }
- }
- else {
- if(IsVectorAligned(op)) {
- __asm {
- mov ecx,dword ptr [n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ebx,dword ptr [op]
- loopua:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
-
- movups xmm0,xmmword ptr[eax]
- movaps xmm1,xmmword ptr[ebx]
- addps xmm0,xmm1
- movups xmmword ptr[edx],xmm0
-
- movups xmm2,xmmword ptr[eax+4*4]
- movaps xmm3,xmmword ptr[ebx+4*4]
- addps xmm2,xmm3
- movups xmmword ptr[edx+4*4],xmm2
-
- movups xmm4,xmmword ptr[eax+8*4]
- movaps xmm5,xmmword ptr[ebx+8*4]
- addps xmm4,xmm5
- movups xmmword ptr[edx+8*4],xmm4
-
- movups xmm6,xmmword ptr[eax+12*4]
- movaps xmm7,xmmword ptr[ebx+12*4]
- addps xmm6,xmm7
- movups xmmword ptr[edx+12*4],xmm6
-
- add eax,16*4
- add ebx,16*4
- add edx,16*4
- loop loopua
- }
- }
- else {
- __asm {
- mov ecx,dword ptr [n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ebx,dword ptr [op]
- loopuu:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
-
- movups xmm0,xmmword ptr[eax]
- movups xmm1,xmmword ptr[ebx]
- addps xmm0,xmm1
- movups xmmword ptr[edx],xmm0
-
- movups xmm2,xmmword ptr[eax+4*4]
- movups xmm3,xmmword ptr[ebx+4*4]
- addps xmm2,xmm3
- movups xmmword ptr[edx+4*4],xmm2
-
- movups xmm4,xmmword ptr[eax+8*4]
- movups xmm5,xmmword ptr[ebx+8*4]
- addps xmm4,xmm5
- movups xmmword ptr[edx+8*4],xmm4
-
- movups xmm6,xmmword ptr[eax+12*4]
- movups xmm7,xmmword ptr[ebx+12*4]
- addps xmm6,xmm7
- movups xmmword ptr[edx+12*4],xmm6
-
- add eax,16*4
- add ebx,16*4
- add edx,16*4
- loop loopuu
- }
- }
- }
-
- src += n<<4,dst += n<<4,op += n<<4;
-zero:
- while(cnt--) *(dst++) = *(src++) + *(op++);
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VDSP__)
- if(true) {
- vadd(src,1,op,1,dst,1,cnt);
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
- if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,op,dst))
- AddAltivec(dst,src,op,cnt);
- else
-#endif // _MSC_VER
-#endif // FLEXT_USE_SIMD
- {
- int n = cnt>>3;
- cnt -= n<<3;
-
- if(dst == src) {
- while(n--) {
- dst[0] += op[0]; dst[1] += op[1];
- dst[2] += op[2]; dst[3] += op[3];
- dst[4] += op[4]; dst[5] += op[5];
- dst[6] += op[6]; dst[7] += op[7];
- dst += 8,op += 8;
- }
- while(cnt--) *(dst++) += *(op++);
- }
- else {
- while(n--) {
- dst[0] = src[0]+op[0]; dst[1] = src[1]+op[1];
- dst[2] = src[2]+op[2]; dst[3] = src[3]+op[3];
- dst[4] = src[4]+op[4]; dst[5] = src[5]+op[5];
- dst[6] = src[6]+op[6]; dst[7] = src[7]+op[7];
- src += 8,dst += 8,op += 8;
- }
- while(cnt--) *(dst++) = *(src++) + *(op++);
- }
- }
-#endif
-}
-
-
-void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,t_sample opadd,int cnt)
-{
-#ifdef FLEXT_USE_IPP
- if(sizeof(t_sample) == 4) {
- ippsMulC_32f((const float *)src,(float)opmul,(float *)dst,cnt);
- ippsAddC_32f_I((float)opadd,(float *)dst,cnt);
- }
- else if(sizeof(t_sample) == 8) {
- ippsMulC_64f((const double *)src,(double)opmul,(double *)dst,cnt);
- ippsAddC_64f_I((double)opadd,(double *)dst,cnt);
- }
- else
- ERRINTERNAL();
-#else
-#ifdef FLEXT_USE_SIMD
-#ifdef _MSC_VER
- if(GetSIMDCapabilities()&simd_sse) {
- // single precision
- int n = cnt>>4;
- if(!n) goto zero;
- cnt -= n<<4;
-
- __asm {
- mov eax,dword ptr [src]
- prefetcht0 [eax+0]
- prefetcht0 [eax+32]
-
- movss xmm0,xmmword ptr [opadd]
- shufps xmm0,xmm0,0
- movss xmm1,xmmword ptr [opmul]
- shufps xmm1,xmm1,0
- }
-
- if(VectorsAligned(src,dst)) {
- // aligned version
- __asm {
- mov ecx,dword ptr [n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
-loopa:
- prefetcht0 [eax+64]
- prefetcht0 [eax+96]
-
- movaps xmm2,xmmword ptr[eax]
- mulps xmm2,xmm1
- addps xmm2,xmm0
- movaps xmmword ptr[edx],xmm2
-
- movaps xmm3,xmmword ptr[eax+4*4]
- mulps xmm3,xmm1
- addps xmm3,xmm0
- movaps xmmword ptr[edx+4*4],xmm3
-
- movaps xmm4,xmmword ptr[eax+8*4]
- mulps xmm4,xmm1
- addps xmm4,xmm0
- movaps xmmword ptr[edx+8*4],xmm4
-
- movaps xmm5,xmmword ptr[eax+12*4]
- mulps xmm5,xmm1
- addps xmm5,xmm0
- movaps xmmword ptr[edx+12*4],xmm5
-
- add eax,16*4
- add edx,16*4
- loop loopa
- }
- }
- else {
- // unaligned version
- __asm {
- mov ecx,dword ptr [n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
-loopu:
- prefetcht0 [eax+64]
- prefetcht0 [eax+96]
-
- movups xmm2,xmmword ptr[eax]
- mulps xmm2,xmm1
- addps xmm2,xmm0
- movups xmmword ptr[edx],xmm2
-
- movups xmm3,xmmword ptr[eax+4*4]
- mulps xmm3,xmm1
- addps xmm3,xmm0
- movups xmmword ptr[edx+4*4],xmm3
-
- movups xmm4,xmmword ptr[eax+8*4]
- mulps xmm4,xmm1
- addps xmm4,xmm0
- movups xmmword ptr[edx+8*4],xmm4
-
- movups xmm5,xmmword ptr[eax+12*4]
- mulps xmm5,xmm1
- addps xmm5,xmm0
- movups xmmword ptr[edx+12*4],xmm5
-
- add eax,16*4
- add edx,16*4
- loop loopu
- }
- }
-
- src += n<<4,dst += n<<4;
-zero:
- while(cnt--) *(dst++) = *(src++)*opmul+opadd;
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
- if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst))
- ScaleAltivec(dst,src,opmul,opadd,cnt);
- else
-#endif // _MSC_VER
-#endif // FLEXT_USE_SIMD
- {
- int n = cnt>>3;
- cnt -= n<<3;
- while(n--) {
- dst[0] = src[0]*opmul+opadd; dst[1] = src[1]*opmul+opadd;
- dst[2] = src[2]*opmul+opadd; dst[3] = src[3]*opmul+opadd;
- dst[4] = src[4]*opmul+opadd; dst[5] = src[5]*opmul+opadd;
- dst[6] = src[6]*opmul+opadd; dst[7] = src[7]*opmul+opadd;
- src += 8,dst += 8;
- }
- while(cnt--) *(dst++) = *(src++)*opmul+opadd;
- }
-#endif
-}
-
-void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,const t_sample *opadd,int cnt)
-{
-#ifdef FLEXT_USE_IPP
- if(sizeof(t_sample) == 4) {
- ippsMulC_32f((const float *)src,(float)opmul,(float *)dst,cnt);
- ippsAdd_32f_I((float *)opadd,(float *)dst,cnt);
- }
- else if(sizeof(t_sample) == 8) {
- ippsMulC_64f((const double *)src,(double)opmul,(double *)dst,cnt);
- ippsAdd_64f_I((double *)opadd,(double *)dst,cnt);
- }
- else
- ERRINTERNAL();
-#else
-#ifdef FLEXT_USE_SIMD
-#ifdef _MSC_VER
- if(GetSIMDCapabilities()&simd_sse) {
- // single precision
- int n = cnt>>4;
- if(!n) goto zero;
- cnt -= n<<4;
-
- __asm {
- mov eax,dword ptr [src]
- prefetcht0 [eax+0]
- prefetcht0 [eax+32]
-
- movss xmm0,xmmword ptr [opmul]
- shufps xmm0,xmm0,0
- }
-
- if(VectorsAligned(src,dst,opadd)) {
- // aligned version
- __asm {
- mov ecx,dword ptr [n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ebx,dword ptr [opadd]
-loopa:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
-
- movaps xmm2,xmmword ptr[eax]
- movaps xmm1,xmmword ptr[ebx]
- mulps xmm2,xmm0
- addps xmm2,xmm1
- movaps xmmword ptr[edx],xmm2
-
- movaps xmm3,xmmword ptr[eax+4*4]
- movaps xmm1,xmmword ptr[ebx+4*4]
- mulps xmm3,xmm0
- addps xmm3,xmm1
- movaps xmmword ptr[edx+4*4],xmm3
-
- movaps xmm4,xmmword ptr[eax+8*4]
- movaps xmm1,xmmword ptr[ebx+8*4]
- mulps xmm4,xmm0
- addps xmm4,xmm1
- movaps xmmword ptr[edx+8*4],xmm4
-
- movaps xmm5,xmmword ptr[eax+12*4]
- movaps xmm1,xmmword ptr[ebx+12*4]
- mulps xmm5,xmm0
- addps xmm5,xmm1
- movaps xmmword ptr[edx+12*4],xmm5
-
- add eax,16*4
- add edx,16*4
- add ebx,16*4
- loop loopa
- }
- }
- else {
- // unaligned version
- __asm {
- mov ecx,dword ptr [n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov ebx,dword ptr [opadd]
-loopu:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
-
- movups xmm2,xmmword ptr[eax]
- movups xmm1,xmmword ptr[ebx]
- mulps xmm2,xmm0
- addps xmm2,xmm1
- movups xmmword ptr[edx],xmm2
-
- movups xmm3,xmmword ptr[eax+4*4]
- movups xmm1,xmmword ptr[ebx+4*4]
- mulps xmm3,xmm0
- addps xmm3,xmm1
- movups xmmword ptr[edx+4*4],xmm3
-
- movups xmm4,xmmword ptr[eax+8*4]
- movups xmm1,xmmword ptr[ebx+8*4]
- mulps xmm4,xmm0
- addps xmm4,xmm1
- movups xmmword ptr[edx+8*4],xmm4
-
- movups xmm5,xmmword ptr[eax+12*4]
- movups xmm1,xmmword ptr[ebx+12*4]
- mulps xmm5,xmm0
- addps xmm5,xmm1
- movups xmmword ptr[edx+12*4],xmm5
-
- add eax,16*4
- add edx,16*4
- add ebx,16*4
- loop loopu
- }
- }
-
- src += n<<4,dst += n<<4,opadd += n<<4;
-zero:
- while(cnt--) *(dst++) = *(src++) * opmul + *(opadd++);
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
- if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst,opadd))
- ScaleAltivec(dst,src,opmul,opadd,cnt);
- else
-#endif // _MSC_VER
-#endif // FLEXT_USE_SIMD
- {
- int n = cnt>>3;
- cnt -= n<<3;
- if(dst == opadd) {
- while(n--) {
- dst[0] += src[0]*opmul; dst[1] += src[1]*opmul;
- dst[2] += src[2]*opmul; dst[3] += src[3]*opmul;
- dst[4] += src[4]*opmul; dst[5] += src[5]*opmul;
- dst[6] += src[6]*opmul; dst[7] += src[7]*opmul;
- src += 8,dst += 8;
- }
- while(cnt--) *(dst++) += *(src++)*opmul;
- }
- else {
- while(n--) {
- dst[0] = src[0]*opmul+opadd[0]; dst[1] = src[1]*opmul+opadd[1];
- dst[2] = src[2]*opmul+opadd[2]; dst[3] = src[3]*opmul+opadd[3];
- dst[4] = src[4]*opmul+opadd[4]; dst[5] = src[5]*opmul+opadd[5];
- dst[6] = src[6]*opmul+opadd[6]; dst[7] = src[7]*opmul+opadd[7];
- src += 8,dst += 8,opadd += 8;
- }
- while(cnt--) *(dst++) = *(src++)*opmul+*(opadd++);
- }
- }
-#endif
-}
-
-void flext::ScaleSamples(t_sample *dst,const t_sample *src,const t_sample *opmul,const t_sample *opadd,int cnt)
-{
-#ifdef FLEXT_USE_IPP
- if(sizeof(t_sample) == 4) {
- ippsMul_32f((const float *)src,(const float *)opmul,(float *)dst,cnt);
- ippsAdd_32f_I((const float *)opadd,(float *)dst,cnt);
- }
- else if(sizeof(t_sample) == 8) {
- ippsMul_64f((const double *)src,(const double *)opmul,(double *)dst,cnt);
- ippsAdd_64f_I((const double *)opadd,(double *)dst,cnt);
- }
- else
- ERRINTERNAL();
-#else
-#ifdef FLEXT_USE_SIMD
-#ifdef _MSC_VER
- if(GetSIMDCapabilities()&simd_sse) {
- // single precision
- int n = cnt>>4;
- if(!n) goto zero;
- cnt -= n<<4;
-
- __asm {
- mov eax,dword ptr [src]
- prefetcht0 [eax+0]
- prefetcht0 [eax+32]
- }
-
- if(VectorsAligned(src,dst,opmul,opadd)) {
- // aligned version
- __asm {
- mov ecx,dword ptr [n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov esi,dword ptr [opmul]
- mov ebx,dword ptr [opadd]
-loopa:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [esi+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
- prefetcht0 [esi+96]
-
- movaps xmm2,xmmword ptr[eax]
- movaps xmm0,xmmword ptr[esi]
- movaps xmm1,xmmword ptr[ebx]
- mulps xmm2,xmm0
- addps xmm2,xmm1
- movaps xmmword ptr[edx],xmm2
-
- movaps xmm3,xmmword ptr[eax+4*4]
- movaps xmm0,xmmword ptr[esi+4*4]
- movaps xmm1,xmmword ptr[ebx+4*4]
- mulps xmm3,xmm0
- addps xmm3,xmm1
- movaps xmmword ptr[edx+4*4],xmm3
-
- movaps xmm4,xmmword ptr[eax+8*4]
- movaps xmm0,xmmword ptr[esi+8*4]
- movaps xmm1,xmmword ptr[ebx+8*4]
- mulps xmm4,xmm0
- addps xmm4,xmm1
- movaps xmmword ptr[edx+8*4],xmm4
-
- movaps xmm5,xmmword ptr[eax+12*4]
- movaps xmm0,xmmword ptr[esi+12*4]
- movaps xmm1,xmmword ptr[ebx+12*4]
- mulps xmm5,xmm0
- addps xmm5,xmm1
- movaps xmmword ptr[edx+12*4],xmm5
-
- add eax,16*4
- add edx,16*4
- add ebx,16*4
- add esi,16*4
- loop loopa
- }
- }
- else {
- // unaligned version
- __asm {
- mov ecx,dword ptr [n]
- mov eax,dword ptr [src]
- mov edx,dword ptr [dst]
- mov esi,dword ptr [opmul]
- mov ebx,dword ptr [opadd]
-loopu:
- prefetcht0 [eax+64]
- prefetcht0 [ebx+64]
- prefetcht0 [esi+64]
- prefetcht0 [eax+96]
- prefetcht0 [ebx+96]
- prefetcht0 [esi+96]
-
- movups xmm2,xmmword ptr[eax]
- movups xmm0,xmmword ptr[esi]
- movups xmm1,xmmword ptr[ebx]
- mulps xmm2,xmm0
- addps xmm2,xmm1
- movups xmmword ptr[edx],xmm2
-
- movups xmm3,xmmword ptr[eax+4*4]
- movups xmm0,xmmword ptr[esi+4*4]
- movups xmm1,xmmword ptr[ebx+4*4]
- mulps xmm3,xmm0
- addps xmm3,xmm1
- movups xmmword ptr[edx+4*4],xmm3
-
- movups xmm4,xmmword ptr[eax+8*4]
- movups xmm0,xmmword ptr[esi+8*4]
- movups xmm1,xmmword ptr[ebx+8*4]
- mulps xmm4,xmm0
- addps xmm4,xmm1
- movups xmmword ptr[edx+8*4],xmm4
-
- movups xmm5,xmmword ptr[eax+12*4]
- movups xmm0,xmmword ptr[esi+12*4]
- movups xmm1,xmmword ptr[ebx+12*4]
- mulps xmm5,xmm0
- addps xmm5,xmm1
- movups xmmword ptr[edx+12*4],xmm5
-
- add eax,16*4
- add edx,16*4
- add ebx,16*4
- add esi,16*4
- loop loopu
- }
- }
- src += n<<4,dst += n<<4,opmul += n<<4,opadd += n<<4;
-zero:
- while(cnt--) *(dst++) = *(src++) * *(opmul++) + *(opadd++);
- }
- else
-#elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__VEC__)
- if(GetSIMDCapabilities()&simd_altivec && VectorsAligned(src,dst,opmul,opadd))
- ScaleAltivec(dst,src,opmul,opadd,cnt);
- else
-#endif // _MSC_VER
-#endif // FLEXT_USE_SIMD
- {
- int n = cnt>>3;
- cnt -= n<<3;
- if(dst == opadd) {
- while(n--) {
- dst[0] += src[0]*opmul[0]; dst[1] += src[1]*opmul[1];
- dst[2] += src[2]*opmul[2]; dst[3] += src[3]*opmul[3];
- dst[4] += src[4]*opmul[4]; dst[5] += src[5]*opmul[5];
- dst[6] += src[6]*opmul[6]; dst[7] += src[7]*opmul[7];
- src += 8,dst += 8,opmul += 8;
- }
- while(cnt--) *(dst++) += *(src++) * *(opmul++);
- }
- else {
- while(n--) {
- dst[0] = src[0]*opmul[0]+opadd[0]; dst[1] = src[1]*opmul[1]+opadd[1];
- dst[2] = src[2]*opmul[2]+opadd[2]; dst[3] = src[3]*opmul[3]+opadd[3];
- dst[4] = src[4]*opmul[4]+opadd[4]; dst[5] = src[5]*opmul[5]+opadd[5];
- dst[6] = src[6]*opmul[6]+opadd[6]; dst[7] = src[7]*opmul[7]+opadd[7];
- src += 8,dst += 8,opmul += 8,opadd += 8;
- }
- while(cnt--) *(dst++) = *(src++)* *(opmul++) + *(opadd++);
- }
- }
-#endif
-}