From b84b298888b12a3c00ffd6cabd5b39a5937e1347 Mon Sep 17 00:00:00 2001 From: Thomas Grill Date: Sat, 17 May 2003 02:32:51 +0000 Subject: "" svn path=/trunk/; revision=625 --- externals/grill/flext/source/flsimd.cpp | 209 ++++++++++++++++++++++++++++---- 1 file changed, 185 insertions(+), 24 deletions(-) (limited to 'externals/grill/flext/source/flsimd.cpp') diff --git a/externals/grill/flext/source/flsimd.cpp b/externals/grill/flext/source/flsimd.cpp index f5db9f5e..bebf3d97 100755 --- a/externals/grill/flext/source/flsimd.cpp +++ b/externals/grill/flext/source/flsimd.cpp @@ -9,7 +9,12 @@ WARRANTIES, see the file, "license.txt," in this distribution. */ /*! \file flsimd.cpp - \brief flext SIMD support functions + \brief flext SIMD support functions + + If FLEXT_USE_SIMD is defined at compilation, SIMD instructions are used wherever feasible. + If used with MSVC++ the "Processor Pack" must be installed. + + If FLEXT_USE_IPP is defined the Intel Performance Package is used. */ #include "flext.h" @@ -19,14 +24,22 @@ WARRANTIES, see the file, "license.txt," in this distribution. #include #endif -#if FLEXT_CPU == FLEXT_CPU_PPC && defined(__MWERKS__) -#include "Altivec.h" -#endif - #ifdef FLEXT_USE_IPP #include #endif +#ifdef FLEXT_USE_SIMD + #ifdef _MSC_VER + // include MSVC SIMD header files + #include // MMX + #include // SSE + #include // SSE2 + #include // 3DNow! + #elif FLEXT_CPU == FLEXT_CPU_PPC && defined(__MWERKS__) + #include "Altivec.h" + #endif +#endif // FLEXT_USE_SIMD + static unsigned long setsimdcaps(); /*! \brief Holds SIMD capability flags @@ -37,6 +50,7 @@ unsigned long flext::simdcaps = setsimdcaps(); unsigned long flext::GetSIMDCapabilities() { return simdcaps; } +#ifdef FLEXT_USE_SIMD #if FLEXT_CPU == FLEXT_CPU_INTEL @@ -247,6 +261,10 @@ static unsigned long setsimdcaps() return simdflags; } +#else // FLEXT_USE_SIMD +static unsigned long setsimdcaps() { return 0; } +#endif // FLEXT_USE_SIMD + void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt) { @@ -258,17 +276,89 @@ void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt) else ERRINTERNAL(); #else - int n = cnt>>3; - cnt -= n<<3; - while(n--) { - dst[0] = src[0]; dst[1] = src[1]; - dst[2] = src[2]; dst[3] = src[3]; - dst[4] = src[4]; dst[5] = src[5]; - dst[6] = src[6]; dst[7] = src[7]; - src += 8,dst += 8; - } - - while(cnt--) *(dst++) = *(src++); +#ifdef FLEXT_USE_SIMD +#ifdef _MSC_VER +#if 1 // t_sample is float + if(GetSIMDCapabilities()&simd_sse) { + // single precision + + int n = cnt>>4; + cnt -= n<<4; + + if((reinterpret_cast(src)&(__alignof(t_sample)-1)) == 0 + && (reinterpret_cast(dst)&(__alignof(t_sample)-1)) == 0 + ) { + // aligned version + while(n--) { + _mm_store_ps(dst+0,_mm_load_ps(src+0)); + _mm_store_ps(dst+4,_mm_load_ps(src+4)); + _mm_store_ps(dst+8,_mm_load_ps(src+8)); + _mm_store_ps(dst+12,_mm_load_ps(src+12)); + src += 16,dst += 16; + } + } + else { + // unaligned version + while(n--) { + _mm_storeu_ps(dst+0,_mm_loadu_ps(src+0)); + _mm_storeu_ps(dst+4,_mm_loadu_ps(src+4)); + _mm_storeu_ps(dst+8,_mm_loadu_ps(src+8)); + _mm_storeu_ps(dst+12,_mm_loadu_ps(src+12)); + src += 16,dst += 16; + } + } + while(cnt--) *(dst++) = *(src++); + } + else +#elif 0 // t_sample is double + if(GetSIMDCapabilities()&simd_sse2) { + // double precision + + int n = cnt>>3; + cnt -= n<<3; + + if((reinterpret_cast(src)&(__alignof(t_sample)-1)) == 0 + && (reinterpret_cast(dst)&(__alignof(t_sample)-1)) == 0 + ) { + // aligned version + while(n--) { + _mm_store_pd(dst+0,_mm_load_pd(src+0)); + _mm_store_pd(dst+2,_mm_load_pd(src+2)); + _mm_store_pd(dst+4,_mm_load_pd(src+4)); + _mm_store_pd(dst+6,_mm_load_pd(src+6)); + src += 8,dst += 8; + } + } + else { + // unaligned version + while(n--) { + _mm_storeu_pd(dst+0,_mm_loadu_pd(src+0)); + _mm_storeu_pd(dst+2,_mm_loadu_pd(src+2)); + _mm_storeu_pd(dst+4,_mm_loadu_pd(src+4)); + _mm_storeu_pd(dst+6,_mm_loadu_pd(src+6)); + src += 8,dst += 8; + } + } + while(cnt--) *(dst++) = *(src++); + } + else +#else + #error t_sample data type has illegal size +#endif +#endif // _MSC_VER +#endif // FLEXT_USE_SIMD + { + int n = cnt>>3; + cnt -= n<<3; + while(n--) { + dst[0] = src[0]; dst[1] = src[1]; + dst[2] = src[2]; dst[3] = src[3]; + dst[4] = src[4]; dst[5] = src[5]; + dst[6] = src[6]; dst[7] = src[7]; + src += 8,dst += 8; + } + while(cnt--) *(dst++) = *(src++); + } #endif } @@ -282,13 +372,84 @@ void flext::SetSamples(t_sample *dst,int cnt,t_sample s) else ERRINTERNAL(); #else - int n = cnt>>3; - cnt -= n<<3; - while(n--) { - dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = dst[7] = s; - dst += 8; - } - - while(cnt--) *(dst++) = s; +#ifdef FLEXT_USE_SIMD +#ifdef _MSC_VER +#if 1 // t_sample is float + if(GetSIMDCapabilities()&simd_sse) { + // single precision + + __m128 v = _mm_load1_ps(&s); + int n = cnt>>4; + cnt -= n<<4; + + if((reinterpret_cast(dst)&(__alignof(t_sample)-1)) == 0) { + // aligned version + while(n--) { + _mm_store_ps(dst+0,v); + _mm_store_ps(dst+4,v); + _mm_store_ps(dst+8,v); + _mm_store_ps(dst+12,v); + dst += 16; + } + } + else { + // unaligned version + while(n--) { + _mm_storeu_ps(dst+0,v); + _mm_storeu_ps(dst+4,v); + _mm_storeu_ps(dst+8,v); + _mm_storeu_ps(dst+12,v); + dst += 16; + } + } + while(cnt--) *(dst++) = s; + } + else +#elif 0 // t_sample is double + if(GetSIMDCapabilities()&simd_sse2) { + // double precision + + __m128 v = _mm_load1_pd(&s); + int n = cnt>>3; + cnt -= n<<3; + + if((reinterpret_cast(dst)&(__alignof(t_sample)-1)) == 0) { + // aligned version + while(n--) { + _mm_store_pd(dst+0,v); + _mm_store_pd(dst+2,v); + _mm_store_pd(dst+4,v); + _mm_store_pd(dst+8,v); + dst += 8; + } + } + else { + // unaligned version + while(n--) { + _mm_storeu_pd(dst+0,v); + _mm_storeu_pd(dst+2,v); + _mm_storeu_pd(dst+4,v); + _mm_storeu_pd(dst+8,v); + dst += 8; + } + } + while(cnt--) *(dst++) = s; + } + else +#else + #error t_sample data type has illegal size +#endif +#endif // _MSC_VER +#endif // FLEXT_USE_SIMD + { + int n = cnt>>3; + cnt -= n<<3; + while(n--) { + dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = dst[7] = s; + dst += 8; + } + + while(cnt--) *(dst++) = s; + } #endif } -- cgit v1.2.1