From 44d97258734b898c3cb0f8cdfd199352de2bfc2d Mon Sep 17 00:00:00 2001 From: Thomas Grill Date: Mon, 21 Jul 2003 02:37:30 +0000 Subject: "" svn path=/trunk/; revision=782 --- externals/grill/flext/source/flattr_ed.cpp | 57 +--- externals/grill/flext/source/flbase.h | 7 + externals/grill/flext/source/flprefix.h | 1 - externals/grill/flext/source/flsimd.cpp | 471 +++++++++++++++++++++-------- 4 files changed, 362 insertions(+), 174 deletions(-) (limited to 'externals/grill/flext/source') diff --git a/externals/grill/flext/source/flattr_ed.cpp b/externals/grill/flext/source/flattr_ed.cpp index ec096ef5..d138a97b 100644 --- a/externals/grill/flext/source/flattr_ed.cpp +++ b/externals/grill/flext/source/flattr_ed.cpp @@ -12,7 +12,7 @@ WARRANTIES, see the file, "license.txt," in this distribution. \brief Attribute editor (property dialog) for PD */ -#include "flprefix.h" +#include "flext.h" #if FLEXT_SYS == FLEXT_SYS_PD && !defined(FLEXT_NOATTREDIT) @@ -20,64 +20,13 @@ WARRANTIES, see the file, "license.txt," in this distribution. #pragma warning( disable : 4091 ) #endif +// This is problematic... non-public headers! #include -#include "flext.h" +#include #include #include -#ifdef __MWERKS__ -#define STD std -#else -#define STD -#endif - - -#if !defined(PD_VERSION_MAJOR) - /* PD version 0.36 or below */ - - /* Call this to get a gobj's bounding rectangle in pixels */ - typedef void (*t_getrectfn)(t_gobj *x, struct _glist *glist, - int *x1, int *y1, int *x2, int *y2); - /* and this to displace a gobj: */ - typedef void (*t_displacefn)(t_gobj *x, struct _glist *glist, int dx, int dy); - /* change color to show selection: */ - typedef void (*t_selectfn)(t_gobj *x, struct _glist *glist, int state); - /* change appearance to show activation/deactivation: */ - typedef void (*t_activatefn)(t_gobj *x, struct _glist *glist, int state); - /* warn a gobj it's about to be deleted */ - typedef void (*t_deletefn)(t_gobj *x, struct _glist *glist); - /* making visible or invisible */ - typedef void (*t_visfn)(t_gobj *x, struct _glist *glist, int flag); - /* field a mouse click (when not in "edit" mode) */ - typedef int (*t_clickfn)(t_gobj *x, struct _glist *glist, - int xpix, int ypix, int shift, int alt, int dbl, int doit); - /* save to a binbuf */ - typedef void (*t_savefn)(t_gobj *x, t_binbuf *b); - /* open properties dialog */ - typedef void (*t_propertiesfn)(t_gobj *x, struct _glist *glist); - /* ... and later, resizing; getting/setting font or color... */ - - struct _widgetbehavior - { - t_getrectfn w_getrectfn; - t_displacefn w_displacefn; - t_selectfn w_selectfn; - t_activatefn w_activatefn; - t_deletefn w_deletefn; - t_visfn w_visfn; - t_clickfn w_clickfn; - t_savefn w_savefn; - t_propertiesfn w_propertiesfn; - }; - -#elif !defined(PD_VERSION_MINOR) - #error Flext cannot be compiled with this version! -#else - #include -#endif - - static t_widgetbehavior widgetbehavior; static void (*ori_vis)(t_gobj *c, t_glist *, int vis) = NULL; diff --git a/externals/grill/flext/source/flbase.h b/externals/grill/flext/source/flbase.h index 795c0673..eec2e520 100644 --- a/externals/grill/flext/source/flbase.h +++ b/externals/grill/flext/source/flbase.h @@ -21,6 +21,13 @@ WARRANTIES, see the file, "license.txt," in this distribution. #include "flsupport.h" +// ----- disable attribute editor for PD version < devel_0_36 or 0.37 +#ifndef PD_MAJOR_VERSION +#undef FLEXT_NOATTREDIT +#define FLEXT_NOATTREDIT +#endif + + class FLEXT_SHARE FLEXT_CLASSDEF(flext_obj); typedef class FLEXT_CLASSDEF(flext_obj) flext_obj; diff --git a/externals/grill/flext/source/flprefix.h b/externals/grill/flext/source/flprefix.h index 7ab517df..c27d1a67 100755 --- a/externals/grill/flext/source/flprefix.h +++ b/externals/grill/flext/source/flprefix.h @@ -371,7 +371,6 @@ WARRANTIES, see the file, "license.txt," in this distribution. #define FLEXT_CLASSDEF(CL) CL##_single #endif - // std namespace #ifdef __MWERKS__ #define STD std diff --git a/externals/grill/flext/source/flsimd.cpp b/externals/grill/flext/source/flsimd.cpp index 3dcee887..88cbdb89 100755 --- a/externals/grill/flext/source/flsimd.cpp +++ b/externals/grill/flext/source/flsimd.cpp @@ -287,6 +287,12 @@ void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt) if(GetSIMDCapabilities()&simd_sse) { // single precision + __asm { + mov eax,dword ptr [src] + prefetcht0 [eax+0] + prefetcht0 [eax+32] + } + int n = cnt>>4; cnt -= n<<4; @@ -295,10 +301,11 @@ void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt) // aligned src, aligned dst __asm { mov eax,dword ptr [src] - prefetcht0 [eax] mov edx,dword ptr [dst] mov ecx,[n] - loopaa: +loopaa: + prefetcht0 [eax+64] + prefetcht0 [eax+96] movaps xmm0,xmmword ptr[eax] movaps xmmword ptr[edx],xmm0 movaps xmm1,xmmword ptr[eax+4*4] @@ -317,10 +324,11 @@ void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt) // aligned src, unaligned dst __asm { mov eax,dword ptr [src] - prefetcht0 [eax] mov edx,dword ptr [dst] mov ecx,[n] - loopau: +loopau: + prefetcht0 [eax+64] + prefetcht0 [eax+96] movaps xmm0,xmmword ptr[eax] movups xmmword ptr[edx],xmm0 movaps xmm1,xmmword ptr[eax+4*4] @@ -341,10 +349,11 @@ void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt) // unaligned src, aligned dst __asm { mov eax,dword ptr [src] - prefetcht0 [eax] mov edx,dword ptr [dst] mov ecx,[n] loopua: + prefetcht0 [eax+64] + prefetcht0 [eax+96] movups xmm0,xmmword ptr[eax] movaps xmmword ptr[edx],xmm0 movups xmm1,xmmword ptr[eax+4*4] @@ -363,10 +372,11 @@ loopua: // unaligned src, unaligned dst __asm { mov eax,dword ptr [src] - prefetcht0 [eax] mov edx,dword ptr [dst] mov ecx,[n] loopuu: + prefetcht0 [eax+64] + prefetcht0 [eax+96] movups xmm0,xmmword ptr[eax] movups xmmword ptr[edx],xmm0 movups xmm1,xmmword ptr[eax+4*4] @@ -502,6 +512,10 @@ void flext::MulSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt) cnt -= n<<4; __asm { + mov eax,dword ptr [src] + prefetcht0 [eax+0] + prefetcht0 [eax+32] + movss xmm0,xmmword ptr [op] shufps xmm0,xmm0,0 } @@ -515,6 +529,9 @@ void flext::MulSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt) mov eax,dword ptr [src] mov edx,dword ptr [dst] loopa: + prefetcht0 [eax+64] + prefetcht0 [eax+96] + movaps xmm1,xmmword ptr[eax] mulps xmm1,xmm0 movaps xmmword ptr[edx],xmm1 @@ -543,6 +560,9 @@ loopa: mov eax,dword ptr [src] mov edx,dword ptr [dst] loopu: + prefetcht0 [eax+64] + prefetcht0 [eax+96] + movups xmm1,xmmword ptr[eax] mulps xmm1,xmm0 movups xmmword ptr[edx],xmm1 @@ -619,75 +639,171 @@ void flext::MulSamples(t_sample *dst,const t_sample *src,const t_sample *op,int int n = cnt>>4; cnt -= n<<4; + __asm { + mov eax,[src] + mov ebx,[op] + prefetcht0 [eax+0] + prefetcht0 [ebx+0] + prefetcht0 [eax+32] + prefetcht0 [ebx+32] + } + if((reinterpret_cast(src)&(__alignof(__m128)-1)) == 0 && (reinterpret_cast(dst)&(__alignof(__m128)-1)) == 0 - && (reinterpret_cast(op)&(__alignof(__m128)-1)) == 0 - ) { - // aligned version - __asm { - mov ecx,[n] - mov eax,dword ptr [src] - mov edx,dword ptr [dst] - mov ebx,dword ptr [op] -loopa: - movaps xmm0,xmmword ptr[eax] - movaps xmm1,xmmword ptr[ebx] - mulps xmm0,xmm1 - movaps xmmword ptr[edx],xmm0 + ) { + if((reinterpret_cast(op)&(__alignof(__m128)-1)) == 0) { + __asm { + mov ecx,[n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov ebx,dword ptr [op] + loopaa: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] - movaps xmm2,xmmword ptr[eax+4*4] - movaps xmm3,xmmword ptr[ebx+4*4] - mulps xmm2,xmm3 - movaps xmmword ptr[edx+4*4],xmm2 + movaps xmm0,xmmword ptr[eax] + movaps xmm1,xmmword ptr[ebx] + mulps xmm0,xmm1 + movaps xmmword ptr[edx],xmm0 - movaps xmm4,xmmword ptr[eax+8*4] - movaps xmm5,xmmword ptr[ebx+8*4] - mulps xmm4,xmm5 - movaps xmmword ptr[edx+8*4],xmm4 + movaps xmm2,xmmword ptr[eax+4*4] + movaps xmm3,xmmword ptr[ebx+4*4] + mulps xmm2,xmm3 + movaps xmmword ptr[edx+4*4],xmm2 - movaps xmm6,xmmword ptr[eax+12*4] - movaps xmm7,xmmword ptr[ebx+12*4] - mulps xmm6,xmm7 - movaps xmmword ptr[edx+12*4],xmm6 + movaps xmm4,xmmword ptr[eax+8*4] + movaps xmm5,xmmword ptr[ebx+8*4] + mulps xmm4,xmm5 + movaps xmmword ptr[edx+8*4],xmm4 - add eax,16*4 - add ebx,16*4 - add edx,16*4 - loop loopa + movaps xmm6,xmmword ptr[eax+12*4] + movaps xmm7,xmmword ptr[ebx+12*4] + mulps xmm6,xmm7 + movaps xmmword ptr[edx+12*4],xmm6 + + add eax,16*4 + add ebx,16*4 + add edx,16*4 + loop loopaa + } } - } + else { + __asm { + mov ecx,[n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov ebx,dword ptr [op] + loopau: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] + + movaps xmm0,xmmword ptr[eax] + movups xmm1,xmmword ptr[ebx] + mulps xmm0,xmm1 + movaps xmmword ptr[edx],xmm0 + + movaps xmm2,xmmword ptr[eax+4*4] + movups xmm3,xmmword ptr[ebx+4*4] + mulps xmm2,xmm3 + movaps xmmword ptr[edx+4*4],xmm2 + + movaps xmm4,xmmword ptr[eax+8*4] + movups xmm5,xmmword ptr[ebx+8*4] + mulps xmm4,xmm5 + movaps xmmword ptr[edx+8*4],xmm4 + + movaps xmm6,xmmword ptr[eax+12*4] + movups xmm7,xmmword ptr[ebx+12*4] + mulps xmm6,xmm7 + movaps xmmword ptr[edx+12*4],xmm6 + + add eax,16*4 + add ebx,16*4 + add edx,16*4 + loop loopau + } + } + } else { - // unaligned version - __asm { - mov ecx,[n] - mov eax,dword ptr [src] - mov edx,dword ptr [dst] - mov ebx,dword ptr [op] -loopu: - movups xmm0,xmmword ptr[eax] - movups xmm1,xmmword ptr[ebx] - mulps xmm0,xmm1 - movups xmmword ptr[edx],xmm0 + if((reinterpret_cast(op)&(__alignof(__m128)-1)) == 0) { + __asm { + mov ecx,[n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov ebx,dword ptr [op] + loopua: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] - movups xmm2,xmmword ptr[eax+4*4] - movups xmm3,xmmword ptr[ebx+4*4] - mulps xmm2,xmm3 - movups xmmword ptr[edx+4*4],xmm2 + movups xmm0,xmmword ptr[eax] + movaps xmm1,xmmword ptr[ebx] + mulps xmm0,xmm1 + movups xmmword ptr[edx],xmm0 - movups xmm4,xmmword ptr[eax+8*4] - movups xmm5,xmmword ptr[ebx+8*4] - mulps xmm4,xmm5 - movups xmmword ptr[edx+8*4],xmm4 + movups xmm2,xmmword ptr[eax+4*4] + movaps xmm3,xmmword ptr[ebx+4*4] + mulps xmm2,xmm3 + movups xmmword ptr[edx+4*4],xmm2 - movups xmm6,xmmword ptr[eax+12*4] - movups xmm7,xmmword ptr[ebx+12*4] - mulps xmm6,xmm7 - movups xmmword ptr[edx+12*4],xmm6 + movups xmm4,xmmword ptr[eax+8*4] + movaps xmm5,xmmword ptr[ebx+8*4] + mulps xmm4,xmm5 + movups xmmword ptr[edx+8*4],xmm4 - add eax,16*4 - add ebx,16*4 - add edx,16*4 - loop loopu + movups xmm6,xmmword ptr[eax+12*4] + movaps xmm7,xmmword ptr[ebx+12*4] + mulps xmm6,xmm7 + movups xmmword ptr[edx+12*4],xmm6 + + add eax,16*4 + add ebx,16*4 + add edx,16*4 + loop loopua + } + } + else { + __asm { + mov ecx,[n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov ebx,dword ptr [op] +loopuu: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] + + movups xmm0,xmmword ptr[eax] + movups xmm1,xmmword ptr[ebx] + mulps xmm0,xmm1 + movups xmmword ptr[edx],xmm0 + + movups xmm2,xmmword ptr[eax+4*4] + movups xmm3,xmmword ptr[ebx+4*4] + mulps xmm2,xmm3 + movups xmmword ptr[edx+4*4],xmm2 + + movups xmm4,xmmword ptr[eax+8*4] + movups xmm5,xmmword ptr[ebx+8*4] + mulps xmm4,xmm5 + movups xmmword ptr[edx+8*4],xmm4 + + movups xmm6,xmmword ptr[eax+12*4] + movups xmm7,xmmword ptr[ebx+12*4] + mulps xmm6,xmm7 + movups xmmword ptr[edx+12*4],xmm6 + + add eax,16*4 + add ebx,16*4 + add edx,16*4 + loop loopuu + } } } while(cnt--) *(dst++) = *(src++) * *(op++); @@ -748,6 +864,10 @@ void flext::AddSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt) cnt -= n<<4; __asm { + mov eax,[src] + prefetcht0 [eax+0] + prefetcht0 [eax+32] + movss xmm0,xmmword ptr [op] shufps xmm0,xmm0,0 } @@ -761,6 +881,9 @@ void flext::AddSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt) mov eax,dword ptr [src] mov edx,dword ptr [dst] loopa: + prefetcht0 [eax+64] + prefetcht0 [eax+96] + movaps xmm1,xmmword ptr[eax] addps xmm1,xmm0 movaps xmmword ptr[edx],xmm1 @@ -789,6 +912,9 @@ loopa: mov eax,dword ptr [src] mov edx,dword ptr [dst] loopu: + prefetcht0 [eax+64] + prefetcht0 [eax+96] + movups xmm1,xmmword ptr[eax] addps xmm1,xmm0 movups xmmword ptr[edx],xmm1 @@ -867,79 +993,176 @@ void flext::AddSamples(t_sample *dst,const t_sample *src,const t_sample *op,int #ifdef FLEXT_USE_SIMD #ifdef _MSC_VER if(GetSIMDCapabilities()&simd_sse) { + // Prefetch cache + __asm { + mov eax,dword ptr [src] + mov ebx,dword ptr [op] + prefetcht0 [eax] + prefetcht0 [ebx] + prefetcht0 [eax+32] + prefetcht0 [ebx+32] + } + // single precision int n = cnt>>4; cnt -= n<<4; if((reinterpret_cast(src)&(__alignof(__m128)-1)) == 0 && (reinterpret_cast(dst)&(__alignof(__m128)-1)) == 0 - && (reinterpret_cast(op)&(__alignof(__m128)-1)) == 0 - ) { - // aligned version - __asm { - mov ecx,dword ptr [n] - mov eax,dword ptr [src] - mov edx,dword ptr [dst] - mov ebx,dword ptr [op] -loopa: - movaps xmm0,xmmword ptr[eax] - movaps xmm1,xmmword ptr[ebx] - addps xmm0,xmm1 - movaps xmmword ptr[edx],xmm0 + ) { + if((reinterpret_cast(op)&(__alignof(__m128)-1)) == 0) { + __asm { + mov ecx,dword ptr [n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov ebx,dword ptr [op] + loopaa: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] - movaps xmm2,xmmword ptr[eax+4*4] - movaps xmm3,xmmword ptr[ebx+4*4] - addps xmm2,xmm3 - movaps xmmword ptr[edx+4*4],xmm2 + movaps xmm0,xmmword ptr[eax] + movaps xmm1,xmmword ptr[ebx] + addps xmm0,xmm1 + movaps xmmword ptr[edx],xmm0 - movaps xmm4,xmmword ptr[eax+8*4] - movaps xmm5,xmmword ptr[ebx+8*4] - addps xmm4,xmm5 - movaps xmmword ptr[edx+8*4],xmm4 + movaps xmm2,xmmword ptr[eax+4*4] + movaps xmm3,xmmword ptr[ebx+4*4] + addps xmm2,xmm3 + movaps xmmword ptr[edx+4*4],xmm2 - movaps xmm6,xmmword ptr[eax+12*4] - movaps xmm7,xmmword ptr[ebx+12*4] - addps xmm6,xmm7 - movaps xmmword ptr[edx+12*4],xmm6 + movaps xmm4,xmmword ptr[eax+8*4] + movaps xmm5,xmmword ptr[ebx+8*4] + addps xmm4,xmm5 + movaps xmmword ptr[edx+8*4],xmm4 - add eax,16*4 - add ebx,16*4 - add edx,16*4 - loop loopa + movaps xmm6,xmmword ptr[eax+12*4] + movaps xmm7,xmmword ptr[ebx+12*4] + addps xmm6,xmm7 + movaps xmmword ptr[edx+12*4],xmm6 + + add eax,16*4 + add ebx,16*4 + add edx,16*4 + loop loopaa + } + } + else { + __asm { + mov ecx,dword ptr [n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov ebx,dword ptr [op] + loopau: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] + + movaps xmm0,xmmword ptr[eax] + movups xmm1,xmmword ptr[ebx] + addps xmm0,xmm1 + movaps xmmword ptr[edx],xmm0 + + movaps xmm2,xmmword ptr[eax+4*4] + movups xmm3,xmmword ptr[ebx+4*4] + addps xmm2,xmm3 + movaps xmmword ptr[edx+4*4],xmm2 + + movaps xmm4,xmmword ptr[eax+8*4] + movups xmm5,xmmword ptr[ebx+8*4] + addps xmm4,xmm5 + movaps xmmword ptr[edx+8*4],xmm4 + + movaps xmm6,xmmword ptr[eax+12*4] + movups xmm7,xmmword ptr[ebx+12*4] + addps xmm6,xmm7 + movaps xmmword ptr[edx+12*4],xmm6 + + add eax,16*4 + add ebx,16*4 + add edx,16*4 + loop loopau + } } } else { - // unaligned version - __asm { - mov ecx,dword ptr [n] - mov eax,dword ptr [src] - mov edx,dword ptr [dst] - mov ebx,dword ptr [op] -loopu: - movups xmm0,xmmword ptr[eax] - movups xmm1,xmmword ptr[ebx] - addps xmm0,xmm1 - movups xmmword ptr[edx],xmm0 + if((reinterpret_cast(op)&(__alignof(__m128)-1)) == 0) { + __asm { + mov ecx,dword ptr [n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov ebx,dword ptr [op] + loopua: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] - movups xmm2,xmmword ptr[eax+4*4] - movups xmm3,xmmword ptr[ebx+4*4] - addps xmm2,xmm3 - movups xmmword ptr[edx+4*4],xmm2 + movups xmm0,xmmword ptr[eax] + movaps xmm1,xmmword ptr[ebx] + addps xmm0,xmm1 + movups xmmword ptr[edx],xmm0 - movups xmm4,xmmword ptr[eax+8*4] - movups xmm5,xmmword ptr[ebx+8*4] - addps xmm4,xmm5 - movups xmmword ptr[edx+8*4],xmm4 + movups xmm2,xmmword ptr[eax+4*4] + movaps xmm3,xmmword ptr[ebx+4*4] + addps xmm2,xmm3 + movups xmmword ptr[edx+4*4],xmm2 - movups xmm6,xmmword ptr[eax+12*4] - movups xmm7,xmmword ptr[ebx+12*4] - addps xmm6,xmm7 - movups xmmword ptr[edx+12*4],xmm6 + movups xmm4,xmmword ptr[eax+8*4] + movaps xmm5,xmmword ptr[ebx+8*4] + addps xmm4,xmm5 + movups xmmword ptr[edx+8*4],xmm4 - add eax,16*4 - add ebx,16*4 - add edx,16*4 - loop loopu + movups xmm6,xmmword ptr[eax+12*4] + movaps xmm7,xmmword ptr[ebx+12*4] + addps xmm6,xmm7 + movups xmmword ptr[edx+12*4],xmm6 + + add eax,16*4 + add ebx,16*4 + add edx,16*4 + loop loopua + } + } + else { + __asm { + mov ecx,dword ptr [n] + mov eax,dword ptr [src] + mov edx,dword ptr [dst] + mov ebx,dword ptr [op] + loopuu: + prefetcht0 [eax+64] + prefetcht0 [ebx+64] + prefetcht0 [eax+96] + prefetcht0 [ebx+96] + + movups xmm0,xmmword ptr[eax] + movups xmm1,xmmword ptr[ebx] + addps xmm0,xmm1 + movups xmmword ptr[edx],xmm0 + + movups xmm2,xmmword ptr[eax+4*4] + movups xmm3,xmmword ptr[ebx+4*4] + addps xmm2,xmm3 + movups xmmword ptr[edx+4*4],xmm2 + + movups xmm4,xmmword ptr[eax+8*4] + movups xmm5,xmmword ptr[ebx+8*4] + addps xmm4,xmm5 + movups xmmword ptr[edx+8*4],xmm4 + + movups xmm6,xmmword ptr[eax+12*4] + movups xmm7,xmmword ptr[ebx+12*4] + addps xmm6,xmm7 + movups xmmword ptr[edx+12*4],xmm6 + + add eax,16*4 + add ebx,16*4 + add edx,16*4 + loop loopuu + } } } while(cnt--) *(dst++) = *(src++) + *(op++); @@ -1008,6 +1231,10 @@ void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,t_samp cnt -= n<<4; __asm { + mov eax,dword ptr [src] + prefetcht0 [eax+0] + prefetcht0 [eax+32] + movss xmm0,xmmword ptr [opadd] shufps xmm0,xmm0,0 movss xmm1,xmmword ptr [opmul] @@ -1023,6 +1250,9 @@ void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,t_samp mov eax,dword ptr [src] mov edx,dword ptr [dst] loopa: + prefetcht0 [eax+64] + prefetcht0 [eax+96] + movaps xmm2,xmmword ptr[eax] mulps xmm2,xmm1 addps xmm2,xmm0 @@ -1055,6 +1285,9 @@ loopa: mov eax,dword ptr [src] mov edx,dword ptr [dst] loopu: + prefetcht0 [eax+64] + prefetcht0 [eax+96] + movups xmm2,xmmword ptr[eax] mulps xmm2,xmm1 addps xmm2,xmm0 -- cgit v1.2.1