From 44d97258734b898c3cb0f8cdfd199352de2bfc2d Mon Sep 17 00:00:00 2001
From: Thomas Grill <xovo@users.sourceforge.net>
Date: Mon, 21 Jul 2003 02:37:30 +0000
Subject:  ""

svn path=/trunk/; revision=782
---
 externals/grill/flext/source/flattr_ed.cpp |  57 +---
 externals/grill/flext/source/flbase.h      |   7 +
 externals/grill/flext/source/flprefix.h    |   1 -
 externals/grill/flext/source/flsimd.cpp    | 471 +++++++++++++++++++++--------
 4 files changed, 362 insertions(+), 174 deletions(-)

(limited to 'externals/grill/flext/source')

diff --git a/externals/grill/flext/source/flattr_ed.cpp b/externals/grill/flext/source/flattr_ed.cpp
index ec096ef5..d138a97b 100644
--- a/externals/grill/flext/source/flattr_ed.cpp
+++ b/externals/grill/flext/source/flattr_ed.cpp
@@ -12,7 +12,7 @@ WARRANTIES, see the file, "license.txt," in this distribution.
     \brief Attribute editor (property dialog) for PD
 */
 
-#include "flprefix.h"
+#include "flext.h"
 
 #if FLEXT_SYS == FLEXT_SYS_PD && !defined(FLEXT_NOATTREDIT)
 
@@ -20,64 +20,13 @@ WARRANTIES, see the file, "license.txt," in this distribution.
 #pragma warning( disable : 4091 ) 
 #endif
 
+// This is problematic... non-public headers!
 #include <m_imp.h>
-#include "flext.h"
+#include <g_canvas.h>
 
 #include <string.h>
 #include <stdio.h>
 
-#ifdef __MWERKS__
-#define STD std
-#else
-#define STD
-#endif
-
-
-#if !defined(PD_VERSION_MAJOR)
-	/* PD version 0.36 or below */
-
-	/* Call this to get a gobj's bounding rectangle in pixels */
-	typedef void (*t_getrectfn)(t_gobj *x, struct _glist *glist,
-		int *x1, int *y1, int *x2, int *y2);
-    		/* and this to displace a gobj: */
-	typedef void (*t_displacefn)(t_gobj *x, struct _glist *glist, int dx, int dy);
-    		/* change color to show selection: */
-	typedef void (*t_selectfn)(t_gobj *x, struct _glist *glist, int state);
-    		/* change appearance to show activation/deactivation: */
-	typedef void (*t_activatefn)(t_gobj *x, struct _glist *glist, int state);
-    		/* warn a gobj it's about to be deleted */
-	typedef void (*t_deletefn)(t_gobj *x, struct _glist *glist);
-    		/*  making visible or invisible */
-	typedef void (*t_visfn)(t_gobj *x, struct _glist *glist, int flag);
-    		/* field a mouse click (when not in "edit" mode) */
-	typedef int (*t_clickfn)(t_gobj *x, struct _glist *glist,
-		int xpix, int ypix, int shift, int alt, int dbl, int doit);
-    		/*  save to a binbuf */
-	typedef void (*t_savefn)(t_gobj *x, t_binbuf *b);
-    		/*  open properties dialog */
-	typedef void (*t_propertiesfn)(t_gobj *x, struct _glist *glist);
-    		/* ... and later, resizing; getting/setting font or color... */
-
-	struct _widgetbehavior
-	{
-		t_getrectfn w_getrectfn;
-		t_displacefn w_displacefn;
-		t_selectfn w_selectfn;
-		t_activatefn w_activatefn;
-		t_deletefn w_deletefn;
-		t_visfn w_visfn;
-		t_clickfn w_clickfn;
-		t_savefn w_savefn;
-		t_propertiesfn w_propertiesfn;
-	};
-	
-#elif !defined(PD_VERSION_MINOR)
-	#error Flext cannot be compiled with this version!
-#else
-	#include <g_canvas.h>
-#endif
-
-
 static t_widgetbehavior widgetbehavior; 
 static void (*ori_vis)(t_gobj *c, t_glist *, int vis) = NULL;
 
diff --git a/externals/grill/flext/source/flbase.h b/externals/grill/flext/source/flbase.h
index 795c0673..eec2e520 100644
--- a/externals/grill/flext/source/flbase.h
+++ b/externals/grill/flext/source/flbase.h
@@ -21,6 +21,13 @@ WARRANTIES, see the file, "license.txt," in this distribution.
 #include "flsupport.h"
 
 
+// ----- disable attribute editor for PD version < devel_0_36 or 0.37
+#ifndef PD_MAJOR_VERSION
+#undef FLEXT_NOATTREDIT
+#define FLEXT_NOATTREDIT
+#endif
+
+
 class FLEXT_SHARE FLEXT_CLASSDEF(flext_obj);
 typedef class FLEXT_CLASSDEF(flext_obj) flext_obj;
 
diff --git a/externals/grill/flext/source/flprefix.h b/externals/grill/flext/source/flprefix.h
index 7ab517df..c27d1a67 100755
--- a/externals/grill/flext/source/flprefix.h
+++ b/externals/grill/flext/source/flprefix.h
@@ -371,7 +371,6 @@ WARRANTIES, see the file, "license.txt," in this distribution.
 	#define FLEXT_CLASSDEF(CL) CL##_single
 #endif
 
-
 // std namespace
 #ifdef __MWERKS__
 #define STD std
diff --git a/externals/grill/flext/source/flsimd.cpp b/externals/grill/flext/source/flsimd.cpp
index 3dcee887..88cbdb89 100755
--- a/externals/grill/flext/source/flsimd.cpp
+++ b/externals/grill/flext/source/flsimd.cpp
@@ -287,6 +287,12 @@ void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt)
     if(GetSIMDCapabilities()&simd_sse) {
         // single precision
 
+		__asm {
+			mov		eax,dword ptr [src]
+			prefetcht0 [eax+0]
+			prefetcht0 [eax+32]
+		}
+
    	    int n = cnt>>4;
         cnt -= n<<4;
 
@@ -295,10 +301,11 @@ void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt)
 				// aligned src, aligned dst
 				__asm {
 					mov		eax,dword ptr [src]
-					prefetcht0 [eax]
 					mov		edx,dword ptr [dst]
 					mov		ecx,[n]
-	loopaa:
+loopaa:
+					prefetcht0 [eax+64]
+					prefetcht0 [eax+96]
 					movaps	xmm0,xmmword ptr[eax]
 					movaps	xmmword ptr[edx],xmm0
 					movaps	xmm1,xmmword ptr[eax+4*4]
@@ -317,10 +324,11 @@ void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt)
 				// aligned src, unaligned dst
 				__asm {
 					mov		eax,dword ptr [src]
-					prefetcht0 [eax]
 					mov		edx,dword ptr [dst]
 					mov		ecx,[n]
-	loopau:
+loopau:
+					prefetcht0 [eax+64]
+					prefetcht0 [eax+96]
 					movaps	xmm0,xmmword ptr[eax]
 					movups	xmmword ptr[edx],xmm0
 					movaps	xmm1,xmmword ptr[eax+4*4]
@@ -341,10 +349,11 @@ void flext::CopySamples(t_sample *dst,const t_sample *src,int cnt)
 				// unaligned src, aligned dst
 				__asm {
 					mov		eax,dword ptr [src]
-					prefetcht0 [eax]
 					mov		edx,dword ptr [dst]
 					mov		ecx,[n]
 loopua:
+					prefetcht0 [eax+64]
+					prefetcht0 [eax+96]
 					movups	xmm0,xmmword ptr[eax]
 					movaps	xmmword ptr[edx],xmm0
 					movups	xmm1,xmmword ptr[eax+4*4]
@@ -363,10 +372,11 @@ loopua:
 				// unaligned src, unaligned dst
 				__asm {
 					mov		eax,dword ptr [src]
-					prefetcht0 [eax]
 					mov		edx,dword ptr [dst]
 					mov		ecx,[n]
 loopuu:
+					prefetcht0 [eax+64]
+					prefetcht0 [eax+96]
 					movups	xmm0,xmmword ptr[eax]
 					movups	xmmword ptr[edx],xmm0
 					movups	xmm1,xmmword ptr[eax+4*4]
@@ -502,6 +512,10 @@ void flext::MulSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt)
         cnt -= n<<4;
 
         __asm {
+			mov		eax,dword ptr [src]
+			prefetcht0 [eax+0]
+			prefetcht0 [eax+32]
+
 			movss	xmm0,xmmword ptr [op]
 			shufps	xmm0,xmm0,0
 		}
@@ -515,6 +529,9 @@ void flext::MulSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt)
 				mov		eax,dword ptr [src]
 				mov		edx,dword ptr [dst]
 loopa:
+				prefetcht0 [eax+64]
+				prefetcht0 [eax+96]
+
 				movaps	xmm1,xmmword ptr[eax]
 				mulps	xmm1,xmm0
 				movaps	xmmword ptr[edx],xmm1
@@ -543,6 +560,9 @@ loopa:
 				mov		eax,dword ptr [src]
 				mov		edx,dword ptr [dst]
 loopu:
+				prefetcht0 [eax+64]
+				prefetcht0 [eax+96]
+
 				movups	xmm1,xmmword ptr[eax]
 				mulps	xmm1,xmm0
 				movups	xmmword ptr[edx],xmm1
@@ -619,75 +639,171 @@ void flext::MulSamples(t_sample *dst,const t_sample *src,const t_sample *op,int
    	    int n = cnt>>4;
         cnt -= n<<4;
 
+		__asm {
+			mov		eax,[src]
+			mov		ebx,[op]
+			prefetcht0 [eax+0]
+			prefetcht0 [ebx+0]
+			prefetcht0 [eax+32]
+			prefetcht0 [ebx+32]
+		}
+
         if((reinterpret_cast<unsigned long>(src)&(__alignof(__m128)-1)) == 0
             && (reinterpret_cast<unsigned long>(dst)&(__alignof(__m128)-1)) == 0
-            && (reinterpret_cast<unsigned long>(op)&(__alignof(__m128)-1)) == 0
-        ) {
-            // aligned version
-	        __asm {
-				mov		ecx,[n]
-				mov		eax,dword ptr [src]
-				mov		edx,dword ptr [dst]
-				mov		ebx,dword ptr [op]
-loopa:
-				movaps	xmm0,xmmword ptr[eax]
-				movaps	xmm1,xmmword ptr[ebx]
-				mulps	xmm0,xmm1
-				movaps	xmmword ptr[edx],xmm0
+		) {
+			if((reinterpret_cast<unsigned long>(op)&(__alignof(__m128)-1)) == 0) {
+				__asm {
+					mov		ecx,[n]
+					mov		eax,dword ptr [src]
+					mov		edx,dword ptr [dst]
+					mov		ebx,dword ptr [op]
+	loopaa:
+					prefetcht0 [eax+64]
+					prefetcht0 [ebx+64]
+					prefetcht0 [eax+96]
+					prefetcht0 [ebx+96]
 
-				movaps	xmm2,xmmword ptr[eax+4*4]
-				movaps	xmm3,xmmword ptr[ebx+4*4]
-				mulps	xmm2,xmm3
-				movaps	xmmword ptr[edx+4*4],xmm2
+					movaps	xmm0,xmmword ptr[eax]
+					movaps	xmm1,xmmword ptr[ebx]
+					mulps	xmm0,xmm1
+					movaps	xmmword ptr[edx],xmm0
 
-				movaps	xmm4,xmmword ptr[eax+8*4]
-				movaps	xmm5,xmmword ptr[ebx+8*4]
-				mulps	xmm4,xmm5
-				movaps	xmmword ptr[edx+8*4],xmm4
+					movaps	xmm2,xmmword ptr[eax+4*4]
+					movaps	xmm3,xmmword ptr[ebx+4*4]
+					mulps	xmm2,xmm3
+					movaps	xmmword ptr[edx+4*4],xmm2
 
-				movaps	xmm6,xmmword ptr[eax+12*4]
-				movaps	xmm7,xmmword ptr[ebx+12*4]
-				mulps	xmm6,xmm7
-				movaps	xmmword ptr[edx+12*4],xmm6
+					movaps	xmm4,xmmword ptr[eax+8*4]
+					movaps	xmm5,xmmword ptr[ebx+8*4]
+					mulps	xmm4,xmm5
+					movaps	xmmword ptr[edx+8*4],xmm4
 
-				add		eax,16*4
-				add		ebx,16*4
-				add		edx,16*4
-				loop	loopa
+					movaps	xmm6,xmmword ptr[eax+12*4]
+					movaps	xmm7,xmmword ptr[ebx+12*4]
+					mulps	xmm6,xmm7
+					movaps	xmmword ptr[edx+12*4],xmm6
+
+					add		eax,16*4
+					add		ebx,16*4
+					add		edx,16*4
+					loop	loopaa
+				}
 			}
-        }
+			else {
+				__asm {
+					mov		ecx,[n]
+					mov		eax,dword ptr [src]
+					mov		edx,dword ptr [dst]
+					mov		ebx,dword ptr [op]
+	loopau:
+					prefetcht0 [eax+64]
+					prefetcht0 [ebx+64]
+					prefetcht0 [eax+96]
+					prefetcht0 [ebx+96]
+
+					movaps	xmm0,xmmword ptr[eax]
+					movups	xmm1,xmmword ptr[ebx]
+					mulps	xmm0,xmm1
+					movaps	xmmword ptr[edx],xmm0
+
+					movaps	xmm2,xmmword ptr[eax+4*4]
+					movups	xmm3,xmmword ptr[ebx+4*4]
+					mulps	xmm2,xmm3
+					movaps	xmmword ptr[edx+4*4],xmm2
+
+					movaps	xmm4,xmmword ptr[eax+8*4]
+					movups	xmm5,xmmword ptr[ebx+8*4]
+					mulps	xmm4,xmm5
+					movaps	xmmword ptr[edx+8*4],xmm4
+
+					movaps	xmm6,xmmword ptr[eax+12*4]
+					movups	xmm7,xmmword ptr[ebx+12*4]
+					mulps	xmm6,xmm7
+					movaps	xmmword ptr[edx+12*4],xmm6
+
+					add		eax,16*4
+					add		ebx,16*4
+					add		edx,16*4
+					loop	loopau
+				}
+			}
+		}
         else {
-            // unaligned version
-            __asm {
-				mov		ecx,[n]
-				mov		eax,dword ptr [src]
-				mov		edx,dword ptr [dst]
-				mov		ebx,dword ptr [op]
-loopu:
-				movups	xmm0,xmmword ptr[eax]
-				movups	xmm1,xmmword ptr[ebx]
-				mulps	xmm0,xmm1
-				movups	xmmword ptr[edx],xmm0
+			if((reinterpret_cast<unsigned long>(op)&(__alignof(__m128)-1)) == 0) {
+				__asm {
+					mov		ecx,[n]
+					mov		eax,dword ptr [src]
+					mov		edx,dword ptr [dst]
+					mov		ebx,dword ptr [op]
+	loopua:
+					prefetcht0 [eax+64]
+					prefetcht0 [ebx+64]
+					prefetcht0 [eax+96]
+					prefetcht0 [ebx+96]
 
-				movups	xmm2,xmmword ptr[eax+4*4]
-				movups	xmm3,xmmword ptr[ebx+4*4]
-				mulps	xmm2,xmm3
-				movups	xmmword ptr[edx+4*4],xmm2
+					movups	xmm0,xmmword ptr[eax]
+					movaps	xmm1,xmmword ptr[ebx]
+					mulps	xmm0,xmm1
+					movups	xmmword ptr[edx],xmm0
 
-				movups	xmm4,xmmword ptr[eax+8*4]
-				movups	xmm5,xmmword ptr[ebx+8*4]
-				mulps	xmm4,xmm5
-				movups	xmmword ptr[edx+8*4],xmm4
+					movups	xmm2,xmmword ptr[eax+4*4]
+					movaps	xmm3,xmmword ptr[ebx+4*4]
+					mulps	xmm2,xmm3
+					movups	xmmword ptr[edx+4*4],xmm2
 
-				movups	xmm6,xmmword ptr[eax+12*4]
-				movups	xmm7,xmmword ptr[ebx+12*4]
-				mulps	xmm6,xmm7
-				movups	xmmword ptr[edx+12*4],xmm6
+					movups	xmm4,xmmword ptr[eax+8*4]
+					movaps	xmm5,xmmword ptr[ebx+8*4]
+					mulps	xmm4,xmm5
+					movups	xmmword ptr[edx+8*4],xmm4
 
-				add		eax,16*4
-				add		ebx,16*4
-				add		edx,16*4
-				loop	loopu
+					movups	xmm6,xmmword ptr[eax+12*4]
+					movaps	xmm7,xmmword ptr[ebx+12*4]
+					mulps	xmm6,xmm7
+					movups	xmmword ptr[edx+12*4],xmm6
+
+					add		eax,16*4
+					add		ebx,16*4
+					add		edx,16*4
+					loop	loopua
+				}
+			}
+			else {
+				__asm {
+					mov		ecx,[n]
+					mov		eax,dword ptr [src]
+					mov		edx,dword ptr [dst]
+					mov		ebx,dword ptr [op]
+loopuu:
+					prefetcht0 [eax+64]
+					prefetcht0 [ebx+64]
+					prefetcht0 [eax+96]
+					prefetcht0 [ebx+96]
+
+					movups	xmm0,xmmword ptr[eax]
+					movups	xmm1,xmmword ptr[ebx]
+					mulps	xmm0,xmm1
+					movups	xmmword ptr[edx],xmm0
+
+					movups	xmm2,xmmword ptr[eax+4*4]
+					movups	xmm3,xmmword ptr[ebx+4*4]
+					mulps	xmm2,xmm3
+					movups	xmmword ptr[edx+4*4],xmm2
+
+					movups	xmm4,xmmword ptr[eax+8*4]
+					movups	xmm5,xmmword ptr[ebx+8*4]
+					mulps	xmm4,xmm5
+					movups	xmmword ptr[edx+8*4],xmm4
+
+					movups	xmm6,xmmword ptr[eax+12*4]
+					movups	xmm7,xmmword ptr[ebx+12*4]
+					mulps	xmm6,xmm7
+					movups	xmmword ptr[edx+12*4],xmm6
+
+					add		eax,16*4
+					add		ebx,16*4
+					add		edx,16*4
+					loop	loopuu
+				}
 			}
         }
 	    while(cnt--) *(dst++) = *(src++) * *(op++); 
@@ -748,6 +864,10 @@ void flext::AddSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt)
         cnt -= n<<4;
 
         __asm {
+			mov		eax,[src]
+			prefetcht0 [eax+0]
+			prefetcht0 [eax+32]
+
 			movss	xmm0,xmmword ptr [op]
 			shufps	xmm0,xmm0,0
 		}
@@ -761,6 +881,9 @@ void flext::AddSamples(t_sample *dst,const t_sample *src,t_sample op,int cnt)
 				mov		eax,dword ptr [src]
 				mov		edx,dword ptr [dst]
 loopa:
+				prefetcht0 [eax+64]
+				prefetcht0 [eax+96]
+
 				movaps	xmm1,xmmword ptr[eax]
 				addps	xmm1,xmm0
 				movaps	xmmword ptr[edx],xmm1
@@ -789,6 +912,9 @@ loopa:
 				mov		eax,dword ptr [src]
 				mov		edx,dword ptr [dst]
 loopu:
+				prefetcht0 [eax+64]
+				prefetcht0 [eax+96]
+
 				movups	xmm1,xmmword ptr[eax]
 				addps	xmm1,xmm0
 				movups	xmmword ptr[edx],xmm1
@@ -867,79 +993,176 @@ void flext::AddSamples(t_sample *dst,const t_sample *src,const t_sample *op,int
 #ifdef FLEXT_USE_SIMD
 #ifdef _MSC_VER
     if(GetSIMDCapabilities()&simd_sse) {
+		// Prefetch cache
+		__asm {
+			mov		eax,dword ptr [src]
+			mov		ebx,dword ptr [op]
+			prefetcht0 [eax]
+			prefetcht0 [ebx]
+			prefetcht0 [eax+32]
+			prefetcht0 [ebx+32]
+		}
+
         // single precision
    	    int n = cnt>>4;
         cnt -= n<<4;
 
         if((reinterpret_cast<unsigned long>(src)&(__alignof(__m128)-1)) == 0
             && (reinterpret_cast<unsigned long>(dst)&(__alignof(__m128)-1)) == 0
-            && (reinterpret_cast<unsigned long>(op)&(__alignof(__m128)-1)) == 0
-        ) {
-            // aligned version
-	        __asm {
-				mov		ecx,dword ptr [n]
-				mov		eax,dword ptr [src]
-				mov		edx,dword ptr [dst]
-				mov		ebx,dword ptr [op]
-loopa:
-				movaps	xmm0,xmmword ptr[eax]
-				movaps	xmm1,xmmword ptr[ebx]
-				addps	xmm0,xmm1
-				movaps	xmmword ptr[edx],xmm0
+		) {
+			if((reinterpret_cast<unsigned long>(op)&(__alignof(__m128)-1)) == 0) {
+				__asm {
+					mov		ecx,dword ptr [n]
+					mov		eax,dword ptr [src]
+					mov		edx,dword ptr [dst]
+					mov		ebx,dword ptr [op]
+	loopaa:
+					prefetcht0 [eax+64]
+					prefetcht0 [ebx+64]
+					prefetcht0 [eax+96]
+					prefetcht0 [ebx+96]
 
-				movaps	xmm2,xmmword ptr[eax+4*4]
-				movaps	xmm3,xmmword ptr[ebx+4*4]
-				addps	xmm2,xmm3
-				movaps	xmmword ptr[edx+4*4],xmm2
+					movaps	xmm0,xmmword ptr[eax]
+					movaps	xmm1,xmmword ptr[ebx]
+					addps	xmm0,xmm1
+					movaps	xmmword ptr[edx],xmm0
 
-				movaps	xmm4,xmmword ptr[eax+8*4]
-				movaps	xmm5,xmmword ptr[ebx+8*4]
-				addps	xmm4,xmm5
-				movaps	xmmword ptr[edx+8*4],xmm4
+					movaps	xmm2,xmmword ptr[eax+4*4]
+					movaps	xmm3,xmmword ptr[ebx+4*4]
+					addps	xmm2,xmm3
+					movaps	xmmword ptr[edx+4*4],xmm2
 
-				movaps	xmm6,xmmword ptr[eax+12*4]
-				movaps	xmm7,xmmword ptr[ebx+12*4]
-				addps	xmm6,xmm7
-				movaps	xmmword ptr[edx+12*4],xmm6
+					movaps	xmm4,xmmword ptr[eax+8*4]
+					movaps	xmm5,xmmword ptr[ebx+8*4]
+					addps	xmm4,xmm5
+					movaps	xmmword ptr[edx+8*4],xmm4
 
-				add		eax,16*4
-				add		ebx,16*4
-				add		edx,16*4
-				loop	loopa
+					movaps	xmm6,xmmword ptr[eax+12*4]
+					movaps	xmm7,xmmword ptr[ebx+12*4]
+					addps	xmm6,xmm7
+					movaps	xmmword ptr[edx+12*4],xmm6
+
+					add		eax,16*4
+					add		ebx,16*4
+					add		edx,16*4
+					loop	loopaa
+				}
+			}
+			else {
+				__asm {
+					mov		ecx,dword ptr [n]
+					mov		eax,dword ptr [src]
+					mov		edx,dword ptr [dst]
+					mov		ebx,dword ptr [op]
+	loopau:
+					prefetcht0 [eax+64]
+					prefetcht0 [ebx+64]
+					prefetcht0 [eax+96]
+					prefetcht0 [ebx+96]
+
+					movaps	xmm0,xmmword ptr[eax]
+					movups	xmm1,xmmword ptr[ebx]
+					addps	xmm0,xmm1
+					movaps	xmmword ptr[edx],xmm0
+
+					movaps	xmm2,xmmword ptr[eax+4*4]
+					movups	xmm3,xmmword ptr[ebx+4*4]
+					addps	xmm2,xmm3
+					movaps	xmmword ptr[edx+4*4],xmm2
+
+					movaps	xmm4,xmmword ptr[eax+8*4]
+					movups	xmm5,xmmword ptr[ebx+8*4]
+					addps	xmm4,xmm5
+					movaps	xmmword ptr[edx+8*4],xmm4
+
+					movaps	xmm6,xmmword ptr[eax+12*4]
+					movups	xmm7,xmmword ptr[ebx+12*4]
+					addps	xmm6,xmm7
+					movaps	xmmword ptr[edx+12*4],xmm6
+
+					add		eax,16*4
+					add		ebx,16*4
+					add		edx,16*4
+					loop	loopau
+				}
 			}
         }
         else {
-            // unaligned version
-	        __asm {
-				mov		ecx,dword ptr [n]
-				mov		eax,dword ptr [src]
-				mov		edx,dword ptr [dst]
-				mov		ebx,dword ptr [op]
-loopu:
-				movups	xmm0,xmmword ptr[eax]
-				movups	xmm1,xmmword ptr[ebx]
-				addps	xmm0,xmm1
-				movups	xmmword ptr[edx],xmm0
+			if((reinterpret_cast<unsigned long>(op)&(__alignof(__m128)-1)) == 0) {
+				__asm {
+					mov		ecx,dword ptr [n]
+					mov		eax,dword ptr [src]
+					mov		edx,dword ptr [dst]
+					mov		ebx,dword ptr [op]
+	loopua:
+					prefetcht0 [eax+64]
+					prefetcht0 [ebx+64]
+					prefetcht0 [eax+96]
+					prefetcht0 [ebx+96]
 
-				movups	xmm2,xmmword ptr[eax+4*4]
-				movups	xmm3,xmmword ptr[ebx+4*4]
-				addps	xmm2,xmm3
-				movups	xmmword ptr[edx+4*4],xmm2
+					movups	xmm0,xmmword ptr[eax]
+					movaps	xmm1,xmmword ptr[ebx]
+					addps	xmm0,xmm1
+					movups	xmmword ptr[edx],xmm0
 
-				movups	xmm4,xmmword ptr[eax+8*4]
-				movups	xmm5,xmmword ptr[ebx+8*4]
-				addps	xmm4,xmm5
-				movups	xmmword ptr[edx+8*4],xmm4
+					movups	xmm2,xmmword ptr[eax+4*4]
+					movaps	xmm3,xmmword ptr[ebx+4*4]
+					addps	xmm2,xmm3
+					movups	xmmword ptr[edx+4*4],xmm2
 
-				movups	xmm6,xmmword ptr[eax+12*4]
-				movups	xmm7,xmmword ptr[ebx+12*4]
-				addps	xmm6,xmm7
-				movups	xmmword ptr[edx+12*4],xmm6
+					movups	xmm4,xmmword ptr[eax+8*4]
+					movaps	xmm5,xmmword ptr[ebx+8*4]
+					addps	xmm4,xmm5
+					movups	xmmword ptr[edx+8*4],xmm4
 
-				add		eax,16*4
-				add		ebx,16*4
-				add		edx,16*4
-				loop	loopu
+					movups	xmm6,xmmword ptr[eax+12*4]
+					movaps	xmm7,xmmword ptr[ebx+12*4]
+					addps	xmm6,xmm7
+					movups	xmmword ptr[edx+12*4],xmm6
+
+					add		eax,16*4
+					add		ebx,16*4
+					add		edx,16*4
+					loop	loopua
+				}
+			}
+			else {
+				__asm {
+					mov		ecx,dword ptr [n]
+					mov		eax,dword ptr [src]
+					mov		edx,dword ptr [dst]
+					mov		ebx,dword ptr [op]
+	loopuu:
+					prefetcht0 [eax+64]
+					prefetcht0 [ebx+64]
+					prefetcht0 [eax+96]
+					prefetcht0 [ebx+96]
+
+					movups	xmm0,xmmword ptr[eax]
+					movups	xmm1,xmmword ptr[ebx]
+					addps	xmm0,xmm1
+					movups	xmmword ptr[edx],xmm0
+
+					movups	xmm2,xmmword ptr[eax+4*4]
+					movups	xmm3,xmmword ptr[ebx+4*4]
+					addps	xmm2,xmm3
+					movups	xmmword ptr[edx+4*4],xmm2
+
+					movups	xmm4,xmmword ptr[eax+8*4]
+					movups	xmm5,xmmword ptr[ebx+8*4]
+					addps	xmm4,xmm5
+					movups	xmmword ptr[edx+8*4],xmm4
+
+					movups	xmm6,xmmword ptr[eax+12*4]
+					movups	xmm7,xmmword ptr[ebx+12*4]
+					addps	xmm6,xmm7
+					movups	xmmword ptr[edx+12*4],xmm6
+
+					add		eax,16*4
+					add		ebx,16*4
+					add		edx,16*4
+					loop	loopuu
+				}
 			}
         }
 	    while(cnt--) *(dst++) = *(src++) + *(op++); 
@@ -1008,6 +1231,10 @@ void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,t_samp
         cnt -= n<<4;
 
         __asm {
+			mov		eax,dword ptr [src]
+			prefetcht0 [eax+0]
+			prefetcht0 [eax+32]
+
 			movss	xmm0,xmmword ptr [opadd]
 			shufps	xmm0,xmm0,0
 			movss	xmm1,xmmword ptr [opmul]
@@ -1023,6 +1250,9 @@ void flext::ScaleSamples(t_sample *dst,const t_sample *src,t_sample opmul,t_samp
 				mov		eax,dword ptr [src]
 				mov		edx,dword ptr [dst]
 loopa:
+				prefetcht0 [eax+64]
+				prefetcht0 [eax+96]
+
 				movaps	xmm2,xmmword ptr[eax]
 				mulps	xmm2,xmm1
 				addps	xmm2,xmm0
@@ -1055,6 +1285,9 @@ loopa:
 				mov		eax,dword ptr [src]
 				mov		edx,dword ptr [dst]
 loopu:
+				prefetcht0 [eax+64]
+				prefetcht0 [eax+96]
+
 				movups	xmm2,xmmword ptr[eax]
 				mulps	xmm2,xmm1
 				addps	xmm2,xmm0
-- 
cgit v1.2.1