20 files changed, 2836 insertions, 0 deletions
diff --git a/system/mmx/Makefile b/system/mmx/Makefile
new file mode 100644
index 0000000..0f8f836
--- /dev/null
+++ b/system/mmx/Makefile
@@ -0,0 +1,29 @@
+include ../../Makefile.config
+
+OBJ = \
+pixel_pack_s16u8.o \
+pixel_unpack_u8s16.o \
+pixel_add_s16.o \
+pixel_mul_s16.o \
+pixel_mix_s16.o \
+pixel_randmix_s16.o \
+pixel_conv_hor_s16.o \
+pixel_conv_ver_s16.o \
+pixel_affine_s16.o \
+pixel_biquad_s16.o \
+pixel_ca_s1.o \
+pixel_rand_s16.o \
+pixel_crot_s16.o \
+pixel_gain_s16.o
+
+all:	$(OBJ)
+
+test:	pdp_mmx_test.o $(OBJ)
+	gcc -o pdp_mmx_test pdp_mmx_test.o $(OBJ) -g
+
+clean:
+	rm -f *.o
+	rm -f *~
+	rm -f pdp_mmx.a
+	rm -f pdp_mmx_test
+
diff --git a/system/mmx/pdp_mmx_test.c b/system/mmx/pdp_mmx_test.c
new file mode 100644
index 0000000..e93539f
--- /dev/null
+++ b/system/mmx/pdp_mmx_test.c
@@ -0,0 +1,62 @@
+#include "pdp_mmx.h"
+
+#define FP(x) ((short int)(((float)(x) * 2 * 256.0f)))
+
+#define nbp 256
+
+    short int a1[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int a2[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int b0[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int b1[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int b2[4] = {0x0100,0x0100,0x0100,0x0100};
+
+    short int u1[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int u2[4] = {0x0100,0x0100,0x0100,0x0100};
+
+    short int x0[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int x1[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int x2[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int x3[4] = {0x0100,0x0100,0x0100,0x0100};
+
+void print_pixel(unsigned int i)
+{
+    if (i) printf("x ");
+    else printf(". ");
+}
+
+void print_line(void)
+{
+    printf("\n");
+}
+
+void print_square(unsigned char *c)
+{
+    int i,j;
+
+    for(j=7; j>=0; j--){
+	for(i=0; i<8; i++) print_pixel(c[j] & (1<<(7-i)));
+	printf("\n");
+    }
+    
+}
+
+main()
+{
+  
+    unsigned char src[16]={1,2,3,4,5,6,7,8,-1,-2,-3,-4,-5,-6,-7,-8};
+    unsigned char dst[8];
+
+    
+    print_square(src);
+    print_line();
+    print_square(src+8);
+    print_line();
+
+    pixel_test_s1(dst,src,1,1);
+
+    print_square(dst);
+    print_line();
+
+
+
+}
diff --git a/system/mmx/pixel_add_s16.s b/system/mmx/pixel_add_s16.s
new file mode 100644
index 0000000..8d4c7df
--- /dev/null
+++ b/system/mmx/pixel_add_s16.s
@@ -0,0 +1,55 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_add_s16
+.type  pixel_add_s16,@function
+
+# simple add
+# void pixel_add_s16(int *left, int *right, int nb_4pixel_vectors)
+
+pixel_add_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %edi	# left array
+	movl 12(%ebp), %esi	# right array
+	movl 16(%ebp), %ecx	# pixel count
+
+	
+	.align 16
+	.loop_mix:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm1	# load right 4 pixels from memory
+	movq (%edi), %mm0	# load 4 left pixels from memory
+	paddsw %mm1, %mm0	# mix
+	movq %mm0, (%edi)
+	addl $8, %esi
+	addl $8, %edi
+	decl %ecx
+	jnz .loop_mix		# loop
+
+	emms
+
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_affine_s16.s b/system/mmx/pixel_affine_s16.s
new file mode 100644
index 0000000..b357de3
--- /dev/null
+++ b/system/mmx/pixel_affine_s16.s
@@ -0,0 +1,59 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_affine_s16
+.type  pixel_affine_s16,@function
+
+# void pixel_affine_s16(int *buf, int nb_8pixel_vectors, short int gain[4], short int offset[4])
+
+pixel_affine_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 20(%ebp), %edi
+	movq (%edi), %mm6	# get offset vector
+
+	movl 16(%ebp), %edi
+	movq (%edi), %mm7	# get gain vector
+	
+	movl 8(%ebp),  %esi	# input array
+	movl 12(%ebp), %ecx	# pixel count
+
+	
+	.align 16
+	.loop_affine:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm0	# load 4 pixels from memory
+	pmulhw %mm7, %mm0	# apply gain (s).15 fixed point
+	psllw $1, %mm0		# apply correction shift
+	paddsw %mm6, %mm0	# add offset
+	movq %mm0, (%esi)	# store result in memory
+
+	addl $8, %esi		# increment source pointer
+	decl %ecx
+	jnz .loop_affine	# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_biquad_dirI_s16.s b/system/mmx/pixel_biquad_dirI_s16.s
new file mode 100644
index 0000000..1729502
--- /dev/null
+++ b/system/mmx/pixel_biquad_dirI_s16.s
@@ -0,0 +1,361 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+
+	# TODO MOVE TO DIRECT FORM II
+	# y[k]  = b0 * x[k] + u1[k-1]
+	# u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
+	# u2[k] = b2 * x[k]           - a2 * y[k]
+	
+	# input in register:	
+	# %mm0-mm3:	input 4x4 pixels {x0 x1 x2 x3}
+	# %esi:		coef memory  (a1, a2, b0, b1, b2)
+	# %edi:		state memory (u1, u2)
+
+	
+	# return in register:	 
+	# %mm0-mm4:	4x4 pixels result
+
+	
+	.biquad_4x4_pixels:	
+	.align 16
+	# prescale
+	movq -8(%esi), %mm4
+	pmulhw %mm4, %mm0
+	pmulhw %mm4, %mm1
+	pmulhw %mm4, %mm2
+	pmulhw %mm4, %mm3
+	psllw $1, %mm0
+	psllw $1, %mm1
+	psllw $1, %mm2
+	psllw $1, %mm3
+
+	
+	# first vector
+	movq 0(%edi), %mm4		# mm4 <- u[-1]
+	movq 8(%edi), %mm5		# mm5 <- u[-2]
+	movq %mm4, %mm6
+	movq %mm5, %mm7
+
+	pmulhw 0(%esi), %mm6		# multiply by a1
+	pmulhw 8(%esi), %mm7		# multiply by a2
+
+	paddsw %mm6, %mm0		# accumulate
+	paddsw %mm7, %mm0		# accumulate
+	paddsw %mm0, %mm0		# scale by 2 (since all fixed point muls are x*y/2)
+
+	movq %mm0, %mm6			# mm6 <- u[0]
+	movq %mm4, %mm7			# mm7 <- u[-1]
+	pmulhw 16(%esi), %mm0		# multiply by b0
+	pmulhw 24(%esi), %mm4		# multiply by b1
+	pmulhw 32(%esi), %mm5		# multiply by b2
+
+	paddsw %mm4, %mm0		# accumulate
+	paddsw %mm5, %mm0		# accumulate
+
+					# mm0 is result 0
+
+	# second vector
+	movq %mm6, %mm4			# mm4 <- u[0]
+	movq %mm7, %mm5			# mm5 <- u[-1]
+
+	pmulhw 0(%esi), %mm6		# multiply by a1
+	pmulhw 8(%esi), %mm7		# multiply by a2
+
+	paddsw %mm6, %mm1		# accumulate
+	paddsw %mm7, %mm1		# accumulate
+	paddsw %mm1, %mm1		# scale by 2
+
+	
+	movq %mm1, %mm6			# mm6 <- u[1]
+	movq %mm4, %mm7			# mm7 <- u[0]
+	pmulhw 16(%esi), %mm1		# multiply by b0
+	pmulhw 24(%esi), %mm4		# multiply by b1
+	pmulhw 32(%esi), %mm5		# multiply by b2
+
+	paddsw %mm4, %mm1		# accumulate
+	paddsw %mm5, %mm1		# accumulate
+
+					# mm1 is result 1
+
+	# third vector
+	movq %mm6, %mm4			# mm4 <- u[1]
+	movq %mm7, %mm5			# mm5 <- u[0]
+
+	pmulhw 0(%esi), %mm6		# multiply by a1
+	pmulhw 8(%esi), %mm7		# multiply by a2
+
+	paddsw %mm6, %mm2		# accumulate
+	paddsw %mm7, %mm2		# accumulate
+	paddsw %mm2, %mm2		# scale by 2
+
+	
+	movq %mm2, %mm6			# mm6 <- u[2]
+	movq %mm4, %mm7			# mm7 <- u[1]
+	pmulhw 16(%esi), %mm2		# multiply by b0
+	pmulhw 24(%esi), %mm4		# multiply by b1
+	pmulhw 32(%esi), %mm5		# multiply by b2
+
+	paddsw %mm4, %mm2		# accumulate
+	paddsw %mm5, %mm2		# accumulate
+
+					# mm2 is result 2
+
+	# fourth vector
+	movq %mm6, %mm4			# mm4 <- u[2]
+	movq %mm7, %mm5			# mm5 <- u[1]
+
+	pmulhw 0(%esi), %mm6		# multiply by a1
+	pmulhw 8(%esi), %mm7		# multiply by a2
+
+	paddsw %mm6, %mm3		# accumulate
+	paddsw %mm7, %mm3		# accumulate
+	paddsw %mm3, %mm3		# scale by 2
+
+	
+	movq %mm3, 0(%edi)		# store  u[3]
+	movq %mm4, 8(%edi)		# store  u[2]
+	pmulhw 16(%esi), %mm3		# multiply by b0
+	pmulhw 24(%esi), %mm4		# multiply by b1
+	pmulhw 32(%esi), %mm5		# multiply by b2
+
+	paddsw %mm4, %mm3		# accumulate
+	paddsw %mm5, %mm3		# accumulate
+
+					# mm3 is result 3
+
+	ret
+	
+
+	# in order to use the 4 line parallel biquad routine on horizontal
+	# lines, we need to reorder (rotate or transpose) the matrix, since
+	# images are scanline encoded, and we want to work in parallell
+	# on 4 lines.
+	#
+	# since the 4 lines are independent, it doesnt matter in which order
+	# the the vector elements are present. 
+	#
+	# this allows us to use the same routine for left->right and right->left
+	# processing.
+	#	
+	# some comments on the non-abelean group of square isometries consisting of
+	# (I) identity
+	# (H) horizontal axis mirror 
+	# (V) vertical axis mirror
+	# (T) transpose (diagonal axis mirror)
+	# (A) antitranspose (antidiagonal axis mirror)
+	# (R1) 90deg anticlockwize rotation
+	# (R2) 180deg rotation
+	# (R3) 90deg clockwize rotation
+	#
+	#	
+	# we basicly have two options: (R1,R3) or (T,A)
+	# we opt for T and A because they are self inverting, which improves locality
+	#
+	# use antitranspose for right to left an transpose
+	# for left to right (little endian)
+
+
+	# antitranspose 4x4
+
+	# input
+	# %mm3 == {d0 d1 d2 d3}
+	# %mm2 == {c0 c1 c2 c3}	
+	# %mm1 == {b0 b1 b2 b3}	
+	# %mm0 == {a0 a1 a2 a3}
+
+	# output
+	# %mm3 == {a3 b3 c3 d3}
+	# %mm2 == {a2 b2 c2 d2}
+	# %mm1 == {a1 b1 c1 d1}
+	# %mm0 == {a0 b0 c0 d0}
+
+	
+	.antitranspose_4x4:	
+	.align 16
+	movq %mm3, %mm4
+	punpcklwd %mm1, %mm4	# mm4 <- {b2 d2 b3 d3}
+	movq %mm3, %mm5	
+	punpckhwd %mm1, %mm5	# mm5 <- {b0 d0 b1 d1}
+			
+	movq %mm2, %mm6
+	punpcklwd %mm0, %mm6	# mm6 <- {a2 c2 a3 c3}
+	movq %mm2, %mm7	
+	punpckhwd %mm0, %mm7	# mm7 <- {a0 c0 a1 c1}
+
+	movq %mm4, %mm3
+	punpcklwd %mm6, %mm3	# mm3 <- {a3 b3 c3 d3}
+	movq %mm4, %mm2
+	punpckhwd %mm6, %mm2	# mm2 <- {a2 b2 c2 d2}
+		
+	movq %mm5, %mm1
+	punpcklwd %mm7, %mm1	# mm1 <- {a1 b1 c1 d1}
+	movq %mm5, %mm0
+	punpckhwd %mm7, %mm0	# mm0 <- {a0 b0 c0 d0}
+
+	ret
+
+	
+
+	# transpose 4x4
+
+	# input
+	# %mm3 == {d3 d2 d1 d0}
+	# %mm2 == {c3 c2 c1 c0}	
+	# %mm1 == {b3 b2 b1 b0}	
+	# %mm0 == {a3 a2 a1 a0}
+
+	# output
+	# %mm3 == {d3 c3 b3 a3}
+	# %mm2 == {d2 c2 b2 a2}
+	# %mm1 == {d1 c1 b1 a1}
+	# %mm0 == {d0 c0 b0 a0}
+
+	
+	.transpose_4x4:	
+	.align 16
+	movq %mm0, %mm4
+	punpcklwd %mm2, %mm4	# mm4 <- {c1 a1 c0 a0}
+	movq %mm0, %mm5	
+	punpckhwd %mm2, %mm5	# mm5 <- {c3 a3 c2 a2}
+		
+	movq %mm1, %mm6
+	punpcklwd %mm3, %mm6	# mm6 <- {d1 b1 d0 b0}
+	movq %mm1, %mm7	
+	punpckhwd %mm3, %mm7	# mm7 <- {d3 b3 d2 b2}
+
+	movq %mm4, %mm0
+	punpcklwd %mm6, %mm0	# mm0 <- {d0 c0 b0 a0}
+	movq %mm4, %mm1
+	punpckhwd %mm6, %mm1	# mm1 <- {d1 c1 b1 a1}
+		
+	movq %mm5, %mm2
+	punpcklwd %mm7, %mm2	# mm2 <- {d2 c2 b2 a2}
+	movq %mm5, %mm3
+	punpckhwd %mm7, %mm3	# mm3 <- {d3 c3 b3 a3}
+
+	ret
+
+	
+.globl pixel_biquad_vertb_s16
+.type  pixel_biquad_vertb_s16,@function
+
+
+# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_biquad_vertb_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing	
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_vertb_line_loop:
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	call .biquad_4x4_pixels
+	movq %mm0, (%ebx)	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+	addl %edx, %ebx
+	addl %eax, %ebx
+	decl %ecx
+	jnz .biquad_vertb_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+.globl pixel_biquad_horlr_s16
+.type  pixel_biquad_horlr_s16,@function
+
+
+# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_biquad_horlr_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_horlr_line_loop:
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	call .transpose_4x4	
+	call .biquad_4x4_pixels
+	call .transpose_4x4	
+	movq %mm0, (%ebx)	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+	addl $8, %ebx
+	decl %ecx
+	jnz .biquad_horlr_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+
+
diff --git a/system/mmx/pixel_biquad_s16.s b/system/mmx/pixel_biquad_s16.s
new file mode 100644
index 0000000..844b041
--- /dev/null
+++ b/system/mmx/pixel_biquad_s16.s
@@ -0,0 +1,451 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+	
+	# DIRECT FORM II BIQUAD
+	#
+	# y[k]  = b0 * x[k] + u1[k-1]
+	# u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
+	# u2[k] = b2 * x[k]           - a2 * y[k]
+	# MACRO:	df2 <reg>
+	#
+	# computes a direct form 2 biquad
+	# does not use {mm0-mm3}\<inreg>
+	#
+	# input:	<reg>   == input
+	#		%mm4    == state 1
+	#		%mm5    == state 2
+	#		(%esi)  == biquad coefs (-a1 -a2 b0 b1 b2) in s1.14
+	# output:	<reg>   == output
+	#		%mm4    == state 1
+	#		%mm5    == state 2
+
+	.macro df2 reg 
+	movq \reg, %mm6			# mm6 == x[k]
+	movq \reg, %mm7			# mm7 == x[k]
+	pmulhw 16(%esi), %mm6		# mm6 == x[k] * b0
+	pmulhw 24(%esi), %mm7		# mm7 == x[k] * b1
+	paddw %mm4, %mm6		# mm6 == x[k] * b0 + u1[k-1] == y[k]
+	paddw %mm5, %mm7		# mm7 == x[k] * b1 + u2[k-1]
+	paddsw %mm6, %mm6		# compensate for mul = x*y/4 (coefs are s1.14 fixed point)
+	paddsw %mm6, %mm6		# paddsw ensures saturation
+	movq \reg, %mm5			# mm5 == x[k]
+	movq %mm6, %mm4			# mm4 == y[k]
+	movq %mm6, \reg			# reg == y[k]	--------------------
+	pmulhw 0(%esi), %mm4		# mm4 == y[k] * (-a1)
+	pmulhw 8(%esi), %mm6		# mm6 == y[k] * (-a2)
+	pmulhw 32(%esi), %mm5		# mm5 == x[k] * b2
+	paddw %mm7, %mm4		# mm4 == u1[k]	--------------------
+	paddw %mm6, %mm5		# mm5 == u2[k]	--------------------
+	.endm
+
+		
+	# input in register:	
+	# %mm0-mm3:	input 4x4 pixels {x0 x1 x2 x3}
+	# %esi:		coef memory  (-a1, -a2, b0, b1, b2) in s1.14
+	# %edi:		state memory (u1, u2)
+	
+	# return in register:	 
+	# %mm0-mm4:	4x4 pixels result
+
+
+
+	
+	.macro biquad_4x4_pixels	
+	.align 16
+	movq 0(%edi), %mm4		# get state
+	movq 8(%edi), %mm5
+	df2 %mm0			# compute 4 biquads
+	df2 %mm1
+	df2 %mm2
+	df2 %mm3
+	movq %mm4, 0(%edi)		# store state
+	movq %mm5, 8(%edi)
+	.endm
+
+	
+
+	# in order to use the 4 line parallel biquad routine on horizontal
+	# lines, we need to reorder (rotate or transpose) the matrix, since
+	# images are scanline encoded, and we want to work in parallell
+	# on 4 lines.
+	#
+	# since the 4 lines are independent, it doesnt matter in which order
+	# the the vector elements are present. 
+	#
+	# this allows us to use the same routine for left->right and right->left
+	# processing.
+	#	
+	# some comments on the non-abelean group of square isometries consisting of
+	# (I) identity
+	# (H) horizontal axis mirror 
+	# (V) vertical axis mirror
+	# (T) transpose (diagonal axis mirror)
+	# (A) antitranspose (antidiagonal axis mirror)
+	# (R1) 90deg anticlockwize rotation
+	# (R2) 180deg rotation
+	# (R3) 90deg clockwize rotation
+	#
+	#	
+	# we basicly have two options: (R1,R3) or (T,A)
+	# we opt for T and A because they are self inverting, which improves locality
+	#
+	# use antitranspose for right to left an transpose
+	# for left to right (little endian)
+
+
+	# antitranspose 4x4
+
+	# input
+	# %mm3 == {d0 d1 d2 d3}
+	# %mm2 == {c0 c1 c2 c3}	
+	# %mm1 == {b0 b1 b2 b3}	
+	# %mm0 == {a0 a1 a2 a3}
+
+	# output
+	# %mm3 == {a3 b3 c3 d3}
+	# %mm2 == {a2 b2 c2 d2}
+	# %mm1 == {a1 b1 c1 d1}
+	# %mm0 == {a0 b0 c0 d0}
+
+	
+	.macro antitranspose_4x4:	
+	movq %mm3, %mm4
+	punpcklwd %mm1, %mm4	# mm4 <- {b2 d2 b3 d3}
+	movq %mm3, %mm5	
+	punpckhwd %mm1, %mm5	# mm5 <- {b0 d0 b1 d1}
+			
+	movq %mm2, %mm6
+	punpcklwd %mm0, %mm6	# mm6 <- {a2 c2 a3 c3}
+	movq %mm2, %mm7	
+	punpckhwd %mm0, %mm7	# mm7 <- {a0 c0 a1 c1}
+
+	movq %mm4, %mm3
+	punpcklwd %mm6, %mm3	# mm3 <- {a3 b3 c3 d3}
+	movq %mm4, %mm2
+	punpckhwd %mm6, %mm2	# mm2 <- {a2 b2 c2 d2}
+		
+	movq %mm5, %mm1
+	punpcklwd %mm7, %mm1	# mm1 <- {a1 b1 c1 d1}
+	movq %mm5, %mm0
+	punpckhwd %mm7, %mm0	# mm0 <- {a0 b0 c0 d0}
+	
+	.endm
+	
+
+	# transpose 4x4
+
+	# input
+	# %mm3 == {d3 d2 d1 d0}
+	# %mm2 == {c3 c2 c1 c0}	
+	# %mm1 == {b3 b2 b1 b0}	
+	# %mm0 == {a3 a2 a1 a0}
+
+	# output
+	# %mm3 == {d3 c3 b3 a3}
+	# %mm2 == {d2 c2 b2 a2}
+	# %mm1 == {d1 c1 b1 a1}
+	# %mm0 == {d0 c0 b0 a0}
+
+	
+	.macro transpose_4x4:	
+	movq %mm0, %mm4
+	punpcklwd %mm2, %mm4	# mm4 <- {c1 a1 c0 a0}
+	movq %mm0, %mm5	
+	punpckhwd %mm2, %mm5	# mm5 <- {c3 a3 c2 a2}
+		
+	movq %mm1, %mm6
+	punpcklwd %mm3, %mm6	# mm6 <- {d1 b1 d0 b0}
+	movq %mm1, %mm7	
+	punpckhwd %mm3, %mm7	# mm7 <- {d3 b3 d2 b2}
+
+	movq %mm4, %mm0
+	punpcklwd %mm6, %mm0	# mm0 <- {d0 c0 b0 a0}
+	movq %mm4, %mm1
+	punpckhwd %mm6, %mm1	# mm1 <- {d1 c1 b1 a1}
+		
+	movq %mm5, %mm2
+	punpcklwd %mm7, %mm2	# mm2 <- {d2 c2 b2 a2}
+	movq %mm5, %mm3
+	punpckhwd %mm7, %mm3	# mm3 <- {d3 c3 b3 a3}
+
+	.endm
+	
+.globl pixel_biquad_vertb_s16
+.type  pixel_biquad_vertb_s16,@function
+
+
+# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_biquad_vertb_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing	
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_vertb_line_loop:
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	biquad_4x4_pixels
+	movq %mm0, (%ebx)	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+	addl %edx, %ebx
+	addl %eax, %ebx
+	decl %ecx
+	jnz .biquad_vertb_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+.globl pixel_biquad_verbt_s16
+.type  pixel_biquad_verbt_s16,@function
+
+
+# pixel_biquad_vertbt_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_biquad_verbt_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %eax	# line with
+
+	shll $3, %eax		# 4 line byte spacing
+	decl %ecx
+	mul %ecx
+	incl %ecx
+	addl %eax, %ebx		# ebx points to last pixblock
+
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing	
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_verbt_line_loop:
+	movq (%ebx), %mm3	
+	movq (%ebx,%edx,1), %mm2	
+	movq (%ebx,%edx,2), %mm1	
+	movq (%ebx,%eax,1), %mm0
+	biquad_4x4_pixels
+	movq %mm3, (%ebx)	
+	movq %mm2, (%ebx,%edx,1)	
+	movq %mm1, (%ebx,%edx,2)	
+	movq %mm0, (%ebx,%eax,1)
+	subl %edx, %ebx
+	subl %eax, %ebx
+	decl %ecx
+	jnz .biquad_verbt_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+.globl pixel_biquad_horlr_s16
+.type  pixel_biquad_horlr_s16,@function
+# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+pixel_biquad_horlr_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_horlr_line_loop:
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	transpose_4x4	
+	biquad_4x4_pixels
+	transpose_4x4	
+	movq %mm0, (%ebx)	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+	addl $8, %ebx
+	decl %ecx
+	jnz .biquad_horlr_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+
+.globl pixel_biquad_horrl_s16
+.type  pixel_biquad_horrl_s16,@function
+# pixel_biquad_horrl_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+pixel_biquad_horrl_s16: 
+
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+
+	movl %ecx, %eax
+	decl %eax
+	shll $3, %eax
+	addl %eax, %ebx		# ebx points to last pixblock
+
+	
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_horrl_line_loop:
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	antitranspose_4x4	
+	biquad_4x4_pixels
+	antitranspose_4x4	
+	movq %mm0, (%ebx)	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+	subl $8, %ebx
+	decl %ecx
+	jnz .biquad_horrl_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+
+.globl pixel_biquad_time_s16
+.type  pixel_biquad_time_s16,@function
+# pixel_biquad_time_s16(short int *pixel_array, short int *s1, short int *s2, short int *coefs, int nb_4_pix_vectors)
+
+pixel_biquad_time_s16: 
+
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %edx	# state 1 array
+	movl 16(%ebp), %edi	# state 2 array
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %ecx	# nb of 4 pixel vectors
+
+
+	.align 16
+	.biquad_time_loop:
+	movq (%ebx), %mm0	# get input
+	movq (%edx), %mm4	# get state 1
+	movq (%edi), %mm5	# get state 2
+	df2 %mm0		# compute direct form 2
+	movq %mm0, (%ebx)	# write output
+	movq %mm5, (%edi)	# write state 2
+	movq %mm4, (%edx)	# write state 1
+	addl $8, %ebx
+	addl $8, %edi
+	addl $8, %edx
+	decl %ecx
+	jnz .biquad_time_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+
diff --git a/system/mmx/pixel_ca_s1.s b/system/mmx/pixel_ca_s1.s
new file mode 100644
index 0000000..d9c730f
--- /dev/null
+++ b/system/mmx/pixel_ca_s1.s
@@ -0,0 +1,189 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+	# this file contains assembler routines for 2D 1 bit cellular automata
+	# processing. it is organized around a feeder kernel and a
+	# stack based bit processor (virtual forth machine)
+	#
+	# the feeder kernel is responsable for loading/storing CA cells
+	# from/to memory. data in memory is organized as a scanline
+	# encoded toroidial bitplane (lsb = left). to simplify the kernel, the top
+	# left corner of the rectangular grid of pixels will shift down
+	# every processing step.
+	#
+	# the stack machine has the following architecture:
+	# CA stack:	%esi, TOS: %mm0 (32x2 pixels. lsw = top row)
+	# CA horizon:	%mm4-%mm7 (64x4 pixels. %mm4 = top row)
+	#
+	# the stack size / organization is not known to the stack machine. 
+	# it can be thought of as operating on a 3x3 cell neightbourhood.
+	# the only purpose of forth program is to determine the CA local update rule.
+	#
+	# the machine is supposed to be very minimal. no looping control.
+	# no adressing modes. no conditional code (hey, this is an experiment!)
+	# so recursion is not allowed (no way to stop it)
+	# there are 9 words to load the cell neigbourhood on the stack.
+	# the rest is just logic and stack manips.
+
+
+	# this file contains pure asm macros. it is to be included before assembly
+	# after scaforth.pl has processed the .scaf file
+	
+
+	# *************************** CA CELL ACCESS MACROS *****************************
+	# fetchTL - fetchBR
+
+	# shift / load rectangle macros:
+
+	# shift rectangle horizontal	
+	# result is in reg1
+	.macro shift reg1 reg2 count
+	psllq $(32+\count), \reg1
+	psrlq $(32-\count), \reg2
+	psrlq $32, \reg1
+	psllq $32, \reg2
+	por \reg2, \reg1
+	.endm
+
+	.macro ldtop reg1 reg2
+	movq %mm4, \reg1
+	movq %mm5, \reg2
+	.endm
+
+	.macro ldcenter reg1 reg2
+	movq %mm5, \reg1
+	movq %mm6, \reg2
+	.endm
+
+	.macro ldbottom reg1 reg2
+	movq %mm6, \reg1
+	movq %mm7, \reg2
+	.endm
+	
+
+	# fetch from top row
+
+	# fetch the top left square
+	.macro fetchTL
+	ldtop %mm0, %mm1
+	shift %mm0, %mm1, -1
+	.endm
+
+	# fetch the top mid square
+	.macro fetchTM
+	ldtop %mm0, %mm1
+	shift %mm0, %mm1, 0
+	.endm
+
+	# fetch the top right square
+	.macro fetchTR
+	ldtop %mm0, %mm1
+	shift %mm0, %mm1, 1
+	.endm
+
+
+	
+	# fetch from center row
+
+	# fetch the mid left square
+	.macro fetchML
+	ldcenter %mm0, %mm1
+	shift %mm0, %mm1, -1
+	.endm
+
+	# fetch the mid mid square
+	.macro fetchMM
+	ldcenter %mm0, %mm1
+	shift %mm0, %mm1, 0
+	.endm
+
+	# fetch the mid right square
+	.macro fetchMR
+	ldcenter %mm0, %mm1
+	shift %mm0, %mm1, 1
+	.endm
+
+
+	
+
+			
+	# fetch from bottom row
+
+	# fetch the bottom left square
+	.macro fetchBL
+	ldbottom %mm0, %mm1
+	shift %mm0, %mm1, -1
+	.endm
+
+	# fetch the bottom mid square
+	.macro fetchBM
+	ldbottom %mm0, %mm1
+	shift %mm0, %mm1, 0
+	.endm
+
+	# fetch the bottom right square
+	.macro fetchBR
+	ldbottom %mm0, %mm1
+	shift %mm0, %mm1, 1
+	.endm
+
+
+
+	# *************************** CA STACK MANIP MACROS *****************************
+	# dup drop dropdup swap nip dropover
+
+	.macro dup
+	lea -8(%esi), %esi
+	movq %mm0, (%esi)	
+	.endm
+
+	.macro drop
+	movq (%esi), %mm0
+	lea 8(%esi), %esi
+	.endm
+
+	.macro dropdup
+	movq (%esi), %mm0
+	.endm
+
+	.macro swap
+	movq (%esi), %mm1
+	movq %mm0, (%esi)
+	movq %mm1, %mm0
+	.endm
+
+	.macro nip
+	lea 8(%esi), %esi
+	.endm
+
+	.macro dropover
+	movq 8(%esi), %mm0
+	.endm
+
+
+	# *************************** CA BOOLEAN LOGIC MACROS *****************************
+	# overxor 
+	
+	.macro overxor
+	pxor (%esi), %mm0
+	.endm	
+	
+	
+	
+	
+
diff --git a/system/mmx/pixel_cascade_s16.s b/system/mmx/pixel_cascade_s16.s
new file mode 100644
index 0000000..bf88d08
--- /dev/null
+++ b/system/mmx/pixel_cascade_s16.s
@@ -0,0 +1,330 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+
+	# TODO:	 COUPLED CASCADE SECOND ORDER SECTION
+	#
+	# s1[k] = ar * s1[k-1] + ai * s2[k-1] + x[k]
+	# s2[k] = ar * s2[k-1] - ai * s1[k-1]
+	# y[k]  = c0 * x[k] + c1 * s1[k-1] + c2 * s2[k-1]
+
+
+	# MACRO:	df2
+	#
+	# computes a coupled cascade
+	#
+	# input:	%mm0    == input
+	#		%mm1    == state 1
+	#		%mm2    == state 2
+	#		(%esi)  == cascade coefs (ar ai c0 c1 c2) in s0.15
+	# output:	%mm0    == output
+	#		%mm1    == state 1
+	#		%mm2    == state 2
+
+
+	.macro coupled
+	pmovq %mm1, %mm3		# mm3 == s1[k-1]
+	pmovq %mm1, %mm4		# mm4 == s1[k-1]
+	pmovq %mm2, %mm5		# mm5 == s2[k-1]
+	pmovq %mm2, %mm6		# mm5 == s2[k-1]
+	pmulhw (%esi), %mm1		# mm1 == s1[k-1] * ar
+	pmulhw 8(%esi), %mm3		# mm3 == s1[k-1] * ai
+	pmulhw 24(%esi), %mm4		# mm4 == s1[k-1] * c1
+	pmulhw (%esi), %mm2		# mm2 == s2[k-1] * ar
+	pmulhw 8(%esi), %mm5		# mm5 == s2[k-1] * ai
+	pmulhw 32(%esi), %mm6		# mm6 == s2[k-1] * c2
+	paddw %mm5, %mm1		# mm1 == s1[k-1] * ar + s2[k-1] * ai
+	psubw %mm3, %mm2		# mm2 == s2[k-1] * ar - s1[k-1] * ai == s2[k]
+	paddw %mm0, %mm1		# mm1 == s1[k]
+	pmulhw 16(%esi), %mm0		# mm0 == x[k] * c0
+	paddw %mm6, %mm4		# mm4 == s1[k-1] * c1 + s2[k-1] * c2
+	paddw %mm4, %mm0		# mm0 == y[k]
+	.endm
+	
+
+	
+
+	# in order to use the 4 line parallel cascade routine on horizontal
+	# lines, we need to reorder (rotate or transpose) the matrix, since
+	# images are scanline encoded, and we want to work in parallell
+	# on 4 lines.
+	#
+	# since the 4 lines are independent, it doesnt matter in which order
+	# the the vector elements are present. 
+	#
+	# this allows us to use the same routine for left->right and right->left
+	# processing.
+	#	
+	# some comments on the non-abelean group of square isometries consisting of
+	# (I) identity
+	# (H) horizontal axis mirror 
+	# (V) vertical axis mirror
+	# (T) transpose (diagonal axis mirror)
+	# (A) antitranspose (antidiagonal axis mirror)
+	# (R1) 90deg anticlockwize rotation
+	# (R2) 180deg rotation
+	# (R3) 90deg clockwize rotation
+	#
+	#	
+	# we basicly have two options: (R1,R3) or (T,A)
+	# we opt for T and A because they are self inverting, which improves locality
+	#
+	# use antitranspose for right to left an transpose
+	# for left to right (little endian)
+
+
+	# antitranspose 4x4
+
+	# input
+	# %mm3 == {d0 d1 d2 d3}
+	# %mm2 == {c0 c1 c2 c3}	
+	# %mm1 == {b0 b1 b2 b3}	
+	# %mm0 == {a0 a1 a2 a3}
+
+	# output
+	# %mm3 == {a3 b3 c3 d3}
+	# %mm2 == {a2 b2 c2 d2}
+	# %mm1 == {a1 b1 c1 d1}
+	# %mm0 == {a0 b0 c0 d0}
+
+	
+	.macro antitranspose_4x4:	
+	movq %mm3, %mm4
+	punpcklwd %mm1, %mm4	# mm4 <- {b2 d2 b3 d3}
+	movq %mm3, %mm5	
+	punpckhwd %mm1, %mm5	# mm5 <- {b0 d0 b1 d1}
+			
+	movq %mm2, %mm6
+	punpcklwd %mm0, %mm6	# mm6 <- {a2 c2 a3 c3}
+	movq %mm2, %mm7	
+	punpckhwd %mm0, %mm7	# mm7 <- {a0 c0 a1 c1}
+
+	movq %mm4, %mm3
+	punpcklwd %mm6, %mm3	# mm3 <- {a3 b3 c3 d3}
+	movq %mm4, %mm2
+	punpckhwd %mm6, %mm2	# mm2 <- {a2 b2 c2 d2}
+		
+	movq %mm5, %mm1
+	punpcklwd %mm7, %mm1	# mm1 <- {a1 b1 c1 d1}
+	movq %mm5, %mm0
+	punpckhwd %mm7, %mm0	# mm0 <- {a0 b0 c0 d0}
+	
+	.endm
+	
+
+	# transpose 4x4
+
+	# input
+	# %mm3 == {d3 d2 d1 d0}
+	# %mm2 == {c3 c2 c1 c0}	
+	# %mm1 == {b3 b2 b1 b0}	
+	# %mm0 == {a3 a2 a1 a0}
+
+	# output
+	# %mm3 == {d3 c3 b3 a3}
+	# %mm2 == {d2 c2 b2 a2}
+	# %mm1 == {d1 c1 b1 a1}
+	# %mm0 == {d0 c0 b0 a0}
+
+	
+	.macro transpose_4x4:	
+	movq %mm0, %mm4
+	punpcklwd %mm2, %mm4	# mm4 <- {c1 a1 c0 a0}
+	movq %mm0, %mm5	
+	punpckhwd %mm2, %mm5	# mm5 <- {c3 a3 c2 a2}
+		
+	movq %mm1, %mm6
+	punpcklwd %mm3, %mm6	# mm6 <- {d1 b1 d0 b0}
+	movq %mm1, %mm7	
+	punpckhwd %mm3, %mm7	# mm7 <- {d3 b3 d2 b2}
+
+	movq %mm4, %mm0
+	punpcklwd %mm6, %mm0	# mm0 <- {d0 c0 b0 a0}
+	movq %mm4, %mm1
+	punpckhwd %mm6, %mm1	# mm1 <- {d1 c1 b1 a1}
+		
+	movq %mm5, %mm2
+	punpcklwd %mm7, %mm2	# mm2 <- {d2 c2 b2 a2}
+	movq %mm5, %mm3
+	punpckhwd %mm7, %mm3	# mm3 <- {d3 c3 b3 a3}
+
+	.endm
+	
+.globl pixel_cascade_vertb_s16
+.type  pixel_cascade_vertb_s16,@function
+
+
+# pixel_cascade_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_cascade_vertb_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing
+	subl %edx, %ebx	
+
+	movq 0(%edi), %mm1	# s1[k-1]
+	movq 8(%edi), %mm2	# s2[k-1]
+	.align 16
+	.cascade_vertb_line_loop:
+	
+	movq (%ebx,%edx,1), %mm3
+	movq %mm3, %mm0
+	addl %edx, %ebx
+	coupled
+	movq %mm0, (%ebx)
+	
+	movq (%ebx,%edx,1), %mm3
+	movq %mm3, %mm0
+	addl %edx, %ebx
+	coupled
+	movq %mm0, (%ebx)
+	
+	movq (%ebx,%edx,1), %mm3
+	movq %mm3, %mm0
+	addl %edx, %ebx
+	coupled
+	movq %mm0, (%ebx)
+	
+	movq (%ebx,%edx,1), %mm3
+	movq %mm3, %mm0
+	addl %edx, %ebx
+	coupled
+	movq %mm0, (%ebx)
+	
+	decl %ecx
+	jnz .cascade_vertb_line_loop
+		
+	movq %mm1, 0(%edi)	# s1[k-1]
+	movq %mm2, 8(%edi)	# s2[k-1]
+
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+.globl pixel_cascade_horlr_s16
+.type  pixel_cascade_horlr_s16,@function
+
+
+# pixel_cascade_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_cascade_horlr_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+
+	
+	.align 16
+	.cascade_horlr_line_loop:
+	movq (%edi), %mm1
+	movq 8(%edi), %mm2
+	
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	
+	transpose_4x4
+	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+
+	coupled
+
+	movq %mm0, (%ebx)
+	movq (%ebx,%edx,1), %mm3
+	movq %mm3, %mm0
+
+	coupled
+
+	movq %mm0, (%ebx, %edx,1)
+	movq (%ebx,%edx,2), %mm3
+	movq %mm3, %mm0
+
+	coupled
+
+	movq %mm0, (%ebx, %edx,2)
+	movq (%ebx,%eax,1), %mm3
+	movq %mm3, %mm0
+
+	coupled
+	
+	movq %mm1, 0(%edi)	# s1[k-1]
+	movq %mm2, 8(%edi)	# s2[k-1]
+
+	movq %mm0, %mm3
+	movq (%ebx), %mm0
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+
+	transpose_4x4
+	
+	movq %mm0, (%ebx)
+	movq %mm1, (%ebx,%edx,1)
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)		
+
+	addl $8, %ebx
+	decl %ecx
+	jnz .cascade_horlr_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+
+
diff --git a/system/mmx/pixel_conv_hor_s16.s b/system/mmx/pixel_conv_hor_s16.s
new file mode 100644
index 0000000..e90a692
--- /dev/null
+++ b/system/mmx/pixel_conv_hor_s16.s
@@ -0,0 +1,134 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+	# intermediate function
+	
+	# input in register:	
+	# %mm0:	left 4 pixels
+	# %mm1:	middle 4 pixels
+	# %mm2:	right 4 pixels
+	
+	# %mm5:	left 4 pixel masks
+	# %mm6:	middle 4 pixel masks
+	# %mm7:	right 4 pixel masks
+	
+	# return in register:	 
+	# %mm0:	middle 4 pixels result
+
+	
+	.conv_hor_4_pixels:	
+	.align 16
+	
+	# compute quadruplet
+
+	# get left pixels
+	psrlq $48, %mm0			# shift word 3 to byte 0
+	movq %mm1, %mm4
+	psllq $16, %mm4			# shift word 0,1,2 to 1,2,3
+	por %mm4, %mm0			# combine
+	pmulhw %mm5, %mm0
+	psllw $1, %mm0
+
+	
+	# get middle pixels
+	movq %mm1, %mm4
+	pmulhw %mm6, %mm4
+	psllw $1, %mm4
+	paddsw %mm4, %mm0	
+
+
+	# get right pixels
+	movq %mm2, %mm3
+	psllq $48, %mm3			# shift word 0 to word 3
+	movq %mm1, %mm4
+	psrlq $16, %mm4			# shift word 1,2,3 to 0,1,2
+	por %mm4, %mm3			# combine
+	pmulhw %mm7, %mm3
+	psllw $1, %mm3
+	paddsw %mm3, %mm0		# accumulate
+	
+	ret
+	
+.globl pixel_conv_hor_s16
+.type  pixel_conv_hor_s16,@function
+
+
+# pixel_conv_hor_s16(short int *pixel_array, int nb_4_pixel_vectors, short int border[4], short int mask[12])
+# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
+# NOT TESTED
+
+	
+pixel_conv_hor_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %esi	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 8 pixel vectors in a row (at least 2)
+
+	movl 20(%ebp), %edi	# mask vector
+	movq (%edi), %mm5
+	movq 8(%edi), %mm6
+	movq 16(%edi), %mm7
+	
+	movl 16(%ebp), %edi	# boundary pixel vector
+	
+	
+
+	movq (%edi), %mm0	# init regs (left edge, so mm0 is zero)
+	movq (%esi), %mm1
+	movq 8(%esi), %mm2
+
+	decl %ecx		# loop has 2 terminator stubs
+	decl %ecx		# todo:	 handle if ecx < 3
+	
+	jmp .conv_line_loop
+
+
+	.align 16
+	.conv_line_loop:	
+	call .conv_hor_4_pixels	# compute conv 
+	movq %mm0, (%esi)	# store result
+	movq %mm1, %mm0		# mm0 <- prev (%esi)
+	movq %mm2, %mm1		# mm1 <- 8(%esi)
+	movq 16(%esi), %mm2	# mm2 <- 16(%esi)
+	
+	addl $8, %esi		# increase pointer
+	decl %ecx
+	jnz .conv_line_loop
+
+	call .conv_hor_4_pixels	# compute conv 
+	movq %mm0, (%esi)	# store result
+	movq %mm1, %mm0		# mm0 <- prev (%esi)
+	movq %mm2, %mm1		# mm1 <- 8(%esi)
+	movq (%edi), %mm2	# mm2 <- border
+
+	call .conv_hor_4_pixels	# compute last vector
+	movq %mm0, 8(%esi)	# store it
+	
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+
+
+
diff --git a/system/mmx/pixel_conv_ver_s16.s b/system/mmx/pixel_conv_ver_s16.s
new file mode 100644
index 0000000..ae2456f
--- /dev/null
+++ b/system/mmx/pixel_conv_ver_s16.s
@@ -0,0 +1,128 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+#TODO:	 fix out of bound acces in conv_ver and conv_hor
+	
+	# intermediate function
+	
+	# input in register:	
+	# %mm0:	top 4 pixels
+	# %mm1:	middle 4 pixels
+	# %mm2:	bottom 4 pixels
+
+	# %mm5:	top 4 pixel mask
+	# %mm6:	middle 4 pixel mask
+	# %mm7:	bottom 4 pixel mask
+	
+	# return in register:	 
+	# %mm0:	middle 4 pixels result
+
+	
+	.conv_ver_4_pixels:	
+	.align 16
+	
+	# compute quadruplet
+
+	# get top pixel
+	pmulhw %mm5, %mm0
+	psllw $1, %mm0
+	
+	# get middle pixel
+	movq %mm1, %mm4
+	pmulhw %mm6, %mm4
+	psllw $1, %mm4
+	paddsw %mm4, %mm0
+
+	# get bottom pixel
+	movq %mm2, %mm3
+	pmulhw %mm7, %mm3
+	psllw $1, %mm3			# mm3 <- mm3/4
+	paddsw %mm3, %mm0
+
+	ret
+	
+.globl pixel_conv_ver_s16
+.type  pixel_conv_ver_s16,@function
+
+
+# pixel_conv_ver_s16(short int *pixel_array, int nb_4_pixel_vectors, int row_byte_size, short int border[4])
+# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
+# NOT TESTED
+
+	
+pixel_conv_ver_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %esi		# pixel array offset
+	movl 12(%ebp), %ecx		# nb of 4 pixel vectors in a row (at least 2)
+	movl 16(%ebp), %edx		# rowsize in bytes
+
+	movl 24(%ebp), %edi		# mask vector
+	movq (%edi), %mm5
+	movq 8(%edi), %mm6
+	movq 16(%edi), %mm7
+	
+	movl 20(%ebp), %edi		# edge vector
+
+
+	shll $1, %edx
+	decl %ecx			# loop has a terminator stub
+	decl %ecx			# loop has another terminator stub
+	
+
+	movq (%edi), %mm0		# init regs (left edge, so mm0 is zero)
+	movq (%esi), %mm1
+	movq (%esi,%edx,1), %mm2
+	jmp .conv_line_loop
+
+
+	.align 16
+	.conv_line_loop:	
+	call .conv_ver_4_pixels		# compute conv 
+	movq %mm0, (%esi)		# store result
+	movq %mm1, %mm0			# mm0 <- prev (%esi)
+	movq %mm2, %mm1			# mm1 <- (%esi,%edx,1)
+	movq (%esi,%edx,2), %mm2	# mm2 <- (%esi,%edx,2)
+	
+	addl %edx, %esi			# increase pointer
+	decl %ecx
+	jnz .conv_line_loop
+
+	call .conv_ver_4_pixels		# compute conv 
+	movq %mm0, (%esi)		# store result
+	movq %mm1, %mm0			# mm0 <- prev (%esi)
+	movq %mm2, %mm1			# mm1 <- (%esi,%edx,1)
+	movq (%edi), %mm2		# clear invalid edge vector
+
+	addl %edx, %esi			# increase pointer
+	call .conv_ver_4_pixels		# compute last vector
+	movq %mm0, (%esi)		# store it
+	
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+
+
+
diff --git a/system/mmx/pixel_crot_s16.s b/system/mmx/pixel_crot_s16.s
new file mode 100644
index 0000000..2427869
--- /dev/null
+++ b/system/mmx/pixel_crot_s16.s
@@ -0,0 +1,153 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_crot3d_s16
+.type  pixel_crot3d_s16,@function
+
+
+# 3 dimensional colour space rotation
+# 3x3 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
+	
+# void pixel_crot3d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
+
+pixel_crot3d_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	
+	movl 8(%ebp),  %esi	# input array
+	movl 12(%ebp), %ecx	# pixel count
+	movl 16(%ebp), %edi	# rotation matrix
+	movl %ecx, %edx
+	shll $3, %edx		# %edx = plane spacing
+
+	
+	.align 16
+	.loop_crot3d:	
+
+	movq (%esi), %mm0		# get 1st component
+	movq (%esi,%edx,1), %mm6	# get 2nd component
+	movq (%esi,%edx,2), %mm7	# get 3rd component
+
+	movq %mm0, %mm1			# copy 1st component
+	movq %mm0, %mm2
+
+	pmulhw (%edi), %mm0		# mul first column
+	pmulhw 8(%edi), %mm1
+	pmulhw 16(%edi), %mm2
+
+	movq %mm6, %mm5			# copy 2nd component
+	movq %mm6, %mm3
+
+	pmulhw 24(%edi), %mm6		# mul second column
+	pmulhw 32(%edi), %mm5
+	pmulhw 40(%edi), %mm3
+
+	paddsw %mm6, %mm0		# accumulate
+	paddsw %mm5, %mm1
+	paddsw %mm3, %mm2
+
+	movq %mm7, %mm4			# copy 3rd component
+	movq %mm7, %mm6
+
+	pmulhw 48(%edi), %mm4		# mul third column
+	pmulhw 56(%edi), %mm6
+	pmulhw 64(%edi), %mm7
+
+	paddsw %mm4, %mm0		# accumulate
+	paddsw %mm6, %mm1
+	paddsw %mm7, %mm2
+
+	paddsw %mm0, %mm0		# double (fixed point normalization)
+	paddsw %mm1, %mm1
+	paddsw %mm2, %mm2
+
+	movq %mm0, (%esi)		# store
+	movq %mm1, (%esi, %edx, 1)
+	movq %mm2, (%esi, %edx, 2)
+
+	addl $8, %esi			# increment source pointer
+	decl %ecx
+	jnz .loop_crot3d		# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
+
+.globl pixel_crot2d_s16
+.type  pixel_crot2d_s16,@function
+	
+# 2 dimensional colour space rotation
+# 2x2 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
+	
+# void pixel_crot2d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
+
+pixel_crot2d_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	
+	movl 8(%ebp),  %esi	# input array
+	movl 12(%ebp), %ecx	# pixel count
+	movl 16(%ebp), %edi	# rotation matrix
+	movl %ecx, %edx
+	shll $3, %edx		# %edx = plane spacing
+
+	
+	.align 16
+	.loop_crot2d:	
+
+	movq (%esi), %mm0		# get 1st component
+	movq (%esi,%edx,1), %mm2	# get 2nd component
+
+	movq %mm0, %mm1			# copy 1st component
+	movq %mm2, %mm3			# copy 2nd component
+
+	pmulhw (%edi), %mm0		# mul first column
+	pmulhw 8(%edi), %mm1
+
+	pmulhw 16(%edi), %mm2		# mul second column
+	pmulhw 24(%edi), %mm3
+
+	paddsw %mm2, %mm0		# accumulate
+	paddsw %mm3, %mm1
+
+	paddsw %mm0, %mm0		# fixed point gain correction
+	paddsw %mm1, %mm1
+
+	movq %mm0, (%esi)		# store
+	movq %mm1, (%esi, %edx, 1)
+
+	addl $8, %esi			# increment source pointer
+	decl %ecx
+	jnz .loop_crot2d		# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_gain.s b/system/mmx/pixel_gain.s
new file mode 100644
index 0000000..5cd5057
--- /dev/null
+++ b/system/mmx/pixel_gain.s
@@ -0,0 +1,83 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_gain
+.type  pixel_gain,@function
+
+# mmx rgba pixel gain
+# void asmtest(char *pixelarray, int32 nbpixels, int *rgba_gain)
+# gains are 7.9 fixed point for rgba
+
+pixel_gain:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %esi	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of elements
+	movl 16(%ebp), %edi	# int16[4] array of gains
+
+	prefetch (%esi)
+
+	emms
+	sarl $2, %ecx		# process 4 pixels per loop iteration
+	jz .exit
+	movq (%edi), %mm7	# read gain array from memory
+	jmp .loop_gain
+
+	.align 16
+	.loop_gain:	
+
+	prefetch 128(%esi)	
+	movq (%esi), %mm5	# load pixel 1-2  from memory
+	movq 8(%esi), %mm6	# load pixel 3-4  from memory
+	pxor %mm0, %mm0		# zero mm0 - mm3
+	pxor %mm1, %mm1
+	pxor %mm2, %mm2
+	pxor %mm3, %mm3
+	punpcklbw %mm5, %mm0	# unpack 1st pixel into 8.8 bit ints
+	punpckhbw %mm5, %mm1	# unpack 2nd
+	punpcklbw %mm6, %mm2	# unpack 3rd
+	punpckhbw %mm6, %mm3	# unpack 4th
+	psrlw $0x1, %mm0	# shift right to clear sign bit 9.7
+	psrlw $0x1, %mm1
+	psrlw $0x1, %mm2
+	psrlw $0x1, %mm3
+	
+	pmulhw %mm7, %mm0	# multiply 1st pixel 9.7 * 7.9 -> 16.0
+	pmulhw %mm7, %mm1	# multiply 2nd  
+	pmulhw %mm7, %mm2	# multiply 3rd
+	pmulhw %mm7, %mm3	# multiply 4th 
+
+	packuswb %mm1, %mm0	# pack & saturate to 8bit vector
+	movq %mm0, (%esi)	# store result in memory
+	packuswb %mm3, %mm2	# pack & saturate to 8bit vector
+	movq %mm2, 8(%esi)	# store result in memory
+
+	addl $16, %esi		# increment source pointer
+	decl %ecx
+	jnz .loop_gain		# loop
+
+	.exit:
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_gain_s16.s b/system/mmx/pixel_gain_s16.s
new file mode 100644
index 0000000..adcfdf5
--- /dev/null
+++ b/system/mmx/pixel_gain_s16.s
@@ -0,0 +1,71 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_gain_s16
+.type  pixel_gain_s16,@function
+
+# gain is integer, shift count is down	
+# void pixel_gain_s16(int *buf, int nb_8pixel_vectors, short int gain[4], unsigned long long *shift)
+
+pixel_gain_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 20(%ebp), %edi
+	movq (%edi), %mm6	# get shift vector
+
+	movl 16(%ebp), %edi
+	movq (%edi), %mm7	# get gain vector
+	
+	movl 8(%ebp),  %esi	# input array
+	movl 12(%ebp), %ecx	# pixel count
+
+	
+	.align 16
+	.loop_gain:	
+
+	movq (%esi), %mm0	# load 4 pixels from memory
+	movq %mm0, %mm1		
+	pmulhw %mm7, %mm1	# apply gain (s15.0) fixed point, high word
+	pmullw %mm7, %mm0	# low word
+
+	movq %mm0, %mm2		# copy
+	movq %mm1, %mm3
+
+	punpcklwd %mm1, %mm0	# unpack lsw components
+	punpckhwd %mm3, %mm2	# unpack msw components
+
+	psrad %mm6, %mm0	# apply signed shift
+	psrad %mm6, %mm2
+
+	packssdw %mm2, %mm0	# pack result & saturate
+	movq %mm0, (%esi)	# store result
+	
+
+	addl $8, %esi		# increment source pointer
+	decl %ecx
+	jnz .loop_gain		# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_mix_s16.s b/system/mmx/pixel_mix_s16.s
new file mode 100644
index 0000000..9bf41eb
--- /dev/null
+++ b/system/mmx/pixel_mix_s16.s
@@ -0,0 +1,68 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_mix_s16
+.type  pixel_mix_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_mix_s16(int *left, int *right, int nb_4pixel_vectors, 
+#	short int gain_left[4], short int gain_right[4])
+
+pixel_mix_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 20(%ebp), %edi	# int16[4] array of gains
+	movq (%edi), %mm6	# get left gain array
+
+	movl 24(%ebp), %edi	# int16[4] array of gains
+	movq (%edi), %mm7	# get right gain array
+	
+	movl 8(%ebp),  %edi	# left array
+	movl 12(%ebp), %esi	# right array
+	movl 16(%ebp), %ecx	# pixel count
+
+	
+	.align 16
+	.loop_mix:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm1	# load right 4 pixels from memory
+	pmulhw %mm7, %mm1	# apply right gain
+	movq (%edi), %mm0	# load 4 left pixels from memory
+	pmulhw %mm6, %mm0	# apply left gain
+#	pslaw $1, %mm1		# shift left ((s).15 x (s).15 -> (s0).14))
+#	pslaw $1, %mm0
+	paddsw %mm0, %mm0	# no shift left arithmic, so use add instead
+	paddsw %mm1, %mm1
+	paddsw %mm1, %mm0	# mix
+	movq %mm0, (%edi)
+	addl $8, %esi
+	addl $8, %edi
+	decl %ecx
+	jnz .loop_mix		# loop
+
+	emms
+
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_mul_s16.s b/system/mmx/pixel_mul_s16.s
new file mode 100644
index 0000000..240a024
--- /dev/null
+++ b/system/mmx/pixel_mul_s16.s
@@ -0,0 +1,56 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_mul_s16
+.type  pixel_mul_s16,@function
+
+# simple add
+# void pixel_mul_s16(int *left, int *right, int nb_4pixel_vectors)
+
+pixel_mul_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %edi	# left array
+	movl 12(%ebp), %esi	# right array
+	movl 16(%ebp), %ecx	# pixel count
+
+	
+	.align 16
+	.loop_mix:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm1	# load right 4 pixels from memory
+	movq (%edi), %mm0	# load 4 left pixels from memory
+	pmulhw %mm1, %mm0	# mul
+	psllw $1, %mm0		# fixed point shift correction
+	movq %mm0, (%edi)
+	addl $8, %esi
+	addl $8, %edi
+	decl %ecx
+	jnz .loop_mix		# loop
+
+	emms
+
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_pack_s16u8.s b/system/mmx/pixel_pack_s16u8.s
new file mode 100644
index 0000000..57df702
--- /dev/null
+++ b/system/mmx/pixel_pack_s16u8.s
@@ -0,0 +1,126 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_pack_s16u8_y
+.type  pixel_pack_s16u8_y,@function
+
+# mmx rgba pixel gain
+# void pixel_pack_s16u8_y(int *input, int *output, int nb_8pixel_vectors)
+
+pixel_pack_s16u8_y:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+#	movl 20(%ebp), %edi	# int16[4] array of gains
+#	movq (%edi), %mm7	# get gain array
+#	psllw $1, %mm7		# adjust for shifted sign bit
+	
+	movl 8(%ebp),  %esi	# input array
+	movl 12(%ebp), %edi	# output array
+	movl 16(%ebp), %ecx	# pixel count
+
+	pxor %mm6, %mm6
+	
+	.align 16
+	.loop_pack_y:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm0	# load 4 pixels from memory
+#	pmulhw %mm7, %mm0	# apply gain
+	movq 8(%esi), %mm1	# load 4 pixels from memory
+#	pmulhw %mm7, %mm1	# apply gain
+
+#	movq %mm0, %mm2
+#	pcmpgtw %mm6, %mm2	# mm2 > 0 ?  0xffff :	0
+#	pand %mm2, %mm0 
+
+#	movq %mm1, %mm3
+#	pcmpgtw %mm6, %mm3	# mm3 > 0 ?  0xffff :	0
+#	pand %mm3, %mm1 
+
+#	psllw $1, %mm0		# shift out sign bit
+#	psllw $1, %mm1		# shift out sign bit
+
+	psraw $7, %mm0		# shift to lsb
+	psraw $7, %mm1		# shift to lsb
+	
+	packuswb %mm1, %mm0	# pack & saturate to 8bit vector
+	movq %mm0, (%edi)	# store result in memory
+
+	addl $16, %esi		# increment source pointer
+	addl $8, %edi		# increment dest pointer
+	decl %ecx
+	jnz .loop_pack_y	# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
+.globl pixel_pack_s16u8_uv
+.type  pixel_pack_s16u8_uv,@function
+
+pixel_pack_s16u8_uv:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+#	movl 20(%ebp), %edi	# int16[4] array of gains
+#	movq (%edi), %mm7	# get gain array
+	movl 8(%ebp),  %esi	# pixel array offset
+	movl 12(%ebp), %edi	# nb of elements
+	movl 16(%ebp), %ecx	# pixel count
+
+	pcmpeqw %mm6, %mm6
+	psllw $15, %mm6
+	movq %mm6, %mm5
+	psrlw $8, %mm5
+	por %mm5, %mm6		# mm6 <- 8 times 0x80
+	
+	.align 16
+	.loop_pack_uv:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm0	# load 4 pixels from memory
+#	pmulhw %mm7, %mm0	# apply gain
+	movq 8(%esi), %mm1	# load 4 pixels from memory
+#	pmulhw %mm7, %mm1	# apply gain
+
+	psraw $8, %mm0		# shift to msb
+	psraw $8, %mm1
+	
+	packsswb %mm1, %mm0	# pack & saturate to 8bit vector
+	pxor %mm6, %mm0		# flip sign bits
+	movq %mm0, (%edi)	# store result in memory
+
+	addl $16, %esi		# increment source pointer
+	addl $8, %edi		# increment dest pointer
+	decl %ecx
+	jnz .loop_pack_uv	# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_rand_s16.s b/system/mmx/pixel_rand_s16.s
new file mode 100644
index 0000000..649400b
--- /dev/null
+++ b/system/mmx/pixel_rand_s16.s
@@ -0,0 +1,76 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_rand_s16
+.type  pixel_rand_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_rand_s16(int *dst, nb_4pixel_vectors, short int random_seed[4])
+
+pixel_rand_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 16(%ebp), %esi	# int16[4] array of random seeds
+	movl 8(%ebp),  %edi	# dst array
+	movl 12(%ebp), %ecx	# pixel count
+
+	movq (%esi), %mm6
+
+
+	pcmpeqw %mm3, %mm3
+	psrlw $15, %mm3		# get bit mask 4 times 0x0001
+	
+	.align 16
+	.loop_rand:	
+
+#	prefetch 128(%esi)	
+
+
+	movq %mm6, %mm4		# get random vector
+	psrlw $15, %mm4		# get first component
+	movq %mm6, %mm5
+	psrlw $14, %mm5		# get second component
+	pxor %mm5, %mm4
+	movq %mm6, %mm5
+	psrlw $12, %mm5		# get third component
+	pxor %mm5, %mm4
+	movq %mm6, %mm5
+	psrlw $3, %mm5		# get forth component
+	pxor %mm5, %mm4
+
+	psllw $1, %mm6		# shift left original random vector
+	pand %mm3, %mm4		# isolate new bit
+	por %mm4, %mm6		# combine into new random vector
+
+	movq %mm6, (%edi)
+	addl $8, %edi
+	decl %ecx
+	jnz .loop_rand	# loop
+
+
+	movq %mm6, (%esi)	# store random seeds
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_randmix_s16.s b/system/mmx/pixel_randmix_s16.s
new file mode 100644
index 0000000..44e1702
--- /dev/null
+++ b/system/mmx/pixel_randmix_s16.s
@@ -0,0 +1,91 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_randmix_s16
+.type  pixel_randmix_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_randmix_s16(int *left, int *right, int nb_4pixel_vectors, short int random_seed[4], short int threshold[4])
+
+pixel_randmix_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 20(%ebp), %edi	# int16[4] array of random seeds
+	movq (%edi), %mm6
+
+	movl 24(%ebp), %edi	# int16[4] array of thresholds
+	movq (%edi), %mm7
+	
+	movl 8(%ebp),  %edi	# left array
+	movl 12(%ebp), %esi	# right array
+	movl 16(%ebp), %ecx	# pixel count
+
+	pcmpeqw %mm3, %mm3
+	psrlw $15, %mm3		# get bit mask 4 times 0x0001
+	
+	.align 16
+	.loop_randmix:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm1	# load right 4 pixels from memory
+	movq (%edi), %mm0	# load 4 left pixels from memory
+
+	movq %mm6, %mm2		# get random vector
+	pcmpgtw %mm7, %mm2	# compare random vector with threshold
+	movq %mm2, %mm5
+	
+	pand %mm0, %mm2		# get left array's components
+	pandn %mm1, %mm5	# get right array's components
+	por %mm2, %mm5
+	
+	movq %mm5, (%edi)	# store pixels
+
+	movq %mm6, %mm4		# get random vector
+	psrlw $15, %mm4		# get first component
+	movq %mm6, %mm5
+	psrlw $14, %mm5		# get second component
+	pxor %mm5, %mm4
+	movq %mm6, %mm5
+	psrlw $12, %mm5		# get third component
+	pxor %mm5, %mm4
+	movq %mm6, %mm5
+	psrlw $3, %mm5		# get forth component
+	pxor %mm5, %mm4
+
+	psllw $1, %mm6		# shift left original random vector
+	pand %mm3, %mm4		# isolate new bit
+	por %mm4, %mm6		# combine into new random vector
+	
+	addl $8, %esi
+	addl $8, %edi
+	decl %ecx
+	jnz .loop_randmix	# loop
+
+
+	movl 20(%ebp), %edi	# int16[4] array of random seeds
+	movq %mm6, (%edi)	# store random seeds
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_s1.s b/system/mmx/pixel_s1.s
new file mode 100644
index 0000000..d6bc5ca
--- /dev/null
+++ b/system/mmx/pixel_s1.s
@@ -0,0 +1,201 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+	# this file contains ops for binary image processing
+	# 8x8 bit tile encoded
+	# low byte = bottom row
+	# low bit = right column
+	# %mm7 = scratch reg for all macros
+
+
+	# ************ load mask *******************
+	# compute bit masks for rows and columns
+	# %mm7:	 scratch reg
+
+	# load mask top
+	.macro ldmt count reg
+	pcmpeqb \reg, \reg
+	psllq $(64-(\count<<3)), \reg
+	.endm
+
+	# load mask bottom
+	.macro ldmb count reg
+	pcmpeqb \reg, \reg
+	psrlq $(64-(\count<<3)), \reg
+	.endm
+
+	# load mask top and bottom
+	.macro ldmtb count regt regb
+	ldmb \count, \regb
+	ldmt \count, \regt
+	.endm
+
+	# load mask right
+	.macro ldmr count reg
+	pcmpeqb %mm7, %mm7
+	psrlw $(16-\count), %mm7
+	movq %mm7, \reg
+	psllq $8, %mm7
+	por %mm7, \reg
+	.endm
+
+	# load mask left	
+	.macro ldml count reg
+	pcmpeqb %mm7, %mm7
+	psllw $(16-\count), %mm7
+	movq %mm7, \reg
+	psrlq $8, %mm7
+	por %mm7, \reg
+	.endm
+
+	# load mask left and right
+	.macro ldmlr count regl regr
+	pcmpeqb %mm7, %mm7
+	psllw $(16-\count), %mm7
+	movq %mm7, \regl
+	psrlq $8, %mm7
+	por %mm7, \regl
+	movq \regl, \regr
+	psrlq $(8-\count), \regr
+	.endm
+
+	# ************* shift square **********
+	# shifts a square in reg, fills with zeros
+
+	# shift square top
+	.macro sst count reg
+	psllq $(\count<<3), \reg
+	.endm
+
+	# shift square bottom
+	.macro ssb count reg
+	psrlq $(\count<<3), \reg
+	.endm
+
+	# not tested
+	# shift square left
+	.macro ssl count reg
+	movq \reg, %mm7
+	pcmpeqb \reg, \reg
+	psllw $(16-\count), \reg
+	psrlw $8, \reg
+	pandn %mm7, \reg
+	psllw $(\count), \reg
+	.endm
+
+	# shift square right
+	.macro ssr count reg
+	movq \reg, %mm7
+	pcmpeqb \reg, \reg
+	psrlw $(16-\count), \reg
+	psllw $8, \reg
+	pandn %mm7, \reg
+	psrlw $(\count), \reg
+	.endm
+
+
+	# ********** combine square *************
+	# combines 2 squares
+
+	# combine right
+	.macro csr count regr reg
+	ssl \count, \reg
+	ssr (8-\count), \regr
+	por \regr, \reg
+	.endm
+
+	# combine left
+	.macro csl count regl reg
+	ssr \count, \reg
+	ssl (8-\count), \regl
+	por \regl, \reg
+	.endm
+
+	# combine top
+	.macro cst count regt reg
+	ssb \count, \reg
+	sst (8-\count), \regt
+	por \regt, \reg
+	.endm
+
+	
+	# combine bottom
+	.macro csb count regb reg
+	sst \count, \reg
+	ssb (8-\count), \regb
+	por \regb, \reg
+	.endm
+
+
+	# ********** load combine square *************
+	# loads combined square using mask
+
+	# load combined square left
+	# mask should be count bits set right (i.e. 0x01)
+	.macro lcsml count mask source sourcel dstreg
+	movq \mask, \dstreg
+	movq \mask, %mm7
+	pandn \source, \dstreg
+	pand \sourcel, %mm7
+	psrlq $(\count), \dstreg
+	psllq $(8-\count), %mm7
+	por %mm7, \dstreg
+	.endm
+	
+	
+			
+.globl pixel_test_s1
+.type  pixel_test_s1,@function
+
+# simple add
+# void pixel_add_s16(void *dest, void *source, int nb_squares, int spacing)
+
+
+
+	#
+	
+
+pixel_test_s1:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %edi	# dest
+	movl 12(%ebp), %esi	# source
+	movl 16(%ebp), %ecx	# count
+	movl 20(%ebp), %edx	# row distance
+
+	ldmr 1, %mm6
+	lcsml 1, %mm6, (%esi), 8(%esi), %mm0
+	movq %mm0, (%edi)
+
+
+#	movq (%esi), %mm0
+#	movq 8(%esi), %mm1
+#	csl 4, %mm1, %mm0
+#	movq %mm0, (%edi)
+
+	emms
+
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_unpack_u8s16.s b/system/mmx/pixel_unpack_u8s16.s
new file mode 100644
index 0000000..0fc14c2
--- /dev/null
+++ b/system/mmx/pixel_unpack_u8s16.s
@@ -0,0 +1,113 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_unpack_u8s16_y
+.type  pixel_unpack_u8s16_y,@function
+
+# mmx rgba pixel gain
+# void pixel_unpack_u8s16_y(char *input, char *output, int32 nb_pixels_div8)
+
+pixel_unpack_u8s16_y:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+#	movl 20(%ebp), %edi	# int16[4] array of gains
+#	movq (%edi), %mm7	# get gain array
+	
+	movl 8(%ebp),  %esi	# input uint8 pixel array
+	movl 12(%ebp), %edi	# output sint16 pixel array
+	movl 16(%ebp), %ecx	# nb of elements div 8
+
+
+	.align 16
+	.loop_unpack_y:	
+
+	movq (%esi), %mm5	# load 8 pixels from memory
+	pxor %mm0, %mm0		# zero mm0 - mm3
+	pxor %mm1, %mm1
+	punpcklbw %mm5, %mm0	# unpack 1st 4 pixels
+	punpckhbw %mm5, %mm1	# unpack 2nd 4 pixles
+	psrlw $0x1, %mm0	# shift right to clear sign bit 9.7
+	psrlw $0x1, %mm1
+#	pmulhw %mm7, %mm0	# apply gain
+#	pmulhw %mm7, %mm1
+#	paddsw %mm0, %mm0	# correct factor 2
+#	paddsw %mm1, %mm1
+	movq %mm0, (%edi)	# store
+	movq %mm1, 8(%edi)
+	
+	addl $8, %esi		# increment source pointer
+	addl $16, %edi		# increment dest pointer
+	decl %ecx
+	jnz .loop_unpack_y	# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
+.globl pixel_unpack_u8s16_uv
+.type  pixel_unpack_u8s16_uv,@function
+pixel_unpack_u8s16_uv:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+#	movl 20(%ebp), %edi	# int16[4] array of gains
+#	movq (%edi), %mm7	# get gain array
+
+	movl 8(%ebp),  %esi	# input uint8 pixel array
+	movl 12(%ebp), %edi	# output sint16 pixel array
+	movl 16(%ebp), %ecx	# nb of elements div 8
+
+	pcmpeqw %mm6, %mm6
+	psllw $15, %mm6
+	
+	.align 16
+	.loop_unpack_uv:	
+
+	movq (%esi), %mm5	# load 8 pixels from memory
+	pxor %mm0, %mm0		# zero mm0 - mm3
+	pxor %mm1, %mm1
+	punpcklbw %mm5, %mm0	# unpack 1st 4 pixels
+	punpckhbw %mm5, %mm1	# unpack 2nd 4 pixles
+	pxor %mm6, %mm0		# flip sign bit (Cr and Cb are ofset by 128)
+	pxor %mm6, %mm1
+#	pmulhw %mm7, %mm0	# apply gain
+#	pmulhw %mm7, %mm1
+#	paddsw %mm0, %mm0	# correct factor 2
+#	paddsw %mm1, %mm1
+	movq %mm0, (%edi)	# store
+	movq %mm1, 8(%edi)
+	
+	addl $8, %esi		# increment source pointer
+	addl $16, %edi		# increment dest pointer
+	decl %ecx
+	jnz .loop_unpack_uv	# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+