removing PDP source (except debian files) before import of PDP 0.12.4

svn path=/trunk/externals/pdp/; revision=4217
author: Hans-Christoph Steiner <eighthave@users.sourceforge.net> 2005-12-15 07:26:47 +0000
committer: Hans-Christoph Steiner <eighthave@users.sourceforge.net> 2005-12-15 07:26:47 +0000
commit: 37b6643df2df7d784a31ca73f7bb90dc109c2401 (patch)
tree: a8664e5adcfcb60cae136063d627549ecb76619b /system/mmx
parent: c50ce0e0217ea07e2d450add2ab29cecea66fa96 (diff)
22 files changed, 0 insertions, 3243 deletions
diff --git a/system/mmx/Makefile b/system/mmx/Makefile
deleted file mode 100644
index 51e5052..0000000
--- a/system/mmx/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-include ../../Makefile.config
-
-OBJ = \
-pixel_pack_s16u8.o \
-pixel_unpack_u8s16.o \
-pixel_add_s16.o \
-pixel_mul_s16.o \
-pixel_mix_s16.o \
-pixel_randmix_s16.o \
-pixel_conv_hor_s16.o \
-pixel_conv_ver_s16.o \
-pixel_affine_s16.o \
-pixel_biquad_s16.o \
-pixel_ca_s1.o \
-pixel_rand_s16.o \
-pixel_crot_s16.o \
-pixel_gain_s16.o \
-pixel_resample_s16.o \
-pixel_cheby_s16.o
-
-all:	$(OBJ)
-
-test:	pdp_mmx_test.o $(OBJ)
-	gcc -o pdp_mmx_test pdp_mmx_test.o $(OBJ) -g
-
-clean:
-	rm -f *.o
-	rm -f *~
-	rm -f pdp_mmx.a
-	rm -f pdp_mmx_test
-
-
diff --git a/system/mmx/pdp_mmx_test.c b/system/mmx/pdp_mmx_test.c
deleted file mode 100644
index e93539f..0000000
--- a/system/mmx/pdp_mmx_test.c
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "pdp_mmx.h"
-
-#define FP(x) ((short int)(((float)(x) * 2 * 256.0f)))
-
-#define nbp 256
-
-    short int a1[4] = {0x0100,0x0100,0x0100,0x0100};
-    short int a2[4] = {0x0100,0x0100,0x0100,0x0100};
-    short int b0[4] = {0x0100,0x0100,0x0100,0x0100};
-    short int b1[4] = {0x0100,0x0100,0x0100,0x0100};
-    short int b2[4] = {0x0100,0x0100,0x0100,0x0100};
-
-    short int u1[4] = {0x0100,0x0100,0x0100,0x0100};
-    short int u2[4] = {0x0100,0x0100,0x0100,0x0100};
-
-    short int x0[4] = {0x0100,0x0100,0x0100,0x0100};
-    short int x1[4] = {0x0100,0x0100,0x0100,0x0100};
-    short int x2[4] = {0x0100,0x0100,0x0100,0x0100};
-    short int x3[4] = {0x0100,0x0100,0x0100,0x0100};
-
-void print_pixel(unsigned int i)
-{
-    if (i) printf("x ");
-    else printf(". ");
-}
-
-void print_line(void)
-{
-    printf("\n");
-}
-
-void print_square(unsigned char *c)
-{
-    int i,j;
-
-    for(j=7; j>=0; j--){
-	for(i=0; i<8; i++) print_pixel(c[j] & (1<<(7-i)));
-	printf("\n");
-    }
-    
-}
-
-main()
-{
-  
-    unsigned char src[16]={1,2,3,4,5,6,7,8,-1,-2,-3,-4,-5,-6,-7,-8};
-    unsigned char dst[8];
-
-    
-    print_square(src);
-    print_line();
-    print_square(src+8);
-    print_line();
-
-    pixel_test_s1(dst,src,1,1);
-
-    print_square(dst);
-    print_line();
-
-
-
-}
diff --git a/system/mmx/pixel_add_s16.s b/system/mmx/pixel_add_s16.s
deleted file mode 100644
index 8d4c7df..0000000
--- a/system/mmx/pixel_add_s16.s
+++ /dev/null
@@ -1,55 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_add_s16
-.type  pixel_add_s16,@function
-
-# simple add
-# void pixel_add_s16(int *left, int *right, int nb_4pixel_vectors)
-
-pixel_add_s16:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %edi	# left array
-	movl 12(%ebp), %esi	# right array
-	movl 16(%ebp), %ecx	# pixel count
-
-	
-	.align 16
-	.loop_mix:	
-
-#	prefetch 128(%esi)	
-	movq (%esi), %mm1	# load right 4 pixels from memory
-	movq (%edi), %mm0	# load 4 left pixels from memory
-	paddsw %mm1, %mm0	# mix
-	movq %mm0, (%edi)
-	addl $8, %esi
-	addl $8, %edi
-	decl %ecx
-	jnz .loop_mix		# loop
-
-	emms
-
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_affine_s16.s b/system/mmx/pixel_affine_s16.s
deleted file mode 100644
index b357de3..0000000
--- a/system/mmx/pixel_affine_s16.s
+++ /dev/null
@@ -1,59 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_affine_s16
-.type  pixel_affine_s16,@function
-
-# void pixel_affine_s16(int *buf, int nb_8pixel_vectors, short int gain[4], short int offset[4])
-
-pixel_affine_s16:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 20(%ebp), %edi
-	movq (%edi), %mm6	# get offset vector
-
-	movl 16(%ebp), %edi
-	movq (%edi), %mm7	# get gain vector
-	
-	movl 8(%ebp),  %esi	# input array
-	movl 12(%ebp), %ecx	# pixel count
-
-	
-	.align 16
-	.loop_affine:	
-
-#	prefetch 128(%esi)	
-	movq (%esi), %mm0	# load 4 pixels from memory
-	pmulhw %mm7, %mm0	# apply gain (s).15 fixed point
-	psllw $1, %mm0		# apply correction shift
-	paddsw %mm6, %mm0	# add offset
-	movq %mm0, (%esi)	# store result in memory
-
-	addl $8, %esi		# increment source pointer
-	decl %ecx
-	jnz .loop_affine	# loop
-
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_biquad_dirI_s16.s b/system/mmx/pixel_biquad_dirI_s16.s
deleted file mode 100644
index 1729502..0000000
--- a/system/mmx/pixel_biquad_dirI_s16.s
+++ /dev/null
@@ -1,361 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
-
-	# TODO MOVE TO DIRECT FORM II
-	# y[k]  = b0 * x[k] + u1[k-1]
-	# u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
-	# u2[k] = b2 * x[k]           - a2 * y[k]
-	
-	# input in register:	
-	# %mm0-mm3:	input 4x4 pixels {x0 x1 x2 x3}
-	# %esi:		coef memory  (a1, a2, b0, b1, b2)
-	# %edi:		state memory (u1, u2)
-
-	
-	# return in register:	 
-	# %mm0-mm4:	4x4 pixels result
-
-	
-	.biquad_4x4_pixels:	
-	.align 16
-	# prescale
-	movq -8(%esi), %mm4
-	pmulhw %mm4, %mm0
-	pmulhw %mm4, %mm1
-	pmulhw %mm4, %mm2
-	pmulhw %mm4, %mm3
-	psllw $1, %mm0
-	psllw $1, %mm1
-	psllw $1, %mm2
-	psllw $1, %mm3
-
-	
-	# first vector
-	movq 0(%edi), %mm4		# mm4 <- u[-1]
-	movq 8(%edi), %mm5		# mm5 <- u[-2]
-	movq %mm4, %mm6
-	movq %mm5, %mm7
-
-	pmulhw 0(%esi), %mm6		# multiply by a1
-	pmulhw 8(%esi), %mm7		# multiply by a2
-
-	paddsw %mm6, %mm0		# accumulate
-	paddsw %mm7, %mm0		# accumulate
-	paddsw %mm0, %mm0		# scale by 2 (since all fixed point muls are x*y/2)
-
-	movq %mm0, %mm6			# mm6 <- u[0]
-	movq %mm4, %mm7			# mm7 <- u[-1]
-	pmulhw 16(%esi), %mm0		# multiply by b0
-	pmulhw 24(%esi), %mm4		# multiply by b1
-	pmulhw 32(%esi), %mm5		# multiply by b2
-
-	paddsw %mm4, %mm0		# accumulate
-	paddsw %mm5, %mm0		# accumulate
-
-					# mm0 is result 0
-
-	# second vector
-	movq %mm6, %mm4			# mm4 <- u[0]
-	movq %mm7, %mm5			# mm5 <- u[-1]
-
-	pmulhw 0(%esi), %mm6		# multiply by a1
-	pmulhw 8(%esi), %mm7		# multiply by a2
-
-	paddsw %mm6, %mm1		# accumulate
-	paddsw %mm7, %mm1		# accumulate
-	paddsw %mm1, %mm1		# scale by 2
-
-	
-	movq %mm1, %mm6			# mm6 <- u[1]
-	movq %mm4, %mm7			# mm7 <- u[0]
-	pmulhw 16(%esi), %mm1		# multiply by b0
-	pmulhw 24(%esi), %mm4		# multiply by b1
-	pmulhw 32(%esi), %mm5		# multiply by b2
-
-	paddsw %mm4, %mm1		# accumulate
-	paddsw %mm5, %mm1		# accumulate
-
-					# mm1 is result 1
-
-	# third vector
-	movq %mm6, %mm4			# mm4 <- u[1]
-	movq %mm7, %mm5			# mm5 <- u[0]
-
-	pmulhw 0(%esi), %mm6		# multiply by a1
-	pmulhw 8(%esi), %mm7		# multiply by a2
-
-	paddsw %mm6, %mm2		# accumulate
-	paddsw %mm7, %mm2		# accumulate
-	paddsw %mm2, %mm2		# scale by 2
-
-	
-	movq %mm2, %mm6			# mm6 <- u[2]
-	movq %mm4, %mm7			# mm7 <- u[1]
-	pmulhw 16(%esi), %mm2		# multiply by b0
-	pmulhw 24(%esi), %mm4		# multiply by b1
-	pmulhw 32(%esi), %mm5		# multiply by b2
-
-	paddsw %mm4, %mm2		# accumulate
-	paddsw %mm5, %mm2		# accumulate
-
-					# mm2 is result 2
-
-	# fourth vector
-	movq %mm6, %mm4			# mm4 <- u[2]
-	movq %mm7, %mm5			# mm5 <- u[1]
-
-	pmulhw 0(%esi), %mm6		# multiply by a1
-	pmulhw 8(%esi), %mm7		# multiply by a2
-
-	paddsw %mm6, %mm3		# accumulate
-	paddsw %mm7, %mm3		# accumulate
-	paddsw %mm3, %mm3		# scale by 2
-
-	
-	movq %mm3, 0(%edi)		# store  u[3]
-	movq %mm4, 8(%edi)		# store  u[2]
-	pmulhw 16(%esi), %mm3		# multiply by b0
-	pmulhw 24(%esi), %mm4		# multiply by b1
-	pmulhw 32(%esi), %mm5		# multiply by b2
-
-	paddsw %mm4, %mm3		# accumulate
-	paddsw %mm5, %mm3		# accumulate
-
-					# mm3 is result 3
-
-	ret
-	
-
-	# in order to use the 4 line parallel biquad routine on horizontal
-	# lines, we need to reorder (rotate or transpose) the matrix, since
-	# images are scanline encoded, and we want to work in parallell
-	# on 4 lines.
-	#
-	# since the 4 lines are independent, it doesnt matter in which order
-	# the the vector elements are present. 
-	#
-	# this allows us to use the same routine for left->right and right->left
-	# processing.
-	#	
-	# some comments on the non-abelean group of square isometries consisting of
-	# (I) identity
-	# (H) horizontal axis mirror 
-	# (V) vertical axis mirror
-	# (T) transpose (diagonal axis mirror)
-	# (A) antitranspose (antidiagonal axis mirror)
-	# (R1) 90deg anticlockwize rotation
-	# (R2) 180deg rotation
-	# (R3) 90deg clockwize rotation
-	#
-	#	
-	# we basicly have two options: (R1,R3) or (T,A)
-	# we opt for T and A because they are self inverting, which improves locality
-	#
-	# use antitranspose for right to left an transpose
-	# for left to right (little endian)
-
-
-	# antitranspose 4x4
-
-	# input
-	# %mm3 == {d0 d1 d2 d3}
-	# %mm2 == {c0 c1 c2 c3}	
-	# %mm1 == {b0 b1 b2 b3}	
-	# %mm0 == {a0 a1 a2 a3}
-
-	# output
-	# %mm3 == {a3 b3 c3 d3}
-	# %mm2 == {a2 b2 c2 d2}
-	# %mm1 == {a1 b1 c1 d1}
-	# %mm0 == {a0 b0 c0 d0}
-
-	
-	.antitranspose_4x4:	
-	.align 16
-	movq %mm3, %mm4
-	punpcklwd %mm1, %mm4	# mm4 <- {b2 d2 b3 d3}
-	movq %mm3, %mm5	
-	punpckhwd %mm1, %mm5	# mm5 <- {b0 d0 b1 d1}
-			
-	movq %mm2, %mm6
-	punpcklwd %mm0, %mm6	# mm6 <- {a2 c2 a3 c3}
-	movq %mm2, %mm7	
-	punpckhwd %mm0, %mm7	# mm7 <- {a0 c0 a1 c1}
-
-	movq %mm4, %mm3
-	punpcklwd %mm6, %mm3	# mm3 <- {a3 b3 c3 d3}
-	movq %mm4, %mm2
-	punpckhwd %mm6, %mm2	# mm2 <- {a2 b2 c2 d2}
-		
-	movq %mm5, %mm1
-	punpcklwd %mm7, %mm1	# mm1 <- {a1 b1 c1 d1}
-	movq %mm5, %mm0
-	punpckhwd %mm7, %mm0	# mm0 <- {a0 b0 c0 d0}
-
-	ret
-
-	
-
-	# transpose 4x4
-
-	# input
-	# %mm3 == {d3 d2 d1 d0}
-	# %mm2 == {c3 c2 c1 c0}	
-	# %mm1 == {b3 b2 b1 b0}	
-	# %mm0 == {a3 a2 a1 a0}
-
-	# output
-	# %mm3 == {d3 c3 b3 a3}
-	# %mm2 == {d2 c2 b2 a2}
-	# %mm1 == {d1 c1 b1 a1}
-	# %mm0 == {d0 c0 b0 a0}
-
-	
-	.transpose_4x4:	
-	.align 16
-	movq %mm0, %mm4
-	punpcklwd %mm2, %mm4	# mm4 <- {c1 a1 c0 a0}
-	movq %mm0, %mm5	
-	punpckhwd %mm2, %mm5	# mm5 <- {c3 a3 c2 a2}
-		
-	movq %mm1, %mm6
-	punpcklwd %mm3, %mm6	# mm6 <- {d1 b1 d0 b0}
-	movq %mm1, %mm7	
-	punpckhwd %mm3, %mm7	# mm7 <- {d3 b3 d2 b2}
-
-	movq %mm4, %mm0
-	punpcklwd %mm6, %mm0	# mm0 <- {d0 c0 b0 a0}
-	movq %mm4, %mm1
-	punpckhwd %mm6, %mm1	# mm1 <- {d1 c1 b1 a1}
-		
-	movq %mm5, %mm2
-	punpcklwd %mm7, %mm2	# mm2 <- {d2 c2 b2 a2}
-	movq %mm5, %mm3
-	punpckhwd %mm7, %mm3	# mm3 <- {d3 c3 b3 a3}
-
-	ret
-
-	
-.globl pixel_biquad_vertb_s16
-.type  pixel_biquad_vertb_s16,@function
-
-
-# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-	
-pixel_biquad_vertb_s16: 
-
-		
-	pushl %ebp
-	movl %esp, %ebp
-	push %ebx
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %ebx	# pixel array offset
-	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
-	movl 16(%ebp), %edx	# line with
-
-	movl 20(%ebp), %esi	# coefs
-	movl 24(%ebp), %edi	# state
-
-	shll $1, %edx		# short int addressing	
-	movl %edx, %eax
-	shll $1, %eax
-	addl %edx, %eax		# eax = 3 * edx
-	
-	.align 16
-	.biquad_vertb_line_loop:
-	movq (%ebx), %mm0	
-	movq (%ebx,%edx,1), %mm1	
-	movq (%ebx,%edx,2), %mm2	
-	movq (%ebx,%eax,1), %mm3
-	call .biquad_4x4_pixels
-	movq %mm0, (%ebx)	
-	movq %mm1, (%ebx,%edx,1)	
-	movq %mm2, (%ebx,%edx,2)	
-	movq %mm3, (%ebx,%eax,1)
-	addl %edx, %ebx
-	addl %eax, %ebx
-	decl %ecx
-	jnz .biquad_vertb_line_loop
-		
-	emms
-	
-	pop %edi
-	pop %esi
-	pop %ebx
-	leave
-	ret
-
-.globl pixel_biquad_horlr_s16
-.type  pixel_biquad_horlr_s16,@function
-
-
-# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-	
-pixel_biquad_horlr_s16: 
-
-		
-	pushl %ebp
-	movl %esp, %ebp
-	push %ebx
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %ebx	# pixel array offset
-	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
-	movl 16(%ebp), %edx	# line with
-
-	movl 20(%ebp), %esi	# coefs
-	movl 24(%ebp), %edi	# state
-
-	shll $1, %edx		# short int addressing
-	movl %edx, %eax
-	shll $1, %eax
-	addl %edx, %eax		# eax = 3 * edx
-	
-	.align 16
-	.biquad_horlr_line_loop:
-	movq (%ebx), %mm0	
-	movq (%ebx,%edx,1), %mm1	
-	movq (%ebx,%edx,2), %mm2	
-	movq (%ebx,%eax,1), %mm3
-	call .transpose_4x4	
-	call .biquad_4x4_pixels
-	call .transpose_4x4	
-	movq %mm0, (%ebx)	
-	movq %mm1, (%ebx,%edx,1)	
-	movq %mm2, (%ebx,%edx,2)	
-	movq %mm3, (%ebx,%eax,1)
-	addl $8, %ebx
-	decl %ecx
-	jnz .biquad_horlr_line_loop
-		
-	emms
-	
-	pop %edi
-	pop %esi
-	pop %ebx
-	leave
-	ret
-
-
-
diff --git a/system/mmx/pixel_biquad_s16.s b/system/mmx/pixel_biquad_s16.s
deleted file mode 100644
index 844b041..0000000
--- a/system/mmx/pixel_biquad_s16.s
+++ /dev/null
@@ -1,451 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
-	
-	# DIRECT FORM II BIQUAD
-	#
-	# y[k]  = b0 * x[k] + u1[k-1]
-	# u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
-	# u2[k] = b2 * x[k]           - a2 * y[k]
-	# MACRO:	df2 <reg>
-	#
-	# computes a direct form 2 biquad
-	# does not use {mm0-mm3}\<inreg>
-	#
-	# input:	<reg>   == input
-	#		%mm4    == state 1
-	#		%mm5    == state 2
-	#		(%esi)  == biquad coefs (-a1 -a2 b0 b1 b2) in s1.14
-	# output:	<reg>   == output
-	#		%mm4    == state 1
-	#		%mm5    == state 2
-
-	.macro df2 reg 
-	movq \reg, %mm6			# mm6 == x[k]
-	movq \reg, %mm7			# mm7 == x[k]
-	pmulhw 16(%esi), %mm6		# mm6 == x[k] * b0
-	pmulhw 24(%esi), %mm7		# mm7 == x[k] * b1
-	paddw %mm4, %mm6		# mm6 == x[k] * b0 + u1[k-1] == y[k]
-	paddw %mm5, %mm7		# mm7 == x[k] * b1 + u2[k-1]
-	paddsw %mm6, %mm6		# compensate for mul = x*y/4 (coefs are s1.14 fixed point)
-	paddsw %mm6, %mm6		# paddsw ensures saturation
-	movq \reg, %mm5			# mm5 == x[k]
-	movq %mm6, %mm4			# mm4 == y[k]
-	movq %mm6, \reg			# reg == y[k]	--------------------
-	pmulhw 0(%esi), %mm4		# mm4 == y[k] * (-a1)
-	pmulhw 8(%esi), %mm6		# mm6 == y[k] * (-a2)
-	pmulhw 32(%esi), %mm5		# mm5 == x[k] * b2
-	paddw %mm7, %mm4		# mm4 == u1[k]	--------------------
-	paddw %mm6, %mm5		# mm5 == u2[k]	--------------------
-	.endm
-
-		
-	# input in register:	
-	# %mm0-mm3:	input 4x4 pixels {x0 x1 x2 x3}
-	# %esi:		coef memory  (-a1, -a2, b0, b1, b2) in s1.14
-	# %edi:		state memory (u1, u2)
-	
-	# return in register:	 
-	# %mm0-mm4:	4x4 pixels result
-
-
-
-	
-	.macro biquad_4x4_pixels	
-	.align 16
-	movq 0(%edi), %mm4		# get state
-	movq 8(%edi), %mm5
-	df2 %mm0			# compute 4 biquads
-	df2 %mm1
-	df2 %mm2
-	df2 %mm3
-	movq %mm4, 0(%edi)		# store state
-	movq %mm5, 8(%edi)
-	.endm
-
-	
-
-	# in order to use the 4 line parallel biquad routine on horizontal
-	# lines, we need to reorder (rotate or transpose) the matrix, since
-	# images are scanline encoded, and we want to work in parallell
-	# on 4 lines.
-	#
-	# since the 4 lines are independent, it doesnt matter in which order
-	# the the vector elements are present. 
-	#
-	# this allows us to use the same routine for left->right and right->left
-	# processing.
-	#	
-	# some comments on the non-abelean group of square isometries consisting of
-	# (I) identity
-	# (H) horizontal axis mirror 
-	# (V) vertical axis mirror
-	# (T) transpose (diagonal axis mirror)
-	# (A) antitranspose (antidiagonal axis mirror)
-	# (R1) 90deg anticlockwize rotation
-	# (R2) 180deg rotation
-	# (R3) 90deg clockwize rotation
-	#
-	#	
-	# we basicly have two options: (R1,R3) or (T,A)
-	# we opt for T and A because they are self inverting, which improves locality
-	#
-	# use antitranspose for right to left an transpose
-	# for left to right (little endian)
-
-
-	# antitranspose 4x4
-
-	# input
-	# %mm3 == {d0 d1 d2 d3}
-	# %mm2 == {c0 c1 c2 c3}	
-	# %mm1 == {b0 b1 b2 b3}	
-	# %mm0 == {a0 a1 a2 a3}
-
-	# output
-	# %mm3 == {a3 b3 c3 d3}
-	# %mm2 == {a2 b2 c2 d2}
-	# %mm1 == {a1 b1 c1 d1}
-	# %mm0 == {a0 b0 c0 d0}
-
-	
-	.macro antitranspose_4x4:	
-	movq %mm3, %mm4
-	punpcklwd %mm1, %mm4	# mm4 <- {b2 d2 b3 d3}
-	movq %mm3, %mm5	
-	punpckhwd %mm1, %mm5	# mm5 <- {b0 d0 b1 d1}
-			
-	movq %mm2, %mm6
-	punpcklwd %mm0, %mm6	# mm6 <- {a2 c2 a3 c3}
-	movq %mm2, %mm7	
-	punpckhwd %mm0, %mm7	# mm7 <- {a0 c0 a1 c1}
-
-	movq %mm4, %mm3
-	punpcklwd %mm6, %mm3	# mm3 <- {a3 b3 c3 d3}
-	movq %mm4, %mm2
-	punpckhwd %mm6, %mm2	# mm2 <- {a2 b2 c2 d2}
-		
-	movq %mm5, %mm1
-	punpcklwd %mm7, %mm1	# mm1 <- {a1 b1 c1 d1}
-	movq %mm5, %mm0
-	punpckhwd %mm7, %mm0	# mm0 <- {a0 b0 c0 d0}
-	
-	.endm
-	
-
-	# transpose 4x4
-
-	# input
-	# %mm3 == {d3 d2 d1 d0}
-	# %mm2 == {c3 c2 c1 c0}	
-	# %mm1 == {b3 b2 b1 b0}	
-	# %mm0 == {a3 a2 a1 a0}
-
-	# output
-	# %mm3 == {d3 c3 b3 a3}
-	# %mm2 == {d2 c2 b2 a2}
-	# %mm1 == {d1 c1 b1 a1}
-	# %mm0 == {d0 c0 b0 a0}
-
-	
-	.macro transpose_4x4:	
-	movq %mm0, %mm4
-	punpcklwd %mm2, %mm4	# mm4 <- {c1 a1 c0 a0}
-	movq %mm0, %mm5	
-	punpckhwd %mm2, %mm5	# mm5 <- {c3 a3 c2 a2}
-		
-	movq %mm1, %mm6
-	punpcklwd %mm3, %mm6	# mm6 <- {d1 b1 d0 b0}
-	movq %mm1, %mm7	
-	punpckhwd %mm3, %mm7	# mm7 <- {d3 b3 d2 b2}
-
-	movq %mm4, %mm0
-	punpcklwd %mm6, %mm0	# mm0 <- {d0 c0 b0 a0}
-	movq %mm4, %mm1
-	punpckhwd %mm6, %mm1	# mm1 <- {d1 c1 b1 a1}
-		
-	movq %mm5, %mm2
-	punpcklwd %mm7, %mm2	# mm2 <- {d2 c2 b2 a2}
-	movq %mm5, %mm3
-	punpckhwd %mm7, %mm3	# mm3 <- {d3 c3 b3 a3}
-
-	.endm
-	
-.globl pixel_biquad_vertb_s16
-.type  pixel_biquad_vertb_s16,@function
-
-
-# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-	
-pixel_biquad_vertb_s16: 
-
-		
-	pushl %ebp
-	movl %esp, %ebp
-	push %ebx
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %ebx	# pixel array offset
-	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
-	movl 16(%ebp), %edx	# line with
-
-	movl 20(%ebp), %esi	# coefs
-	movl 24(%ebp), %edi	# state
-
-	shll $1, %edx		# short int addressing	
-	movl %edx, %eax
-	shll $1, %eax
-	addl %edx, %eax		# eax = 3 * edx
-	
-	.align 16
-	.biquad_vertb_line_loop:
-	movq (%ebx), %mm0	
-	movq (%ebx,%edx,1), %mm1	
-	movq (%ebx,%edx,2), %mm2	
-	movq (%ebx,%eax,1), %mm3
-	biquad_4x4_pixels
-	movq %mm0, (%ebx)	
-	movq %mm1, (%ebx,%edx,1)	
-	movq %mm2, (%ebx,%edx,2)	
-	movq %mm3, (%ebx,%eax,1)
-	addl %edx, %ebx
-	addl %eax, %ebx
-	decl %ecx
-	jnz .biquad_vertb_line_loop
-		
-	emms
-	
-	pop %edi
-	pop %esi
-	pop %ebx
-	leave
-	ret
-.globl pixel_biquad_verbt_s16
-.type  pixel_biquad_verbt_s16,@function
-
-
-# pixel_biquad_vertbt_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-	
-pixel_biquad_verbt_s16: 
-
-		
-	pushl %ebp
-	movl %esp, %ebp
-	push %ebx
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %ebx	# pixel array offset
-	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
-	movl 16(%ebp), %eax	# line with
-
-	shll $3, %eax		# 4 line byte spacing
-	decl %ecx
-	mul %ecx
-	incl %ecx
-	addl %eax, %ebx		# ebx points to last pixblock
-
-	movl 16(%ebp), %edx	# line with
-
-	movl 20(%ebp), %esi	# coefs
-	movl 24(%ebp), %edi	# state
-
-	shll $1, %edx		# short int addressing	
-	movl %edx, %eax
-	shll $1, %eax
-	addl %edx, %eax		# eax = 3 * edx
-	
-	.align 16
-	.biquad_verbt_line_loop:
-	movq (%ebx), %mm3	
-	movq (%ebx,%edx,1), %mm2	
-	movq (%ebx,%edx,2), %mm1	
-	movq (%ebx,%eax,1), %mm0
-	biquad_4x4_pixels
-	movq %mm3, (%ebx)	
-	movq %mm2, (%ebx,%edx,1)	
-	movq %mm1, (%ebx,%edx,2)	
-	movq %mm0, (%ebx,%eax,1)
-	subl %edx, %ebx
-	subl %eax, %ebx
-	decl %ecx
-	jnz .biquad_verbt_line_loop
-		
-	emms
-	
-	pop %edi
-	pop %esi
-	pop %ebx
-	leave
-	ret
-
-.globl pixel_biquad_horlr_s16
-.type  pixel_biquad_horlr_s16,@function
-# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-pixel_biquad_horlr_s16: 
-
-		
-	pushl %ebp
-	movl %esp, %ebp
-	push %ebx
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %ebx	# pixel array offset
-	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
-	movl 16(%ebp), %edx	# line with
-
-	movl 20(%ebp), %esi	# coefs
-	movl 24(%ebp), %edi	# state
-
-	shll $1, %edx		# short int addressing
-	movl %edx, %eax
-	shll $1, %eax
-	addl %edx, %eax		# eax = 3 * edx
-	
-	.align 16
-	.biquad_horlr_line_loop:
-	movq (%ebx), %mm0	
-	movq (%ebx,%edx,1), %mm1	
-	movq (%ebx,%edx,2), %mm2	
-	movq (%ebx,%eax,1), %mm3
-	transpose_4x4	
-	biquad_4x4_pixels
-	transpose_4x4	
-	movq %mm0, (%ebx)	
-	movq %mm1, (%ebx,%edx,1)	
-	movq %mm2, (%ebx,%edx,2)	
-	movq %mm3, (%ebx,%eax,1)
-	addl $8, %ebx
-	decl %ecx
-	jnz .biquad_horlr_line_loop
-		
-	emms
-	
-	pop %edi
-	pop %esi
-	pop %ebx
-	leave
-	ret
-
-
-.globl pixel_biquad_horrl_s16
-.type  pixel_biquad_horrl_s16,@function
-# pixel_biquad_horrl_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-pixel_biquad_horrl_s16: 
-
-	pushl %ebp
-	movl %esp, %ebp
-	push %ebx
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %ebx	# pixel array offset
-	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
-	movl 16(%ebp), %edx	# line with
-
-
-	movl %ecx, %eax
-	decl %eax
-	shll $3, %eax
-	addl %eax, %ebx		# ebx points to last pixblock
-
-	
-	movl 20(%ebp), %esi	# coefs
-	movl 24(%ebp), %edi	# state
-
-	shll $1, %edx		# short int addressing
-	movl %edx, %eax
-	shll $1, %eax
-	addl %edx, %eax		# eax = 3 * edx
-	
-	.align 16
-	.biquad_horrl_line_loop:
-	movq (%ebx), %mm0	
-	movq (%ebx,%edx,1), %mm1	
-	movq (%ebx,%edx,2), %mm2	
-	movq (%ebx,%eax,1), %mm3
-	antitranspose_4x4	
-	biquad_4x4_pixels
-	antitranspose_4x4	
-	movq %mm0, (%ebx)	
-	movq %mm1, (%ebx,%edx,1)	
-	movq %mm2, (%ebx,%edx,2)	
-	movq %mm3, (%ebx,%eax,1)
-	subl $8, %ebx
-	decl %ecx
-	jnz .biquad_horrl_line_loop
-		
-	emms
-	
-	pop %edi
-	pop %esi
-	pop %ebx
-	leave
-	ret
-
-
-.globl pixel_biquad_time_s16
-.type  pixel_biquad_time_s16,@function
-# pixel_biquad_time_s16(short int *pixel_array, short int *s1, short int *s2, short int *coefs, int nb_4_pix_vectors)
-
-pixel_biquad_time_s16: 
-
-	pushl %ebp
-	movl %esp, %ebp
-	push %ebx
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %ebx	# pixel array offset
-	movl 12(%ebp), %edx	# state 1 array
-	movl 16(%ebp), %edi	# state 2 array
-
-	movl 20(%ebp), %esi	# coefs
-	movl 24(%ebp), %ecx	# nb of 4 pixel vectors
-
-
-	.align 16
-	.biquad_time_loop:
-	movq (%ebx), %mm0	# get input
-	movq (%edx), %mm4	# get state 1
-	movq (%edi), %mm5	# get state 2
-	df2 %mm0		# compute direct form 2
-	movq %mm0, (%ebx)	# write output
-	movq %mm5, (%edi)	# write state 2
-	movq %mm4, (%edx)	# write state 1
-	addl $8, %ebx
-	addl $8, %edi
-	addl $8, %edx
-	decl %ecx
-	jnz .biquad_time_loop
-		
-	emms
-	
-	pop %edi
-	pop %esi
-	pop %ebx
-	leave
-	ret
-
-
diff --git a/system/mmx/pixel_ca_s1.s b/system/mmx/pixel_ca_s1.s
deleted file mode 100644
index d9c730f..0000000
--- a/system/mmx/pixel_ca_s1.s
+++ /dev/null
@@ -1,189 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
-	# this file contains assembler routines for 2D 1 bit cellular automata
-	# processing. it is organized around a feeder kernel and a
-	# stack based bit processor (virtual forth machine)
-	#
-	# the feeder kernel is responsable for loading/storing CA cells
-	# from/to memory. data in memory is organized as a scanline
-	# encoded toroidial bitplane (lsb = left). to simplify the kernel, the top
-	# left corner of the rectangular grid of pixels will shift down
-	# every processing step.
-	#
-	# the stack machine has the following architecture:
-	# CA stack:	%esi, TOS: %mm0 (32x2 pixels. lsw = top row)
-	# CA horizon:	%mm4-%mm7 (64x4 pixels. %mm4 = top row)
-	#
-	# the stack size / organization is not known to the stack machine. 
-	# it can be thought of as operating on a 3x3 cell neightbourhood.
-	# the only purpose of forth program is to determine the CA local update rule.
-	#
-	# the machine is supposed to be very minimal. no looping control.
-	# no adressing modes. no conditional code (hey, this is an experiment!)
-	# so recursion is not allowed (no way to stop it)
-	# there are 9 words to load the cell neigbourhood on the stack.
-	# the rest is just logic and stack manips.
-
-
-	# this file contains pure asm macros. it is to be included before assembly
-	# after scaforth.pl has processed the .scaf file
-	
-
-	# *************************** CA CELL ACCESS MACROS *****************************
-	# fetchTL - fetchBR
-
-	# shift / load rectangle macros:
-
-	# shift rectangle horizontal	
-	# result is in reg1
-	.macro shift reg1 reg2 count
-	psllq $(32+\count), \reg1
-	psrlq $(32-\count), \reg2
-	psrlq $32, \reg1
-	psllq $32, \reg2
-	por \reg2, \reg1
-	.endm
-
-	.macro ldtop reg1 reg2
-	movq %mm4, \reg1
-	movq %mm5, \reg2
-	.endm
-
-	.macro ldcenter reg1 reg2
-	movq %mm5, \reg1
-	movq %mm6, \reg2
-	.endm
-
-	.macro ldbottom reg1 reg2
-	movq %mm6, \reg1
-	movq %mm7, \reg2
-	.endm
-	
-
-	# fetch from top row
-
-	# fetch the top left square
-	.macro fetchTL
-	ldtop %mm0, %mm1
-	shift %mm0, %mm1, -1
-	.endm
-
-	# fetch the top mid square
-	.macro fetchTM
-	ldtop %mm0, %mm1
-	shift %mm0, %mm1, 0
-	.endm
-
-	# fetch the top right square
-	.macro fetchTR
-	ldtop %mm0, %mm1
-	shift %mm0, %mm1, 1
-	.endm
-
-
-	
-	# fetch from center row
-
-	# fetch the mid left square
-	.macro fetchML
-	ldcenter %mm0, %mm1
-	shift %mm0, %mm1, -1
-	.endm
-
-	# fetch the mid mid square
-	.macro fetchMM
-	ldcenter %mm0, %mm1
-	shift %mm0, %mm1, 0
-	.endm
-
-	# fetch the mid right square
-	.macro fetchMR
-	ldcenter %mm0, %mm1
-	shift %mm0, %mm1, 1
-	.endm
-
-
-	
-
-			
-	# fetch from bottom row
-
-	# fetch the bottom left square
-	.macro fetchBL
-	ldbottom %mm0, %mm1
-	shift %mm0, %mm1, -1
-	.endm
-
-	# fetch the bottom mid square
-	.macro fetchBM
-	ldbottom %mm0, %mm1
-	shift %mm0, %mm1, 0
-	.endm
-
-	# fetch the bottom right square
-	.macro fetchBR
-	ldbottom %mm0, %mm1
-	shift %mm0, %mm1, 1
-	.endm
-
-
-
-	# *************************** CA STACK MANIP MACROS *****************************
-	# dup drop dropdup swap nip dropover
-
-	.macro dup
-	lea -8(%esi), %esi
-	movq %mm0, (%esi)	
-	.endm
-
-	.macro drop
-	movq (%esi), %mm0
-	lea 8(%esi), %esi
-	.endm
-
-	.macro dropdup
-	movq (%esi), %mm0
-	.endm
-
-	.macro swap
-	movq (%esi), %mm1
-	movq %mm0, (%esi)
-	movq %mm1, %mm0
-	.endm
-
-	.macro nip
-	lea 8(%esi), %esi
-	.endm
-
-	.macro dropover
-	movq 8(%esi), %mm0
-	.endm
-
-
-	# *************************** CA BOOLEAN LOGIC MACROS *****************************
-	# overxor 
-	
-	.macro overxor
-	pxor (%esi), %mm0
-	.endm	
-	
-	
-	
-	
-
diff --git a/system/mmx/pixel_cascade_s16.s b/system/mmx/pixel_cascade_s16.s
deleted file mode 100644
index bf88d08..0000000
--- a/system/mmx/pixel_cascade_s16.s
+++ /dev/null
@@ -1,330 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
-
-	# TODO:	 COUPLED CASCADE SECOND ORDER SECTION
-	#
-	# s1[k] = ar * s1[k-1] + ai * s2[k-1] + x[k]
-	# s2[k] = ar * s2[k-1] - ai * s1[k-1]
-	# y[k]  = c0 * x[k] + c1 * s1[k-1] + c2 * s2[k-1]
-
-
-	# MACRO:	df2
-	#
-	# computes a coupled cascade
-	#
-	# input:	%mm0    == input
-	#		%mm1    == state 1
-	#		%mm2    == state 2
-	#		(%esi)  == cascade coefs (ar ai c0 c1 c2) in s0.15
-	# output:	%mm0    == output
-	#		%mm1    == state 1
-	#		%mm2    == state 2
-
-
-	.macro coupled
-	pmovq %mm1, %mm3		# mm3 == s1[k-1]
-	pmovq %mm1, %mm4		# mm4 == s1[k-1]
-	pmovq %mm2, %mm5		# mm5 == s2[k-1]
-	pmovq %mm2, %mm6		# mm5 == s2[k-1]
-	pmulhw (%esi), %mm1		# mm1 == s1[k-1] * ar
-	pmulhw 8(%esi), %mm3		# mm3 == s1[k-1] * ai
-	pmulhw 24(%esi), %mm4		# mm4 == s1[k-1] * c1
-	pmulhw (%esi), %mm2		# mm2 == s2[k-1] * ar
-	pmulhw 8(%esi), %mm5		# mm5 == s2[k-1] * ai
-	pmulhw 32(%esi), %mm6		# mm6 == s2[k-1] * c2
-	paddw %mm5, %mm1		# mm1 == s1[k-1] * ar + s2[k-1] * ai
-	psubw %mm3, %mm2		# mm2 == s2[k-1] * ar - s1[k-1] * ai == s2[k]
-	paddw %mm0, %mm1		# mm1 == s1[k]
-	pmulhw 16(%esi), %mm0		# mm0 == x[k] * c0
-	paddw %mm6, %mm4		# mm4 == s1[k-1] * c1 + s2[k-1] * c2
-	paddw %mm4, %mm0		# mm0 == y[k]
-	.endm
-	
-
-	
-
-	# in order to use the 4 line parallel cascade routine on horizontal
-	# lines, we need to reorder (rotate or transpose) the matrix, since
-	# images are scanline encoded, and we want to work in parallell
-	# on 4 lines.
-	#
-	# since the 4 lines are independent, it doesnt matter in which order
-	# the the vector elements are present. 
-	#
-	# this allows us to use the same routine for left->right and right->left
-	# processing.
-	#	
-	# some comments on the non-abelean group of square isometries consisting of
-	# (I) identity
-	# (H) horizontal axis mirror 
-	# (V) vertical axis mirror
-	# (T) transpose (diagonal axis mirror)
-	# (A) antitranspose (antidiagonal axis mirror)
-	# (R1) 90deg anticlockwize rotation
-	# (R2) 180deg rotation
-	# (R3) 90deg clockwize rotation
-	#
-	#	
-	# we basicly have two options: (R1,R3) or (T,A)
-	# we opt for T and A because they are self inverting, which improves locality
-	#
-	# use antitranspose for right to left an transpose
-	# for left to right (little endian)
-
-
-	# antitranspose 4x4
-
-	# input
-	# %mm3 == {d0 d1 d2 d3}
-	# %mm2 == {c0 c1 c2 c3}	
-	# %mm1 == {b0 b1 b2 b3}	
-	# %mm0 == {a0 a1 a2 a3}
-
-	# output
-	# %mm3 == {a3 b3 c3 d3}
-	# %mm2 == {a2 b2 c2 d2}
-	# %mm1 == {a1 b1 c1 d1}
-	# %mm0 == {a0 b0 c0 d0}
-
-	
-	.macro antitranspose_4x4:	
-	movq %mm3, %mm4
-	punpcklwd %mm1, %mm4	# mm4 <- {b2 d2 b3 d3}
-	movq %mm3, %mm5	
-	punpckhwd %mm1, %mm5	# mm5 <- {b0 d0 b1 d1}
-			
-	movq %mm2, %mm6
-	punpcklwd %mm0, %mm6	# mm6 <- {a2 c2 a3 c3}
-	movq %mm2, %mm7	
-	punpckhwd %mm0, %mm7	# mm7 <- {a0 c0 a1 c1}
-
-	movq %mm4, %mm3
-	punpcklwd %mm6, %mm3	# mm3 <- {a3 b3 c3 d3}
-	movq %mm4, %mm2
-	punpckhwd %mm6, %mm2	# mm2 <- {a2 b2 c2 d2}
-		
-	movq %mm5, %mm1
-	punpcklwd %mm7, %mm1	# mm1 <- {a1 b1 c1 d1}
-	movq %mm5, %mm0
-	punpckhwd %mm7, %mm0	# mm0 <- {a0 b0 c0 d0}
-	
-	.endm
-	
-
-	# transpose 4x4
-
-	# input
-	# %mm3 == {d3 d2 d1 d0}
-	# %mm2 == {c3 c2 c1 c0}	
-	# %mm1 == {b3 b2 b1 b0}	
-	# %mm0 == {a3 a2 a1 a0}
-
-	# output
-	# %mm3 == {d3 c3 b3 a3}
-	# %mm2 == {d2 c2 b2 a2}
-	# %mm1 == {d1 c1 b1 a1}
-	# %mm0 == {d0 c0 b0 a0}
-
-	
-	.macro transpose_4x4:	
-	movq %mm0, %mm4
-	punpcklwd %mm2, %mm4	# mm4 <- {c1 a1 c0 a0}
-	movq %mm0, %mm5	
-	punpckhwd %mm2, %mm5	# mm5 <- {c3 a3 c2 a2}
-		
-	movq %mm1, %mm6
-	punpcklwd %mm3, %mm6	# mm6 <- {d1 b1 d0 b0}
-	movq %mm1, %mm7	
-	punpckhwd %mm3, %mm7	# mm7 <- {d3 b3 d2 b2}
-
-	movq %mm4, %mm0
-	punpcklwd %mm6, %mm0	# mm0 <- {d0 c0 b0 a0}
-	movq %mm4, %mm1
-	punpckhwd %mm6, %mm1	# mm1 <- {d1 c1 b1 a1}
-		
-	movq %mm5, %mm2
-	punpcklwd %mm7, %mm2	# mm2 <- {d2 c2 b2 a2}
-	movq %mm5, %mm3
-	punpckhwd %mm7, %mm3	# mm3 <- {d3 c3 b3 a3}
-
-	.endm
-	
-.globl pixel_cascade_vertb_s16
-.type  pixel_cascade_vertb_s16,@function
-
-
-# pixel_cascade_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-	
-pixel_cascade_vertb_s16: 
-
-		
-	pushl %ebp
-	movl %esp, %ebp
-	push %ebx
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %ebx	# pixel array offset
-	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
-	movl 16(%ebp), %edx	# line with
-
-	movl 20(%ebp), %esi	# coefs
-	movl 24(%ebp), %edi	# state
-
-	shll $1, %edx		# short int addressing
-	subl %edx, %ebx	
-
-	movq 0(%edi), %mm1	# s1[k-1]
-	movq 8(%edi), %mm2	# s2[k-1]
-	.align 16
-	.cascade_vertb_line_loop:
-	
-	movq (%ebx,%edx,1), %mm3
-	movq %mm3, %mm0
-	addl %edx, %ebx
-	coupled
-	movq %mm0, (%ebx)
-	
-	movq (%ebx,%edx,1), %mm3
-	movq %mm3, %mm0
-	addl %edx, %ebx
-	coupled
-	movq %mm0, (%ebx)
-	
-	movq (%ebx,%edx,1), %mm3
-	movq %mm3, %mm0
-	addl %edx, %ebx
-	coupled
-	movq %mm0, (%ebx)
-	
-	movq (%ebx,%edx,1), %mm3
-	movq %mm3, %mm0
-	addl %edx, %ebx
-	coupled
-	movq %mm0, (%ebx)
-	
-	decl %ecx
-	jnz .cascade_vertb_line_loop
-		
-	movq %mm1, 0(%edi)	# s1[k-1]
-	movq %mm2, 8(%edi)	# s2[k-1]
-
-	emms
-	
-	pop %edi
-	pop %esi
-	pop %ebx
-	leave
-	ret
-
-.globl pixel_cascade_horlr_s16
-.type  pixel_cascade_horlr_s16,@function
-
-
-# pixel_cascade_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-	
-pixel_cascade_horlr_s16: 
-
-		
-	pushl %ebp
-	movl %esp, %ebp
-	push %ebx
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %ebx	# pixel array offset
-	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
-	movl 16(%ebp), %edx	# line with
-
-	movl 20(%ebp), %esi	# coefs
-	movl 24(%ebp), %edi	# state
-
-	shll $1, %edx		# short int addressing
-	movl %edx, %eax
-	shll $1, %eax
-	addl %edx, %eax		# eax = 3 * edx
-
-	
-	.align 16
-	.cascade_horlr_line_loop:
-	movq (%edi), %mm1
-	movq 8(%edi), %mm2
-	
-	movq (%ebx), %mm0	
-	movq (%ebx,%edx,1), %mm1	
-	movq (%ebx,%edx,2), %mm2	
-	movq (%ebx,%eax,1), %mm3
-	
-	transpose_4x4
-	
-	movq %mm1, (%ebx,%edx,1)	
-	movq %mm2, (%ebx,%edx,2)	
-	movq %mm3, (%ebx,%eax,1)
-
-	coupled
-
-	movq %mm0, (%ebx)
-	movq (%ebx,%edx,1), %mm3
-	movq %mm3, %mm0
-
-	coupled
-
-	movq %mm0, (%ebx, %edx,1)
-	movq (%ebx,%edx,2), %mm3
-	movq %mm3, %mm0
-
-	coupled
-
-	movq %mm0, (%ebx, %edx,2)
-	movq (%ebx,%eax,1), %mm3
-	movq %mm3, %mm0
-
-	coupled
-	
-	movq %mm1, 0(%edi)	# s1[k-1]
-	movq %mm2, 8(%edi)	# s2[k-1]
-
-	movq %mm0, %mm3
-	movq (%ebx), %mm0
-	movq (%ebx,%edx,1), %mm1	
-	movq (%ebx,%edx,2), %mm2	
-
-	transpose_4x4
-	
-	movq %mm0, (%ebx)
-	movq %mm1, (%ebx,%edx,1)
-	movq %mm2, (%ebx,%edx,2)	
-	movq %mm3, (%ebx,%eax,1)		
-
-	addl $8, %ebx
-	decl %ecx
-	jnz .cascade_horlr_line_loop
-		
-	emms
-	
-	pop %edi
-	pop %esi
-	pop %ebx
-	leave
-	ret
-
-
-
diff --git a/system/mmx/pixel_cheby_s16.s b/system/mmx/pixel_cheby_s16.s
deleted file mode 100644
index 2afe9e2..0000000
--- a/system/mmx/pixel_cheby_s16.s
+++ /dev/null
@@ -1,90 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_cheby_s16_3plus
-.type  pixel_cheby_s16_3plus,@function
-
-# void pixel_cheby_s16(int *buf, int nb_8pixel_vectors, int order+1, short int *coefs)
-
-
-# coefs are s2.13 fixed point (-4->4)
-pixel_cheby_s16_3plus:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-	push %edx
-
-	movl 8(%ebp),  %esi	# input array
-	movl 12(%ebp), %ecx	# vector count
-	movl 16(%ebp), %eax	# get order+1
-
-	shll $3, %eax
-	movl 20(%ebp), %edx
-	addl %eax, %edx		# edx = coef endx address
-	
-#	jmp skip
-	
-	.align 16
-	.loop_cheby:	
-
-	movl 20(%ebp), %edi	# get coefs
-	movq (%esi), %mm0	# load 4 pixels from memory (mm0 = x)
-	pcmpeqw %mm2, %mm2
-	movq %mm0, %mm1		# mm1 (T_n-1) <- x
-	psrlw $1, %mm2		# mm2 (T_n-2) <- 1
-	
-
-	movq (%edi), %mm4	# mm4 (acc) == a0
-	psraw $1, %mm4		# mm4 == a0/2
-	movq %mm0, %mm5		# mm5 (intermediate)
-	pmulhw 8(%edi), %mm5	# mm5 == (x * a1)/2
-	paddsw %mm5, %mm4	# acc = c0 + c1 x
-	addl $16, %edi
-
-	.loop_cheby_inner:	
-	movq %mm1, %mm3		# mm3 == T_n-1
-	psraw $2, %mm2		# mm2 == T_n-2 / 4
-	pmulhw %mm0, %mm3	# mm3 == (2 x T_n-1) / 4
-	psubsw %mm2, %mm3	# mm3 == (2 x T_n-1 - T_n-2) / 4
-	paddsw %mm3, %mm3
-	paddsw %mm3, %mm3	# mm3 == T_n
-	movq %mm1, %mm2		# mm2 == new T_n-1
-	movq %mm3, %mm1		# mm3 == new T_n-2
-	pmulhw (%edi), %mm3	# mm3 = a_n * T_n / 2
-	paddsw %mm3, %mm4	# accumulate
-	addl $8, %edi
-	cmpl %edx, %edi
-	jne .loop_cheby_inner
-	
-	paddsw %mm4, %mm4	# compensate for 0.125 factor
-	paddsw %mm4, %mm4
-	paddsw %mm4, %mm4
-	movq %mm4, (%esi)	# store result in memory
-	addl $8, %esi		# increment source/dest pointer
-	decl %ecx
-	jnz .loop_cheby		# loop
-
-skip:	
-	emms
-
-	pop %edx
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_conv_hor_s16.s b/system/mmx/pixel_conv_hor_s16.s
deleted file mode 100644
index e90a692..0000000
--- a/system/mmx/pixel_conv_hor_s16.s
+++ /dev/null
@@ -1,134 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-	# intermediate function
-	
-	# input in register:	
-	# %mm0:	left 4 pixels
-	# %mm1:	middle 4 pixels
-	# %mm2:	right 4 pixels
-	
-	# %mm5:	left 4 pixel masks
-	# %mm6:	middle 4 pixel masks
-	# %mm7:	right 4 pixel masks
-	
-	# return in register:	 
-	# %mm0:	middle 4 pixels result
-
-	
-	.conv_hor_4_pixels:	
-	.align 16
-	
-	# compute quadruplet
-
-	# get left pixels
-	psrlq $48, %mm0			# shift word 3 to byte 0
-	movq %mm1, %mm4
-	psllq $16, %mm4			# shift word 0,1,2 to 1,2,3
-	por %mm4, %mm0			# combine
-	pmulhw %mm5, %mm0
-	psllw $1, %mm0
-
-	
-	# get middle pixels
-	movq %mm1, %mm4
-	pmulhw %mm6, %mm4
-	psllw $1, %mm4
-	paddsw %mm4, %mm0	
-
-
-	# get right pixels
-	movq %mm2, %mm3
-	psllq $48, %mm3			# shift word 0 to word 3
-	movq %mm1, %mm4
-	psrlq $16, %mm4			# shift word 1,2,3 to 0,1,2
-	por %mm4, %mm3			# combine
-	pmulhw %mm7, %mm3
-	psllw $1, %mm3
-	paddsw %mm3, %mm0		# accumulate
-	
-	ret
-	
-.globl pixel_conv_hor_s16
-.type  pixel_conv_hor_s16,@function
-
-
-# pixel_conv_hor_s16(short int *pixel_array, int nb_4_pixel_vectors, short int border[4], short int mask[12])
-# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
-# NOT TESTED
-
-	
-pixel_conv_hor_s16: 
-
-		
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %esi	# pixel array offset
-	movl 12(%ebp), %ecx	# nb of 8 pixel vectors in a row (at least 2)
-
-	movl 20(%ebp), %edi	# mask vector
-	movq (%edi), %mm5
-	movq 8(%edi), %mm6
-	movq 16(%edi), %mm7
-	
-	movl 16(%ebp), %edi	# boundary pixel vector
-	
-	
-
-	movq (%edi), %mm0	# init regs (left edge, so mm0 is zero)
-	movq (%esi), %mm1
-	movq 8(%esi), %mm2
-
-	decl %ecx		# loop has 2 terminator stubs
-	decl %ecx		# todo:	 handle if ecx < 3
-	
-	jmp .conv_line_loop
-
-
-	.align 16
-	.conv_line_loop:	
-	call .conv_hor_4_pixels	# compute conv 
-	movq %mm0, (%esi)	# store result
-	movq %mm1, %mm0		# mm0 <- prev (%esi)
-	movq %mm2, %mm1		# mm1 <- 8(%esi)
-	movq 16(%esi), %mm2	# mm2 <- 16(%esi)
-	
-	addl $8, %esi		# increase pointer
-	decl %ecx
-	jnz .conv_line_loop
-
-	call .conv_hor_4_pixels	# compute conv 
-	movq %mm0, (%esi)	# store result
-	movq %mm1, %mm0		# mm0 <- prev (%esi)
-	movq %mm2, %mm1		# mm1 <- 8(%esi)
-	movq (%edi), %mm2	# mm2 <- border
-
-	call .conv_hor_4_pixels	# compute last vector
-	movq %mm0, 8(%esi)	# store it
-	
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-
-
-
diff --git a/system/mmx/pixel_conv_ver_s16.s b/system/mmx/pixel_conv_ver_s16.s
deleted file mode 100644
index ae2456f..0000000
--- a/system/mmx/pixel_conv_ver_s16.s
+++ /dev/null
@@ -1,128 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-#TODO:	 fix out of bound acces in conv_ver and conv_hor
-	
-	# intermediate function
-	
-	# input in register:	
-	# %mm0:	top 4 pixels
-	# %mm1:	middle 4 pixels
-	# %mm2:	bottom 4 pixels
-
-	# %mm5:	top 4 pixel mask
-	# %mm6:	middle 4 pixel mask
-	# %mm7:	bottom 4 pixel mask
-	
-	# return in register:	 
-	# %mm0:	middle 4 pixels result
-
-	
-	.conv_ver_4_pixels:	
-	.align 16
-	
-	# compute quadruplet
-
-	# get top pixel
-	pmulhw %mm5, %mm0
-	psllw $1, %mm0
-	
-	# get middle pixel
-	movq %mm1, %mm4
-	pmulhw %mm6, %mm4
-	psllw $1, %mm4
-	paddsw %mm4, %mm0
-
-	# get bottom pixel
-	movq %mm2, %mm3
-	pmulhw %mm7, %mm3
-	psllw $1, %mm3			# mm3 <- mm3/4
-	paddsw %mm3, %mm0
-
-	ret
-	
-.globl pixel_conv_ver_s16
-.type  pixel_conv_ver_s16,@function
-
-
-# pixel_conv_ver_s16(short int *pixel_array, int nb_4_pixel_vectors, int row_byte_size, short int border[4])
-# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
-# NOT TESTED
-
-	
-pixel_conv_ver_s16: 
-
-		
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %esi		# pixel array offset
-	movl 12(%ebp), %ecx		# nb of 4 pixel vectors in a row (at least 2)
-	movl 16(%ebp), %edx		# rowsize in bytes
-
-	movl 24(%ebp), %edi		# mask vector
-	movq (%edi), %mm5
-	movq 8(%edi), %mm6
-	movq 16(%edi), %mm7
-	
-	movl 20(%ebp), %edi		# edge vector
-
-
-	shll $1, %edx
-	decl %ecx			# loop has a terminator stub
-	decl %ecx			# loop has another terminator stub
-	
-
-	movq (%edi), %mm0		# init regs (left edge, so mm0 is zero)
-	movq (%esi), %mm1
-	movq (%esi,%edx,1), %mm2
-	jmp .conv_line_loop
-
-
-	.align 16
-	.conv_line_loop:	
-	call .conv_ver_4_pixels		# compute conv 
-	movq %mm0, (%esi)		# store result
-	movq %mm1, %mm0			# mm0 <- prev (%esi)
-	movq %mm2, %mm1			# mm1 <- (%esi,%edx,1)
-	movq (%esi,%edx,2), %mm2	# mm2 <- (%esi,%edx,2)
-	
-	addl %edx, %esi			# increase pointer
-	decl %ecx
-	jnz .conv_line_loop
-
-	call .conv_ver_4_pixels		# compute conv 
-	movq %mm0, (%esi)		# store result
-	movq %mm1, %mm0			# mm0 <- prev (%esi)
-	movq %mm2, %mm1			# mm1 <- (%esi,%edx,1)
-	movq (%edi), %mm2		# clear invalid edge vector
-
-	addl %edx, %esi			# increase pointer
-	call .conv_ver_4_pixels		# compute last vector
-	movq %mm0, (%esi)		# store it
-	
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-
-
-
diff --git a/system/mmx/pixel_crot_s16.s b/system/mmx/pixel_crot_s16.s
deleted file mode 100644
index 2427869..0000000
--- a/system/mmx/pixel_crot_s16.s
+++ /dev/null
@@ -1,153 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_crot3d_s16
-.type  pixel_crot3d_s16,@function
-
-
-# 3 dimensional colour space rotation
-# 3x3 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
-	
-# void pixel_crot3d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
-
-pixel_crot3d_s16:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	
-	movl 8(%ebp),  %esi	# input array
-	movl 12(%ebp), %ecx	# pixel count
-	movl 16(%ebp), %edi	# rotation matrix
-	movl %ecx, %edx
-	shll $3, %edx		# %edx = plane spacing
-
-	
-	.align 16
-	.loop_crot3d:	
-
-	movq (%esi), %mm0		# get 1st component
-	movq (%esi,%edx,1), %mm6	# get 2nd component
-	movq (%esi,%edx,2), %mm7	# get 3rd component
-
-	movq %mm0, %mm1			# copy 1st component
-	movq %mm0, %mm2
-
-	pmulhw (%edi), %mm0		# mul first column
-	pmulhw 8(%edi), %mm1
-	pmulhw 16(%edi), %mm2
-
-	movq %mm6, %mm5			# copy 2nd component
-	movq %mm6, %mm3
-
-	pmulhw 24(%edi), %mm6		# mul second column
-	pmulhw 32(%edi), %mm5
-	pmulhw 40(%edi), %mm3
-
-	paddsw %mm6, %mm0		# accumulate
-	paddsw %mm5, %mm1
-	paddsw %mm3, %mm2
-
-	movq %mm7, %mm4			# copy 3rd component
-	movq %mm7, %mm6
-
-	pmulhw 48(%edi), %mm4		# mul third column
-	pmulhw 56(%edi), %mm6
-	pmulhw 64(%edi), %mm7
-
-	paddsw %mm4, %mm0		# accumulate
-	paddsw %mm6, %mm1
-	paddsw %mm7, %mm2
-
-	paddsw %mm0, %mm0		# double (fixed point normalization)
-	paddsw %mm1, %mm1
-	paddsw %mm2, %mm2
-
-	movq %mm0, (%esi)		# store
-	movq %mm1, (%esi, %edx, 1)
-	movq %mm2, (%esi, %edx, 2)
-
-	addl $8, %esi			# increment source pointer
-	decl %ecx
-	jnz .loop_crot3d		# loop
-
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
-
-.globl pixel_crot2d_s16
-.type  pixel_crot2d_s16,@function
-	
-# 2 dimensional colour space rotation
-# 2x2 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
-	
-# void pixel_crot2d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
-
-pixel_crot2d_s16:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	
-	movl 8(%ebp),  %esi	# input array
-	movl 12(%ebp), %ecx	# pixel count
-	movl 16(%ebp), %edi	# rotation matrix
-	movl %ecx, %edx
-	shll $3, %edx		# %edx = plane spacing
-
-	
-	.align 16
-	.loop_crot2d:	
-
-	movq (%esi), %mm0		# get 1st component
-	movq (%esi,%edx,1), %mm2	# get 2nd component
-
-	movq %mm0, %mm1			# copy 1st component
-	movq %mm2, %mm3			# copy 2nd component
-
-	pmulhw (%edi), %mm0		# mul first column
-	pmulhw 8(%edi), %mm1
-
-	pmulhw 16(%edi), %mm2		# mul second column
-	pmulhw 24(%edi), %mm3
-
-	paddsw %mm2, %mm0		# accumulate
-	paddsw %mm3, %mm1
-
-	paddsw %mm0, %mm0		# fixed point gain correction
-	paddsw %mm1, %mm1
-
-	movq %mm0, (%esi)		# store
-	movq %mm1, (%esi, %edx, 1)
-
-	addl $8, %esi			# increment source pointer
-	decl %ecx
-	jnz .loop_crot2d		# loop
-
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_gain.s b/system/mmx/pixel_gain.s
deleted file mode 100644
index 5cd5057..0000000
--- a/system/mmx/pixel_gain.s
+++ /dev/null
@@ -1,83 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_gain
-.type  pixel_gain,@function
-
-# mmx rgba pixel gain
-# void asmtest(char *pixelarray, int32 nbpixels, int *rgba_gain)
-# gains are 7.9 fixed point for rgba
-
-pixel_gain:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %esi	# pixel array offset
-	movl 12(%ebp), %ecx	# nb of elements
-	movl 16(%ebp), %edi	# int16[4] array of gains
-
-	prefetch (%esi)
-
-	emms
-	sarl $2, %ecx		# process 4 pixels per loop iteration
-	jz .exit
-	movq (%edi), %mm7	# read gain array from memory
-	jmp .loop_gain
-
-	.align 16
-	.loop_gain:	
-
-	prefetch 128(%esi)	
-	movq (%esi), %mm5	# load pixel 1-2  from memory
-	movq 8(%esi), %mm6	# load pixel 3-4  from memory
-	pxor %mm0, %mm0		# zero mm0 - mm3
-	pxor %mm1, %mm1
-	pxor %mm2, %mm2
-	pxor %mm3, %mm3
-	punpcklbw %mm5, %mm0	# unpack 1st pixel into 8.8 bit ints
-	punpckhbw %mm5, %mm1	# unpack 2nd
-	punpcklbw %mm6, %mm2	# unpack 3rd
-	punpckhbw %mm6, %mm3	# unpack 4th
-	psrlw $0x1, %mm0	# shift right to clear sign bit 9.7
-	psrlw $0x1, %mm1
-	psrlw $0x1, %mm2
-	psrlw $0x1, %mm3
-	
-	pmulhw %mm7, %mm0	# multiply 1st pixel 9.7 * 7.9 -> 16.0
-	pmulhw %mm7, %mm1	# multiply 2nd  
-	pmulhw %mm7, %mm2	# multiply 3rd
-	pmulhw %mm7, %mm3	# multiply 4th 
-
-	packuswb %mm1, %mm0	# pack & saturate to 8bit vector
-	movq %mm0, (%esi)	# store result in memory
-	packuswb %mm3, %mm2	# pack & saturate to 8bit vector
-	movq %mm2, 8(%esi)	# store result in memory
-
-	addl $16, %esi		# increment source pointer
-	decl %ecx
-	jnz .loop_gain		# loop
-
-	.exit:
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_gain_s16.s b/system/mmx/pixel_gain_s16.s
deleted file mode 100644
index adcfdf5..0000000
--- a/system/mmx/pixel_gain_s16.s
+++ /dev/null
@@ -1,71 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_gain_s16
-.type  pixel_gain_s16,@function
-
-# gain is integer, shift count is down	
-# void pixel_gain_s16(int *buf, int nb_8pixel_vectors, short int gain[4], unsigned long long *shift)
-
-pixel_gain_s16:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 20(%ebp), %edi
-	movq (%edi), %mm6	# get shift vector
-
-	movl 16(%ebp), %edi
-	movq (%edi), %mm7	# get gain vector
-	
-	movl 8(%ebp),  %esi	# input array
-	movl 12(%ebp), %ecx	# pixel count
-
-	
-	.align 16
-	.loop_gain:	
-
-	movq (%esi), %mm0	# load 4 pixels from memory
-	movq %mm0, %mm1		
-	pmulhw %mm7, %mm1	# apply gain (s15.0) fixed point, high word
-	pmullw %mm7, %mm0	# low word
-
-	movq %mm0, %mm2		# copy
-	movq %mm1, %mm3
-
-	punpcklwd %mm1, %mm0	# unpack lsw components
-	punpckhwd %mm3, %mm2	# unpack msw components
-
-	psrad %mm6, %mm0	# apply signed shift
-	psrad %mm6, %mm2
-
-	packssdw %mm2, %mm0	# pack result & saturate
-	movq %mm0, (%esi)	# store result
-	
-
-	addl $8, %esi		# increment source pointer
-	decl %ecx
-	jnz .loop_gain		# loop
-
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_mix_s16.s b/system/mmx/pixel_mix_s16.s
deleted file mode 100644
index 9bf41eb..0000000
--- a/system/mmx/pixel_mix_s16.s
+++ /dev/null
@@ -1,68 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_mix_s16
-.type  pixel_mix_s16,@function
-
-# mmx rgba pixel gain
-# void pixel_mix_s16(int *left, int *right, int nb_4pixel_vectors, 
-#	short int gain_left[4], short int gain_right[4])
-
-pixel_mix_s16:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 20(%ebp), %edi	# int16[4] array of gains
-	movq (%edi), %mm6	# get left gain array
-
-	movl 24(%ebp), %edi	# int16[4] array of gains
-	movq (%edi), %mm7	# get right gain array
-	
-	movl 8(%ebp),  %edi	# left array
-	movl 12(%ebp), %esi	# right array
-	movl 16(%ebp), %ecx	# pixel count
-
-	
-	.align 16
-	.loop_mix:	
-
-#	prefetch 128(%esi)	
-	movq (%esi), %mm1	# load right 4 pixels from memory
-	pmulhw %mm7, %mm1	# apply right gain
-	movq (%edi), %mm0	# load 4 left pixels from memory
-	pmulhw %mm6, %mm0	# apply left gain
-#	pslaw $1, %mm1		# shift left ((s).15 x (s).15 -> (s0).14))
-#	pslaw $1, %mm0
-	paddsw %mm0, %mm0	# no shift left arithmic, so use add instead
-	paddsw %mm1, %mm1
-	paddsw %mm1, %mm0	# mix
-	movq %mm0, (%edi)
-	addl $8, %esi
-	addl $8, %edi
-	decl %ecx
-	jnz .loop_mix		# loop
-
-	emms
-
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_mul_s16.s b/system/mmx/pixel_mul_s16.s
deleted file mode 100644
index 240a024..0000000
--- a/system/mmx/pixel_mul_s16.s
+++ /dev/null
@@ -1,56 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_mul_s16
-.type  pixel_mul_s16,@function
-
-# simple add
-# void pixel_mul_s16(int *left, int *right, int nb_4pixel_vectors)
-
-pixel_mul_s16:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %edi	# left array
-	movl 12(%ebp), %esi	# right array
-	movl 16(%ebp), %ecx	# pixel count
-
-	
-	.align 16
-	.loop_mix:	
-
-#	prefetch 128(%esi)	
-	movq (%esi), %mm1	# load right 4 pixels from memory
-	movq (%edi), %mm0	# load 4 left pixels from memory
-	pmulhw %mm1, %mm0	# mul
-	psllw $1, %mm0		# fixed point shift correction
-	movq %mm0, (%edi)
-	addl $8, %esi
-	addl $8, %edi
-	decl %ecx
-	jnz .loop_mix		# loop
-
-	emms
-
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_pack_s16u8.s b/system/mmx/pixel_pack_s16u8.s
deleted file mode 100644
index 57df702..0000000
--- a/system/mmx/pixel_pack_s16u8.s
+++ /dev/null
@@ -1,126 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_pack_s16u8_y
-.type  pixel_pack_s16u8_y,@function
-
-# mmx rgba pixel gain
-# void pixel_pack_s16u8_y(int *input, int *output, int nb_8pixel_vectors)
-
-pixel_pack_s16u8_y:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-#	movl 20(%ebp), %edi	# int16[4] array of gains
-#	movq (%edi), %mm7	# get gain array
-#	psllw $1, %mm7		# adjust for shifted sign bit
-	
-	movl 8(%ebp),  %esi	# input array
-	movl 12(%ebp), %edi	# output array
-	movl 16(%ebp), %ecx	# pixel count
-
-	pxor %mm6, %mm6
-	
-	.align 16
-	.loop_pack_y:	
-
-#	prefetch 128(%esi)	
-	movq (%esi), %mm0	# load 4 pixels from memory
-#	pmulhw %mm7, %mm0	# apply gain
-	movq 8(%esi), %mm1	# load 4 pixels from memory
-#	pmulhw %mm7, %mm1	# apply gain
-
-#	movq %mm0, %mm2
-#	pcmpgtw %mm6, %mm2	# mm2 > 0 ?  0xffff :	0
-#	pand %mm2, %mm0 
-
-#	movq %mm1, %mm3
-#	pcmpgtw %mm6, %mm3	# mm3 > 0 ?  0xffff :	0
-#	pand %mm3, %mm1 
-
-#	psllw $1, %mm0		# shift out sign bit
-#	psllw $1, %mm1		# shift out sign bit
-
-	psraw $7, %mm0		# shift to lsb
-	psraw $7, %mm1		# shift to lsb
-	
-	packuswb %mm1, %mm0	# pack & saturate to 8bit vector
-	movq %mm0, (%edi)	# store result in memory
-
-	addl $16, %esi		# increment source pointer
-	addl $8, %edi		# increment dest pointer
-	decl %ecx
-	jnz .loop_pack_y	# loop
-
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
-.globl pixel_pack_s16u8_uv
-.type  pixel_pack_s16u8_uv,@function
-
-pixel_pack_s16u8_uv:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-#	movl 20(%ebp), %edi	# int16[4] array of gains
-#	movq (%edi), %mm7	# get gain array
-	movl 8(%ebp),  %esi	# pixel array offset
-	movl 12(%ebp), %edi	# nb of elements
-	movl 16(%ebp), %ecx	# pixel count
-
-	pcmpeqw %mm6, %mm6
-	psllw $15, %mm6
-	movq %mm6, %mm5
-	psrlw $8, %mm5
-	por %mm5, %mm6		# mm6 <- 8 times 0x80
-	
-	.align 16
-	.loop_pack_uv:	
-
-#	prefetch 128(%esi)	
-	movq (%esi), %mm0	# load 4 pixels from memory
-#	pmulhw %mm7, %mm0	# apply gain
-	movq 8(%esi), %mm1	# load 4 pixels from memory
-#	pmulhw %mm7, %mm1	# apply gain
-
-	psraw $8, %mm0		# shift to msb
-	psraw $8, %mm1
-	
-	packsswb %mm1, %mm0	# pack & saturate to 8bit vector
-	pxor %mm6, %mm0		# flip sign bits
-	movq %mm0, (%edi)	# store result in memory
-
-	addl $16, %esi		# increment source pointer
-	addl $8, %edi		# increment dest pointer
-	decl %ecx
-	jnz .loop_pack_uv	# loop
-
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_rand_s16.s b/system/mmx/pixel_rand_s16.s
deleted file mode 100644
index 649400b..0000000
--- a/system/mmx/pixel_rand_s16.s
+++ /dev/null
@@ -1,76 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_rand_s16
-.type  pixel_rand_s16,@function
-
-# mmx rgba pixel gain
-# void pixel_rand_s16(int *dst, nb_4pixel_vectors, short int random_seed[4])
-
-pixel_rand_s16:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 16(%ebp), %esi	# int16[4] array of random seeds
-	movl 8(%ebp),  %edi	# dst array
-	movl 12(%ebp), %ecx	# pixel count
-
-	movq (%esi), %mm6
-
-
-	pcmpeqw %mm3, %mm3
-	psrlw $15, %mm3		# get bit mask 4 times 0x0001
-	
-	.align 16
-	.loop_rand:	
-
-#	prefetch 128(%esi)	
-
-
-	movq %mm6, %mm4		# get random vector
-	psrlw $15, %mm4		# get first component
-	movq %mm6, %mm5
-	psrlw $14, %mm5		# get second component
-	pxor %mm5, %mm4
-	movq %mm6, %mm5
-	psrlw $12, %mm5		# get third component
-	pxor %mm5, %mm4
-	movq %mm6, %mm5
-	psrlw $3, %mm5		# get forth component
-	pxor %mm5, %mm4
-
-	psllw $1, %mm6		# shift left original random vector
-	pand %mm3, %mm4		# isolate new bit
-	por %mm4, %mm6		# combine into new random vector
-
-	movq %mm6, (%edi)
-	addl $8, %edi
-	decl %ecx
-	jnz .loop_rand	# loop
-
-
-	movq %mm6, (%esi)	# store random seeds
-
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_randmix_s16.s b/system/mmx/pixel_randmix_s16.s
deleted file mode 100644
index 44e1702..0000000
--- a/system/mmx/pixel_randmix_s16.s
+++ /dev/null
@@ -1,91 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_randmix_s16
-.type  pixel_randmix_s16,@function
-
-# mmx rgba pixel gain
-# void pixel_randmix_s16(int *left, int *right, int nb_4pixel_vectors, short int random_seed[4], short int threshold[4])
-
-pixel_randmix_s16:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 20(%ebp), %edi	# int16[4] array of random seeds
-	movq (%edi), %mm6
-
-	movl 24(%ebp), %edi	# int16[4] array of thresholds
-	movq (%edi), %mm7
-	
-	movl 8(%ebp),  %edi	# left array
-	movl 12(%ebp), %esi	# right array
-	movl 16(%ebp), %ecx	# pixel count
-
-	pcmpeqw %mm3, %mm3
-	psrlw $15, %mm3		# get bit mask 4 times 0x0001
-	
-	.align 16
-	.loop_randmix:	
-
-#	prefetch 128(%esi)	
-	movq (%esi), %mm1	# load right 4 pixels from memory
-	movq (%edi), %mm0	# load 4 left pixels from memory
-
-	movq %mm6, %mm2		# get random vector
-	pcmpgtw %mm7, %mm2	# compare random vector with threshold
-	movq %mm2, %mm5
-	
-	pand %mm0, %mm2		# get left array's components
-	pandn %mm1, %mm5	# get right array's components
-	por %mm2, %mm5
-	
-	movq %mm5, (%edi)	# store pixels
-
-	movq %mm6, %mm4		# get random vector
-	psrlw $15, %mm4		# get first component
-	movq %mm6, %mm5
-	psrlw $14, %mm5		# get second component
-	pxor %mm5, %mm4
-	movq %mm6, %mm5
-	psrlw $12, %mm5		# get third component
-	pxor %mm5, %mm4
-	movq %mm6, %mm5
-	psrlw $3, %mm5		# get forth component
-	pxor %mm5, %mm4
-
-	psllw $1, %mm6		# shift left original random vector
-	pand %mm3, %mm4		# isolate new bit
-	por %mm4, %mm6		# combine into new random vector
-	
-	addl $8, %esi
-	addl $8, %edi
-	decl %ecx
-	jnz .loop_randmix	# loop
-
-
-	movl 20(%ebp), %edi	# int16[4] array of random seeds
-	movq %mm6, (%edi)	# store random seeds
-
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_resample_s16.s b/system/mmx/pixel_resample_s16.s
deleted file mode 100644
index 3959f9c..0000000
--- a/system/mmx/pixel_resample_s16.s
+++ /dev/null
@@ -1,314 +0,0 @@
-	
-
-#interpolation data:
-#* 4 vectors: neighbourhood for samples (TL, TR, BL, BR)
-#* 2 vectors: fractional part (unsigned)
-#* 2 vectors: addresses of pixel blocks
-
-#coord conversion data:
-#1 vector: 32bit splatted address	
-#1 vector: 16bit splatted w-1
-#1 vector: 16bit splatted h-1
-#1 vector: 16bit splatted w (reuse w-1 with add?)
-#1 dword:  32 bit line offset
-
-#coord generation data:	several vectors for parameter update stuff..
-
-#coordinate systems: 16 bit virtual coordinates (signed, center relative)
-#* 2 vectors: virtual coordinates
-#(evt tussenstap + conversie naar 16 bit virtual)
-
-
-#step 1:	generate virtual coords
-
-		
-#step 2:	virtual coords -> block adresses + fractional adresses
-#* mulhigh: real coords (x,y) (center relative)
-#* add center -> unsigned (top left relative)
-#* mullow: fractional part (x_frac, y_frac)
-#* mulhigh, mullow, pack 32bit: y_offset
-#* pack 32bit: x_offset
-#* add, shift, add start address: real addresses
-	
-
-#step3:		data fetch using generated addresses: 
-#		this step would be much simpler in 4x16bit rgba. life's a bitch..
-
-#step4:		billinear interpolation
-
-#stat5:		store
-
-
-
-		# this can be simplified by doing 32 bit unaligned moves
-		# and vector unpacking on the data
-
-	
-
-		# cooked image data structure
-		# pixel environment temp storage
-		TL1 = 0x00
-		TL2 = 0x02
-		TL3 = 0x04
-		TL4 = 0x06
-		TR1 = 0x08
-		TR2 = 0x0A
-		TR3 = 0x0C
-		TR4 = 0x0E
-		BL1 = 0x10
-		BL2 = 0x12
-		BL3 = 0x14
-		BL4 = 0x16
-		BR1 = 0x18
-		BR2 = 0x1A
-		BR3 = 0x1C
-		BR4 = 0x1E
-		# addresses of pixel blocks
-		ADDRESS1  = 0x20
-		ADDRESS2  = 0x24
-		ADDRESS3  = 0x28
-		ADDRESS4  = 0x2C
-
-		# second env + address buffer (testing:	 not used)
-		SECONDBUFFER = 0x30
-	
-		# 32bit splatted bitmap address
-		V2PLANEADDRESS = 0x60
-		# 16bit splatted image constants
-		V4TWOWIDTHM1 = 0x68
-		V4TWOHEIGHTM1 = 0x70
-		V4LINEOFFSET = 0x78
-		# data struct size
-		RESAMPLEDATASIZE = 0x80
-	
-	
-
-		# interpolation routine
-		# input:	%mm0, %mm1 4 x 16bit unsigned top left relative virtual x and y coordinates
-		#		%esi: temp & algo data structure
-
-getpixelsbilin:	psrlw $1, %mm0			# convert to range 0->0x7fff [0,0.5[
-		psrlw $1, %mm1
-		movq %mm0, %mm2
-		movq %mm1, %mm3
-		movq V4TWOWIDTHM1(%esi), %mm4	# 2 * (width - 1)
-		movq V4TWOHEIGHTM1(%esi), %mm5	# 2 * (height - 1)
-		pmulhw %mm5, %mm3		# mm3 == y coord (topleft relative)
-		pmulhw %mm4, %mm2		# mm2 == x coord (topleft relative)
-		pmullw %mm5, %mm1		# mm1 == y frac (unsigned)
-		pmullw %mm4, %mm0		# mm0 == x frac (unsigned)
-
-		movq %mm3, %mm5			# copy y coord 
-		pmullw V4LINEOFFSET(%esi), %mm3	# low part of line offset
-		pmulhw V4LINEOFFSET(%esi), %mm5	# high part of line offset
-
-		movq %mm2, %mm7			# copy x coord vector
-		pxor %mm4, %mm4
-		punpcklwd %mm4, %mm2		# low part in %mm2
-		punpckhwd %mm4, %mm7		# hight part in %mm7
-	
-		movq %mm3, %mm6			# copy
-		punpcklwd %mm5, %mm3		# unpack low part in %mm3
-		punpckhwd %mm5, %mm6		# high part int %mm6
-
-		paddd %mm2, %mm3
-		paddd %mm7, %mm6
-		pslld $1, %mm3			# convert to word adresses
-		pslld $1, %mm6
-
-		paddd V2PLANEADDRESS(%esi), %mm3	# add pixel plane address
-		paddd V2PLANEADDRESS(%esi), %mm6
-
-		movq %mm3, ADDRESS1(%esi)	# store adresses
-		movq %mm6, ADDRESS3(%esi)
-
-		pcmpeqw %mm2, %mm2		# all ones
-		movq %mm0, %mm4			# copy x frac
-		movq %mm1, %mm5			# copy y frac
-		pxor %mm2, %mm4			# compute compliment (approx negative)
-		pxor %mm2, %mm5
-
-		psrlw $1, %mm0			# shift right (0.5 * (frac x)
-		psrlw $1, %mm1			# shift right (0.5 * (frac y)
-		psrlw $1, %mm4			# shift right (0.5 * (1 - frac x)
-		psrlw $1, %mm5			# shift right (0.5 * (1 - frac y)
-
-		movq %mm0, %mm2			# copy of frac x
-		movq %mm4, %mm3			# copy of (1-frac x)
-						# fetch data
-
-		#jmp skipfetch			# seems the fetch is the real killer. try to optimize this
-						# using 32 bit accesses & shifts
-
-						# the src image data struct is padded to the cooked data struct
-		movl RESAMPLEDATASIZE(%esi), %edi
-		shll $1, %edi
-
-		movl ADDRESS1(%esi), %ecx 
-		movl ADDRESS2(%esi), %edx
-	
-		movw (%ecx), %ax
-		movw (%edx), %bx
-		movw %ax, TL1(%esi)
-		movw %bx, TL2(%esi)
-		movw 2(%ecx), %ax
-		movw 2(%edx), %bx
-		movw %ax, TR1(%esi)
-		movw %bx, TR2(%esi)
-
-		addl %edi, %ecx
-		addl %edi, %edx
-
-		movw (%ecx), %ax
-		movw (%edx), %bx
-		movw %ax, BL1(%esi)
-		movw %bx, BL2(%esi)
-		movw 2(%ecx), %ax
-		movw 2(%edx), %bx
-		movw %ax, BR1(%esi)
-		movw %bx, BR2(%esi)
-
-		
-		movl ADDRESS3(%esi), %ecx 
-		movl ADDRESS4(%esi), %edx
-
-
-		movw (%ecx), %ax
-		movw (%edx), %bx
-		movw %ax, TL3(%esi)
-		movw %bx, TL4(%esi)
-		movw 2(%ecx), %ax
-		movw 2(%edx), %bx
-		movw %ax, TR3(%esi)
-		movw %bx, TR4(%esi)
-	
-		addl %edi, %ecx
-		addl %edi, %edx
-
-		movw (%ecx), %ax
-		movw (%edx), %bx
-		movw %ax, BL3(%esi)
-		movw %bx, BL4(%esi)
-		movw 2(%ecx), %ax
-		movw 2(%edx), %bx
-		movw %ax, BR3(%esi)
-		movw %bx, BR4(%esi)
-
-	
-skipfetch:	
-		pmulhw TL1(%esi), %mm4		# bilin interpolation
-		pmulhw TR1(%esi), %mm0
-		pmulhw BL1(%esi), %mm3
-		pmulhw BR1(%esi), %mm2
-
-
-		paddw %mm4, %mm0
-		paddw %mm3, %mm2
-
-		pmulhw %mm5, %mm0
-		pmulhw %mm1, %mm2
-
-		paddw %mm2, %mm0
-		psllw $2, %mm0			# compensate for gain reduction
-
-		ret
-
-
-		// linear mapping data struct
-		ROWSTATEX = 0x0
-		ROWSTATEY = 0x8
-		COLSTATEX = 0x10
-		COLSTATEY = 0x18
-		ROWINCX = 0x20		
-		ROWINCY = 0x28
-		COLINCX = 0x30		
-		COLINCY = 0x38
-
-		// image data struct
-		LINEOFFSET = 0x0
-		IMAGEADDRESS = 0x4
-		WIDTH = 0x8
-		HEIGHT = 0xC
-		IMAGEDATASIZE = 0x10
-		
-
-
-# pixel_resample_linmap_s16(void *x)		
-.globl pixel_resample_linmap_s16
-.type  pixel_resample_linmap_s16,@function
-
-		SOURCEIMAGE = RESAMPLEDATASIZE
-		DESTIMAGE = SOURCEIMAGE + IMAGEDATASIZE
-		LINMAPDATA = DESTIMAGE + IMAGEDATASIZE
-	
-pixel_resample_linmap_s16:	
-		pushl %ebp
-		movl %esp, %ebp
-		pushl %esi
-		pushl %edi
-		pushl %ebx
-
-
-		movl 8(%ebp),  %esi			# get data struct
-		movl DESTIMAGE+HEIGHT(%esi), %edx	# image height
-		movl DESTIMAGE+IMAGEADDRESS(%esi), %edi # dest image address
-		movl DESTIMAGE+WIDTH(%esi), %ecx	# image width
-		shrl $2, %ecx				# vector count
-		.align 16
-	
-linmap_looprow:
-		movq LINMAPDATA+ROWSTATEX(%esi), %mm0	# get current coordinates
-		movq LINMAPDATA+ROWSTATEY(%esi), %mm1
-
-linmap_loopcol:		
-		movq %mm0, %mm4				# copy
-		movq %mm1, %mm5
-		paddd LINMAPDATA+ROWINCX(%esi), %mm4	# increment
-		paddd LINMAPDATA+ROWINCY(%esi), %mm5
-		movq %mm4, %mm6				# copy
-		movq %mm5, %mm7	
-		paddd LINMAPDATA+ROWINCX(%esi), %mm6	# increment
-		paddd LINMAPDATA+ROWINCY(%esi), %mm7
-		movq %mm6, LINMAPDATA+ROWSTATEX(%esi)	# store next state
-		movq %mm7, LINMAPDATA+ROWSTATEY(%esi) 
-
-		psrad $16, %mm0				# round to 16 bit
-		psrad $16, %mm1
-		psrad $16, %mm4
-		psrad $16, %mm5
-		packssdw %mm4, %mm0			# pack new coordinates
-		packssdw %mm5, %mm1
-	
-		push %ecx
-		push %edx
-		push %edi
-	
-		call getpixelsbilin			# do interpolation
-
-		pop %edi
-		pop %edx
-		pop %ecx
-		movq %mm0, (%edi)			# store 4 pixels
-		addl $0x8, %edi				# point to next 4 pixels
-		decl %ecx				# dec row counter
-		jnz linmap_looprow
-
-		movq LINMAPDATA+COLSTATEX(%esi), %mm0	# get column state vector
-		movq LINMAPDATA+COLSTATEY(%esi), %mm1
-		movl DESTIMAGE+WIDTH(%esi), %ecx	# image width
-		shrl $2, %ecx				# vector count
-		paddd LINMAPDATA+COLINCX(%esi), %mm0	# increment
-		paddd LINMAPDATA+COLINCY(%esi), %mm1
-		movq %mm0, LINMAPDATA+COLSTATEX(%esi)	# store
-		movq %mm1, LINMAPDATA+COLSTATEY(%esi)
-		decl %edx				# dec column counter
-		jnz linmap_loopcol
-		
-		emms
-		popl %ebx
-		popl %edi
-		popl %esi
-		leave
-		ret
-
-
diff --git a/system/mmx/pixel_s1.s b/system/mmx/pixel_s1.s
deleted file mode 100644
index d6bc5ca..0000000
--- a/system/mmx/pixel_s1.s
+++ /dev/null
@@ -1,201 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
-	# this file contains ops for binary image processing
-	# 8x8 bit tile encoded
-	# low byte = bottom row
-	# low bit = right column
-	# %mm7 = scratch reg for all macros
-
-
-	# ************ load mask *******************
-	# compute bit masks for rows and columns
-	# %mm7:	 scratch reg
-
-	# load mask top
-	.macro ldmt count reg
-	pcmpeqb \reg, \reg
-	psllq $(64-(\count<<3)), \reg
-	.endm
-
-	# load mask bottom
-	.macro ldmb count reg
-	pcmpeqb \reg, \reg
-	psrlq $(64-(\count<<3)), \reg
-	.endm
-
-	# load mask top and bottom
-	.macro ldmtb count regt regb
-	ldmb \count, \regb
-	ldmt \count, \regt
-	.endm
-
-	# load mask right
-	.macro ldmr count reg
-	pcmpeqb %mm7, %mm7
-	psrlw $(16-\count), %mm7
-	movq %mm7, \reg
-	psllq $8, %mm7
-	por %mm7, \reg
-	.endm
-
-	# load mask left	
-	.macro ldml count reg
-	pcmpeqb %mm7, %mm7
-	psllw $(16-\count), %mm7
-	movq %mm7, \reg
-	psrlq $8, %mm7
-	por %mm7, \reg
-	.endm
-
-	# load mask left and right
-	.macro ldmlr count regl regr
-	pcmpeqb %mm7, %mm7
-	psllw $(16-\count), %mm7
-	movq %mm7, \regl
-	psrlq $8, %mm7
-	por %mm7, \regl
-	movq \regl, \regr
-	psrlq $(8-\count), \regr
-	.endm
-
-	# ************* shift square **********
-	# shifts a square in reg, fills with zeros
-
-	# shift square top
-	.macro sst count reg
-	psllq $(\count<<3), \reg
-	.endm
-
-	# shift square bottom
-	.macro ssb count reg
-	psrlq $(\count<<3), \reg
-	.endm
-
-	# not tested
-	# shift square left
-	.macro ssl count reg
-	movq \reg, %mm7
-	pcmpeqb \reg, \reg
-	psllw $(16-\count), \reg
-	psrlw $8, \reg
-	pandn %mm7, \reg
-	psllw $(\count), \reg
-	.endm
-
-	# shift square right
-	.macro ssr count reg
-	movq \reg, %mm7
-	pcmpeqb \reg, \reg
-	psrlw $(16-\count), \reg
-	psllw $8, \reg
-	pandn %mm7, \reg
-	psrlw $(\count), \reg
-	.endm
-
-
-	# ********** combine square *************
-	# combines 2 squares
-
-	# combine right
-	.macro csr count regr reg
-	ssl \count, \reg
-	ssr (8-\count), \regr
-	por \regr, \reg
-	.endm
-
-	# combine left
-	.macro csl count regl reg
-	ssr \count, \reg
-	ssl (8-\count), \regl
-	por \regl, \reg
-	.endm
-
-	# combine top
-	.macro cst count regt reg
-	ssb \count, \reg
-	sst (8-\count), \regt
-	por \regt, \reg
-	.endm
-
-	
-	# combine bottom
-	.macro csb count regb reg
-	sst \count, \reg
-	ssb (8-\count), \regb
-	por \regb, \reg
-	.endm
-
-
-	# ********** load combine square *************
-	# loads combined square using mask
-
-	# load combined square left
-	# mask should be count bits set right (i.e. 0x01)
-	.macro lcsml count mask source sourcel dstreg
-	movq \mask, \dstreg
-	movq \mask, %mm7
-	pandn \source, \dstreg
-	pand \sourcel, %mm7
-	psrlq $(\count), \dstreg
-	psllq $(8-\count), %mm7
-	por %mm7, \dstreg
-	.endm
-	
-	
-			
-.globl pixel_test_s1
-.type  pixel_test_s1,@function
-
-# simple add
-# void pixel_add_s16(void *dest, void *source, int nb_squares, int spacing)
-
-
-
-	#
-	
-
-pixel_test_s1:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-	movl 8(%ebp),  %edi	# dest
-	movl 12(%ebp), %esi	# source
-	movl 16(%ebp), %ecx	# count
-	movl 20(%ebp), %edx	# row distance
-
-	ldmr 1, %mm6
-	lcsml 1, %mm6, (%esi), 8(%esi), %mm0
-	movq %mm0, (%edi)
-
-
-#	movq (%esi), %mm0
-#	movq 8(%esi), %mm1
-#	csl 4, %mm1, %mm0
-#	movq %mm0, (%edi)
-
-	emms
-
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
diff --git a/system/mmx/pixel_unpack_u8s16.s b/system/mmx/pixel_unpack_u8s16.s
deleted file mode 100644
index 0fc14c2..0000000
--- a/system/mmx/pixel_unpack_u8s16.s
+++ /dev/null
@@ -1,113 +0,0 @@
-#    Pure Data Packet mmx routine.
-#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-# 
-#    This program is free software; you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation; either version 2 of the License, or
-#    (at your option) any later version.
-# 
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-# 
-#    You should have received a copy of the GNU General Public License
-#    along with this program; if not, write to the Free Software
-#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_unpack_u8s16_y
-.type  pixel_unpack_u8s16_y,@function
-
-# mmx rgba pixel gain
-# void pixel_unpack_u8s16_y(char *input, char *output, int32 nb_pixels_div8)
-
-pixel_unpack_u8s16_y:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-#	movl 20(%ebp), %edi	# int16[4] array of gains
-#	movq (%edi), %mm7	# get gain array
-	
-	movl 8(%ebp),  %esi	# input uint8 pixel array
-	movl 12(%ebp), %edi	# output sint16 pixel array
-	movl 16(%ebp), %ecx	# nb of elements div 8
-
-
-	.align 16
-	.loop_unpack_y:	
-
-	movq (%esi), %mm5	# load 8 pixels from memory
-	pxor %mm0, %mm0		# zero mm0 - mm3
-	pxor %mm1, %mm1
-	punpcklbw %mm5, %mm0	# unpack 1st 4 pixels
-	punpckhbw %mm5, %mm1	# unpack 2nd 4 pixles
-	psrlw $0x1, %mm0	# shift right to clear sign bit 9.7
-	psrlw $0x1, %mm1
-#	pmulhw %mm7, %mm0	# apply gain
-#	pmulhw %mm7, %mm1
-#	paddsw %mm0, %mm0	# correct factor 2
-#	paddsw %mm1, %mm1
-	movq %mm0, (%edi)	# store
-	movq %mm1, 8(%edi)
-	
-	addl $8, %esi		# increment source pointer
-	addl $16, %edi		# increment dest pointer
-	decl %ecx
-	jnz .loop_unpack_y	# loop
-
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-	
-.globl pixel_unpack_u8s16_uv
-.type  pixel_unpack_u8s16_uv,@function
-pixel_unpack_u8s16_uv:
-	pushl %ebp
-	movl %esp, %ebp
-	push %esi
-	push %edi
-
-#	movl 20(%ebp), %edi	# int16[4] array of gains
-#	movq (%edi), %mm7	# get gain array
-
-	movl 8(%ebp),  %esi	# input uint8 pixel array
-	movl 12(%ebp), %edi	# output sint16 pixel array
-	movl 16(%ebp), %ecx	# nb of elements div 8
-
-	pcmpeqw %mm6, %mm6
-	psllw $15, %mm6
-	
-	.align 16
-	.loop_unpack_uv:	
-
-	movq (%esi), %mm5	# load 8 pixels from memory
-	pxor %mm0, %mm0		# zero mm0 - mm3
-	pxor %mm1, %mm1
-	punpcklbw %mm5, %mm0	# unpack 1st 4 pixels
-	punpckhbw %mm5, %mm1	# unpack 2nd 4 pixles
-	pxor %mm6, %mm0		# flip sign bit (Cr and Cb are ofset by 128)
-	pxor %mm6, %mm1
-#	pmulhw %mm7, %mm0	# apply gain
-#	pmulhw %mm7, %mm1
-#	paddsw %mm0, %mm0	# correct factor 2
-#	paddsw %mm1, %mm1
-	movq %mm0, (%edi)	# store
-	movq %mm1, 8(%edi)
-	
-	addl $8, %esi		# increment source pointer
-	addl $16, %edi		# increment dest pointer
-	decl %ecx
-	jnz .loop_unpack_uv	# loop
-
-	emms
-	
-	pop %edi
-	pop %esi
-	leave
-	ret
-
author	Hans-Christoph Steiner <eighthave@users.sourceforge.net>	2005-12-15 07:26:47 +0000
committer	Hans-Christoph Steiner <eighthave@users.sourceforge.net>	2005-12-15 07:26:47 +0000
commit	37b6643df2df7d784a31ca73f7bb90dc109c2401 (patch)
tree	a8664e5adcfcb60cae136063d627549ecb76619b /system/mmx
parent	c50ce0e0217ea07e2d450add2ab29cecea66fa96 (diff)