36 files changed, 5547 insertions, 0 deletions
diff --git a/system/Makefile b/system/Makefile
new file mode 100644
index 0000000..acdb944
--- /dev/null
+++ b/system/Makefile
@@ -0,0 +1,20 @@
+target:	all_objects
+
+include ../Makefile.config
+include Makefile.$(PDP_TARGET)
+
+
+
+OBJECTS = pdp.o pdp_ut.o pdp_packet.o pdp_type.o pdp_queue.o pdp_comm.o \
+	pdp_control.o pdp_llconv.o pdp_resample.o
+
+pdp_main_clean:
+	rm -f pdp.o
+
+all_objects: pdp_main_clean $(OBJECTS) platform_targets
+
+clean:
+	rm -f *~
+	rm -f *.o
+	make -C mmx clean
+
diff --git a/system/Makefile.linux b/system/Makefile.linux
new file mode 100644
index 0000000..96660f7
--- /dev/null
+++ b/system/Makefile.linux
@@ -0,0 +1,2 @@
+platform_targets: pdp_imageproc_portable.o pdp_llconv_portable.o
+
diff --git a/system/Makefile.linux_mmx b/system/Makefile.linux_mmx
new file mode 100644
index 0000000..a646e5e
--- /dev/null
+++ b/system/Makefile.linux_mmx
@@ -0,0 +1,4 @@
+platform_subtree:
+	make -C mmx
+
+platform_targets: pdp_imageproc_mmx.o pdp_llconv_mmx.o platform_subtree
diff --git a/system/mmx/Makefile b/system/mmx/Makefile
new file mode 100644
index 0000000..0f8f836
--- /dev/null
+++ b/system/mmx/Makefile
@@ -0,0 +1,29 @@
+include ../../Makefile.config
+
+OBJ = \
+pixel_pack_s16u8.o \
+pixel_unpack_u8s16.o \
+pixel_add_s16.o \
+pixel_mul_s16.o \
+pixel_mix_s16.o \
+pixel_randmix_s16.o \
+pixel_conv_hor_s16.o \
+pixel_conv_ver_s16.o \
+pixel_affine_s16.o \
+pixel_biquad_s16.o \
+pixel_ca_s1.o \
+pixel_rand_s16.o \
+pixel_crot_s16.o \
+pixel_gain_s16.o
+
+all:	$(OBJ)
+
+test:	pdp_mmx_test.o $(OBJ)
+	gcc -o pdp_mmx_test pdp_mmx_test.o $(OBJ) -g
+
+clean:
+	rm -f *.o
+	rm -f *~
+	rm -f pdp_mmx.a
+	rm -f pdp_mmx_test
+
diff --git a/system/mmx/pdp_mmx_test.c b/system/mmx/pdp_mmx_test.c
new file mode 100644
index 0000000..e93539f
--- /dev/null
+++ b/system/mmx/pdp_mmx_test.c
@@ -0,0 +1,62 @@
+#include "pdp_mmx.h"
+
+#define FP(x) ((short int)(((float)(x) * 2 * 256.0f)))
+
+#define nbp 256
+
+    short int a1[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int a2[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int b0[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int b1[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int b2[4] = {0x0100,0x0100,0x0100,0x0100};
+
+    short int u1[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int u2[4] = {0x0100,0x0100,0x0100,0x0100};
+
+    short int x0[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int x1[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int x2[4] = {0x0100,0x0100,0x0100,0x0100};
+    short int x3[4] = {0x0100,0x0100,0x0100,0x0100};
+
+void print_pixel(unsigned int i)
+{
+    if (i) printf("x ");
+    else printf(". ");
+}
+
+void print_line(void)
+{
+    printf("\n");
+}
+
+void print_square(unsigned char *c)
+{
+    int i,j;
+
+    for(j=7; j>=0; j--){
+	for(i=0; i<8; i++) print_pixel(c[j] & (1<<(7-i)));
+	printf("\n");
+    }
+    
+}
+
+main()
+{
+  
+    unsigned char src[16]={1,2,3,4,5,6,7,8,-1,-2,-3,-4,-5,-6,-7,-8};
+    unsigned char dst[8];
+
+    
+    print_square(src);
+    print_line();
+    print_square(src+8);
+    print_line();
+
+    pixel_test_s1(dst,src,1,1);
+
+    print_square(dst);
+    print_line();
+
+
+
+}
diff --git a/system/mmx/pixel_add_s16.s b/system/mmx/pixel_add_s16.s
new file mode 100644
index 0000000..8d4c7df
--- /dev/null
+++ b/system/mmx/pixel_add_s16.s
@@ -0,0 +1,55 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_add_s16
+.type  pixel_add_s16,@function
+
+# simple add
+# void pixel_add_s16(int *left, int *right, int nb_4pixel_vectors)
+
+pixel_add_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %edi	# left array
+	movl 12(%ebp), %esi	# right array
+	movl 16(%ebp), %ecx	# pixel count
+
+	
+	.align 16
+	.loop_mix:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm1	# load right 4 pixels from memory
+	movq (%edi), %mm0	# load 4 left pixels from memory
+	paddsw %mm1, %mm0	# mix
+	movq %mm0, (%edi)
+	addl $8, %esi
+	addl $8, %edi
+	decl %ecx
+	jnz .loop_mix		# loop
+
+	emms
+
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_affine_s16.s b/system/mmx/pixel_affine_s16.s
new file mode 100644
index 0000000..b357de3
--- /dev/null
+++ b/system/mmx/pixel_affine_s16.s
@@ -0,0 +1,59 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_affine_s16
+.type  pixel_affine_s16,@function
+
+# void pixel_affine_s16(int *buf, int nb_8pixel_vectors, short int gain[4], short int offset[4])
+
+pixel_affine_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 20(%ebp), %edi
+	movq (%edi), %mm6	# get offset vector
+
+	movl 16(%ebp), %edi
+	movq (%edi), %mm7	# get gain vector
+	
+	movl 8(%ebp),  %esi	# input array
+	movl 12(%ebp), %ecx	# pixel count
+
+	
+	.align 16
+	.loop_affine:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm0	# load 4 pixels from memory
+	pmulhw %mm7, %mm0	# apply gain (s).15 fixed point
+	psllw $1, %mm0		# apply correction shift
+	paddsw %mm6, %mm0	# add offset
+	movq %mm0, (%esi)	# store result in memory
+
+	addl $8, %esi		# increment source pointer
+	decl %ecx
+	jnz .loop_affine	# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_biquad_dirI_s16.s b/system/mmx/pixel_biquad_dirI_s16.s
new file mode 100644
index 0000000..1729502
--- /dev/null
+++ b/system/mmx/pixel_biquad_dirI_s16.s
@@ -0,0 +1,361 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+
+	# TODO MOVE TO DIRECT FORM II
+	# y[k]  = b0 * x[k] + u1[k-1]
+	# u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
+	# u2[k] = b2 * x[k]           - a2 * y[k]
+	
+	# input in register:	
+	# %mm0-mm3:	input 4x4 pixels {x0 x1 x2 x3}
+	# %esi:		coef memory  (a1, a2, b0, b1, b2)
+	# %edi:		state memory (u1, u2)
+
+	
+	# return in register:	 
+	# %mm0-mm4:	4x4 pixels result
+
+	
+	.biquad_4x4_pixels:	
+	.align 16
+	# prescale
+	movq -8(%esi), %mm4
+	pmulhw %mm4, %mm0
+	pmulhw %mm4, %mm1
+	pmulhw %mm4, %mm2
+	pmulhw %mm4, %mm3
+	psllw $1, %mm0
+	psllw $1, %mm1
+	psllw $1, %mm2
+	psllw $1, %mm3
+
+	
+	# first vector
+	movq 0(%edi), %mm4		# mm4 <- u[-1]
+	movq 8(%edi), %mm5		# mm5 <- u[-2]
+	movq %mm4, %mm6
+	movq %mm5, %mm7
+
+	pmulhw 0(%esi), %mm6		# multiply by a1
+	pmulhw 8(%esi), %mm7		# multiply by a2
+
+	paddsw %mm6, %mm0		# accumulate
+	paddsw %mm7, %mm0		# accumulate
+	paddsw %mm0, %mm0		# scale by 2 (since all fixed point muls are x*y/2)
+
+	movq %mm0, %mm6			# mm6 <- u[0]
+	movq %mm4, %mm7			# mm7 <- u[-1]
+	pmulhw 16(%esi), %mm0		# multiply by b0
+	pmulhw 24(%esi), %mm4		# multiply by b1
+	pmulhw 32(%esi), %mm5		# multiply by b2
+
+	paddsw %mm4, %mm0		# accumulate
+	paddsw %mm5, %mm0		# accumulate
+
+					# mm0 is result 0
+
+	# second vector
+	movq %mm6, %mm4			# mm4 <- u[0]
+	movq %mm7, %mm5			# mm5 <- u[-1]
+
+	pmulhw 0(%esi), %mm6		# multiply by a1
+	pmulhw 8(%esi), %mm7		# multiply by a2
+
+	paddsw %mm6, %mm1		# accumulate
+	paddsw %mm7, %mm1		# accumulate
+	paddsw %mm1, %mm1		# scale by 2
+
+	
+	movq %mm1, %mm6			# mm6 <- u[1]
+	movq %mm4, %mm7			# mm7 <- u[0]
+	pmulhw 16(%esi), %mm1		# multiply by b0
+	pmulhw 24(%esi), %mm4		# multiply by b1
+	pmulhw 32(%esi), %mm5		# multiply by b2
+
+	paddsw %mm4, %mm1		# accumulate
+	paddsw %mm5, %mm1		# accumulate
+
+					# mm1 is result 1
+
+	# third vector
+	movq %mm6, %mm4			# mm4 <- u[1]
+	movq %mm7, %mm5			# mm5 <- u[0]
+
+	pmulhw 0(%esi), %mm6		# multiply by a1
+	pmulhw 8(%esi), %mm7		# multiply by a2
+
+	paddsw %mm6, %mm2		# accumulate
+	paddsw %mm7, %mm2		# accumulate
+	paddsw %mm2, %mm2		# scale by 2
+
+	
+	movq %mm2, %mm6			# mm6 <- u[2]
+	movq %mm4, %mm7			# mm7 <- u[1]
+	pmulhw 16(%esi), %mm2		# multiply by b0
+	pmulhw 24(%esi), %mm4		# multiply by b1
+	pmulhw 32(%esi), %mm5		# multiply by b2
+
+	paddsw %mm4, %mm2		# accumulate
+	paddsw %mm5, %mm2		# accumulate
+
+					# mm2 is result 2
+
+	# fourth vector
+	movq %mm6, %mm4			# mm4 <- u[2]
+	movq %mm7, %mm5			# mm5 <- u[1]
+
+	pmulhw 0(%esi), %mm6		# multiply by a1
+	pmulhw 8(%esi), %mm7		# multiply by a2
+
+	paddsw %mm6, %mm3		# accumulate
+	paddsw %mm7, %mm3		# accumulate
+	paddsw %mm3, %mm3		# scale by 2
+
+	
+	movq %mm3, 0(%edi)		# store  u[3]
+	movq %mm4, 8(%edi)		# store  u[2]
+	pmulhw 16(%esi), %mm3		# multiply by b0
+	pmulhw 24(%esi), %mm4		# multiply by b1
+	pmulhw 32(%esi), %mm5		# multiply by b2
+
+	paddsw %mm4, %mm3		# accumulate
+	paddsw %mm5, %mm3		# accumulate
+
+					# mm3 is result 3
+
+	ret
+	
+
+	# in order to use the 4 line parallel biquad routine on horizontal
+	# lines, we need to reorder (rotate or transpose) the matrix, since
+	# images are scanline encoded, and we want to work in parallell
+	# on 4 lines.
+	#
+	# since the 4 lines are independent, it doesnt matter in which order
+	# the the vector elements are present. 
+	#
+	# this allows us to use the same routine for left->right and right->left
+	# processing.
+	#	
+	# some comments on the non-abelean group of square isometries consisting of
+	# (I) identity
+	# (H) horizontal axis mirror 
+	# (V) vertical axis mirror
+	# (T) transpose (diagonal axis mirror)
+	# (A) antitranspose (antidiagonal axis mirror)
+	# (R1) 90deg anticlockwize rotation
+	# (R2) 180deg rotation
+	# (R3) 90deg clockwize rotation
+	#
+	#	
+	# we basicly have two options: (R1,R3) or (T,A)
+	# we opt for T and A because they are self inverting, which improves locality
+	#
+	# use antitranspose for right to left an transpose
+	# for left to right (little endian)
+
+
+	# antitranspose 4x4
+
+	# input
+	# %mm3 == {d0 d1 d2 d3}
+	# %mm2 == {c0 c1 c2 c3}	
+	# %mm1 == {b0 b1 b2 b3}	
+	# %mm0 == {a0 a1 a2 a3}
+
+	# output
+	# %mm3 == {a3 b3 c3 d3}
+	# %mm2 == {a2 b2 c2 d2}
+	# %mm1 == {a1 b1 c1 d1}
+	# %mm0 == {a0 b0 c0 d0}
+
+	
+	.antitranspose_4x4:	
+	.align 16
+	movq %mm3, %mm4
+	punpcklwd %mm1, %mm4	# mm4 <- {b2 d2 b3 d3}
+	movq %mm3, %mm5	
+	punpckhwd %mm1, %mm5	# mm5 <- {b0 d0 b1 d1}
+			
+	movq %mm2, %mm6
+	punpcklwd %mm0, %mm6	# mm6 <- {a2 c2 a3 c3}
+	movq %mm2, %mm7	
+	punpckhwd %mm0, %mm7	# mm7 <- {a0 c0 a1 c1}
+
+	movq %mm4, %mm3
+	punpcklwd %mm6, %mm3	# mm3 <- {a3 b3 c3 d3}
+	movq %mm4, %mm2
+	punpckhwd %mm6, %mm2	# mm2 <- {a2 b2 c2 d2}
+		
+	movq %mm5, %mm1
+	punpcklwd %mm7, %mm1	# mm1 <- {a1 b1 c1 d1}
+	movq %mm5, %mm0
+	punpckhwd %mm7, %mm0	# mm0 <- {a0 b0 c0 d0}
+
+	ret
+
+	
+
+	# transpose 4x4
+
+	# input
+	# %mm3 == {d3 d2 d1 d0}
+	# %mm2 == {c3 c2 c1 c0}	
+	# %mm1 == {b3 b2 b1 b0}	
+	# %mm0 == {a3 a2 a1 a0}
+
+	# output
+	# %mm3 == {d3 c3 b3 a3}
+	# %mm2 == {d2 c2 b2 a2}
+	# %mm1 == {d1 c1 b1 a1}
+	# %mm0 == {d0 c0 b0 a0}
+
+	
+	.transpose_4x4:	
+	.align 16
+	movq %mm0, %mm4
+	punpcklwd %mm2, %mm4	# mm4 <- {c1 a1 c0 a0}
+	movq %mm0, %mm5	
+	punpckhwd %mm2, %mm5	# mm5 <- {c3 a3 c2 a2}
+		
+	movq %mm1, %mm6
+	punpcklwd %mm3, %mm6	# mm6 <- {d1 b1 d0 b0}
+	movq %mm1, %mm7	
+	punpckhwd %mm3, %mm7	# mm7 <- {d3 b3 d2 b2}
+
+	movq %mm4, %mm0
+	punpcklwd %mm6, %mm0	# mm0 <- {d0 c0 b0 a0}
+	movq %mm4, %mm1
+	punpckhwd %mm6, %mm1	# mm1 <- {d1 c1 b1 a1}
+		
+	movq %mm5, %mm2
+	punpcklwd %mm7, %mm2	# mm2 <- {d2 c2 b2 a2}
+	movq %mm5, %mm3
+	punpckhwd %mm7, %mm3	# mm3 <- {d3 c3 b3 a3}
+
+	ret
+
+	
+.globl pixel_biquad_vertb_s16
+.type  pixel_biquad_vertb_s16,@function
+
+
+# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_biquad_vertb_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing	
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_vertb_line_loop:
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	call .biquad_4x4_pixels
+	movq %mm0, (%ebx)	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+	addl %edx, %ebx
+	addl %eax, %ebx
+	decl %ecx
+	jnz .biquad_vertb_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+.globl pixel_biquad_horlr_s16
+.type  pixel_biquad_horlr_s16,@function
+
+
+# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_biquad_horlr_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_horlr_line_loop:
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	call .transpose_4x4	
+	call .biquad_4x4_pixels
+	call .transpose_4x4	
+	movq %mm0, (%ebx)	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+	addl $8, %ebx
+	decl %ecx
+	jnz .biquad_horlr_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+
+
diff --git a/system/mmx/pixel_biquad_s16.s b/system/mmx/pixel_biquad_s16.s
new file mode 100644
index 0000000..844b041
--- /dev/null
+++ b/system/mmx/pixel_biquad_s16.s
@@ -0,0 +1,451 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+	
+	# DIRECT FORM II BIQUAD
+	#
+	# y[k]  = b0 * x[k] + u1[k-1]
+	# u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
+	# u2[k] = b2 * x[k]           - a2 * y[k]
+	# MACRO:	df2 <reg>
+	#
+	# computes a direct form 2 biquad
+	# does not use {mm0-mm3}\<inreg>
+	#
+	# input:	<reg>   == input
+	#		%mm4    == state 1
+	#		%mm5    == state 2
+	#		(%esi)  == biquad coefs (-a1 -a2 b0 b1 b2) in s1.14
+	# output:	<reg>   == output
+	#		%mm4    == state 1
+	#		%mm5    == state 2
+
+	.macro df2 reg 
+	movq \reg, %mm6			# mm6 == x[k]
+	movq \reg, %mm7			# mm7 == x[k]
+	pmulhw 16(%esi), %mm6		# mm6 == x[k] * b0
+	pmulhw 24(%esi), %mm7		# mm7 == x[k] * b1
+	paddw %mm4, %mm6		# mm6 == x[k] * b0 + u1[k-1] == y[k]
+	paddw %mm5, %mm7		# mm7 == x[k] * b1 + u2[k-1]
+	paddsw %mm6, %mm6		# compensate for mul = x*y/4 (coefs are s1.14 fixed point)
+	paddsw %mm6, %mm6		# paddsw ensures saturation
+	movq \reg, %mm5			# mm5 == x[k]
+	movq %mm6, %mm4			# mm4 == y[k]
+	movq %mm6, \reg			# reg == y[k]	--------------------
+	pmulhw 0(%esi), %mm4		# mm4 == y[k] * (-a1)
+	pmulhw 8(%esi), %mm6		# mm6 == y[k] * (-a2)
+	pmulhw 32(%esi), %mm5		# mm5 == x[k] * b2
+	paddw %mm7, %mm4		# mm4 == u1[k]	--------------------
+	paddw %mm6, %mm5		# mm5 == u2[k]	--------------------
+	.endm
+
+		
+	# input in register:	
+	# %mm0-mm3:	input 4x4 pixels {x0 x1 x2 x3}
+	# %esi:		coef memory  (-a1, -a2, b0, b1, b2) in s1.14
+	# %edi:		state memory (u1, u2)
+	
+	# return in register:	 
+	# %mm0-mm4:	4x4 pixels result
+
+
+
+	
+	.macro biquad_4x4_pixels	
+	.align 16
+	movq 0(%edi), %mm4		# get state
+	movq 8(%edi), %mm5
+	df2 %mm0			# compute 4 biquads
+	df2 %mm1
+	df2 %mm2
+	df2 %mm3
+	movq %mm4, 0(%edi)		# store state
+	movq %mm5, 8(%edi)
+	.endm
+
+	
+
+	# in order to use the 4 line parallel biquad routine on horizontal
+	# lines, we need to reorder (rotate or transpose) the matrix, since
+	# images are scanline encoded, and we want to work in parallell
+	# on 4 lines.
+	#
+	# since the 4 lines are independent, it doesnt matter in which order
+	# the the vector elements are present. 
+	#
+	# this allows us to use the same routine for left->right and right->left
+	# processing.
+	#	
+	# some comments on the non-abelean group of square isometries consisting of
+	# (I) identity
+	# (H) horizontal axis mirror 
+	# (V) vertical axis mirror
+	# (T) transpose (diagonal axis mirror)
+	# (A) antitranspose (antidiagonal axis mirror)
+	# (R1) 90deg anticlockwize rotation
+	# (R2) 180deg rotation
+	# (R3) 90deg clockwize rotation
+	#
+	#	
+	# we basicly have two options: (R1,R3) or (T,A)
+	# we opt for T and A because they are self inverting, which improves locality
+	#
+	# use antitranspose for right to left an transpose
+	# for left to right (little endian)
+
+
+	# antitranspose 4x4
+
+	# input
+	# %mm3 == {d0 d1 d2 d3}
+	# %mm2 == {c0 c1 c2 c3}	
+	# %mm1 == {b0 b1 b2 b3}	
+	# %mm0 == {a0 a1 a2 a3}
+
+	# output
+	# %mm3 == {a3 b3 c3 d3}
+	# %mm2 == {a2 b2 c2 d2}
+	# %mm1 == {a1 b1 c1 d1}
+	# %mm0 == {a0 b0 c0 d0}
+
+	
+	.macro antitranspose_4x4:	
+	movq %mm3, %mm4
+	punpcklwd %mm1, %mm4	# mm4 <- {b2 d2 b3 d3}
+	movq %mm3, %mm5	
+	punpckhwd %mm1, %mm5	# mm5 <- {b0 d0 b1 d1}
+			
+	movq %mm2, %mm6
+	punpcklwd %mm0, %mm6	# mm6 <- {a2 c2 a3 c3}
+	movq %mm2, %mm7	
+	punpckhwd %mm0, %mm7	# mm7 <- {a0 c0 a1 c1}
+
+	movq %mm4, %mm3
+	punpcklwd %mm6, %mm3	# mm3 <- {a3 b3 c3 d3}
+	movq %mm4, %mm2
+	punpckhwd %mm6, %mm2	# mm2 <- {a2 b2 c2 d2}
+		
+	movq %mm5, %mm1
+	punpcklwd %mm7, %mm1	# mm1 <- {a1 b1 c1 d1}
+	movq %mm5, %mm0
+	punpckhwd %mm7, %mm0	# mm0 <- {a0 b0 c0 d0}
+	
+	.endm
+	
+
+	# transpose 4x4
+
+	# input
+	# %mm3 == {d3 d2 d1 d0}
+	# %mm2 == {c3 c2 c1 c0}	
+	# %mm1 == {b3 b2 b1 b0}	
+	# %mm0 == {a3 a2 a1 a0}
+
+	# output
+	# %mm3 == {d3 c3 b3 a3}
+	# %mm2 == {d2 c2 b2 a2}
+	# %mm1 == {d1 c1 b1 a1}
+	# %mm0 == {d0 c0 b0 a0}
+
+	
+	.macro transpose_4x4:	
+	movq %mm0, %mm4
+	punpcklwd %mm2, %mm4	# mm4 <- {c1 a1 c0 a0}
+	movq %mm0, %mm5	
+	punpckhwd %mm2, %mm5	# mm5 <- {c3 a3 c2 a2}
+		
+	movq %mm1, %mm6
+	punpcklwd %mm3, %mm6	# mm6 <- {d1 b1 d0 b0}
+	movq %mm1, %mm7	
+	punpckhwd %mm3, %mm7	# mm7 <- {d3 b3 d2 b2}
+
+	movq %mm4, %mm0
+	punpcklwd %mm6, %mm0	# mm0 <- {d0 c0 b0 a0}
+	movq %mm4, %mm1
+	punpckhwd %mm6, %mm1	# mm1 <- {d1 c1 b1 a1}
+		
+	movq %mm5, %mm2
+	punpcklwd %mm7, %mm2	# mm2 <- {d2 c2 b2 a2}
+	movq %mm5, %mm3
+	punpckhwd %mm7, %mm3	# mm3 <- {d3 c3 b3 a3}
+
+	.endm
+	
+.globl pixel_biquad_vertb_s16
+.type  pixel_biquad_vertb_s16,@function
+
+
+# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_biquad_vertb_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing	
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_vertb_line_loop:
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	biquad_4x4_pixels
+	movq %mm0, (%ebx)	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+	addl %edx, %ebx
+	addl %eax, %ebx
+	decl %ecx
+	jnz .biquad_vertb_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+.globl pixel_biquad_verbt_s16
+.type  pixel_biquad_verbt_s16,@function
+
+
+# pixel_biquad_vertbt_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_biquad_verbt_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %eax	# line with
+
+	shll $3, %eax		# 4 line byte spacing
+	decl %ecx
+	mul %ecx
+	incl %ecx
+	addl %eax, %ebx		# ebx points to last pixblock
+
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing	
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_verbt_line_loop:
+	movq (%ebx), %mm3	
+	movq (%ebx,%edx,1), %mm2	
+	movq (%ebx,%edx,2), %mm1	
+	movq (%ebx,%eax,1), %mm0
+	biquad_4x4_pixels
+	movq %mm3, (%ebx)	
+	movq %mm2, (%ebx,%edx,1)	
+	movq %mm1, (%ebx,%edx,2)	
+	movq %mm0, (%ebx,%eax,1)
+	subl %edx, %ebx
+	subl %eax, %ebx
+	decl %ecx
+	jnz .biquad_verbt_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+.globl pixel_biquad_horlr_s16
+.type  pixel_biquad_horlr_s16,@function
+# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+pixel_biquad_horlr_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_horlr_line_loop:
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	transpose_4x4	
+	biquad_4x4_pixels
+	transpose_4x4	
+	movq %mm0, (%ebx)	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+	addl $8, %ebx
+	decl %ecx
+	jnz .biquad_horlr_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+
+.globl pixel_biquad_horrl_s16
+.type  pixel_biquad_horrl_s16,@function
+# pixel_biquad_horrl_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+pixel_biquad_horrl_s16: 
+
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+
+	movl %ecx, %eax
+	decl %eax
+	shll $3, %eax
+	addl %eax, %ebx		# ebx points to last pixblock
+
+	
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+	
+	.align 16
+	.biquad_horrl_line_loop:
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	antitranspose_4x4	
+	biquad_4x4_pixels
+	antitranspose_4x4	
+	movq %mm0, (%ebx)	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+	subl $8, %ebx
+	decl %ecx
+	jnz .biquad_horrl_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+
+.globl pixel_biquad_time_s16
+.type  pixel_biquad_time_s16,@function
+# pixel_biquad_time_s16(short int *pixel_array, short int *s1, short int *s2, short int *coefs, int nb_4_pix_vectors)
+
+pixel_biquad_time_s16: 
+
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %edx	# state 1 array
+	movl 16(%ebp), %edi	# state 2 array
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %ecx	# nb of 4 pixel vectors
+
+
+	.align 16
+	.biquad_time_loop:
+	movq (%ebx), %mm0	# get input
+	movq (%edx), %mm4	# get state 1
+	movq (%edi), %mm5	# get state 2
+	df2 %mm0		# compute direct form 2
+	movq %mm0, (%ebx)	# write output
+	movq %mm5, (%edi)	# write state 2
+	movq %mm4, (%edx)	# write state 1
+	addl $8, %ebx
+	addl $8, %edi
+	addl $8, %edx
+	decl %ecx
+	jnz .biquad_time_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+
diff --git a/system/mmx/pixel_ca_s1.s b/system/mmx/pixel_ca_s1.s
new file mode 100644
index 0000000..d9c730f
--- /dev/null
+++ b/system/mmx/pixel_ca_s1.s
@@ -0,0 +1,189 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+	# this file contains assembler routines for 2D 1 bit cellular automata
+	# processing. it is organized around a feeder kernel and a
+	# stack based bit processor (virtual forth machine)
+	#
+	# the feeder kernel is responsable for loading/storing CA cells
+	# from/to memory. data in memory is organized as a scanline
+	# encoded toroidial bitplane (lsb = left). to simplify the kernel, the top
+	# left corner of the rectangular grid of pixels will shift down
+	# every processing step.
+	#
+	# the stack machine has the following architecture:
+	# CA stack:	%esi, TOS: %mm0 (32x2 pixels. lsw = top row)
+	# CA horizon:	%mm4-%mm7 (64x4 pixels. %mm4 = top row)
+	#
+	# the stack size / organization is not known to the stack machine. 
+	# it can be thought of as operating on a 3x3 cell neightbourhood.
+	# the only purpose of forth program is to determine the CA local update rule.
+	#
+	# the machine is supposed to be very minimal. no looping control.
+	# no adressing modes. no conditional code (hey, this is an experiment!)
+	# so recursion is not allowed (no way to stop it)
+	# there are 9 words to load the cell neigbourhood on the stack.
+	# the rest is just logic and stack manips.
+
+
+	# this file contains pure asm macros. it is to be included before assembly
+	# after scaforth.pl has processed the .scaf file
+	
+
+	# *************************** CA CELL ACCESS MACROS *****************************
+	# fetchTL - fetchBR
+
+	# shift / load rectangle macros:
+
+	# shift rectangle horizontal	
+	# result is in reg1
+	.macro shift reg1 reg2 count
+	psllq $(32+\count), \reg1
+	psrlq $(32-\count), \reg2
+	psrlq $32, \reg1
+	psllq $32, \reg2
+	por \reg2, \reg1
+	.endm
+
+	.macro ldtop reg1 reg2
+	movq %mm4, \reg1
+	movq %mm5, \reg2
+	.endm
+
+	.macro ldcenter reg1 reg2
+	movq %mm5, \reg1
+	movq %mm6, \reg2
+	.endm
+
+	.macro ldbottom reg1 reg2
+	movq %mm6, \reg1
+	movq %mm7, \reg2
+	.endm
+	
+
+	# fetch from top row
+
+	# fetch the top left square
+	.macro fetchTL
+	ldtop %mm0, %mm1
+	shift %mm0, %mm1, -1
+	.endm
+
+	# fetch the top mid square
+	.macro fetchTM
+	ldtop %mm0, %mm1
+	shift %mm0, %mm1, 0
+	.endm
+
+	# fetch the top right square
+	.macro fetchTR
+	ldtop %mm0, %mm1
+	shift %mm0, %mm1, 1
+	.endm
+
+
+	
+	# fetch from center row
+
+	# fetch the mid left square
+	.macro fetchML
+	ldcenter %mm0, %mm1
+	shift %mm0, %mm1, -1
+	.endm
+
+	# fetch the mid mid square
+	.macro fetchMM
+	ldcenter %mm0, %mm1
+	shift %mm0, %mm1, 0
+	.endm
+
+	# fetch the mid right square
+	.macro fetchMR
+	ldcenter %mm0, %mm1
+	shift %mm0, %mm1, 1
+	.endm
+
+
+	
+
+			
+	# fetch from bottom row
+
+	# fetch the bottom left square
+	.macro fetchBL
+	ldbottom %mm0, %mm1
+	shift %mm0, %mm1, -1
+	.endm
+
+	# fetch the bottom mid square
+	.macro fetchBM
+	ldbottom %mm0, %mm1
+	shift %mm0, %mm1, 0
+	.endm
+
+	# fetch the bottom right square
+	.macro fetchBR
+	ldbottom %mm0, %mm1
+	shift %mm0, %mm1, 1
+	.endm
+
+
+
+	# *************************** CA STACK MANIP MACROS *****************************
+	# dup drop dropdup swap nip dropover
+
+	.macro dup
+	lea -8(%esi), %esi
+	movq %mm0, (%esi)	
+	.endm
+
+	.macro drop
+	movq (%esi), %mm0
+	lea 8(%esi), %esi
+	.endm
+
+	.macro dropdup
+	movq (%esi), %mm0
+	.endm
+
+	.macro swap
+	movq (%esi), %mm1
+	movq %mm0, (%esi)
+	movq %mm1, %mm0
+	.endm
+
+	.macro nip
+	lea 8(%esi), %esi
+	.endm
+
+	.macro dropover
+	movq 8(%esi), %mm0
+	.endm
+
+
+	# *************************** CA BOOLEAN LOGIC MACROS *****************************
+	# overxor 
+	
+	.macro overxor
+	pxor (%esi), %mm0
+	.endm	
+	
+	
+	
+	
+
diff --git a/system/mmx/pixel_cascade_s16.s b/system/mmx/pixel_cascade_s16.s
new file mode 100644
index 0000000..bf88d08
--- /dev/null
+++ b/system/mmx/pixel_cascade_s16.s
@@ -0,0 +1,330 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+
+	# TODO:	 COUPLED CASCADE SECOND ORDER SECTION
+	#
+	# s1[k] = ar * s1[k-1] + ai * s2[k-1] + x[k]
+	# s2[k] = ar * s2[k-1] - ai * s1[k-1]
+	# y[k]  = c0 * x[k] + c1 * s1[k-1] + c2 * s2[k-1]
+
+
+	# MACRO:	df2
+	#
+	# computes a coupled cascade
+	#
+	# input:	%mm0    == input
+	#		%mm1    == state 1
+	#		%mm2    == state 2
+	#		(%esi)  == cascade coefs (ar ai c0 c1 c2) in s0.15
+	# output:	%mm0    == output
+	#		%mm1    == state 1
+	#		%mm2    == state 2
+
+
+	.macro coupled
+	pmovq %mm1, %mm3		# mm3 == s1[k-1]
+	pmovq %mm1, %mm4		# mm4 == s1[k-1]
+	pmovq %mm2, %mm5		# mm5 == s2[k-1]
+	pmovq %mm2, %mm6		# mm5 == s2[k-1]
+	pmulhw (%esi), %mm1		# mm1 == s1[k-1] * ar
+	pmulhw 8(%esi), %mm3		# mm3 == s1[k-1] * ai
+	pmulhw 24(%esi), %mm4		# mm4 == s1[k-1] * c1
+	pmulhw (%esi), %mm2		# mm2 == s2[k-1] * ar
+	pmulhw 8(%esi), %mm5		# mm5 == s2[k-1] * ai
+	pmulhw 32(%esi), %mm6		# mm6 == s2[k-1] * c2
+	paddw %mm5, %mm1		# mm1 == s1[k-1] * ar + s2[k-1] * ai
+	psubw %mm3, %mm2		# mm2 == s2[k-1] * ar - s1[k-1] * ai == s2[k]
+	paddw %mm0, %mm1		# mm1 == s1[k]
+	pmulhw 16(%esi), %mm0		# mm0 == x[k] * c0
+	paddw %mm6, %mm4		# mm4 == s1[k-1] * c1 + s2[k-1] * c2
+	paddw %mm4, %mm0		# mm0 == y[k]
+	.endm
+	
+
+	
+
+	# in order to use the 4 line parallel cascade routine on horizontal
+	# lines, we need to reorder (rotate or transpose) the matrix, since
+	# images are scanline encoded, and we want to work in parallell
+	# on 4 lines.
+	#
+	# since the 4 lines are independent, it doesnt matter in which order
+	# the the vector elements are present. 
+	#
+	# this allows us to use the same routine for left->right and right->left
+	# processing.
+	#	
+	# some comments on the non-abelean group of square isometries consisting of
+	# (I) identity
+	# (H) horizontal axis mirror 
+	# (V) vertical axis mirror
+	# (T) transpose (diagonal axis mirror)
+	# (A) antitranspose (antidiagonal axis mirror)
+	# (R1) 90deg anticlockwize rotation
+	# (R2) 180deg rotation
+	# (R3) 90deg clockwize rotation
+	#
+	#	
+	# we basicly have two options: (R1,R3) or (T,A)
+	# we opt for T and A because they are self inverting, which improves locality
+	#
+	# use antitranspose for right to left an transpose
+	# for left to right (little endian)
+
+
+	# antitranspose 4x4
+
+	# input
+	# %mm3 == {d0 d1 d2 d3}
+	# %mm2 == {c0 c1 c2 c3}	
+	# %mm1 == {b0 b1 b2 b3}	
+	# %mm0 == {a0 a1 a2 a3}
+
+	# output
+	# %mm3 == {a3 b3 c3 d3}
+	# %mm2 == {a2 b2 c2 d2}
+	# %mm1 == {a1 b1 c1 d1}
+	# %mm0 == {a0 b0 c0 d0}
+
+	
+	.macro antitranspose_4x4:	
+	movq %mm3, %mm4
+	punpcklwd %mm1, %mm4	# mm4 <- {b2 d2 b3 d3}
+	movq %mm3, %mm5	
+	punpckhwd %mm1, %mm5	# mm5 <- {b0 d0 b1 d1}
+			
+	movq %mm2, %mm6
+	punpcklwd %mm0, %mm6	# mm6 <- {a2 c2 a3 c3}
+	movq %mm2, %mm7	
+	punpckhwd %mm0, %mm7	# mm7 <- {a0 c0 a1 c1}
+
+	movq %mm4, %mm3
+	punpcklwd %mm6, %mm3	# mm3 <- {a3 b3 c3 d3}
+	movq %mm4, %mm2
+	punpckhwd %mm6, %mm2	# mm2 <- {a2 b2 c2 d2}
+		
+	movq %mm5, %mm1
+	punpcklwd %mm7, %mm1	# mm1 <- {a1 b1 c1 d1}
+	movq %mm5, %mm0
+	punpckhwd %mm7, %mm0	# mm0 <- {a0 b0 c0 d0}
+	
+	.endm
+	
+
+	# transpose 4x4
+
+	# input
+	# %mm3 == {d3 d2 d1 d0}
+	# %mm2 == {c3 c2 c1 c0}	
+	# %mm1 == {b3 b2 b1 b0}	
+	# %mm0 == {a3 a2 a1 a0}
+
+	# output
+	# %mm3 == {d3 c3 b3 a3}
+	# %mm2 == {d2 c2 b2 a2}
+	# %mm1 == {d1 c1 b1 a1}
+	# %mm0 == {d0 c0 b0 a0}
+
+	
+	.macro transpose_4x4:	
+	movq %mm0, %mm4
+	punpcklwd %mm2, %mm4	# mm4 <- {c1 a1 c0 a0}
+	movq %mm0, %mm5	
+	punpckhwd %mm2, %mm5	# mm5 <- {c3 a3 c2 a2}
+		
+	movq %mm1, %mm6
+	punpcklwd %mm3, %mm6	# mm6 <- {d1 b1 d0 b0}
+	movq %mm1, %mm7	
+	punpckhwd %mm3, %mm7	# mm7 <- {d3 b3 d2 b2}
+
+	movq %mm4, %mm0
+	punpcklwd %mm6, %mm0	# mm0 <- {d0 c0 b0 a0}
+	movq %mm4, %mm1
+	punpckhwd %mm6, %mm1	# mm1 <- {d1 c1 b1 a1}
+		
+	movq %mm5, %mm2
+	punpcklwd %mm7, %mm2	# mm2 <- {d2 c2 b2 a2}
+	movq %mm5, %mm3
+	punpckhwd %mm7, %mm3	# mm3 <- {d3 c3 b3 a3}
+
+	.endm
+	
+.globl pixel_cascade_vertb_s16
+.type  pixel_cascade_vertb_s16,@function
+
+
+# pixel_cascade_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_cascade_vertb_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing
+	subl %edx, %ebx	
+
+	movq 0(%edi), %mm1	# s1[k-1]
+	movq 8(%edi), %mm2	# s2[k-1]
+	.align 16
+	.cascade_vertb_line_loop:
+	
+	movq (%ebx,%edx,1), %mm3
+	movq %mm3, %mm0
+	addl %edx, %ebx
+	coupled
+	movq %mm0, (%ebx)
+	
+	movq (%ebx,%edx,1), %mm3
+	movq %mm3, %mm0
+	addl %edx, %ebx
+	coupled
+	movq %mm0, (%ebx)
+	
+	movq (%ebx,%edx,1), %mm3
+	movq %mm3, %mm0
+	addl %edx, %ebx
+	coupled
+	movq %mm0, (%ebx)
+	
+	movq (%ebx,%edx,1), %mm3
+	movq %mm3, %mm0
+	addl %edx, %ebx
+	coupled
+	movq %mm0, (%ebx)
+	
+	decl %ecx
+	jnz .cascade_vertb_line_loop
+		
+	movq %mm1, 0(%edi)	# s1[k-1]
+	movq %mm2, 8(%edi)	# s2[k-1]
+
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+.globl pixel_cascade_horlr_s16
+.type  pixel_cascade_horlr_s16,@function
+
+
+# pixel_cascade_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+	
+pixel_cascade_horlr_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %ebx	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 4x4 pixblocks
+	movl 16(%ebp), %edx	# line with
+
+	movl 20(%ebp), %esi	# coefs
+	movl 24(%ebp), %edi	# state
+
+	shll $1, %edx		# short int addressing
+	movl %edx, %eax
+	shll $1, %eax
+	addl %edx, %eax		# eax = 3 * edx
+
+	
+	.align 16
+	.cascade_horlr_line_loop:
+	movq (%edi), %mm1
+	movq 8(%edi), %mm2
+	
+	movq (%ebx), %mm0	
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+	movq (%ebx,%eax,1), %mm3
+	
+	transpose_4x4
+	
+	movq %mm1, (%ebx,%edx,1)	
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)
+
+	coupled
+
+	movq %mm0, (%ebx)
+	movq (%ebx,%edx,1), %mm3
+	movq %mm3, %mm0
+
+	coupled
+
+	movq %mm0, (%ebx, %edx,1)
+	movq (%ebx,%edx,2), %mm3
+	movq %mm3, %mm0
+
+	coupled
+
+	movq %mm0, (%ebx, %edx,2)
+	movq (%ebx,%eax,1), %mm3
+	movq %mm3, %mm0
+
+	coupled
+	
+	movq %mm1, 0(%edi)	# s1[k-1]
+	movq %mm2, 8(%edi)	# s2[k-1]
+
+	movq %mm0, %mm3
+	movq (%ebx), %mm0
+	movq (%ebx,%edx,1), %mm1	
+	movq (%ebx,%edx,2), %mm2	
+
+	transpose_4x4
+	
+	movq %mm0, (%ebx)
+	movq %mm1, (%ebx,%edx,1)
+	movq %mm2, (%ebx,%edx,2)	
+	movq %mm3, (%ebx,%eax,1)		
+
+	addl $8, %ebx
+	decl %ecx
+	jnz .cascade_horlr_line_loop
+		
+	emms
+	
+	pop %edi
+	pop %esi
+	pop %ebx
+	leave
+	ret
+
+
+
diff --git a/system/mmx/pixel_conv_hor_s16.s b/system/mmx/pixel_conv_hor_s16.s
new file mode 100644
index 0000000..e90a692
--- /dev/null
+++ b/system/mmx/pixel_conv_hor_s16.s
@@ -0,0 +1,134 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+	# intermediate function
+	
+	# input in register:	
+	# %mm0:	left 4 pixels
+	# %mm1:	middle 4 pixels
+	# %mm2:	right 4 pixels
+	
+	# %mm5:	left 4 pixel masks
+	# %mm6:	middle 4 pixel masks
+	# %mm7:	right 4 pixel masks
+	
+	# return in register:	 
+	# %mm0:	middle 4 pixels result
+
+	
+	.conv_hor_4_pixels:	
+	.align 16
+	
+	# compute quadruplet
+
+	# get left pixels
+	psrlq $48, %mm0			# shift word 3 to byte 0
+	movq %mm1, %mm4
+	psllq $16, %mm4			# shift word 0,1,2 to 1,2,3
+	por %mm4, %mm0			# combine
+	pmulhw %mm5, %mm0
+	psllw $1, %mm0
+
+	
+	# get middle pixels
+	movq %mm1, %mm4
+	pmulhw %mm6, %mm4
+	psllw $1, %mm4
+	paddsw %mm4, %mm0	
+
+
+	# get right pixels
+	movq %mm2, %mm3
+	psllq $48, %mm3			# shift word 0 to word 3
+	movq %mm1, %mm4
+	psrlq $16, %mm4			# shift word 1,2,3 to 0,1,2
+	por %mm4, %mm3			# combine
+	pmulhw %mm7, %mm3
+	psllw $1, %mm3
+	paddsw %mm3, %mm0		# accumulate
+	
+	ret
+	
+.globl pixel_conv_hor_s16
+.type  pixel_conv_hor_s16,@function
+
+
+# pixel_conv_hor_s16(short int *pixel_array, int nb_4_pixel_vectors, short int border[4], short int mask[12])
+# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
+# NOT TESTED
+
+	
+pixel_conv_hor_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %esi	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of 8 pixel vectors in a row (at least 2)
+
+	movl 20(%ebp), %edi	# mask vector
+	movq (%edi), %mm5
+	movq 8(%edi), %mm6
+	movq 16(%edi), %mm7
+	
+	movl 16(%ebp), %edi	# boundary pixel vector
+	
+	
+
+	movq (%edi), %mm0	# init regs (left edge, so mm0 is zero)
+	movq (%esi), %mm1
+	movq 8(%esi), %mm2
+
+	decl %ecx		# loop has 2 terminator stubs
+	decl %ecx		# todo:	 handle if ecx < 3
+	
+	jmp .conv_line_loop
+
+
+	.align 16
+	.conv_line_loop:	
+	call .conv_hor_4_pixels	# compute conv 
+	movq %mm0, (%esi)	# store result
+	movq %mm1, %mm0		# mm0 <- prev (%esi)
+	movq %mm2, %mm1		# mm1 <- 8(%esi)
+	movq 16(%esi), %mm2	# mm2 <- 16(%esi)
+	
+	addl $8, %esi		# increase pointer
+	decl %ecx
+	jnz .conv_line_loop
+
+	call .conv_hor_4_pixels	# compute conv 
+	movq %mm0, (%esi)	# store result
+	movq %mm1, %mm0		# mm0 <- prev (%esi)
+	movq %mm2, %mm1		# mm1 <- 8(%esi)
+	movq (%edi), %mm2	# mm2 <- border
+
+	call .conv_hor_4_pixels	# compute last vector
+	movq %mm0, 8(%esi)	# store it
+	
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+
+
+
diff --git a/system/mmx/pixel_conv_ver_s16.s b/system/mmx/pixel_conv_ver_s16.s
new file mode 100644
index 0000000..ae2456f
--- /dev/null
+++ b/system/mmx/pixel_conv_ver_s16.s
@@ -0,0 +1,128 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+#TODO:	 fix out of bound acces in conv_ver and conv_hor
+	
+	# intermediate function
+	
+	# input in register:	
+	# %mm0:	top 4 pixels
+	# %mm1:	middle 4 pixels
+	# %mm2:	bottom 4 pixels
+
+	# %mm5:	top 4 pixel mask
+	# %mm6:	middle 4 pixel mask
+	# %mm7:	bottom 4 pixel mask
+	
+	# return in register:	 
+	# %mm0:	middle 4 pixels result
+
+	
+	.conv_ver_4_pixels:	
+	.align 16
+	
+	# compute quadruplet
+
+	# get top pixel
+	pmulhw %mm5, %mm0
+	psllw $1, %mm0
+	
+	# get middle pixel
+	movq %mm1, %mm4
+	pmulhw %mm6, %mm4
+	psllw $1, %mm4
+	paddsw %mm4, %mm0
+
+	# get bottom pixel
+	movq %mm2, %mm3
+	pmulhw %mm7, %mm3
+	psllw $1, %mm3			# mm3 <- mm3/4
+	paddsw %mm3, %mm0
+
+	ret
+	
+.globl pixel_conv_ver_s16
+.type  pixel_conv_ver_s16,@function
+
+
+# pixel_conv_ver_s16(short int *pixel_array, int nb_4_pixel_vectors, int row_byte_size, short int border[4])
+# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
+# NOT TESTED
+
+	
+pixel_conv_ver_s16: 
+
+		
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %esi		# pixel array offset
+	movl 12(%ebp), %ecx		# nb of 4 pixel vectors in a row (at least 2)
+	movl 16(%ebp), %edx		# rowsize in bytes
+
+	movl 24(%ebp), %edi		# mask vector
+	movq (%edi), %mm5
+	movq 8(%edi), %mm6
+	movq 16(%edi), %mm7
+	
+	movl 20(%ebp), %edi		# edge vector
+
+
+	shll $1, %edx
+	decl %ecx			# loop has a terminator stub
+	decl %ecx			# loop has another terminator stub
+	
+
+	movq (%edi), %mm0		# init regs (left edge, so mm0 is zero)
+	movq (%esi), %mm1
+	movq (%esi,%edx,1), %mm2
+	jmp .conv_line_loop
+
+
+	.align 16
+	.conv_line_loop:	
+	call .conv_ver_4_pixels		# compute conv 
+	movq %mm0, (%esi)		# store result
+	movq %mm1, %mm0			# mm0 <- prev (%esi)
+	movq %mm2, %mm1			# mm1 <- (%esi,%edx,1)
+	movq (%esi,%edx,2), %mm2	# mm2 <- (%esi,%edx,2)
+	
+	addl %edx, %esi			# increase pointer
+	decl %ecx
+	jnz .conv_line_loop
+
+	call .conv_ver_4_pixels		# compute conv 
+	movq %mm0, (%esi)		# store result
+	movq %mm1, %mm0			# mm0 <- prev (%esi)
+	movq %mm2, %mm1			# mm1 <- (%esi,%edx,1)
+	movq (%edi), %mm2		# clear invalid edge vector
+
+	addl %edx, %esi			# increase pointer
+	call .conv_ver_4_pixels		# compute last vector
+	movq %mm0, (%esi)		# store it
+	
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+
+
+
diff --git a/system/mmx/pixel_crot_s16.s b/system/mmx/pixel_crot_s16.s
new file mode 100644
index 0000000..2427869
--- /dev/null
+++ b/system/mmx/pixel_crot_s16.s
@@ -0,0 +1,153 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_crot3d_s16
+.type  pixel_crot3d_s16,@function
+
+
+# 3 dimensional colour space rotation
+# 3x3 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
+	
+# void pixel_crot3d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
+
+pixel_crot3d_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	
+	movl 8(%ebp),  %esi	# input array
+	movl 12(%ebp), %ecx	# pixel count
+	movl 16(%ebp), %edi	# rotation matrix
+	movl %ecx, %edx
+	shll $3, %edx		# %edx = plane spacing
+
+	
+	.align 16
+	.loop_crot3d:	
+
+	movq (%esi), %mm0		# get 1st component
+	movq (%esi,%edx,1), %mm6	# get 2nd component
+	movq (%esi,%edx,2), %mm7	# get 3rd component
+
+	movq %mm0, %mm1			# copy 1st component
+	movq %mm0, %mm2
+
+	pmulhw (%edi), %mm0		# mul first column
+	pmulhw 8(%edi), %mm1
+	pmulhw 16(%edi), %mm2
+
+	movq %mm6, %mm5			# copy 2nd component
+	movq %mm6, %mm3
+
+	pmulhw 24(%edi), %mm6		# mul second column
+	pmulhw 32(%edi), %mm5
+	pmulhw 40(%edi), %mm3
+
+	paddsw %mm6, %mm0		# accumulate
+	paddsw %mm5, %mm1
+	paddsw %mm3, %mm2
+
+	movq %mm7, %mm4			# copy 3rd component
+	movq %mm7, %mm6
+
+	pmulhw 48(%edi), %mm4		# mul third column
+	pmulhw 56(%edi), %mm6
+	pmulhw 64(%edi), %mm7
+
+	paddsw %mm4, %mm0		# accumulate
+	paddsw %mm6, %mm1
+	paddsw %mm7, %mm2
+
+	paddsw %mm0, %mm0		# double (fixed point normalization)
+	paddsw %mm1, %mm1
+	paddsw %mm2, %mm2
+
+	movq %mm0, (%esi)		# store
+	movq %mm1, (%esi, %edx, 1)
+	movq %mm2, (%esi, %edx, 2)
+
+	addl $8, %esi			# increment source pointer
+	decl %ecx
+	jnz .loop_crot3d		# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
+
+.globl pixel_crot2d_s16
+.type  pixel_crot2d_s16,@function
+	
+# 2 dimensional colour space rotation
+# 2x2 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
+	
+# void pixel_crot2d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
+
+pixel_crot2d_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	
+	movl 8(%ebp),  %esi	# input array
+	movl 12(%ebp), %ecx	# pixel count
+	movl 16(%ebp), %edi	# rotation matrix
+	movl %ecx, %edx
+	shll $3, %edx		# %edx = plane spacing
+
+	
+	.align 16
+	.loop_crot2d:	
+
+	movq (%esi), %mm0		# get 1st component
+	movq (%esi,%edx,1), %mm2	# get 2nd component
+
+	movq %mm0, %mm1			# copy 1st component
+	movq %mm2, %mm3			# copy 2nd component
+
+	pmulhw (%edi), %mm0		# mul first column
+	pmulhw 8(%edi), %mm1
+
+	pmulhw 16(%edi), %mm2		# mul second column
+	pmulhw 24(%edi), %mm3
+
+	paddsw %mm2, %mm0		# accumulate
+	paddsw %mm3, %mm1
+
+	paddsw %mm0, %mm0		# fixed point gain correction
+	paddsw %mm1, %mm1
+
+	movq %mm0, (%esi)		# store
+	movq %mm1, (%esi, %edx, 1)
+
+	addl $8, %esi			# increment source pointer
+	decl %ecx
+	jnz .loop_crot2d		# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_gain.s b/system/mmx/pixel_gain.s
new file mode 100644
index 0000000..5cd5057
--- /dev/null
+++ b/system/mmx/pixel_gain.s
@@ -0,0 +1,83 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_gain
+.type  pixel_gain,@function
+
+# mmx rgba pixel gain
+# void asmtest(char *pixelarray, int32 nbpixels, int *rgba_gain)
+# gains are 7.9 fixed point for rgba
+
+pixel_gain:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %esi	# pixel array offset
+	movl 12(%ebp), %ecx	# nb of elements
+	movl 16(%ebp), %edi	# int16[4] array of gains
+
+	prefetch (%esi)
+
+	emms
+	sarl $2, %ecx		# process 4 pixels per loop iteration
+	jz .exit
+	movq (%edi), %mm7	# read gain array from memory
+	jmp .loop_gain
+
+	.align 16
+	.loop_gain:	
+
+	prefetch 128(%esi)	
+	movq (%esi), %mm5	# load pixel 1-2  from memory
+	movq 8(%esi), %mm6	# load pixel 3-4  from memory
+	pxor %mm0, %mm0		# zero mm0 - mm3
+	pxor %mm1, %mm1
+	pxor %mm2, %mm2
+	pxor %mm3, %mm3
+	punpcklbw %mm5, %mm0	# unpack 1st pixel into 8.8 bit ints
+	punpckhbw %mm5, %mm1	# unpack 2nd
+	punpcklbw %mm6, %mm2	# unpack 3rd
+	punpckhbw %mm6, %mm3	# unpack 4th
+	psrlw $0x1, %mm0	# shift right to clear sign bit 9.7
+	psrlw $0x1, %mm1
+	psrlw $0x1, %mm2
+	psrlw $0x1, %mm3
+	
+	pmulhw %mm7, %mm0	# multiply 1st pixel 9.7 * 7.9 -> 16.0
+	pmulhw %mm7, %mm1	# multiply 2nd  
+	pmulhw %mm7, %mm2	# multiply 3rd
+	pmulhw %mm7, %mm3	# multiply 4th 
+
+	packuswb %mm1, %mm0	# pack & saturate to 8bit vector
+	movq %mm0, (%esi)	# store result in memory
+	packuswb %mm3, %mm2	# pack & saturate to 8bit vector
+	movq %mm2, 8(%esi)	# store result in memory
+
+	addl $16, %esi		# increment source pointer
+	decl %ecx
+	jnz .loop_gain		# loop
+
+	.exit:
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_gain_s16.s b/system/mmx/pixel_gain_s16.s
new file mode 100644
index 0000000..adcfdf5
--- /dev/null
+++ b/system/mmx/pixel_gain_s16.s
@@ -0,0 +1,71 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_gain_s16
+.type  pixel_gain_s16,@function
+
+# gain is integer, shift count is down	
+# void pixel_gain_s16(int *buf, int nb_8pixel_vectors, short int gain[4], unsigned long long *shift)
+
+pixel_gain_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 20(%ebp), %edi
+	movq (%edi), %mm6	# get shift vector
+
+	movl 16(%ebp), %edi
+	movq (%edi), %mm7	# get gain vector
+	
+	movl 8(%ebp),  %esi	# input array
+	movl 12(%ebp), %ecx	# pixel count
+
+	
+	.align 16
+	.loop_gain:	
+
+	movq (%esi), %mm0	# load 4 pixels from memory
+	movq %mm0, %mm1		
+	pmulhw %mm7, %mm1	# apply gain (s15.0) fixed point, high word
+	pmullw %mm7, %mm0	# low word
+
+	movq %mm0, %mm2		# copy
+	movq %mm1, %mm3
+
+	punpcklwd %mm1, %mm0	# unpack lsw components
+	punpckhwd %mm3, %mm2	# unpack msw components
+
+	psrad %mm6, %mm0	# apply signed shift
+	psrad %mm6, %mm2
+
+	packssdw %mm2, %mm0	# pack result & saturate
+	movq %mm0, (%esi)	# store result
+	
+
+	addl $8, %esi		# increment source pointer
+	decl %ecx
+	jnz .loop_gain		# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_mix_s16.s b/system/mmx/pixel_mix_s16.s
new file mode 100644
index 0000000..9bf41eb
--- /dev/null
+++ b/system/mmx/pixel_mix_s16.s
@@ -0,0 +1,68 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_mix_s16
+.type  pixel_mix_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_mix_s16(int *left, int *right, int nb_4pixel_vectors, 
+#	short int gain_left[4], short int gain_right[4])
+
+pixel_mix_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 20(%ebp), %edi	# int16[4] array of gains
+	movq (%edi), %mm6	# get left gain array
+
+	movl 24(%ebp), %edi	# int16[4] array of gains
+	movq (%edi), %mm7	# get right gain array
+	
+	movl 8(%ebp),  %edi	# left array
+	movl 12(%ebp), %esi	# right array
+	movl 16(%ebp), %ecx	# pixel count
+
+	
+	.align 16
+	.loop_mix:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm1	# load right 4 pixels from memory
+	pmulhw %mm7, %mm1	# apply right gain
+	movq (%edi), %mm0	# load 4 left pixels from memory
+	pmulhw %mm6, %mm0	# apply left gain
+#	pslaw $1, %mm1		# shift left ((s).15 x (s).15 -> (s0).14))
+#	pslaw $1, %mm0
+	paddsw %mm0, %mm0	# no shift left arithmic, so use add instead
+	paddsw %mm1, %mm1
+	paddsw %mm1, %mm0	# mix
+	movq %mm0, (%edi)
+	addl $8, %esi
+	addl $8, %edi
+	decl %ecx
+	jnz .loop_mix		# loop
+
+	emms
+
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_mul_s16.s b/system/mmx/pixel_mul_s16.s
new file mode 100644
index 0000000..240a024
--- /dev/null
+++ b/system/mmx/pixel_mul_s16.s
@@ -0,0 +1,56 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_mul_s16
+.type  pixel_mul_s16,@function
+
+# simple add
+# void pixel_mul_s16(int *left, int *right, int nb_4pixel_vectors)
+
+pixel_mul_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %edi	# left array
+	movl 12(%ebp), %esi	# right array
+	movl 16(%ebp), %ecx	# pixel count
+
+	
+	.align 16
+	.loop_mix:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm1	# load right 4 pixels from memory
+	movq (%edi), %mm0	# load 4 left pixels from memory
+	pmulhw %mm1, %mm0	# mul
+	psllw $1, %mm0		# fixed point shift correction
+	movq %mm0, (%edi)
+	addl $8, %esi
+	addl $8, %edi
+	decl %ecx
+	jnz .loop_mix		# loop
+
+	emms
+
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_pack_s16u8.s b/system/mmx/pixel_pack_s16u8.s
new file mode 100644
index 0000000..57df702
--- /dev/null
+++ b/system/mmx/pixel_pack_s16u8.s
@@ -0,0 +1,126 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_pack_s16u8_y
+.type  pixel_pack_s16u8_y,@function
+
+# mmx rgba pixel gain
+# void pixel_pack_s16u8_y(int *input, int *output, int nb_8pixel_vectors)
+
+pixel_pack_s16u8_y:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+#	movl 20(%ebp), %edi	# int16[4] array of gains
+#	movq (%edi), %mm7	# get gain array
+#	psllw $1, %mm7		# adjust for shifted sign bit
+	
+	movl 8(%ebp),  %esi	# input array
+	movl 12(%ebp), %edi	# output array
+	movl 16(%ebp), %ecx	# pixel count
+
+	pxor %mm6, %mm6
+	
+	.align 16
+	.loop_pack_y:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm0	# load 4 pixels from memory
+#	pmulhw %mm7, %mm0	# apply gain
+	movq 8(%esi), %mm1	# load 4 pixels from memory
+#	pmulhw %mm7, %mm1	# apply gain
+
+#	movq %mm0, %mm2
+#	pcmpgtw %mm6, %mm2	# mm2 > 0 ?  0xffff :	0
+#	pand %mm2, %mm0 
+
+#	movq %mm1, %mm3
+#	pcmpgtw %mm6, %mm3	# mm3 > 0 ?  0xffff :	0
+#	pand %mm3, %mm1 
+
+#	psllw $1, %mm0		# shift out sign bit
+#	psllw $1, %mm1		# shift out sign bit
+
+	psraw $7, %mm0		# shift to lsb
+	psraw $7, %mm1		# shift to lsb
+	
+	packuswb %mm1, %mm0	# pack & saturate to 8bit vector
+	movq %mm0, (%edi)	# store result in memory
+
+	addl $16, %esi		# increment source pointer
+	addl $8, %edi		# increment dest pointer
+	decl %ecx
+	jnz .loop_pack_y	# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
+.globl pixel_pack_s16u8_uv
+.type  pixel_pack_s16u8_uv,@function
+
+pixel_pack_s16u8_uv:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+#	movl 20(%ebp), %edi	# int16[4] array of gains
+#	movq (%edi), %mm7	# get gain array
+	movl 8(%ebp),  %esi	# pixel array offset
+	movl 12(%ebp), %edi	# nb of elements
+	movl 16(%ebp), %ecx	# pixel count
+
+	pcmpeqw %mm6, %mm6
+	psllw $15, %mm6
+	movq %mm6, %mm5
+	psrlw $8, %mm5
+	por %mm5, %mm6		# mm6 <- 8 times 0x80
+	
+	.align 16
+	.loop_pack_uv:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm0	# load 4 pixels from memory
+#	pmulhw %mm7, %mm0	# apply gain
+	movq 8(%esi), %mm1	# load 4 pixels from memory
+#	pmulhw %mm7, %mm1	# apply gain
+
+	psraw $8, %mm0		# shift to msb
+	psraw $8, %mm1
+	
+	packsswb %mm1, %mm0	# pack & saturate to 8bit vector
+	pxor %mm6, %mm0		# flip sign bits
+	movq %mm0, (%edi)	# store result in memory
+
+	addl $16, %esi		# increment source pointer
+	addl $8, %edi		# increment dest pointer
+	decl %ecx
+	jnz .loop_pack_uv	# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_rand_s16.s b/system/mmx/pixel_rand_s16.s
new file mode 100644
index 0000000..649400b
--- /dev/null
+++ b/system/mmx/pixel_rand_s16.s
@@ -0,0 +1,76 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_rand_s16
+.type  pixel_rand_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_rand_s16(int *dst, nb_4pixel_vectors, short int random_seed[4])
+
+pixel_rand_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 16(%ebp), %esi	# int16[4] array of random seeds
+	movl 8(%ebp),  %edi	# dst array
+	movl 12(%ebp), %ecx	# pixel count
+
+	movq (%esi), %mm6
+
+
+	pcmpeqw %mm3, %mm3
+	psrlw $15, %mm3		# get bit mask 4 times 0x0001
+	
+	.align 16
+	.loop_rand:	
+
+#	prefetch 128(%esi)	
+
+
+	movq %mm6, %mm4		# get random vector
+	psrlw $15, %mm4		# get first component
+	movq %mm6, %mm5
+	psrlw $14, %mm5		# get second component
+	pxor %mm5, %mm4
+	movq %mm6, %mm5
+	psrlw $12, %mm5		# get third component
+	pxor %mm5, %mm4
+	movq %mm6, %mm5
+	psrlw $3, %mm5		# get forth component
+	pxor %mm5, %mm4
+
+	psllw $1, %mm6		# shift left original random vector
+	pand %mm3, %mm4		# isolate new bit
+	por %mm4, %mm6		# combine into new random vector
+
+	movq %mm6, (%edi)
+	addl $8, %edi
+	decl %ecx
+	jnz .loop_rand	# loop
+
+
+	movq %mm6, (%esi)	# store random seeds
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_randmix_s16.s b/system/mmx/pixel_randmix_s16.s
new file mode 100644
index 0000000..44e1702
--- /dev/null
+++ b/system/mmx/pixel_randmix_s16.s
@@ -0,0 +1,91 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_randmix_s16
+.type  pixel_randmix_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_randmix_s16(int *left, int *right, int nb_4pixel_vectors, short int random_seed[4], short int threshold[4])
+
+pixel_randmix_s16:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 20(%ebp), %edi	# int16[4] array of random seeds
+	movq (%edi), %mm6
+
+	movl 24(%ebp), %edi	# int16[4] array of thresholds
+	movq (%edi), %mm7
+	
+	movl 8(%ebp),  %edi	# left array
+	movl 12(%ebp), %esi	# right array
+	movl 16(%ebp), %ecx	# pixel count
+
+	pcmpeqw %mm3, %mm3
+	psrlw $15, %mm3		# get bit mask 4 times 0x0001
+	
+	.align 16
+	.loop_randmix:	
+
+#	prefetch 128(%esi)	
+	movq (%esi), %mm1	# load right 4 pixels from memory
+	movq (%edi), %mm0	# load 4 left pixels from memory
+
+	movq %mm6, %mm2		# get random vector
+	pcmpgtw %mm7, %mm2	# compare random vector with threshold
+	movq %mm2, %mm5
+	
+	pand %mm0, %mm2		# get left array's components
+	pandn %mm1, %mm5	# get right array's components
+	por %mm2, %mm5
+	
+	movq %mm5, (%edi)	# store pixels
+
+	movq %mm6, %mm4		# get random vector
+	psrlw $15, %mm4		# get first component
+	movq %mm6, %mm5
+	psrlw $14, %mm5		# get second component
+	pxor %mm5, %mm4
+	movq %mm6, %mm5
+	psrlw $12, %mm5		# get third component
+	pxor %mm5, %mm4
+	movq %mm6, %mm5
+	psrlw $3, %mm5		# get forth component
+	pxor %mm5, %mm4
+
+	psllw $1, %mm6		# shift left original random vector
+	pand %mm3, %mm4		# isolate new bit
+	por %mm4, %mm6		# combine into new random vector
+	
+	addl $8, %esi
+	addl $8, %edi
+	decl %ecx
+	jnz .loop_randmix	# loop
+
+
+	movl 20(%ebp), %edi	# int16[4] array of random seeds
+	movq %mm6, (%edi)	# store random seeds
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_s1.s b/system/mmx/pixel_s1.s
new file mode 100644
index 0000000..d6bc5ca
--- /dev/null
+++ b/system/mmx/pixel_s1.s
@@ -0,0 +1,201 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+	# this file contains ops for binary image processing
+	# 8x8 bit tile encoded
+	# low byte = bottom row
+	# low bit = right column
+	# %mm7 = scratch reg for all macros
+
+
+	# ************ load mask *******************
+	# compute bit masks for rows and columns
+	# %mm7:	 scratch reg
+
+	# load mask top
+	.macro ldmt count reg
+	pcmpeqb \reg, \reg
+	psllq $(64-(\count<<3)), \reg
+	.endm
+
+	# load mask bottom
+	.macro ldmb count reg
+	pcmpeqb \reg, \reg
+	psrlq $(64-(\count<<3)), \reg
+	.endm
+
+	# load mask top and bottom
+	.macro ldmtb count regt regb
+	ldmb \count, \regb
+	ldmt \count, \regt
+	.endm
+
+	# load mask right
+	.macro ldmr count reg
+	pcmpeqb %mm7, %mm7
+	psrlw $(16-\count), %mm7
+	movq %mm7, \reg
+	psllq $8, %mm7
+	por %mm7, \reg
+	.endm
+
+	# load mask left	
+	.macro ldml count reg
+	pcmpeqb %mm7, %mm7
+	psllw $(16-\count), %mm7
+	movq %mm7, \reg
+	psrlq $8, %mm7
+	por %mm7, \reg
+	.endm
+
+	# load mask left and right
+	.macro ldmlr count regl regr
+	pcmpeqb %mm7, %mm7
+	psllw $(16-\count), %mm7
+	movq %mm7, \regl
+	psrlq $8, %mm7
+	por %mm7, \regl
+	movq \regl, \regr
+	psrlq $(8-\count), \regr
+	.endm
+
+	# ************* shift square **********
+	# shifts a square in reg, fills with zeros
+
+	# shift square top
+	.macro sst count reg
+	psllq $(\count<<3), \reg
+	.endm
+
+	# shift square bottom
+	.macro ssb count reg
+	psrlq $(\count<<3), \reg
+	.endm
+
+	# not tested
+	# shift square left
+	.macro ssl count reg
+	movq \reg, %mm7
+	pcmpeqb \reg, \reg
+	psllw $(16-\count), \reg
+	psrlw $8, \reg
+	pandn %mm7, \reg
+	psllw $(\count), \reg
+	.endm
+
+	# shift square right
+	.macro ssr count reg
+	movq \reg, %mm7
+	pcmpeqb \reg, \reg
+	psrlw $(16-\count), \reg
+	psllw $8, \reg
+	pandn %mm7, \reg
+	psrlw $(\count), \reg
+	.endm
+
+
+	# ********** combine square *************
+	# combines 2 squares
+
+	# combine right
+	.macro csr count regr reg
+	ssl \count, \reg
+	ssr (8-\count), \regr
+	por \regr, \reg
+	.endm
+
+	# combine left
+	.macro csl count regl reg
+	ssr \count, \reg
+	ssl (8-\count), \regl
+	por \regl, \reg
+	.endm
+
+	# combine top
+	.macro cst count regt reg
+	ssb \count, \reg
+	sst (8-\count), \regt
+	por \regt, \reg
+	.endm
+
+	
+	# combine bottom
+	.macro csb count regb reg
+	sst \count, \reg
+	ssb (8-\count), \regb
+	por \regb, \reg
+	.endm
+
+
+	# ********** load combine square *************
+	# loads combined square using mask
+
+	# load combined square left
+	# mask should be count bits set right (i.e. 0x01)
+	.macro lcsml count mask source sourcel dstreg
+	movq \mask, \dstreg
+	movq \mask, %mm7
+	pandn \source, \dstreg
+	pand \sourcel, %mm7
+	psrlq $(\count), \dstreg
+	psllq $(8-\count), %mm7
+	por %mm7, \dstreg
+	.endm
+	
+	
+			
+.globl pixel_test_s1
+.type  pixel_test_s1,@function
+
+# simple add
+# void pixel_add_s16(void *dest, void *source, int nb_squares, int spacing)
+
+
+
+	#
+	
+
+pixel_test_s1:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+	movl 8(%ebp),  %edi	# dest
+	movl 12(%ebp), %esi	# source
+	movl 16(%ebp), %ecx	# count
+	movl 20(%ebp), %edx	# row distance
+
+	ldmr 1, %mm6
+	lcsml 1, %mm6, (%esi), 8(%esi), %mm0
+	movq %mm0, (%edi)
+
+
+#	movq (%esi), %mm0
+#	movq 8(%esi), %mm1
+#	csl 4, %mm1, %mm0
+#	movq %mm0, (%edi)
+
+	emms
+
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/mmx/pixel_unpack_u8s16.s b/system/mmx/pixel_unpack_u8s16.s
new file mode 100644
index 0000000..0fc14c2
--- /dev/null
+++ b/system/mmx/pixel_unpack_u8s16.s
@@ -0,0 +1,113 @@
+#    Pure Data Packet mmx routine.
+#    Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+# 
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+# 
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+# 
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_unpack_u8s16_y
+.type  pixel_unpack_u8s16_y,@function
+
+# mmx rgba pixel gain
+# void pixel_unpack_u8s16_y(char *input, char *output, int32 nb_pixels_div8)
+
+pixel_unpack_u8s16_y:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+#	movl 20(%ebp), %edi	# int16[4] array of gains
+#	movq (%edi), %mm7	# get gain array
+	
+	movl 8(%ebp),  %esi	# input uint8 pixel array
+	movl 12(%ebp), %edi	# output sint16 pixel array
+	movl 16(%ebp), %ecx	# nb of elements div 8
+
+
+	.align 16
+	.loop_unpack_y:	
+
+	movq (%esi), %mm5	# load 8 pixels from memory
+	pxor %mm0, %mm0		# zero mm0 - mm3
+	pxor %mm1, %mm1
+	punpcklbw %mm5, %mm0	# unpack 1st 4 pixels
+	punpckhbw %mm5, %mm1	# unpack 2nd 4 pixles
+	psrlw $0x1, %mm0	# shift right to clear sign bit 9.7
+	psrlw $0x1, %mm1
+#	pmulhw %mm7, %mm0	# apply gain
+#	pmulhw %mm7, %mm1
+#	paddsw %mm0, %mm0	# correct factor 2
+#	paddsw %mm1, %mm1
+	movq %mm0, (%edi)	# store
+	movq %mm1, 8(%edi)
+	
+	addl $8, %esi		# increment source pointer
+	addl $16, %edi		# increment dest pointer
+	decl %ecx
+	jnz .loop_unpack_y	# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
+.globl pixel_unpack_u8s16_uv
+.type  pixel_unpack_u8s16_uv,@function
+pixel_unpack_u8s16_uv:
+	pushl %ebp
+	movl %esp, %ebp
+	push %esi
+	push %edi
+
+#	movl 20(%ebp), %edi	# int16[4] array of gains
+#	movq (%edi), %mm7	# get gain array
+
+	movl 8(%ebp),  %esi	# input uint8 pixel array
+	movl 12(%ebp), %edi	# output sint16 pixel array
+	movl 16(%ebp), %ecx	# nb of elements div 8
+
+	pcmpeqw %mm6, %mm6
+	psllw $15, %mm6
+	
+	.align 16
+	.loop_unpack_uv:	
+
+	movq (%esi), %mm5	# load 8 pixels from memory
+	pxor %mm0, %mm0		# zero mm0 - mm3
+	pxor %mm1, %mm1
+	punpcklbw %mm5, %mm0	# unpack 1st 4 pixels
+	punpckhbw %mm5, %mm1	# unpack 2nd 4 pixles
+	pxor %mm6, %mm0		# flip sign bit (Cr and Cb are ofset by 128)
+	pxor %mm6, %mm1
+#	pmulhw %mm7, %mm0	# apply gain
+#	pmulhw %mm7, %mm1
+#	paddsw %mm0, %mm0	# correct factor 2
+#	paddsw %mm1, %mm1
+	movq %mm0, (%edi)	# store
+	movq %mm1, 8(%edi)
+	
+	addl $8, %esi		# increment source pointer
+	addl $16, %edi		# increment dest pointer
+	decl %ecx
+	jnz .loop_unpack_uv	# loop
+
+	emms
+	
+	pop %edi
+	pop %esi
+	leave
+	ret
+	
diff --git a/system/pdp.c b/system/pdp.c
new file mode 100644
index 0000000..8651971
--- /dev/null
+++ b/system/pdp.c
@@ -0,0 +1,115 @@
+/*
+ *   Pure Data Packet system implementation: setup code
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "pdp.h"
+#include <stdio.h>
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+
+/* module setup declarations (all C-style) */
+
+/* pdp system / internal stuff */
+void pdp_packet_setup(void);
+void pdp_ut_setup(void);
+void pdp_queue_setup(void);
+void pdp_control_setup(void);
+
+/* pdp modules */
+void pdp_xv_setup(void);
+void pdp_add_setup(void);
+void pdp_mul_setup(void);
+void pdp_mix_setup(void);
+void pdp_randmix_setup(void);
+void pdp_qt_setup(void);
+void pdp_v4l_setup(void);
+void pdp_reg_setup(void);
+void pdp_conv_setup(void);
+void pdp_bq_setup(void);
+void pdp_affine_setup(void);
+void pdp_del_setup(void);
+void pdp_snap_setup(void);
+void pdp_trigger_setup(void);
+void pdp_route_setup(void);
+void pdp_noise_setup(void);
+void pdp_gradient_setup(void);
+void pdp_gain_setup(void);
+void pdp_grey_setup(void);
+void pdp_chrot_setup(void);
+void pdp_scope_setup(void);
+void pdp_scale_setup(void);
+void pdp_zoom_setup(void);
+
+
+/* library setup routine */
+void pdp_setup(void){
+    
+    /* babble */
+    post ("PDP: pure data packet");
+
+#ifdef PDP_VERSION	
+    fprintf(stderr, "PDP: version " PDP_VERSION "\n");
+#endif
+
+
+    /* setup pdp system */
+    pdp_packet_setup();
+    pdp_queue_setup();
+    pdp_control_setup();
+
+    /* setup utility toolkit */
+    pdp_ut_setup();
+
+    /* setup pdp modules*/
+    pdp_add_setup();
+    pdp_mul_setup();
+    pdp_mix_setup();
+    pdp_randmix_setup();
+    pdp_xv_setup();
+    pdp_qt_setup();
+    pdp_v4l_setup();
+    pdp_reg_setup();
+    pdp_conv_setup();
+    pdp_bq_setup();
+    pdp_affine_setup();
+    pdp_del_setup();
+    pdp_snap_setup();
+    pdp_trigger_setup();
+    pdp_route_setup();
+    pdp_noise_setup();
+    pdp_gradient_setup();
+    pdp_gain_setup();
+    pdp_grey_setup();
+    pdp_chrot_setup();
+    pdp_scope_setup();
+    pdp_scale_setup();
+    pdp_zoom_setup();
+
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_comm.c b/system/pdp_comm.c
new file mode 100644
index 0000000..80cbaaa
--- /dev/null
+++ b/system/pdp_comm.c
@@ -0,0 +1,119 @@
+/*
+ *   Pure Data Packet system implementation.
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+/* this file contains misc communication methods */
+
+
+#include "pdp.h"
+#include "pdp_internals.h"
+#include <stdio.h>
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+/************** packet management and communication convenience functions ************/
+
+/* send a packet to an outlet */
+void outlet_pdp(t_outlet *out, int packetid)
+{
+    t_atom atom[2];
+    t_symbol *s = gensym("pdp");
+    t_symbol *rro = gensym("register_ro");
+    t_symbol *rrw = gensym("register_rw");
+    t_symbol *proc = gensym("process");
+
+
+    SETFLOAT(atom+1, (float)packetid);
+
+    SETSYMBOL(atom+0, rro);
+    outlet_anything(out, s, 2, atom);
+
+    SETSYMBOL(atom+0, rrw);
+    outlet_anything(out, s, 2, atom);
+
+    /* this is not really necessary, it can be triggered by the rw message */
+    SETSYMBOL(atom+0, proc);
+    outlet_anything(out, s, 1, atom);
+
+}
+
+
+/* unregister a packet and send it to an outlet */
+void
+pdp_pass_if_valid(t_outlet *outlet, int *packet)
+{
+    if (-1 != *packet){
+	pdp_packet_mark_unused(*packet);
+	outlet_pdp(outlet, *packet);
+	*packet = -1;
+    }
+}
+
+void
+pdp_replace_if_valid(int *dpacket, int *spacket)
+{
+    if (-1 != *spacket){
+	pdp_packet_mark_unused(*dpacket);
+	*dpacket = *spacket;
+	*spacket = -1;
+    }
+    
+}
+
+
+int
+pdp_packet_copy_ro_or_drop(int *dpacket, int spacket)
+{
+    int drop = 0;
+    if (*dpacket == -1) *dpacket = pdp_packet_copy_ro(spacket);
+    else {
+	/* send a notification there is a dropped packet */
+	pdp_control_notify_drop(spacket);
+	drop = 1;
+    }
+    return drop;
+}
+
+
+int
+pdp_packet_copy_rw_or_drop(int *dpacket, int spacket)
+{
+    int drop = 0;
+    if (*dpacket == -1) *dpacket = pdp_packet_copy_rw(spacket);
+    else {
+	/* send a notification there is a dropped packet */
+	pdp_control_notify_drop(spacket);
+	drop = 1;
+    }
+    return drop;
+}
+
+
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_control.c b/system/pdp_control.c
new file mode 100644
index 0000000..a7ee0c7
--- /dev/null
+++ b/system/pdp_control.c
@@ -0,0 +1,162 @@
+/*
+ *   Pure Data Packet system implementation: control object
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+/* this is an actual pd class that is used for communication with the
+   pdp framework */
+
+#include "pdp.h"
+#include "pdp_internals.h"
+#include <stdio.h>
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+
+static long dropped_packets;
+
+static t_class* pdp_control_class;
+
+
+/* pdp control instance data */
+
+struct pdp_control_struct;
+typedef struct pdp_control_struct
+{
+    t_object x_obj;
+    t_outlet *x_outlet0;
+    struct pdp_control_struct *x_next;
+
+} t_pdp_control;
+
+typedef void (t_pdp_control_method_notify)(t_pdp_control *x);
+
+
+static t_pdp_control *pdp_control_list;
+
+static void pdp_control_info(t_pdp_control *x)
+{
+}
+
+static void pdp_control_thread(t_pdp_control *x, t_floatarg f)
+{
+    int t = (int)f;
+
+    if (t){
+	post("pdp_control: switching on processing in thread");
+	pdp_queue_use_thread(1);
+    }
+    else {
+	post("pdp_control: switching off processing in thread");
+	pdp_queue_use_thread(0);
+    }
+}
+
+
+static void pdp_control_send_drop_message(t_pdp_control *x)
+{
+    t_atom atom[1];
+    t_symbol *s = gensym("pdp_drop");
+
+    SETFLOAT(atom+0, (float)dropped_packets);
+    outlet_anything(x->x_outlet0, s, 1, atom);
+}
+
+
+static void pdp_control_free(t_pdp_control *x)
+{
+    /* remove from linked list */
+    t_pdp_control *curr = pdp_control_list;
+    if (pdp_control_list == x) pdp_control_list = x->x_next;
+    else while (curr){
+	if (curr->x_next == x) {
+	    curr->x_next = x->x_next;
+	    break;
+	}
+	else {
+	    curr = curr->x_next;
+	}
+	
+    }
+}
+
+
+static void *pdp_control_new(void)
+{
+    t_pdp_control *x = (t_pdp_control *)pd_new(pdp_control_class);
+    x->x_outlet0 = outlet_new(&x->x_obj, &s_anything);
+
+    /* add to list */
+    x->x_next = pdp_control_list;
+    pdp_control_list = x;
+    return x;
+}
+
+/************************* class methods ***************************************/
+
+
+void pdp_control_setup(void)
+{
+
+    pdp_control_list = 0;
+    dropped_packets = 0;
+
+    /* setup pd class data */
+    pdp_control_class = class_new(gensym("pdp_control"), (t_newmethod)pdp_control_new,
+    	(t_method)pdp_control_free, sizeof(t_pdp_control), 0, A_NULL);
+
+
+    class_addmethod(pdp_control_class, (t_method)pdp_control_info, gensym("info"), A_NULL);   
+    class_addmethod(pdp_control_class, (t_method)pdp_control_thread, gensym("thread"),  A_DEFFLOAT, A_NULL);   
+}
+
+
+
+void pdp_control_notify_broadcast(t_pdp_control_method_notify *notify)
+{
+    t_pdp_control *curr = pdp_control_list;
+    while (curr){
+	(*notify)(curr);
+	curr = curr->x_next;
+    }
+}
+
+
+
+/************************* notify class methods  *************************/
+
+void pdp_control_notify_drop(int packet)
+{
+    dropped_packets++;
+
+    /* send drop notify to controller class instances */
+    pdp_control_notify_broadcast(pdp_control_send_drop_message);
+    //post("dropped packet");
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_imageproc_mmx.c b/system/pdp_imageproc_mmx.c
new file mode 100644
index 0000000..2f32c3f
--- /dev/null
+++ b/system/pdp_imageproc_mmx.c
@@ -0,0 +1,319 @@
+/*
+ *   Pure Data Packet. c wrapper for mmx image processing routines.
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+/* this is a c wrapper around platform specific (mmx) code */
+#include <stdlib.h>
+#include "pdp_mmx.h"
+#include "pdp_imageproc.h"
+
+// utility stuff
+inline static s16 float2fixed(float f)
+{
+    if (f > 1) f = 1;
+    if (f < -1) f = -1;
+    f *= 0x7fff;
+    return (s16)f;
+}
+
+inline static void setvec(s16 *v, float f)
+{
+    s16 a = float2fixed(f);
+    v[0] = a;
+    v[1] = a;
+    v[2] = a;
+    v[3] = a;
+}
+
+
+
+// add two images
+void pdp_imageproc_add_process(s16 *image, s16 *image2,  u32 width, u32 height)
+{
+    unsigned int totalnbpixels = width * height;
+    pixel_add_s16(image, image2, totalnbpixels>>2);
+}
+
+// mul two images
+void pdp_imageproc_mul_process(s16 *image, s16 *image2,  u32 width, u32 height)
+{
+    unsigned int totalnbpixels = width * height;
+    pixel_mul_s16(image, image2, totalnbpixels>>2);
+}
+
+// mix 2 images
+void *pdp_imageproc_mix_new(void){return malloc(8*sizeof(s16));}
+void pdp_imageproc_mix_delete(void *x) {free (x);}
+void pdp_imageproc_mix_setleftgain(void *x, float gain){setvec((s16 *)x, gain);}
+void pdp_imageproc_mix_setrightgain(void *x, float gain){setvec((s16 *)x + 4, gain);}
+void pdp_imageproc_mix_process(void *x, s16 *image, s16 *image2, u32 width, u32 height)
+{
+    s16 *d = (s16 *)x;
+    unsigned int totalnbpixels = width * height;
+    pixel_mix_s16(image, image2, totalnbpixels>>2, d, d+4);
+}
+
+
+// random mix 2 images
+void *pdp_imageproc_randmix_new(void){return malloc(8*sizeof(s16));}
+void pdp_imageproc_randmix_delete(void *x) {free (x);}
+void pdp_imageproc_randmix_setthreshold(void *x, float threshold){setvec((s16 *)x, 2*threshold-1);}
+void pdp_imageproc_randmix_setseed(void *x, float seed)
+{
+    s16 *d = (s16 *)x;
+    srandom((u32)seed);
+    d[4] = (s16)random();
+    d[5] = (s16)random();
+    d[6] = (s16)random();
+    d[7] = (s16)random();
+    
+}
+void pdp_imageproc_randmix_process(void *x, s16 *image, s16 *image2, u32 width, u32 height)
+{
+    s16 *d = (s16 *)x;
+    unsigned int totalnbpixels = width * height;
+    pixel_randmix_s16(image, image2, totalnbpixels>>2, d+4, d);
+}
+
+// affine transformation (applies gain + adds offset)
+void *pdp_imageproc_affine_new(void){return malloc(8*sizeof(s16));}
+void pdp_imageproc_affine_delete(void *x){free(x);}
+void pdp_imageproc_affine_setgain(void *x, float gain){setvec((s16 *)x, gain);}
+void pdp_imageproc_affine_setoffset(void *x, float offset){setvec((s16 *)x+4, offset);}
+void pdp_imageproc_affine_process(void *x, s16 *image, u32 width, u32 height)
+{
+    s16 *d = (s16 *)x;
+    pixel_affine_s16(image, (width*height)>>2, d, d+4);
+}
+
+// 3x1 or 1x3 in place convolution
+// orientation
+void *pdp_imageproc_conv_new(void){return(malloc(16*sizeof(s16)));}
+void pdp_imageproc_conv_delete(void *x){free(x);}
+void pdp_imageproc_conv_setmin1(void *x, float val){setvec((s16 *)x, val);}
+void pdp_imageproc_conv_setzero(void *x, float val){setvec((s16 *)x+4, val);}
+void pdp_imageproc_conv_setplus1(void *x, float val){setvec((s16 *)x+8, val);}
+void pdp_imageproc_conv_setbordercolor(void *x, float val){setvec((s16 *)x+12, val);}
+void pdp_imageproc_conv_process(void *x, s16 *image, u32 width, u32 height, u32 orientation, u32 nbp)
+{
+    s16 *d = (s16 *)x;
+    u32 i,j;
+
+    if (orientation == PDP_IMAGEPROC_CONV_HORIZONTAL)
+    {
+	for(i=0; i<width*height; i+=width)
+	    for (j=0; j<nbp; j++)
+		pixel_conv_hor_s16(image+i, width>>2, d+12, d);
+    }
+
+    else
+    {
+	for (j=0; j<nbp; j++)
+	    for(i=0; i<width; i +=4) pixel_conv_ver_s16(image+i,  height, width, d+12, d);
+    }
+
+	
+	
+}
+
+// apply a gain to an image
+void *pdp_imageproc_gain_new(void){return(malloc(8*sizeof(s16)));}
+void pdp_imageproc_gain_delete(void *x){free(x);}
+void pdp_imageproc_gain_setgain(void *x, float gain)
+{
+    /* convert float to s16 + shift */
+    s16 *d = (s16 *)x;
+    s16 g;
+    int i;
+    float sign;
+    int shift = 0;
+    
+    sign = (gain < 0) ? -1 : 1;
+    gain *= sign;
+
+    /* max shift = 16 */
+    for(i=0; i<=16; i++){
+	if (gain < 0x4000){
+	    gain *= 2;
+	    shift++;
+	}
+	else break;
+    }
+
+    gain *= sign;
+    g = (s16) gain;
+
+    //g = 0x4000;
+    //shift = 14;
+
+    d[0]=g;
+    d[1]=g;
+    d[2]=g;
+    d[3]=g;
+    d[4]=(s16)shift;
+    d[5]=0;
+    d[6]=0;
+    d[7]=0;
+}
+void pdp_imageproc_gain_process(void *x, s16 *image, u32 width, u32 height)
+{
+    s16 *d = (s16 *)x;
+    pixel_gain_s16(image, (width*height)>>2, d, (u64 *)(d+4));
+}
+
+// colour rotation for 2 colour planes
+void *pdp_imageproc_crot2d_new(void){return malloc(16*sizeof(s16));}
+void pdp_imageproc_crot2d_delete(void *x){free(x);}
+void pdp_imageproc_crot2d_setmatrix(void *x, float *matrix)
+{
+    s16 *d = (s16 *)x;
+    setvec(d, matrix[0]);
+    setvec(d+4, matrix[1]);
+    setvec(d+8, matrix[2]);
+    setvec(d+12, matrix[3]);
+}
+void pdp_imageproc_crot2d_process(void *x, s16 *image, u32 width, u32 height)
+{
+    s16 *d = (s16 *)x;
+    pixel_crot2d_s16(image, width*height >> 2, d);
+}
+
+// biquad and biquad time
+void *pdp_imageproc_bq_new(void){return malloc((5+2+2)*4*sizeof(s16));}//5xcoef, 2xstate, 2xsavestate
+void pdp_imageproc_bq_delete(void *x){free(x);}
+void pdp_imageproc_bq_setcoef(void *x, float *coef) // a0,-a1,-a2,b0,b1,b2,u0,u1
+{
+    s16 *d = (s16 *)x;
+    float ia0 = 1.0f / coef[0];
+
+    /* all coefs are s1.14 fixed point */
+    /* representing values -2 < x < 2  */
+    /* so scale down before using the ordinary s0.15 float->fixed routine */
+
+    ia0 *= 0.5f;
+
+    // coef
+    setvec(d, ia0*coef[1]);
+    setvec(d+4, ia0*coef[2]);
+    setvec(d+8, ia0*coef[3]);
+    setvec(d+12, ia0*coef[4]);
+    setvec(d+16, ia0*coef[5]);
+
+    // state to reset too
+    setvec(d+28, coef[6]);
+    setvec(d+32, coef[7]);
+
+}
+void pdp_imageproc_bqt_process(void *x, s16 *image, s16 *state0, s16 *state1, u32 width, u32 height)
+{
+    s16 *d = (s16 *)x;
+    pixel_biquad_time_s16(image, state0, state1, d, (width*height)>>2);
+}
+
+void pdp_imageproc_bq_process(void *x, s16 *image, u32 width, u32 height, u32 direction, u32 nbp)
+{
+    s16 *d = (s16 *)x;
+    unsigned int i,j;
+
+
+
+    /* VERTICAL */
+
+    if ((direction & PDP_IMAGEPROC_BIQUAD_TOP2BOTTOM)
+	&& (direction &  PDP_IMAGEPROC_BIQUAD_BOTTOM2TOP)){
+
+	for(i=0; i<width; i +=4){
+	    for (j=0; j<nbp; j++){
+		pixel_biquad_vertb_s16(image+i,    height>>2, width, d, d + (5*4));
+		pixel_biquad_verbt_s16(image+i,    height>>2, width, d, d + (5*4));
+	    }
+	}
+    }
+
+    else if (direction & PDP_IMAGEPROC_BIQUAD_TOP2BOTTOM){
+	for(i=0; i<width; i +=4){
+	    for (j=0; j<nbp; j++){
+		pixel_biquad_vertb_s16(image+i,    height>>2, width, d, d + (5*4));
+	    }
+	}
+    }
+
+    else if (direction & PDP_IMAGEPROC_BIQUAD_BOTTOM2TOP){
+	for(i=0; i<width; i +=4){
+	    for (j=0; j<nbp; j++){
+		pixel_biquad_verbt_s16(image+i,    height>>2, width, d, d + (5*4));
+	    }
+	}
+    }
+
+    /* HORIZONTAL */
+
+    if ((direction & PDP_IMAGEPROC_BIQUAD_LEFT2RIGHT)
+	&& (direction & PDP_IMAGEPROC_BIQUAD_RIGHT2LEFT)){
+
+	for(i=0; i<(width*height); i +=(width<<2)){
+	    for (j=0; j<nbp; j++){
+		pixel_biquad_horlr_s16(image+i,    width>>2, width, d, d + (5*4));
+		pixel_biquad_horrl_s16(image+i,    width>>2, width, d, d + (5*4));
+	    }
+	}
+    }
+
+    else if (direction & PDP_IMAGEPROC_BIQUAD_LEFT2RIGHT){
+	for(i=0; i<(width*height); i +=(width<<2)){
+	    for (j=0; j<nbp; j++){
+		pixel_biquad_horlr_s16(image+i,    width>>2, width, d, d + (5*4));
+	    }
+	}
+    }
+
+    else if (direction & PDP_IMAGEPROC_BIQUAD_RIGHT2LEFT){
+	for(i=0; i<(width*height); i +=(width<<2)){
+	    for (j=0; j<nbp; j++){
+		pixel_biquad_horrl_s16(image+i,    width>>2, width, d, d + (5*4));
+	    }
+	}
+    }
+
+}
+
+// produce a random image
+// note: random number generator can be platform specific
+// however, it should be seeded. (same seed produces the same result)
+void *pdp_imageproc_random_new(void){return malloc(4*sizeof(s16));}
+void pdp_imageproc_random_delete(void *x){free(x);}
+void pdp_imageproc_random_setseed(void *x, float seed)
+{
+    s16 *d = (s16 *)x;
+    srandom((u32)seed);
+    d[0] = (s16)random();
+    d[1] = (s16)random();
+    d[2] = (s16)random();
+    d[3] = (s16)random();
+    
+}
+void pdp_imageproc_random_process(void *x, s16 *image, u32 width, u32 height)
+{
+    s16 *d = (s16 *)x;
+    unsigned int totalnbpixels = width * height;
+    pixel_rand_s16(image, totalnbpixels>>2, d);
+}
+
+
diff --git a/system/pdp_imageproc_portable.c b/system/pdp_imageproc_portable.c
new file mode 100644
index 0000000..60062d6
--- /dev/null
+++ b/system/pdp_imageproc_portable.c
@@ -0,0 +1,492 @@
+/*
+ *   Pure Data Packet. portable image processing routines.
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+
+#include <stdlib.h>
+#include "pdp_imageproc.h"
+
+// utility stuff
+inline static s32 float2fixed(float f)
+{
+    if (f > 1) f = 1;
+    if (f < -1) f = -1;
+    f *= 0x7fff;
+    return (s32)f;
+}
+
+
+
+#define CLAMP16(x) (((x) > 0x7fff) ? 0x7fff : (((x) < -0x7fff) ? -0x7fff : (x)))
+
+// add two images
+void pdp_imageproc_add_process(s16 *image, s16 *image2,  u32 width, u32 height)
+{
+    int a, b;
+    unsigned int i;
+    for (i=0; i<width*height; i++){
+	a = (int)image[i];
+	b = (int)image2[i];
+	image[i] = (s16)(CLAMP16(a+b));
+    }
+    
+}
+
+// mul two images
+void pdp_imageproc_mul_process(s16 *image, s16 *image2,  u32 width, u32 height)
+{
+    int a, b;
+    unsigned int i;
+    for (i=0; i<width*height; i++){
+	a = (int)image[i];
+	b = (int)image2[i];
+	image[i] = (s16)((a*b)>>15);
+    }
+    
+}
+
+// mix 2 images
+void *pdp_imageproc_mix_new(void){return malloc(2*sizeof(s32));}
+void pdp_imageproc_mix_delete(void *x) {free (x);}
+void pdp_imageproc_mix_setleftgain(void *x, float gain)
+{
+    s32 *d = (s32 *)x;
+    d[0] = float2fixed(gain);
+}
+void pdp_imageproc_mix_setrightgain(void *x, float gain)
+{
+    s32 *d = (s32 *)x;
+    d[1] = float2fixed(gain);
+}
+void pdp_imageproc_mix_process(void *x, s16 *image, s16 *image2, u32 width, u32 height)
+{
+    s32 *d = (s32 *)x;
+    u32 i;
+    s32 a,b;
+
+    for(i=0; i<width*height; i++){
+	a = (s32)image[i];
+	b = (s32)image2[i];
+	a = (a*d[0] + b*d[1]) >> 15;
+	image[i] = (s16)CLAMP16(a);
+    }
+	
+}
+
+
+// random mix 2 images
+void *pdp_imageproc_randmix_new(void){return malloc(2*sizeof(s32));;}
+void pdp_imageproc_randmix_delete(void *x) {free(x);}
+void pdp_imageproc_randmix_setthreshold(void *x, float threshold)
+{
+    s32 *d = (s32 *)x;
+    if (threshold > 1.0f) threshold = 1.0f;
+    if (threshold < 0.0f) threshold = 0.0f;
+    d[0] = float2fixed(threshold);
+}
+void pdp_imageproc_randmix_setseed(void *x, float seed)
+{
+    s32 *d = (s32 *)x;
+    d[1] = float2fixed(seed);
+}
+void pdp_imageproc_randmix_process(void *x, s16 *image, s16 *image2, u32 width, u32 height)
+{
+    s32 *d = (s32 *)x;
+    u32 i;
+    s16 r;
+    srandom((u32)d[1]);
+
+
+    for(i=0; i<width*height; i++){
+	// get a random val between 0 and 0x7fff
+	r = (s16)(random() & 0x7fff);
+	if (r < d[0]) image[i] = image2[i];
+    }
+}
+
+// affine transformation (applies gain + adds offset)
+void *pdp_imageproc_affine_new(void){return malloc(2*sizeof(s32));}
+void pdp_imageproc_affine_delete(void *x){free(x);}
+void pdp_imageproc_affine_setgain(void *x, float gain)
+{
+    s32 *d = (s32 *)x;
+    d[0] = float2fixed(gain);
+}
+
+void pdp_imageproc_affine_setoffset(void *x, float offset)
+{
+    s32 *d = (s32 *)x;
+    d[1] = float2fixed(offset);
+}
+void pdp_imageproc_affine_process(void *x, s16 *image, u32 width, u32 height)
+{
+    s32 *d = (s32 *)x;
+    u32 i;
+    s32 a;
+
+    for(i=0; i<width*height; i++){
+	a = (s32)image[i];
+	a = (a*d[0]) >> 15;
+	a += d[1];
+	image[i] = (s16)CLAMP16(a);
+    }
+}
+
+// 3x1 or 1x3 in place convolution
+// orientation
+void *pdp_imageproc_conv_new(void){return(malloc(4*sizeof(s32)));}
+void pdp_imageproc_conv_delete(void *x){free(x);}
+void pdp_imageproc_conv_setmin1(void *x, float val)
+{
+    s32 *d = (s32 *)x;
+    d[0] = float2fixed(val);
+}
+void pdp_imageproc_conv_setzero(void *x, float val)
+{
+    s32 *d = (s32 *)x;
+    d[1] = float2fixed(val);
+}
+void pdp_imageproc_conv_setplus1(void *x, float val)
+{
+    s32 *d = (s32 *)x;
+    d[2] = float2fixed(val);
+}
+void pdp_imageproc_conv_setbordercolor(void *x, float val)
+{
+    s32 *d = (s32 *)x;
+    d[3] = float2fixed(val);
+}
+
+static inline void pdp_imageproc_conv_scanline(void *x, s16 *data, u32 count, s32 stride)
+{
+    s32 *d = (s32 *)x;
+    s32 a,b,c,r;
+    u32 i;
+
+    a = d[3]; //border
+    b = data[0];
+    c = data[stride];
+
+    for(i = 0; i < count-2; i++){
+	r = a*d[0] + b*d[1] + c*d[2];
+	a = data[0];
+	b = data[stride];
+	c = data[stride<<1];
+	data[0] = (s16)CLAMP16(r>>15);
+	data += stride;
+    }
+    r = a*d[0] + b*d[1] + c*d[2];
+    a = data[0];
+    b = data[stride];
+    c = d[3]; //border
+    data[0] = (s16)CLAMP16(r>>15);
+    r = a*d[0] + b*d[1] + c*d[2];
+    data[stride] = (s16)CLAMP16(r>>15);
+
+}
+
+void pdp_imageproc_conv_process(void *x, s16 *image, u32 width, u32 height, u32 orientation, u32 nbp)
+{
+    s32 *d = (s32 *)x;
+    u32 i, j;
+
+    if (orientation == PDP_IMAGEPROC_CONV_HORIZONTAL){
+	for(i=0; i<width*height; i+=width)
+	    for(j=0; j<nbp; j++)
+		pdp_imageproc_conv_scanline(x, image+i, width, 1);
+
+    }
+
+    if (orientation == PDP_IMAGEPROC_CONV_VERTICAL){
+	for(i=0; i<width; i++)
+	    for(j=0; j<nbp; j++)
+		pdp_imageproc_conv_scanline(x, image+i, height, width);
+
+    }
+
+
+
+	
+}
+
+// apply a gain to an image
+void *pdp_imageproc_gain_new(void){return(malloc(2*sizeof(s32)));}
+void pdp_imageproc_gain_delete(void *x){free(x);}
+void pdp_imageproc_gain_setgain(void *x, float gain)
+{
+    /* convert float to s16 + shift */
+    s32 *d = (s32 *)x;
+    s32 g;
+    int i;
+    float sign;
+    s32 shift = 0;
+    
+    sign = (gain < 0) ? -1 : 1;
+    gain *= sign;
+
+    /* max shift = 16 */
+    for(i=0; i<=16; i++){
+	if (gain < 0x4000){
+	    gain *= 2;
+	    shift++;
+	}
+	else break;
+    }
+
+    gain *= sign;
+    g = (s32) gain;
+
+    //g = 0x4000;
+    //shift = 14;
+
+    d[0]=g;
+    d[1]=shift;
+}
+void pdp_imageproc_gain_process(void *x, s16 *image, u32 width, u32 height)
+{
+    s32 *d = (s32 *)x;
+    s32 a;
+    u32 i;
+    for (i=0; i<width*height; i++){
+	a = (s32)image[i];
+	image[i] = (s16)(CLAMP16((a * d[0]) >> d[1]));
+    }
+}
+
+// colour rotation for 2 colour planes
+void *pdp_imageproc_crot2d_new(void){return malloc(4*sizeof(s32));}
+void pdp_imageproc_crot2d_delete(void *x){free(x);}
+void pdp_imageproc_crot2d_setmatrix(void *x, float *matrix)
+{
+    s32 *d = (s32 *)x;
+    d[0] = float2fixed(matrix[0]);
+    d[1] = float2fixed(matrix[1]);
+    d[2] = float2fixed(matrix[2]);
+    d[3] = float2fixed(matrix[3]);
+
+}
+void pdp_imageproc_crot2d_process(void *x, s16 *image, u32 width, u32 height)
+{
+    s32 *d = (s32 *)x;
+    u32 i,j;
+    s32 a1,a2,c1,c2;
+
+    for(i=0, j=width*height; i<width*height; i++, j++){
+	c1 = (s32)image[i];
+	c2 = (s32)image[j];
+	
+	a1 = d[0] * c1;
+	a2 = d[1] * c1;
+	a1+= d[2] * c2;
+	a2+= d[3] * c2;
+
+	a1 >>= 15;
+	a2 >>= 15;
+
+	image[i] = (s16)CLAMP16(a1);
+	image[j] = (s16)CLAMP16(a2);
+    }
+}
+
+// biquad and biquad time
+void *pdp_imageproc_bq_new(void){return malloc((5+2+2)*sizeof(s32));}//5xcoef, 2xstate, 2xsavestate
+void pdp_imageproc_bq_delete(void *x){free(x);}
+void pdp_imageproc_bq_setcoef(void *x, float *coef) // a0,-a1,-a2,b0,b1,b2,u0,u1
+{
+    s32 *d = (s32 *)x;
+    float ia0 = 1.0f / coef[0];
+
+    /* all coefs are s1.14 fixed point */
+    /* representing values -2 < x < 2  */
+    /* so scale down before using the ordinary s0.15 float->fixed routine */
+
+    ia0 *= 0.5f;
+
+    // coef
+    d[0] = float2fixed(ia0*coef[1]); // -a1
+    d[1] = float2fixed(ia0*coef[2]); // -a2
+    d[2] = float2fixed(ia0*coef[3]); // b0
+    d[3] = float2fixed(ia0*coef[4]); // b1
+    d[4] = float2fixed(ia0*coef[5]); // b2
+
+
+    // state to reset too
+    d[5] = float2fixed(coef[6]);
+    d[6] = float2fixed(coef[7]);
+
+}
+
+#define A1 d[0]
+#define A2 d[1]
+#define B0 d[2]
+#define B1 d[3]
+#define B2 d[4]
+/*
+ 	# DIRECT FORM II BIQUAD (from pixel_biquad_s16.s)
+ 	#
+ 	# y[k]  = b0 * x[k] + u1[k-1]
+ 	# u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
+	# u2[k] = b2 * x[k]           - a2 * y[k]
+*/
+
+/* remark A1 and A2 are already negated) */
+
+
+static inline void pdp_imageproc_bq_scanline(void *x, s16 *data, u32 count, s32 stride)
+{
+
+    s32 *d = (s32 *)x;
+    s32 u1,u2, xx, yy;
+
+    u32 i;
+
+    u1 = d[7];
+    u2 = d[8];
+
+    for(i = 0; i < count; i++){
+
+	xx = (s32)data[0];
+
+	yy = ((B0 * xx)>>14) + u1;
+	u1 = ((B1 * xx)>>14) + u2 + ((A1 * yy)>>14);
+	u2 = ((B2 * xx)>>14)      + ((A2 * yy)>>14);
+
+	data[0] = (s16)CLAMP16(yy);
+
+	data += stride;
+
+    }
+
+    d[7] = u1;
+    d[8] = u2;
+
+}
+
+void pdp_imageproc_bqt_process(void *x, s16 *image, s16 *state1, s16 *state2, u32 width, u32 height)
+{
+    s32 *d = (s32 *)x;
+    u32 i;
+    s32 u1, u2, xx, yy;
+
+    for (i=0; i<width*height; i++){
+
+	xx = (s32)image[i];
+	u1 = (s32)state1[i];
+	u2 = (s32)state2[i];
+
+	yy = ((B0 * xx)>>14) + u1;
+	u1 = ((B1 * xx)>>14) + u2 + ((A1 * yy)>>14);
+	u2 = ((B2 * xx)>>14)      + ((A2 * yy)>>14);
+
+	image[i] = (s16)CLAMP16(yy);
+	state1[i] = (s16)CLAMP16(u1);
+	state2[i] = (s16)CLAMP16(u2);
+    }
+	
+	
+}
+
+void pdp_imageproc_bq_process(void *x, s16 *data, u32 width, u32 height, u32 direction, u32 nbp)
+{
+    s32 *d = (s32 *)x;
+    unsigned int i,j, offset;
+
+    /* VERTICAL */
+    offset = (height-1)*width;
+
+    if ((direction & PDP_IMAGEPROC_BIQUAD_TOP2BOTTOM)
+	&& (direction &  PDP_IMAGEPROC_BIQUAD_BOTTOM2TOP)){
+
+	for(i=0; i<width; i++){
+	    for (j=0; j<nbp; j++){
+		pdp_imageproc_bq_scanline(x, data+i, height, width); //T->B
+		pdp_imageproc_bq_scanline(x, data+offset+i, height, -width); //B->T
+	    }
+	}
+    }
+
+    else if (direction & PDP_IMAGEPROC_BIQUAD_TOP2BOTTOM){
+	for(i=0; i<width; i++){
+	    for (j=0; j<nbp; j++){
+		pdp_imageproc_bq_scanline(x, data+i, height, width); //T->B
+	    }
+	}
+    }
+
+    else if (direction & PDP_IMAGEPROC_BIQUAD_BOTTOM2TOP){
+	for(i=0; i<width; i++){
+	    for (j=0; j<nbp; j++){
+		pdp_imageproc_bq_scanline(x, data+offset+i, height, -width); //B->T
+	    }
+	}
+    }
+
+    /* HORIZONTAL */
+
+    offset = width-1;
+    if ((direction & PDP_IMAGEPROC_BIQUAD_LEFT2RIGHT)
+	&& (direction & PDP_IMAGEPROC_BIQUAD_RIGHT2LEFT)){
+
+	for(i=0; i<(width*height); i += width){
+	    for (j=0; j<nbp; j++){
+		pdp_imageproc_bq_scanline(x, data+i, width, 1); //L->R
+		pdp_imageproc_bq_scanline(x, data+offset+i, width, -1); //R->L
+	    }
+	}
+    }
+
+    else if (direction & PDP_IMAGEPROC_BIQUAD_LEFT2RIGHT){
+	for(i=0; i<(width*height); i += width){
+	    for (j=0; j<nbp; j++){
+		pdp_imageproc_bq_scanline(x, data+i, width, 1); //L->R
+	    }
+	}
+    }
+
+    else if (direction & PDP_IMAGEPROC_BIQUAD_RIGHT2LEFT){
+	for(i=0; i<(width*height); i += width){
+	    for (j=0; j<nbp; j++){
+		pdp_imageproc_bq_scanline(x, data+offset+i, width, -1); //R->L
+
+	    }
+	}
+    }
+
+}
+
+// produce a random image
+// note: random number generator can be platform specific
+// however, it should be seeded. (same seed produces the same result)
+void *pdp_imageproc_random_new(void){return malloc(sizeof(s32));}
+void pdp_imageproc_random_delete(void *x){free(x);}
+void pdp_imageproc_random_setseed(void *x, float seed)
+{
+    s32 *d = (s32 *)x;
+    d[0] = float2fixed(seed);
+}
+void pdp_imageproc_random_process(void *x, s16 *image, u32 width, u32 height)
+{
+    s32 *d = (s32 *)x;
+    u32 i;
+    srandom((u32)d[0]);
+    for (i=0; i<width*height; i++) image[i] = (s16)(random() & 0xffff);
+    
+}
+
diff --git a/system/pdp_llconv.c b/system/pdp_llconv.c
new file mode 100644
index 0000000..93d2934
--- /dev/null
+++ b/system/pdp_llconv.c
@@ -0,0 +1,293 @@
+/*
+ *   Pure Data Packet system implementation. : low level format conversion code
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+/* this file contains low level image conversion code 
+   nominated as "the ugliest part of pdp"
+   some code is mmx, most is not. */
+
+#include "pdp_llconv.h"
+#include "pdp_mmx.h"
+
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+#define CLAMP(x) (((x)<0) ? 0 : ((x>255)? 255 : (x)))
+#define CLAMP16(x) (((x)<-0x7fff) ? -0x7fff : ((x>0x7fff) ? 0x7fff : (x)))
+#define FP(x) ((int)(((float)(x)) * 256.0f))
+
+
+/* some prototypes for functions defined elsewhere */
+void llconv_yvu_planar_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels);
+void llconv_yuv_planar_u8s16(unsigned char* source, short int *dest, int nbpixels);
+void llconv_grey_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels);
+void llconv_yvu_planar_u8s16(unsigned char* source, short int *dest, int nbpixels);
+
+
+static inline int rgb2y(int r, int g, int b){return (FP(0.257) * r) + (FP(0.504) * g) + (FP(0.098) * b) + FP(16);}
+static inline int rgb2v(int r, int g, int b){return (FP(0.439) * r) - (FP(0.368) * g) - (FP(0.071) * b) + FP(128);}
+static inline int rgb2u(int r, int g, int b){return -(FP(0.148) * r) - (FP(0.291) * g) + (FP(0.439) * b) + FP(128);}
+
+
+/* "standard" 8 bit conversion routine */
+static void llconv_rgb2yvu(unsigned char* src, unsigned char* dst, int nbpixels)
+{
+    int r,g,b,y,v,u,i;
+    for (i=0; i<nbpixels; i++){
+	r = src[0];
+	g = src[1];
+	b = src[2];
+
+	y = rgb2y(r,g,b);
+	v = rgb2v(r,g,b);
+	u = rgb2u(r,g,b);
+
+	dst[0] = CLAMP(y>>8);
+	dst[1] = CLAMP(v>>8);
+	dst[2] = CLAMP(u>>8);
+
+	src += 3;
+	dst += 3;
+    }
+}
+
+
+
+/* 8 bit rgb to 16 bit planar subsampled yvu */
+static void llconv_rgb2yvu_planar16sub(unsigned char* src, short int* dst, int w, int h)
+{
+    int r,g,b,y,v,u,i,j,k;
+    int size = w*h;
+
+    int voffset = size;
+    int uoffset = size + (size>>2);
+
+
+    int loffset = w * 3;
+
+    k=0;
+    for (j=0; j<w*h; j+=(w<<1)){
+	k = 3 * j;
+	for (i=0; i<w; i+=2){
+
+
+	    // well, this seems to work... strange though
+	    b = src[k];
+	    g = src[k+1];
+	    r = src[k+2];
+	    
+	    y =  (FP(0.257) * r) + (FP(0.504) * g) + (FP(0.098) * b) + FP(16);
+	    v =  (FP(0.439) * r) - (FP(0.368) * g) - (FP(0.071) * b);
+	    u = -(FP(0.148) * r) - (FP(0.291) * g) + (FP(0.439) * b);
+
+	    dst[i+j] = CLAMP16(y >> 1);
+
+	    b = src[k+3];
+	    g = src[k+4];
+	    r = src[k+5];
+	    
+	    y =  (FP(0.257) * r) + (FP(0.504) * g) + (FP(0.098) * b) + FP(16);
+	    v +=  (FP(0.439) * r) - (FP(0.368) * g) - (FP(0.071) * b);
+	    u += -(FP(0.148) * r) - (FP(0.291) * g) + (FP(0.439) * b);
+
+	    dst[i+j+1] = CLAMP16(y >> 1);
+
+
+
+	    b = src[loffset + k];
+	    g = src[loffset + k+1];
+	    r = src[loffset + k+2];
+	    
+	    y =  (FP(0.257) * r) + (FP(0.504) * g) + (FP(0.098) * b) + FP(16);
+	    v =  (FP(0.439) * r) - (FP(0.368) * g) - (FP(0.071) * b);
+	    u = -(FP(0.148) * r) - (FP(0.291) * g) + (FP(0.439) * b);
+
+	    dst[w+i+j] = CLAMP16(y >> 1);
+
+	    b = src[loffset + k+3];
+	    g = src[loffset + k+4];
+	    r = src[loffset + k+5];
+	    
+	    k += 6;
+
+	    y =  (FP(0.257) * r) + (FP(0.504) * g) + (FP(0.098) * b) + FP(16);
+	    v +=  (FP(0.439) * r) - (FP(0.368) * g) - (FP(0.071) * b);
+	    u += -(FP(0.148) * r) - (FP(0.291) * g) + (FP(0.439) * b);
+
+	    dst[w+i+j+1] = CLAMP16(y >> 1);
+
+	    dst[uoffset+ (i>>1) + (j>>2)] = (CLAMP16(u >> 1));
+	    dst[voffset+ (i>>1) + (j>>2)] = (CLAMP16(v >> 1));
+	}
+    }
+}
+
+/* these seem to be pretty slow */
+
+static void llconv_yvu2rgb(unsigned char* src, unsigned char* dst, int nbpixels)
+{
+    int r,g,b,y,v,u,i;
+    for (i=0; i<nbpixels; i++){
+	y = src[0];
+	v = src[1];
+	u = src[2];
+
+
+	b = FP(1.164) * (y - 16)                         + FP(2.018) * (u - 128);
+	g = FP(1.164) * (y - 16) - FP(0.813) * (v - 128) - FP(0.391) * (u - 128);
+	r = FP(1.164) * (y - 16) + FP(1.596) * (v - 128);
+
+	dst[0] = CLAMP(r>>8);
+	dst[1] = CLAMP(g>>8);
+	dst[2] = CLAMP(b>>8);
+
+	src += 3;
+	dst += 3;
+    }
+}
+
+
+
+/* convert yvu to yuyv */
+static void llconv_yvu2yuyv(unsigned char *src, unsigned char *dst, unsigned int nbpixels)
+{
+    unsigned int y1, y2, u, v, i;
+
+    for (i = 0; i < nbpixels/2; i++){
+
+	y1 = src[0];
+	y2 = src[3];
+	v = (src[1] + src[4]) >> 1;
+	u = (src[2] + src[5]) >> 1;
+	dst[0] = y1;
+	dst[1] = u;
+	dst[2] = y2;
+	dst[3] = v;
+
+	src += 6;
+	dst += 4;
+
+    }
+
+}
+
+
+
+/* convert yuvu packed 8 bit unsigned to yv12 planar 16bit signed */
+static void llconv_yuyv_packed_u8s16(unsigned char* ucsource, short int *sidest, unsigned int w, unsigned int h)
+{
+    unsigned int i, j;
+    unsigned int *source = (unsigned int *)ucsource;
+
+    unsigned int *dest = (unsigned int *)sidest;
+    unsigned int uoffset = (w*h)>>1;
+    unsigned int voffset = (w*h + ((w*h) >> 2)) >> 1;
+
+    for(j=0; j < (h*w)>>1; j +=(w)){
+	for(i=0; i< (w>>1); i+=2){
+	    unsigned int y,u,v;
+	    unsigned int v00, v01, v10, v11;
+	    v00 = source[i+j];
+	    v01 = source[i+j+1];
+	    v10 = source[i+j+(w>>1)];
+	    v11 = source[i+j+(w>>1)+1];
+	    
+	    // save luma
+	    dest[i+j]          = ((v00 & 0x00ff00ff) << 7);
+	    dest[i+j+1]        = ((v01 & 0x00ff00ff) << 7);
+	    dest[i+j+(w>>1)]   = ((v10 & 0x00ff00ff) << 7);
+	    dest[i+j+(w>>1)+1] = ((v11 & 0x00ff00ff) << 7);
+
+	    // compute chroma
+
+	    // mask out luma & shift right
+	    v00 = (v00 & 0xff00ff00)>>1;
+	    v01 = (v01 & 0xff00ff00)>>1;
+	    v10 = (v10 & 0xff00ff00)>>1;
+	    v11 = (v11 & 0xff00ff00)>>1;
+	    
+	    // average 2 scan lines
+	    v00 += v10;
+	    v01 += v11;
+
+	    // combine
+	    v = (v01 << 16) | (v00 & 0x0000ffff);
+	    u = (v01 & 0xffff0000) | (v00 >> 16);
+
+	    // flip sign bits for u,v
+	    u ^= 0x80008000;
+	    v ^= 0x80008000;
+
+	    // save chroma
+	    dest[uoffset + (i>>1) + (j>>2)] = u;
+	    dest[voffset + (i>>1) + (j>>2)] = v;
+	}
+    }
+
+
+}
+
+#define CONVERT(x,y) ((x) + ((y)<<16))
+
+void pdp_llconv(void *src, int stype, void *dst, int dtype, int w, int h)
+{
+    int conversion = CONVERT(stype, dtype);
+    void *tmpbuf;
+
+    switch(CONVERT(stype, dtype)){
+
+    case CONVERT( RIF_YVU__P411_U8, RIF_YVU__P411_S16 ):
+	llconv_yvu_planar_u8s16((unsigned char*)src, (short int *)dst, w*h);
+	break;
+
+    case CONVERT( RIF_YUV__P411_U8, RIF_YVU__P411_S16 ):
+	llconv_yuv_planar_u8s16((unsigned char*)src, (short int *)dst, w*h);
+	break;
+
+    case CONVERT( RIF_YUYV_P____U8, RIF_YVU__P411_S16 ):
+	llconv_yuyv_packed_u8s16((unsigned char*)src, (short int *)dst, w, h);
+	break;
+
+    case CONVERT( RIF_RGB__P____U8, RIF_YVU__P411_S16 ):
+	llconv_rgb2yvu_planar16sub((unsigned char*) src, (short int*) dst, w, h);
+	break;
+
+    case CONVERT( RIF_YVU__P411_S16, RIF_YVU__P411_U8 ):
+	llconv_yvu_planar_s16u8((short int*)src, (unsigned char*)dst, w*h);
+	break;
+
+    case CONVERT( RIF_GREY______S16, RIF_GREY______U8 ):
+	llconv_grey_s16u8((short int*)src, (unsigned char*)dst, w*h);
+	break;
+    default:
+	post("pdp_llconv: WARNING: no conversion routine defined for (%d)->(%d)", stype, dtype);
+
+    }
+
+}
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_llconv_mmx.c b/system/pdp_llconv_mmx.c
new file mode 100644
index 0000000..8070bac
--- /dev/null
+++ b/system/pdp_llconv_mmx.c
@@ -0,0 +1,55 @@
+
+/*
+ *   Pure Data Packet system implementation. : wrapper for mmx low level format conversion code
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "pdp_mmx.h"
+
+
+
+/* convert greyscale 8 bit unsigned to 16bit signed */
+void llconv_grey_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+    pixel_pack_s16u8_y(src, dst, nbpixels>>3);
+}
+
+/* convert yvu planar 411 16 bit signed to 8 bit unsigned */
+void llconv_yvu_planar_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+    pixel_pack_s16u8_y(src, dst, nbpixels>>3);
+    pixel_pack_s16u8_uv(src + nbpixels, dst + nbpixels, nbpixels>>4);
+}
+
+
+/* convert yvu planar 411 8 bit unsigned to yv12 planar 16bit signed */
+void llconv_yvu_planar_u8s16(unsigned char* source, short int *dest, int nbpixels)
+{
+    pixel_unpack_u8s16_y(source, dest, nbpixels>>3);
+    pixel_unpack_u8s16_uv(&source[nbpixels], &dest[nbpixels], nbpixels>>4);
+}
+
+/* convert yuv planar 411 8 bit unsigned to yv12 planar 16bit signed */
+void llconv_yuv_planar_u8s16(unsigned char* source, short int *dest, int nbpixels)
+{
+    pixel_unpack_u8s16_y(source, dest, nbpixels>>3);
+    pixel_unpack_u8s16_uv(&source[nbpixels], &dest[nbpixels + (nbpixels>>2)], nbpixels>>5);
+    pixel_unpack_u8s16_uv(&source[nbpixels + (nbpixels>>2)], &dest[nbpixels], nbpixels>>5);
+}
+
diff --git a/system/pdp_llconv_portable.c b/system/pdp_llconv_portable.c
new file mode 100644
index 0000000..de65ef5
--- /dev/null
+++ b/system/pdp_llconv_portable.c
@@ -0,0 +1,81 @@
+
+/*
+ *   Pure Data Packet system implementation. : portable low level format conversion code
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define CLAMP(x) (((x)<0) ? 0 : ((x>255)? 255 : (x)))
+#define FP(x) ((int)(((float)(x)) * 256.0f))
+
+void pixel_unpack_portable_u8s16_y(unsigned char *src ,short int *dst, unsigned int nbpixels)
+{
+    unsigned int i;
+    for (i=0; i<nbpixels; i++) dst[i] = ((short int)(src[i])) << 7;
+}
+
+void pixel_unpack_portable_u8s16_uv(unsigned char *src ,short int *dst, unsigned int nbpixels)
+{
+    unsigned int i;
+    for (i=0; i<nbpixels; i++) dst[i] = (((short int)(src[i])) << 8) ^ 0x8000;
+}
+
+
+void pixel_pack_portable_s16u8_y(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+    unsigned int i;
+    for (i=0; i<nbpixels; i++) dst[i] = (unsigned char)(CLAMP(src[i]>>7));
+}
+
+void pixel_pack_portable_s16u8_uv(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+    unsigned int i;
+    for (i=0; i<nbpixels; i++) dst[i] = (unsigned char)((src[i]^0x8000)>>8);
+}
+
+
+/* convert greyscale 8 bit unsigned to 16bit signed */
+void llconv_grey_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+    pixel_pack_portable_s16u8_y(src, dst, nbpixels);
+}
+
+/* convert yvu planar 411 16 bit signed to 8 bit unsigned */
+void llconv_yvu_planar_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+    pixel_pack_portable_s16u8_y(src, dst, nbpixels);
+    pixel_pack_portable_s16u8_uv(src + nbpixels, dst + nbpixels, nbpixels>>1);
+
+}
+
+
+/* convert yvu planar 411 8 bit unsigned to yv12 planar 16bit signed */
+void llconv_yvu_planar_u8s16(unsigned char* source, short int *dest, int nbpixels)
+{
+    pixel_unpack_portable_u8s16_y(source, dest, nbpixels);
+    pixel_unpack_portable_u8s16_uv(&source[nbpixels], &dest[nbpixels], nbpixels>>1);
+}
+
+/* convert yuv planar 411 8 bit unsigned to yv12 planar 16bit signed */
+void llconv_yuv_planar_u8s16(unsigned char* source, short int *dest, int nbpixels)
+{
+    pixel_unpack_portable_u8s16_y(source, dest, nbpixels);
+    pixel_unpack_portable_u8s16_uv(&source[nbpixels], &dest[nbpixels + (nbpixels>>2)], nbpixels>>2);
+    pixel_unpack_portable_u8s16_uv(&source[nbpixels + (nbpixels>>2)], &dest[nbpixels], nbpixels>>2);
+}
+
+
diff --git a/system/pdp_packet.c b/system/pdp_packet.c
new file mode 100644
index 0000000..0c0b2c2
--- /dev/null
+++ b/system/pdp_packet.c
@@ -0,0 +1,239 @@
+/*
+ *   Pure Data Packet system implementation: 
+ *   code for allocation/deallocation/copying/...
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "pdp.h"
+#include <stdio.h>
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* this needs to be able to grow dynamically, think about it later */
+#define PDP_OBJECT_ARRAY_SIZE 1024
+static t_pdp* pdp_stack[PDP_OBJECT_ARRAY_SIZE];
+
+
+/* some global vars */
+static t_symbol* pdp_sym_register_rw;
+static t_symbol* pdp_sym_register_ro;
+static t_symbol* pdp_sym_process;
+
+
+/* setup methods */
+
+void 
+pdp_packet_setup(void)
+{
+    bzero(pdp_stack, PDP_OBJECT_ARRAY_SIZE * sizeof(t_pdp *));
+    pdp_sym_register_rw = gensym("register_rw");
+    pdp_sym_register_ro = gensym("register_ro");
+    pdp_sym_process = gensym("process");
+}
+
+void 
+pdp_packet_destroy(void)
+{
+    int i = 0;
+    /* dealloc all the data in object stack */
+    while ((i < PDP_OBJECT_ARRAY_SIZE) && (pdp_stack[i])) free(pdp_stack[i++]);
+}
+
+
+/* private pdp_mem methods */
+
+
+/* public object manips */
+
+
+/* alloc method: alloc time is linear in the number of used packets */
+/* think about a better (tree) method when this number grows large */
+int 
+pdp_packet_new(unsigned int datatype, unsigned int datasize /*without header*/)
+{
+    unsigned int totalsize = datasize + PDP_HEADER_SIZE;
+    int i = 0;
+    unsigned int align;
+    t_pdp* p;
+    for (i=0; i < PDP_OBJECT_ARRAY_SIZE; i++){
+	p = pdp_stack[i];
+	/* check if we can reuse this one if it is already allocated */
+	if (p) {
+	    /* remark: if p->size >= totalsize we can give away the packet */
+	    /* but that would lead to unefficient use if we have a lot of packets */
+	    /* of different sizes */
+	    if ((p->users == 0) && (p->size == totalsize) && (p->type == datatype)){
+	      //post("pdp_new_object: can reuse %d", i);
+	      p->users = 1;
+	      return i;
+	    }
+	    else{
+	      //post("pdp_new_object: can't reuse %d, (%d users)", i, p->users);
+	      //post("size:%d, newsize:%d, type:%d, newtype:%d", p->size, totalsize, p->type, datatype);
+	    }
+	}
+	/* allocate new one */
+	else {
+	    p = (t_pdp *)malloc(totalsize);
+	    align = ((unsigned int)p) & (PDP_ALIGN - 1);
+	    if (align) post("pdp_new_object: warning data misaligned by %x", align);
+	    pdp_stack[i] = p;
+	    p->type = datatype;
+	    p->size = totalsize;
+	    p->users = 1;
+	    //post("pdp_new_object: allocating new (%d)", i);
+	    return i;
+	}
+    }
+    post("pdp_new_object: WARNING: out of memory");
+
+    return -1;
+
+}
+
+
+t_pdp*
+pdp_packet_header(int handle)
+{
+    if ((handle >= 0) && (handle < PDP_OBJECT_ARRAY_SIZE)) return pdp_stack[handle];
+    else return 0;
+}
+
+void*
+pdp_packet_data(int handle)
+{
+    if ((handle >= 0) && (handle < PDP_OBJECT_ARRAY_SIZE)) 
+	return (char *)(pdp_stack[handle]) + PDP_HEADER_SIZE;
+    else return 0;
+}
+
+
+
+int
+pdp_packet_copy_ro(int handle)
+{
+    int out_handle;
+
+    t_pdp* p;
+    if ((handle >= 0) 
+	&& (handle < PDP_OBJECT_ARRAY_SIZE) 
+	&& (p = pdp_stack[handle])){
+	/* increase the number of users and return */
+	p->users++;
+	out_handle = handle;
+    }
+    else out_handle = -1;
+
+    //post("pdp_copy_ro: outhandle:%d", out_handle);
+
+    return out_handle;
+}
+
+int
+pdp_packet_copy_rw(int handle)
+{
+    int out_handle;
+
+    t_pdp* p;
+    if ((handle >= 0) 
+	&& (handle < PDP_OBJECT_ARRAY_SIZE) 
+	&& (p = pdp_stack[handle])){
+	/* if there are other users, copy the object otherwize return the same handle */
+	if (p->users){
+	    int new_handle = pdp_packet_new(p->type, p->size - PDP_HEADER_SIZE);
+	    t_pdp* new_p = pdp_packet_header(new_handle);
+	    memcpy(new_p, p, p->size);
+	    new_p->users = 1;
+	    out_handle = new_handle;
+	}
+	else {
+	    p->users++;
+	    out_handle = handle;
+	}
+	//post("pdp_copy_rw: inhandle:%d outhandle:%d", handle, out_handle);
+
+    }
+    else out_handle = -1;
+
+    return out_handle;
+}
+
+int
+pdp_packet_clone_rw(int handle)
+{
+    int out_handle;
+
+    t_pdp* p;
+    if ((handle >= 0) 
+	&& (handle < PDP_OBJECT_ARRAY_SIZE) 
+	&& (p = pdp_stack[handle])){
+
+	/* clone the packet header, don't copy the data */
+	int new_handle = pdp_packet_new(p->type, p->size - PDP_HEADER_SIZE);
+	t_pdp* new_p = pdp_packet_header(new_handle);
+	memcpy(new_p, p, PDP_HEADER_SIZE);
+	new_p->users = 1;
+	out_handle = new_handle;
+    }
+
+    else out_handle = -1;
+
+    return out_handle;
+}
+
+void
+pdp_packet_mark_unused(int handle)
+{
+    t_pdp* p;
+    if ((handle >= 0) && (handle < PDP_OBJECT_ARRAY_SIZE)){
+      if (p = pdp_stack[handle]) {
+	if (p->users) {
+	  p->users--;
+	  //post("pdp_mark_unused: handle %d, users left %d", handle, p->users);
+	}
+	else {
+	  post("pdp_mark_unused: WARNING: handle %d has zero users (duplicate pdp_mark_unused ?)", handle);
+	}
+      }
+      else {
+	post("pdp_mark_unused: WARNING: invalid handle %d: no associated object", handle);
+      }
+    }
+    
+    else {
+      /* -1 is the only invalid handle that doesn't trigger a warning */
+      if (handle != -1) post("pdp_mark_unused: WARNING: invalid handle %d: out of bound", handle);
+    }
+
+
+}
+
+/* remark. if an owner frees a rw copy, he can still pass it along to clients.
+the first copy instruction revives the object. maybe free should not be called free but unregister.
+as long as no new_object method is called, or no copy on another object is performed,
+the "undead" copy can be revived. this smells a bit, i know...*/
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_queue.c b/system/pdp_queue.c
new file mode 100644
index 0000000..2932728
--- /dev/null
+++ b/system/pdp_queue.c
@@ -0,0 +1,337 @@
+/*
+ *   Pure Data Packet - processor queue module.
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+
+/* 
+   this is a the processor queue pdp system module 
+   it receives tasks from objects that are schedules to 
+   be computed in another thread. the object is signalled back
+   when the task is completed.
+
+   this is not a standard pd class. it is a sigleton class
+   using a standard pd clock to poll for compleded methods on 
+   every scheduler run. this is a hack to do thread synchronization 
+   in a thread unsafe pd.
+
+ */
+
+#include "pdp.h"
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define PDP_QUEUE_SIZE 1024
+#define PDP_QUEUE_DELTIME 1.0f;
+
+
+
+
+/********************* pdp process queue data *********************/
+
+typedef void (*t_pdpmethod)(void *client);
+
+/* the process queue data record */
+typedef struct process_queue_struct
+{
+  void *x_owner;             /* the object we are dealing with */
+  t_pdpmethod x_process;     /* the process method */
+  t_pdpmethod x_callback;    /* the function to be called when finished */
+  int *x_queue_id;           /* place to store the queue id for task */
+} t_process_queue;
+
+
+
+/* clock members */
+static t_clock *pdp_clock;
+static double deltime;
+
+/* some bookkeeping vars */
+static long long ticks;
+static long long packets;
+
+/* queue members */
+static t_process_queue *q;    /* queue */
+static int mask;
+static int head;              /* last entry in queue + 1 */
+static int tail;              /* first entry in queque */
+static int curr;              /* the object currently processed in other thread */
+
+/* pthread vars */
+static pthread_mutex_t mut;
+static pthread_cond_t cond_dataready;
+static pthread_cond_t cond_processingdone;
+static pthread_t thread_id;
+    
+/* synchro pipes */
+static int pipe_fd[2];
+
+/* toggle for thread usage */
+static int use_thread;
+
+
+
+/* the methods */
+void pdp_queue_wait()
+{
+    //post("pdp_pq_wait: waiting for pdp_queue_thread to finish processing");
+    pthread_mutex_lock(&mut);
+    while(((curr - head) & mask) != 0){
+
+	  pthread_cond_wait(&cond_processingdone, &mut);
+    }
+    pthread_mutex_unlock(&mut);
+    //post("pdp_pq_wait: pdp_queue_thread has finished processing");
+
+}
+void pdp_queue_finish(int index)
+{
+
+  if (-1 == index) {
+      //post("pdp_pq_remove: index == -1");
+      return;
+  }
+  /* wait for processing thread to finish*/
+  pdp_queue_wait();
+
+  /* invalidate callback at index */
+  q[index & mask].x_callback = 0;
+  q[index & mask].x_queue_id = 0;
+
+}
+
+static void pdp_queue_signal_processor(void)
+{
+
+    pthread_mutex_lock(&mut);
+    //post("signalling process thread");
+    pthread_cond_signal(&cond_dataready);
+    pthread_mutex_unlock(&mut);
+    //post("signalling process thread done");
+
+}
+
+static void pdp_queue_wait_for_feeder(void)
+{
+
+
+    /* only use locking when there is no data */
+    if(((curr - head) & mask) == 0){
+	pthread_mutex_lock(&mut);
+
+	/* signal processing done */
+	//post("pdp_queue_thread: signalling processing is done");
+	pthread_cond_signal(&cond_processingdone);
+
+	/* wait until there is an item in the queue */
+	while(((curr - head) & mask) == 0){
+	    //post("waiting for feeder");
+	    pthread_cond_wait(&cond_dataready, &mut);
+	    //post("waiting for feeder done");
+	}
+
+	pthread_mutex_unlock(&mut);
+
+    }
+}
+
+void pdp_queue_add(void *owner, void *process, void *callback, int *queue_id)
+{
+    int i;
+
+    /* if processing is in not in thread, just call the funcs */
+    if (!use_thread){
+	//post("pdp_queue_add: calling processing routine directly");
+	*queue_id = -1;
+	((t_pdpmethod) process)(owner);
+	((t_pdpmethod) callback)(owner);
+	return;
+    }
+	
+
+
+    /* schedule method in thread queue */
+    if (1 == ((tail - head) & mask)) {
+	post("pdp_queue_add: WARNING: processing queue is full.\n");
+	post("pdp_queue_add: WARNING: skipping process method, calling callback directly.\n");
+	*queue_id = -1;
+	((t_pdpmethod) callback)(owner);
+    }
+
+
+
+    i = head & mask;
+    q[i].x_owner = owner;
+    q[i].x_process = process;
+    q[i].x_callback = callback;
+    q[i].x_queue_id = queue_id;
+    *queue_id = i;
+    //post("pdp_queue_add: added method to queue, index %d", i);
+
+      
+    // increase the packet count
+    packets++;
+  
+    // move head forward
+    head++;
+
+    pdp_queue_signal_processor();
+
+}
+
+
+/* processing thread */
+static void *pdp_queue_thread(void *dummy)
+{
+  while(1){
+
+
+      /* wait until there is data available */
+      pdp_queue_wait_for_feeder();      
+
+
+      //post("pdp_queue_thread: processing %d", curr);
+
+
+      /* call the process routine */
+      (q[curr & mask].x_process)(q[curr & mask].x_owner);
+
+      /* advance */
+      curr++;
+
+
+    }
+}
+
+
+/* call back all the callbacks */
+static void pdp_queue_callback (void)
+{
+
+  /* call callbacks for finished packets */
+  while(0 != ((curr - tail) & mask))
+    {
+      int i = tail & mask;
+      /* invalidate queue id */
+      if(q[i].x_queue_id) *q[i].x_queue_id = -1;
+      /* call callback */
+      if(q[i].x_callback) (q[i].x_callback)(q[i].x_owner);
+      //else post("pdp_pq_tick: callback %d is disabled",i );
+      tail++;
+    }
+
+}
+
+/* the clock method */
+static void pdp_queue_tick (void)
+{
+  /* do work */
+  //if (!(ticks % 1000)) post("pdp tick %d", ticks);
+
+  if (!use_thread) return;
+
+  /* call callbacks */
+  pdp_queue_callback();
+
+  /* increase counter */
+  ticks++;
+
+  /* set clock for next update */
+  clock_delay(pdp_clock, deltime);
+}
+
+
+void pdp_queue_use_thread(int t)
+{
+    /* if thread usage is being disabled, 
+       wait for thread to finish processing first */
+    if (t == 0) {
+	pdp_queue_wait();
+	use_thread = 0;
+	pdp_queue_callback();
+	clock_unset(pdp_clock);
+    }
+    else {
+	clock_unset(pdp_clock);
+	clock_delay(pdp_clock, deltime);
+	use_thread = 1;
+    }
+
+}
+
+void pdp_queue_setup(void)
+{
+  pthread_attr_t attr;
+
+  /* setup pdp queue processor object */
+  ticks = 0;
+  deltime = PDP_QUEUE_DELTIME;
+
+  /* setup queue data */
+  mask = PDP_QUEUE_SIZE - 1;
+  head = 0;
+  tail = 0;
+  curr = 0;
+  q = getbytes(PDP_QUEUE_SIZE * sizeof(*q));
+
+  /* use threads by default */
+  use_thread = 1;
+
+  /* setup synchro stuff */
+  pthread_mutex_init(&mut, NULL);
+  pthread_cond_init(&cond_dataready, NULL);
+  pthread_cond_init(&cond_processingdone, NULL);
+
+ 
+  /* allocate the clock */
+  pdp_clock = clock_new(0, (t_method)pdp_queue_tick);
+
+  /* set the clock */
+  clock_delay(pdp_clock, 0);
+
+  /* start processing thread */
+
+  /* glibc doc says SCHED_OTHER is default,
+     but it seems not to be when initiated from a RT thread
+     so we explicitly set it here */
+  pthread_attr_init (&attr);
+  //pthread_attr_setschedpolicy(&attr, SCHED_FIFO); 
+  pthread_attr_setschedpolicy(&attr, SCHED_OTHER); 
+  pthread_create(&thread_id, &attr, pdp_queue_thread, (void *)0);
+
+
+
+}
+
+
+
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_resample.c b/system/pdp_resample.c
new file mode 100644
index 0000000..2b5a9de
--- /dev/null
+++ b/system/pdp_resample.c
@@ -0,0 +1,135 @@
+/*
+ *   Pure Data Packet system file. - image resampling routines
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "pdp_resample.h"
+#include "pdp.h"
+
+/*
+
+efficient bilinear resampling ??
+performance: how to eliminate divides? -> virtual coordinates 2^k x 2^k (conf. opengl)
+
+i.e. 16 bit virtual coordinates: easy modular addressing
+
+*/
+
+s32 pdp_resample_bilin(s16 *image, s32 width, s32 height, s32 virt_x, s32 virt_y)
+{
+
+    s32 fp_x, fp_y, frac_x, frac_y, f, offset, r_1, r_2;
+
+    virt_x &= 0xffff;
+    virt_y &= 0xffff;
+
+    fp_x = virt_x * (width - 1);
+    fp_y = virt_y * (height - 1);
+
+    frac_x = fp_x & (0xffff);
+    frac_y = fp_y & (0xffff);
+
+    offset = (fp_x >> 16) + (fp_y >> 16) * width;
+    image += offset;
+
+    f = 0x10000 - frac_x;
+
+    r_1 = ((f * (s32)(image[0])  +  frac_x * (s32)(image[1])))>>16;
+
+    image += width;
+
+    r_2 = ((f * (s32)(image[0])  +  frac_x * (s32)(image[1])))>>16;
+
+    f = 0x10000 - frac_y;
+
+    return ((f * r_1 + frac_y * r_2)>>16);
+    
+}
+
+
+void pdp_resample_scale_bilin(s16 *src_image, s16 *dst_image, s32 src_w, s32 src_h, s32 dst_w, s32 dst_h)
+{
+    s32 i,j;
+    s32 virt_x=0;
+    s32 virt_y=0; /* virtual coordinates in 30 bit */
+    s32 scale_x = 0x40000000 / dst_w;
+    s32 scale_y = 0x40000000 / dst_h;
+
+    for (j=0; j<dst_h; j++){
+	for (i=0; i<dst_w; i++){
+	    *dst_image++ = pdp_resample_bilin(src_image, src_w, src_h, virt_x>>14, virt_y>>14);
+	    virt_x += scale_x;
+	}
+	virt_x = 0;
+	virt_y += scale_y;
+    }
+
+}
+
+void pdp_resample_scale_nn(s16 *src_image, s16 *dst_image, s32 src_w, s32 src_h, s32 dst_w, s32 dst_h)
+{
+    s32 i,j;
+    s32 x=0;
+    s32 y=0;
+    s32 frac_x=0;
+    s32 frac_y=0;
+    s32 scale_x = (src_w << 20 ) / dst_w;
+    s32 scale_y = (src_h << 20 ) / dst_h;
+
+    for (j=0; j<dst_h; j++){
+	for (i=0; i<dst_w; i++){
+	    *dst_image++ = src_image[x+y];
+	    frac_x += scale_x;
+	    x = frac_x >> 20;
+	}
+	x = 0;
+	frac_x = 0;
+	frac_y += scale_y;
+	y = (frac_y >> 20) * src_w;
+    }
+
+}
+
+void pdp_resample_zoom_tiled_bilin(s16 *src_image, s16 *dst_image, s32 w, s32 h, 
+				   float zoom_x, float zoom_y, float center_x_relative, float center_y_relative)
+{
+    float izx = 1.0f / zoom_x;
+    float izy = 1.0f / zoom_y;
+    s32 scale_x = (s32)((float)0x100000 * izx / (float)w);
+    s32 scale_y = (s32)((float)0x100000 * izy / (float)h);
+
+    s32 top_virt_x = (s32)((1.0f - izx) * (float)0x100000 * center_x_relative);
+    s32 top_virt_y = (s32)((1.0f - izy) * (float)0x100000 * center_y_relative);
+
+    s32 virt_x = top_virt_x;
+    s32 virt_y = top_virt_y; 
+
+    s32 i,j;
+
+    for (j=0; j<h; j++){
+	for (i=0; i<w; i++){
+	    *dst_image++ = pdp_resample_bilin(src_image, w, h, virt_x>>4, virt_y>>4);
+	    virt_x += scale_x;
+	}
+	virt_x = top_virt_x;
+	virt_y += scale_y;
+    }
+
+}
+
diff --git a/system/pdp_type.c b/system/pdp_type.c
new file mode 100644
index 0000000..b23b9cd
--- /dev/null
+++ b/system/pdp_type.c
@@ -0,0 +1,143 @@
+/*
+ *   Pure Data Packet system implementation. : code for handling different packet types
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "pdp.h"
+#include <stdio.h>
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+/****************** packet type checking methods ********************/
+
+
+/* check if two packets are allocated and of the same type */
+int pdp_type_compat(int packet0, int packet1)
+{
+
+    t_pdp *header0 = pdp_packet_header(packet0);
+    t_pdp *header1 = pdp_packet_header(packet1);
+
+    if (!(header1)){
+	//post("pdp_type_compat: invalid header packet1");
+	return 0;
+    }
+    if (!(header0)){
+	//post("pdp_type_compat: invalid header packet 0");
+	return 0;
+    }
+    if (header0->type != header1->type){
+	//post("pdp_type_compat: types do not match");
+	return 0;
+    }
+
+    return 1;
+}
+
+/* check if two image packets are allocated and of the same type */
+int pdp_type_compat_image(int packet0, int packet1)
+{
+    t_pdp *header0 = pdp_packet_header(packet0);
+    t_pdp *header1 = pdp_packet_header(packet1);
+
+
+    if (!(pdp_type_compat(packet0, packet1))) return 0;
+    if (header0->type != PDP_IMAGE){
+	//post("pdp_type_compat_image: not a PDP_IMAGE");
+	return 0;
+    }
+    if (header0->info.image.encoding != header1->info.image.encoding){
+	//post("pdp_type_compat_image: encodings differ");
+	return 0;
+    }
+    if (header0->info.image.width != header1->info.image.width){
+	//post("pdp_type_compat_image: image withs differ");
+	return 0;
+    }
+    if (header0->info.image.height != header1->info.image.height){
+	//post("pdp_type_compat_image: image heights differ");
+	return 0;
+    }
+    return 1;
+}
+
+/* check if packet is a valid image packet */
+int pdp_type_isvalid_image(int packet)
+{
+    t_pdp *header = pdp_packet_header(packet);
+    if (!header) return 0;
+    if (PDP_IMAGE != header->type) return 0;
+    if ((PDP_IMAGE_YV12 != header->info.image.encoding)
+	&& (PDP_IMAGE_GREY != header->info.image.encoding)) return 0;
+
+    return 1;
+
+}
+
+
+
+int pdp_packet_new_image_yv12(u32 w, u32 h)
+{
+    t_pdp *header;
+    int packet;
+
+
+    u32 size = w*h;
+    u32 totalnbpixels = size + (size >> 1);
+    u32 packet_size = totalnbpixels << 1;
+
+    packet = pdp_packet_new(PDP_IMAGE, packet_size);
+    header = pdp_packet_header(packet);
+
+    header->info.image.encoding = PDP_IMAGE_YV12;
+    header->info.image.width = w;
+    header->info.image.height = h;
+
+    return packet;
+}
+
+int pdp_packet_new_image_grey(u32 w, u32 h)
+{
+    t_pdp *header;
+    int packet;
+
+
+    u32 size = w*h;
+    u32 totalnbpixels = size;
+    u32 packet_size = totalnbpixels << 1;
+
+    packet = pdp_packet_new(PDP_IMAGE, packet_size);
+    header = pdp_packet_header(packet);
+
+    header->info.image.encoding = PDP_IMAGE_GREY;
+    header->info.image.width = w;
+    header->info.image.height = h;
+
+    return packet;
+}
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_ut.c b/system/pdp_ut.c
new file mode 100644
index 0000000..83b4cb0
--- /dev/null
+++ b/system/pdp_ut.c
@@ -0,0 +1,195 @@
+/*
+ *   Pure Data Packet - Utility toolkit objects.
+ *   Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+/* This file contains some small utility pd objects that make working with 
+   pdp objects a lot easier. Mainly as glue to be used in the abstractions 
+   in the distro. */
+
+#include "pdp.h"
+#include <math.h>
+
+/* this object does an add, scale, clip operation */
+
+t_class *pdp_ut_addscaleclip_class;
+
+typedef struct pdp_ut_addscaleclip_struct
+{
+    t_object x_obj;
+    t_outlet *x_outlet0;
+    t_float x_min;
+    t_float x_max;
+    t_float x_offset;
+    t_float x_scale;
+} t_pdp_ut_addscaleclip;
+
+
+static void pdp_ut_addscaleclip_float(t_pdp_ut_addscaleclip *x, t_floatarg f)
+{
+    f += x->x_offset;
+    f *= x->x_scale;
+    f = (f < x->x_min) ? x->x_min : f;
+    f = (f > x->x_max) ? x->x_max : f;
+    outlet_float(x->x_outlet0, f);
+}
+
+static void pdp_ut_addscaleclip_free(t_pdp_ut_addscaleclip *x){}
+
+void *pdp_ut_addscaleclip_new(t_floatarg offset, t_floatarg scale, t_floatarg min, t_floatarg max)
+{
+    t_pdp_ut_addscaleclip *x = (t_pdp_ut_addscaleclip *)pd_new(pdp_ut_addscaleclip_class);
+    x->x_outlet0 = outlet_new(&x->x_obj, &s_float); 
+    x->x_offset = offset;
+    x->x_scale = scale;
+    x->x_min = min;
+    x->x_max = max;
+    return (void *)x;
+}
+
+void pdp_ut_addscaleclip_setup(void)
+{
+    pdp_ut_addscaleclip_class = class_new(gensym("pdp_ut_addscaleclip"), (t_newmethod)pdp_ut_addscaleclip_new,
+			      (t_method)pdp_ut_addscaleclip_free, sizeof(t_pdp_ut_addscaleclip), 0, 
+					 A_FLOAT, A_FLOAT, A_FLOAT, A_FLOAT, A_NULL);
+    class_addfloat(pdp_ut_addscaleclip_class,  pdp_ut_addscaleclip_float);
+}
+
+
+/* pdp_ut_logmap does a logarithmic parameter mapping [0->1] x -> min(max/min)^x max an add, scale, clip operation */
+/* pdp_ut_logmap_comp does x -> min(max/min)^(1-x) */
+/* pdp_ut_linmap dos x -> min + (max - min * x */
+
+t_class *pdp_ut_linmap_class;
+t_class *pdp_ut_logmap_class;
+t_class *pdp_ut_logmap_comp_class;
+
+typedef struct pdp_ut_map_struct
+{
+    t_object x_obj;
+    t_outlet *x_outlet0;
+    t_float x_min;
+    t_float x_max;
+} t_pdp_ut_map;
+
+
+static void pdp_ut_logmap_float(t_pdp_ut_map *x, t_floatarg f)
+{
+    f = (f < 0.0f) ? 0.0f : f;
+    f = (f > 1.0f) ? 1.0f : f;
+
+    f = x->x_min * pow((x->x_max / x->x_min), f);
+
+    outlet_float(x->x_outlet0, f);
+}
+
+static void pdp_ut_linmap_float(t_pdp_ut_map *x, t_floatarg f)
+{
+    f = (f < 0.0f) ? 0.0f : f;
+    f = (f > 1.0f) ? 1.0f : f;
+
+    f = x->x_min + ((x->x_max - x->x_min) * f);
+
+    outlet_float(x->x_outlet0, f);
+}
+
+static void pdp_ut_logmap_comp_float(t_pdp_ut_map *x, t_floatarg f)
+{
+    f = (f < 0.0f) ? 0.0f : f;
+    f = (f > 1.0f) ? 1.0f : f;
+
+    f = x->x_min * pow((x->x_max / x->x_min), (1.0f - f));
+
+    outlet_float(x->x_outlet0, f);
+}
+
+static void pdp_ut_map_free(t_pdp_ut_map *x){}
+
+
+void pdp_ut_map_init(t_pdp_ut_map *x, t_floatarg min, t_floatarg max)
+{
+    x->x_outlet0 = outlet_new(&x->x_obj, &s_float); 
+    x->x_min = min;
+    x->x_max = max;
+}
+
+void *pdp_ut_logmap_new(t_floatarg min, t_floatarg max)
+{
+    t_pdp_ut_map *x = (t_pdp_ut_map *)pd_new(pdp_ut_logmap_class);
+    pdp_ut_map_init(x, min, max);
+    return (void *)x;
+}
+
+void *pdp_ut_linmap_new(t_floatarg min, t_floatarg max)
+{
+    t_pdp_ut_map *x = (t_pdp_ut_map *)pd_new(pdp_ut_linmap_class);
+    pdp_ut_map_init(x, min, max);
+    return (void *)x;
+}
+
+void *pdp_ut_logmap_comp_new(t_floatarg min, t_floatarg max)
+{
+    t_pdp_ut_map *x = (t_pdp_ut_map *)pd_new(pdp_ut_logmap_comp_class);
+    pdp_ut_map_init(x, min, max);
+    return (void *)x;
+}
+
+void pdp_ut_logmap_setup(void)
+{
+    pdp_ut_logmap_class = class_new(gensym("pdp_ut_logmap"), (t_newmethod)pdp_ut_logmap_new,
+			      (t_method)pdp_ut_map_free, sizeof(t_pdp_ut_map), 0, 
+					 A_FLOAT, A_FLOAT, A_NULL);
+    class_addfloat(pdp_ut_logmap_class,  pdp_ut_logmap_float);
+}
+
+void pdp_ut_logmap_comp_setup(void)
+{
+    pdp_ut_logmap_comp_class = class_new(gensym("pdp_ut_logmap_comp"), (t_newmethod)pdp_ut_logmap_comp_new,
+			      (t_method)pdp_ut_map_free, sizeof(t_pdp_ut_map), 0, 
+					 A_FLOAT, A_FLOAT, A_NULL);
+    class_addfloat(pdp_ut_logmap_comp_class,  pdp_ut_logmap_comp_float);
+}
+
+void pdp_ut_linmap_setup(void)
+{
+    pdp_ut_linmap_class = class_new(gensym("pdp_ut_linmap"), (t_newmethod)pdp_ut_linmap_new,
+			      (t_method)pdp_ut_map_free, sizeof(t_pdp_ut_map), 0, 
+					 A_FLOAT, A_FLOAT, A_NULL);
+    class_addfloat(pdp_ut_linmap_class,  pdp_ut_linmap_float);
+}
+
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void pdp_ut_setup(void)
+{
+    pdp_ut_addscaleclip_setup();
+    pdp_ut_logmap_setup();
+    pdp_ut_logmap_comp_setup();
+    pdp_ut_linmap_setup();
+}
+
+
+#ifdef __cplusplus
+}
+#endif