aboutsummaryrefslogtreecommitdiff
path: root/system
diff options
context:
space:
mode:
authorTom Schouten <doelie@users.sourceforge.net>2003-01-21 10:27:33 +0000
committerTom Schouten <doelie@users.sourceforge.net>2003-01-21 10:27:33 +0000
commit9b8745d5250c9d0b60c9aa5a77f58a3fcddf1076 (patch)
tree8372b6a414a7124cec57efc9c80845e2bc1b157d /system
This commit was generated by cvs2svn to compensate for changes in r352,svn2git-root
which included commits to RCS files with non-trunk default branches. svn path=/trunk/externals/pdp/; revision=353
Diffstat (limited to 'system')
-rw-r--r--system/Makefile20
-rw-r--r--system/Makefile.linux2
-rw-r--r--system/Makefile.linux_mmx4
-rw-r--r--system/mmx/Makefile29
-rw-r--r--system/mmx/pdp_mmx_test.c62
-rw-r--r--system/mmx/pixel_add_s16.s55
-rw-r--r--system/mmx/pixel_affine_s16.s59
-rw-r--r--system/mmx/pixel_biquad_dirI_s16.s361
-rw-r--r--system/mmx/pixel_biquad_s16.s451
-rw-r--r--system/mmx/pixel_ca_s1.s189
-rw-r--r--system/mmx/pixel_cascade_s16.s330
-rw-r--r--system/mmx/pixel_conv_hor_s16.s134
-rw-r--r--system/mmx/pixel_conv_ver_s16.s128
-rw-r--r--system/mmx/pixel_crot_s16.s153
-rw-r--r--system/mmx/pixel_gain.s83
-rw-r--r--system/mmx/pixel_gain_s16.s71
-rw-r--r--system/mmx/pixel_mix_s16.s68
-rw-r--r--system/mmx/pixel_mul_s16.s56
-rw-r--r--system/mmx/pixel_pack_s16u8.s126
-rw-r--r--system/mmx/pixel_rand_s16.s76
-rw-r--r--system/mmx/pixel_randmix_s16.s91
-rw-r--r--system/mmx/pixel_s1.s201
-rw-r--r--system/mmx/pixel_unpack_u8s16.s113
-rw-r--r--system/pdp.c115
-rw-r--r--system/pdp_comm.c119
-rw-r--r--system/pdp_control.c162
-rw-r--r--system/pdp_imageproc_mmx.c319
-rw-r--r--system/pdp_imageproc_portable.c492
-rw-r--r--system/pdp_llconv.c293
-rw-r--r--system/pdp_llconv_mmx.c55
-rw-r--r--system/pdp_llconv_portable.c81
-rw-r--r--system/pdp_packet.c239
-rw-r--r--system/pdp_queue.c337
-rw-r--r--system/pdp_resample.c135
-rw-r--r--system/pdp_type.c143
-rw-r--r--system/pdp_ut.c195
36 files changed, 5547 insertions, 0 deletions
diff --git a/system/Makefile b/system/Makefile
new file mode 100644
index 0000000..acdb944
--- /dev/null
+++ b/system/Makefile
@@ -0,0 +1,20 @@
+target: all_objects
+
+include ../Makefile.config
+include Makefile.$(PDP_TARGET)
+
+
+
+OBJECTS = pdp.o pdp_ut.o pdp_packet.o pdp_type.o pdp_queue.o pdp_comm.o \
+ pdp_control.o pdp_llconv.o pdp_resample.o
+
+pdp_main_clean:
+ rm -f pdp.o
+
+all_objects: pdp_main_clean $(OBJECTS) platform_targets
+
+clean:
+ rm -f *~
+ rm -f *.o
+ make -C mmx clean
+
diff --git a/system/Makefile.linux b/system/Makefile.linux
new file mode 100644
index 0000000..96660f7
--- /dev/null
+++ b/system/Makefile.linux
@@ -0,0 +1,2 @@
+platform_targets: pdp_imageproc_portable.o pdp_llconv_portable.o
+
diff --git a/system/Makefile.linux_mmx b/system/Makefile.linux_mmx
new file mode 100644
index 0000000..a646e5e
--- /dev/null
+++ b/system/Makefile.linux_mmx
@@ -0,0 +1,4 @@
+platform_subtree:
+ make -C mmx
+
+platform_targets: pdp_imageproc_mmx.o pdp_llconv_mmx.o platform_subtree
diff --git a/system/mmx/Makefile b/system/mmx/Makefile
new file mode 100644
index 0000000..0f8f836
--- /dev/null
+++ b/system/mmx/Makefile
@@ -0,0 +1,29 @@
+include ../../Makefile.config
+
+OBJ = \
+pixel_pack_s16u8.o \
+pixel_unpack_u8s16.o \
+pixel_add_s16.o \
+pixel_mul_s16.o \
+pixel_mix_s16.o \
+pixel_randmix_s16.o \
+pixel_conv_hor_s16.o \
+pixel_conv_ver_s16.o \
+pixel_affine_s16.o \
+pixel_biquad_s16.o \
+pixel_ca_s1.o \
+pixel_rand_s16.o \
+pixel_crot_s16.o \
+pixel_gain_s16.o
+
+all: $(OBJ)
+
+test: pdp_mmx_test.o $(OBJ)
+ gcc -o pdp_mmx_test pdp_mmx_test.o $(OBJ) -g
+
+clean:
+ rm -f *.o
+ rm -f *~
+ rm -f pdp_mmx.a
+ rm -f pdp_mmx_test
+
diff --git a/system/mmx/pdp_mmx_test.c b/system/mmx/pdp_mmx_test.c
new file mode 100644
index 0000000..e93539f
--- /dev/null
+++ b/system/mmx/pdp_mmx_test.c
@@ -0,0 +1,62 @@
+#include "pdp_mmx.h"
+
+#define FP(x) ((short int)(((float)(x) * 2 * 256.0f)))
+
+#define nbp 256
+
+ short int a1[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int a2[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int b0[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int b1[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int b2[4] = {0x0100,0x0100,0x0100,0x0100};
+
+ short int u1[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int u2[4] = {0x0100,0x0100,0x0100,0x0100};
+
+ short int x0[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int x1[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int x2[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int x3[4] = {0x0100,0x0100,0x0100,0x0100};
+
+void print_pixel(unsigned int i)
+{
+ if (i) printf("x ");
+ else printf(". ");
+}
+
+void print_line(void)
+{
+ printf("\n");
+}
+
+void print_square(unsigned char *c)
+{
+ int i,j;
+
+ for(j=7; j>=0; j--){
+ for(i=0; i<8; i++) print_pixel(c[j] & (1<<(7-i)));
+ printf("\n");
+ }
+
+}
+
+main()
+{
+
+ unsigned char src[16]={1,2,3,4,5,6,7,8,-1,-2,-3,-4,-5,-6,-7,-8};
+ unsigned char dst[8];
+
+
+ print_square(src);
+ print_line();
+ print_square(src+8);
+ print_line();
+
+ pixel_test_s1(dst,src,1,1);
+
+ print_square(dst);
+ print_line();
+
+
+
+}
diff --git a/system/mmx/pixel_add_s16.s b/system/mmx/pixel_add_s16.s
new file mode 100644
index 0000000..8d4c7df
--- /dev/null
+++ b/system/mmx/pixel_add_s16.s
@@ -0,0 +1,55 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_add_s16
+.type pixel_add_s16,@function
+
+# simple add
+# void pixel_add_s16(int *left, int *right, int nb_4pixel_vectors)
+
+pixel_add_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %edi # left array
+ movl 12(%ebp), %esi # right array
+ movl 16(%ebp), %ecx # pixel count
+
+
+ .align 16
+ .loop_mix:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm1 # load right 4 pixels from memory
+ movq (%edi), %mm0 # load 4 left pixels from memory
+ paddsw %mm1, %mm0 # mix
+ movq %mm0, (%edi)
+ addl $8, %esi
+ addl $8, %edi
+ decl %ecx
+ jnz .loop_mix # loop
+
+ emms
+
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_affine_s16.s b/system/mmx/pixel_affine_s16.s
new file mode 100644
index 0000000..b357de3
--- /dev/null
+++ b/system/mmx/pixel_affine_s16.s
@@ -0,0 +1,59 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_affine_s16
+.type pixel_affine_s16,@function
+
+# void pixel_affine_s16(int *buf, int nb_8pixel_vectors, short int gain[4], short int offset[4])
+
+pixel_affine_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 20(%ebp), %edi
+ movq (%edi), %mm6 # get offset vector
+
+ movl 16(%ebp), %edi
+ movq (%edi), %mm7 # get gain vector
+
+ movl 8(%ebp), %esi # input array
+ movl 12(%ebp), %ecx # pixel count
+
+
+ .align 16
+ .loop_affine:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm0 # load 4 pixels from memory
+ pmulhw %mm7, %mm0 # apply gain (s).15 fixed point
+ psllw $1, %mm0 # apply correction shift
+ paddsw %mm6, %mm0 # add offset
+ movq %mm0, (%esi) # store result in memory
+
+ addl $8, %esi # increment source pointer
+ decl %ecx
+ jnz .loop_affine # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_biquad_dirI_s16.s b/system/mmx/pixel_biquad_dirI_s16.s
new file mode 100644
index 0000000..1729502
--- /dev/null
+++ b/system/mmx/pixel_biquad_dirI_s16.s
@@ -0,0 +1,361 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+
+ # TODO MOVE TO DIRECT FORM II
+ # y[k] = b0 * x[k] + u1[k-1]
+ # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
+ # u2[k] = b2 * x[k] - a2 * y[k]
+
+ # input in register:
+ # %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3}
+ # %esi: coef memory (a1, a2, b0, b1, b2)
+ # %edi: state memory (u1, u2)
+
+
+ # return in register:
+ # %mm0-mm4: 4x4 pixels result
+
+
+ .biquad_4x4_pixels:
+ .align 16
+ # prescale
+ movq -8(%esi), %mm4
+ pmulhw %mm4, %mm0
+ pmulhw %mm4, %mm1
+ pmulhw %mm4, %mm2
+ pmulhw %mm4, %mm3
+ psllw $1, %mm0
+ psllw $1, %mm1
+ psllw $1, %mm2
+ psllw $1, %mm3
+
+
+ # first vector
+ movq 0(%edi), %mm4 # mm4 <- u[-1]
+ movq 8(%edi), %mm5 # mm5 <- u[-2]
+ movq %mm4, %mm6
+ movq %mm5, %mm7
+
+ pmulhw 0(%esi), %mm6 # multiply by a1
+ pmulhw 8(%esi), %mm7 # multiply by a2
+
+ paddsw %mm6, %mm0 # accumulate
+ paddsw %mm7, %mm0 # accumulate
+ paddsw %mm0, %mm0 # scale by 2 (since all fixed point muls are x*y/2)
+
+ movq %mm0, %mm6 # mm6 <- u[0]
+ movq %mm4, %mm7 # mm7 <- u[-1]
+ pmulhw 16(%esi), %mm0 # multiply by b0
+ pmulhw 24(%esi), %mm4 # multiply by b1
+ pmulhw 32(%esi), %mm5 # multiply by b2
+
+ paddsw %mm4, %mm0 # accumulate
+ paddsw %mm5, %mm0 # accumulate
+
+ # mm0 is result 0
+
+ # second vector
+ movq %mm6, %mm4 # mm4 <- u[0]
+ movq %mm7, %mm5 # mm5 <- u[-1]
+
+ pmulhw 0(%esi), %mm6 # multiply by a1
+ pmulhw 8(%esi), %mm7 # multiply by a2
+
+ paddsw %mm6, %mm1 # accumulate
+ paddsw %mm7, %mm1 # accumulate
+ paddsw %mm1, %mm1 # scale by 2
+
+
+ movq %mm1, %mm6 # mm6 <- u[1]
+ movq %mm4, %mm7 # mm7 <- u[0]
+ pmulhw 16(%esi), %mm1 # multiply by b0
+ pmulhw 24(%esi), %mm4 # multiply by b1
+ pmulhw 32(%esi), %mm5 # multiply by b2
+
+ paddsw %mm4, %mm1 # accumulate
+ paddsw %mm5, %mm1 # accumulate
+
+ # mm1 is result 1
+
+ # third vector
+ movq %mm6, %mm4 # mm4 <- u[1]
+ movq %mm7, %mm5 # mm5 <- u[0]
+
+ pmulhw 0(%esi), %mm6 # multiply by a1
+ pmulhw 8(%esi), %mm7 # multiply by a2
+
+ paddsw %mm6, %mm2 # accumulate
+ paddsw %mm7, %mm2 # accumulate
+ paddsw %mm2, %mm2 # scale by 2
+
+
+ movq %mm2, %mm6 # mm6 <- u[2]
+ movq %mm4, %mm7 # mm7 <- u[1]
+ pmulhw 16(%esi), %mm2 # multiply by b0
+ pmulhw 24(%esi), %mm4 # multiply by b1
+ pmulhw 32(%esi), %mm5 # multiply by b2
+
+ paddsw %mm4, %mm2 # accumulate
+ paddsw %mm5, %mm2 # accumulate
+
+ # mm2 is result 2
+
+ # fourth vector
+ movq %mm6, %mm4 # mm4 <- u[2]
+ movq %mm7, %mm5 # mm5 <- u[1]
+
+ pmulhw 0(%esi), %mm6 # multiply by a1
+ pmulhw 8(%esi), %mm7 # multiply by a2
+
+ paddsw %mm6, %mm3 # accumulate
+ paddsw %mm7, %mm3 # accumulate
+ paddsw %mm3, %mm3 # scale by 2
+
+
+ movq %mm3, 0(%edi) # store u[3]
+ movq %mm4, 8(%edi) # store u[2]
+ pmulhw 16(%esi), %mm3 # multiply by b0
+ pmulhw 24(%esi), %mm4 # multiply by b1
+ pmulhw 32(%esi), %mm5 # multiply by b2
+
+ paddsw %mm4, %mm3 # accumulate
+ paddsw %mm5, %mm3 # accumulate
+
+ # mm3 is result 3
+
+ ret
+
+
+ # in order to use the 4 line parallel biquad routine on horizontal
+ # lines, we need to reorder (rotate or transpose) the matrix, since
+ # images are scanline encoded, and we want to work in parallell
+ # on 4 lines.
+ #
+ # since the 4 lines are independent, it doesnt matter in which order
+ # the the vector elements are present.
+ #
+ # this allows us to use the same routine for left->right and right->left
+ # processing.
+ #
+ # some comments on the non-abelean group of square isometries consisting of
+ # (I) identity
+ # (H) horizontal axis mirror
+ # (V) vertical axis mirror
+ # (T) transpose (diagonal axis mirror)
+ # (A) antitranspose (antidiagonal axis mirror)
+ # (R1) 90deg anticlockwize rotation
+ # (R2) 180deg rotation
+ # (R3) 90deg clockwize rotation
+ #
+ #
+ # we basicly have two options: (R1,R3) or (T,A)
+ # we opt for T and A because they are self inverting, which improves locality
+ #
+ # use antitranspose for right to left an transpose
+ # for left to right (little endian)
+
+
+ # antitranspose 4x4
+
+ # input
+ # %mm3 == {d0 d1 d2 d3}
+ # %mm2 == {c0 c1 c2 c3}
+ # %mm1 == {b0 b1 b2 b3}
+ # %mm0 == {a0 a1 a2 a3}
+
+ # output
+ # %mm3 == {a3 b3 c3 d3}
+ # %mm2 == {a2 b2 c2 d2}
+ # %mm1 == {a1 b1 c1 d1}
+ # %mm0 == {a0 b0 c0 d0}
+
+
+ .antitranspose_4x4:
+ .align 16
+ movq %mm3, %mm4
+ punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
+ movq %mm3, %mm5
+ punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
+
+ movq %mm2, %mm6
+ punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
+ movq %mm2, %mm7
+ punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
+
+ movq %mm4, %mm3
+ punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
+ movq %mm4, %mm2
+ punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
+
+ movq %mm5, %mm1
+ punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
+ movq %mm5, %mm0
+ punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
+
+ ret
+
+
+
+ # transpose 4x4
+
+ # input
+ # %mm3 == {d3 d2 d1 d0}
+ # %mm2 == {c3 c2 c1 c0}
+ # %mm1 == {b3 b2 b1 b0}
+ # %mm0 == {a3 a2 a1 a0}
+
+ # output
+ # %mm3 == {d3 c3 b3 a3}
+ # %mm2 == {d2 c2 b2 a2}
+ # %mm1 == {d1 c1 b1 a1}
+ # %mm0 == {d0 c0 b0 a0}
+
+
+ .transpose_4x4:
+ .align 16
+ movq %mm0, %mm4
+ punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
+ movq %mm0, %mm5
+ punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
+
+ movq %mm1, %mm6
+ punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
+ movq %mm1, %mm7
+ punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
+
+ movq %mm4, %mm0
+ punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
+ movq %mm4, %mm1
+ punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
+
+ movq %mm5, %mm2
+ punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
+ movq %mm5, %mm3
+ punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
+
+ ret
+
+
+.globl pixel_biquad_vertb_s16
+.type pixel_biquad_vertb_s16,@function
+
+
+# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_biquad_vertb_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_vertb_line_loop:
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+ call .biquad_4x4_pixels
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+ addl %edx, %ebx
+ addl %eax, %ebx
+ decl %ecx
+ jnz .biquad_vertb_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+.globl pixel_biquad_horlr_s16
+.type pixel_biquad_horlr_s16,@function
+
+
+# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_biquad_horlr_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_horlr_line_loop:
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+ call .transpose_4x4
+ call .biquad_4x4_pixels
+ call .transpose_4x4
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+ addl $8, %ebx
+ decl %ecx
+ jnz .biquad_horlr_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+
+
diff --git a/system/mmx/pixel_biquad_s16.s b/system/mmx/pixel_biquad_s16.s
new file mode 100644
index 0000000..844b041
--- /dev/null
+++ b/system/mmx/pixel_biquad_s16.s
@@ -0,0 +1,451 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+
+ # DIRECT FORM II BIQUAD
+ #
+ # y[k] = b0 * x[k] + u1[k-1]
+ # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
+ # u2[k] = b2 * x[k] - a2 * y[k]
+ # MACRO: df2 <reg>
+ #
+ # computes a direct form 2 biquad
+ # does not use {mm0-mm3}\<inreg>
+ #
+ # input: <reg> == input
+ # %mm4 == state 1
+ # %mm5 == state 2
+ # (%esi) == biquad coefs (-a1 -a2 b0 b1 b2) in s1.14
+ # output: <reg> == output
+ # %mm4 == state 1
+ # %mm5 == state 2
+
+ .macro df2 reg
+ movq \reg, %mm6 # mm6 == x[k]
+ movq \reg, %mm7 # mm7 == x[k]
+ pmulhw 16(%esi), %mm6 # mm6 == x[k] * b0
+ pmulhw 24(%esi), %mm7 # mm7 == x[k] * b1
+ paddw %mm4, %mm6 # mm6 == x[k] * b0 + u1[k-1] == y[k]
+ paddw %mm5, %mm7 # mm7 == x[k] * b1 + u2[k-1]
+ paddsw %mm6, %mm6 # compensate for mul = x*y/4 (coefs are s1.14 fixed point)
+ paddsw %mm6, %mm6 # paddsw ensures saturation
+ movq \reg, %mm5 # mm5 == x[k]
+ movq %mm6, %mm4 # mm4 == y[k]
+ movq %mm6, \reg # reg == y[k] --------------------
+ pmulhw 0(%esi), %mm4 # mm4 == y[k] * (-a1)
+ pmulhw 8(%esi), %mm6 # mm6 == y[k] * (-a2)
+ pmulhw 32(%esi), %mm5 # mm5 == x[k] * b2
+ paddw %mm7, %mm4 # mm4 == u1[k] --------------------
+ paddw %mm6, %mm5 # mm5 == u2[k] --------------------
+ .endm
+
+
+ # input in register:
+ # %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3}
+ # %esi: coef memory (-a1, -a2, b0, b1, b2) in s1.14
+ # %edi: state memory (u1, u2)
+
+ # return in register:
+ # %mm0-mm4: 4x4 pixels result
+
+
+
+
+ .macro biquad_4x4_pixels
+ .align 16
+ movq 0(%edi), %mm4 # get state
+ movq 8(%edi), %mm5
+ df2 %mm0 # compute 4 biquads
+ df2 %mm1
+ df2 %mm2
+ df2 %mm3
+ movq %mm4, 0(%edi) # store state
+ movq %mm5, 8(%edi)
+ .endm
+
+
+
+ # in order to use the 4 line parallel biquad routine on horizontal
+ # lines, we need to reorder (rotate or transpose) the matrix, since
+ # images are scanline encoded, and we want to work in parallell
+ # on 4 lines.
+ #
+ # since the 4 lines are independent, it doesnt matter in which order
+ # the the vector elements are present.
+ #
+ # this allows us to use the same routine for left->right and right->left
+ # processing.
+ #
+ # some comments on the non-abelean group of square isometries consisting of
+ # (I) identity
+ # (H) horizontal axis mirror
+ # (V) vertical axis mirror
+ # (T) transpose (diagonal axis mirror)
+ # (A) antitranspose (antidiagonal axis mirror)
+ # (R1) 90deg anticlockwize rotation
+ # (R2) 180deg rotation
+ # (R3) 90deg clockwize rotation
+ #
+ #
+ # we basicly have two options: (R1,R3) or (T,A)
+ # we opt for T and A because they are self inverting, which improves locality
+ #
+ # use antitranspose for right to left an transpose
+ # for left to right (little endian)
+
+
+ # antitranspose 4x4
+
+ # input
+ # %mm3 == {d0 d1 d2 d3}
+ # %mm2 == {c0 c1 c2 c3}
+ # %mm1 == {b0 b1 b2 b3}
+ # %mm0 == {a0 a1 a2 a3}
+
+ # output
+ # %mm3 == {a3 b3 c3 d3}
+ # %mm2 == {a2 b2 c2 d2}
+ # %mm1 == {a1 b1 c1 d1}
+ # %mm0 == {a0 b0 c0 d0}
+
+
+ .macro antitranspose_4x4:
+ movq %mm3, %mm4
+ punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
+ movq %mm3, %mm5
+ punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
+
+ movq %mm2, %mm6
+ punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
+ movq %mm2, %mm7
+ punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
+
+ movq %mm4, %mm3
+ punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
+ movq %mm4, %mm2
+ punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
+
+ movq %mm5, %mm1
+ punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
+ movq %mm5, %mm0
+ punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
+
+ .endm
+
+
+ # transpose 4x4
+
+ # input
+ # %mm3 == {d3 d2 d1 d0}
+ # %mm2 == {c3 c2 c1 c0}
+ # %mm1 == {b3 b2 b1 b0}
+ # %mm0 == {a3 a2 a1 a0}
+
+ # output
+ # %mm3 == {d3 c3 b3 a3}
+ # %mm2 == {d2 c2 b2 a2}
+ # %mm1 == {d1 c1 b1 a1}
+ # %mm0 == {d0 c0 b0 a0}
+
+
+ .macro transpose_4x4:
+ movq %mm0, %mm4
+ punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
+ movq %mm0, %mm5
+ punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
+
+ movq %mm1, %mm6
+ punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
+ movq %mm1, %mm7
+ punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
+
+ movq %mm4, %mm0
+ punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
+ movq %mm4, %mm1
+ punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
+
+ movq %mm5, %mm2
+ punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
+ movq %mm5, %mm3
+ punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
+
+ .endm
+
+.globl pixel_biquad_vertb_s16
+.type pixel_biquad_vertb_s16,@function
+
+
+# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_biquad_vertb_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_vertb_line_loop:
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+ biquad_4x4_pixels
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+ addl %edx, %ebx
+ addl %eax, %ebx
+ decl %ecx
+ jnz .biquad_vertb_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+.globl pixel_biquad_verbt_s16
+.type pixel_biquad_verbt_s16,@function
+
+
+# pixel_biquad_vertbt_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_biquad_verbt_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %eax # line with
+
+ shll $3, %eax # 4 line byte spacing
+ decl %ecx
+ mul %ecx
+ incl %ecx
+ addl %eax, %ebx # ebx points to last pixblock
+
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_verbt_line_loop:
+ movq (%ebx), %mm3
+ movq (%ebx,%edx,1), %mm2
+ movq (%ebx,%edx,2), %mm1
+ movq (%ebx,%eax,1), %mm0
+ biquad_4x4_pixels
+ movq %mm3, (%ebx)
+ movq %mm2, (%ebx,%edx,1)
+ movq %mm1, (%ebx,%edx,2)
+ movq %mm0, (%ebx,%eax,1)
+ subl %edx, %ebx
+ subl %eax, %ebx
+ decl %ecx
+ jnz .biquad_verbt_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+.globl pixel_biquad_horlr_s16
+.type pixel_biquad_horlr_s16,@function
+# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+pixel_biquad_horlr_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_horlr_line_loop:
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+ transpose_4x4
+ biquad_4x4_pixels
+ transpose_4x4
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+ addl $8, %ebx
+ decl %ecx
+ jnz .biquad_horlr_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+
+.globl pixel_biquad_horrl_s16
+.type pixel_biquad_horrl_s16,@function
+# pixel_biquad_horrl_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+pixel_biquad_horrl_s16:
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+
+ movl %ecx, %eax
+ decl %eax
+ shll $3, %eax
+ addl %eax, %ebx # ebx points to last pixblock
+
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_horrl_line_loop:
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+ antitranspose_4x4
+ biquad_4x4_pixels
+ antitranspose_4x4
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+ subl $8, %ebx
+ decl %ecx
+ jnz .biquad_horrl_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+
+.globl pixel_biquad_time_s16
+.type pixel_biquad_time_s16,@function
+# pixel_biquad_time_s16(short int *pixel_array, short int *s1, short int *s2, short int *coefs, int nb_4_pix_vectors)
+
+pixel_biquad_time_s16:
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %edx # state 1 array
+ movl 16(%ebp), %edi # state 2 array
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %ecx # nb of 4 pixel vectors
+
+
+ .align 16
+ .biquad_time_loop:
+ movq (%ebx), %mm0 # get input
+ movq (%edx), %mm4 # get state 1
+ movq (%edi), %mm5 # get state 2
+ df2 %mm0 # compute direct form 2
+ movq %mm0, (%ebx) # write output
+ movq %mm5, (%edi) # write state 2
+ movq %mm4, (%edx) # write state 1
+ addl $8, %ebx
+ addl $8, %edi
+ addl $8, %edx
+ decl %ecx
+ jnz .biquad_time_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+
diff --git a/system/mmx/pixel_ca_s1.s b/system/mmx/pixel_ca_s1.s
new file mode 100644
index 0000000..d9c730f
--- /dev/null
+++ b/system/mmx/pixel_ca_s1.s
@@ -0,0 +1,189 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+ # this file contains assembler routines for 2D 1 bit cellular automata
+ # processing. it is organized around a feeder kernel and a
+ # stack based bit processor (virtual forth machine)
+ #
+ # the feeder kernel is responsable for loading/storing CA cells
+ # from/to memory. data in memory is organized as a scanline
+ # encoded toroidial bitplane (lsb = left). to simplify the kernel, the top
+ # left corner of the rectangular grid of pixels will shift down
+ # every processing step.
+ #
+ # the stack machine has the following architecture:
+ # CA stack: %esi, TOS: %mm0 (32x2 pixels. lsw = top row)
+ # CA horizon: %mm4-%mm7 (64x4 pixels. %mm4 = top row)
+ #
+ # the stack size / organization is not known to the stack machine.
+ # it can be thought of as operating on a 3x3 cell neightbourhood.
+ # the only purpose of forth program is to determine the CA local update rule.
+ #
+ # the machine is supposed to be very minimal. no looping control.
+ # no adressing modes. no conditional code (hey, this is an experiment!)
+ # so recursion is not allowed (no way to stop it)
+ # there are 9 words to load the cell neigbourhood on the stack.
+ # the rest is just logic and stack manips.
+
+
+ # this file contains pure asm macros. it is to be included before assembly
+ # after scaforth.pl has processed the .scaf file
+
+
+ # *************************** CA CELL ACCESS MACROS *****************************
+ # fetchTL - fetchBR
+
+ # shift / load rectangle macros:
+
+ # shift rectangle horizontal
+ # result is in reg1
+ .macro shift reg1 reg2 count
+ psllq $(32+\count), \reg1
+ psrlq $(32-\count), \reg2
+ psrlq $32, \reg1
+ psllq $32, \reg2
+ por \reg2, \reg1
+ .endm
+
+ .macro ldtop reg1 reg2
+ movq %mm4, \reg1
+ movq %mm5, \reg2
+ .endm
+
+ .macro ldcenter reg1 reg2
+ movq %mm5, \reg1
+ movq %mm6, \reg2
+ .endm
+
+ .macro ldbottom reg1 reg2
+ movq %mm6, \reg1
+ movq %mm7, \reg2
+ .endm
+
+
+ # fetch from top row
+
+ # fetch the top left square
+ .macro fetchTL
+ ldtop %mm0, %mm1
+ shift %mm0, %mm1, -1
+ .endm
+
+ # fetch the top mid square
+ .macro fetchTM
+ ldtop %mm0, %mm1
+ shift %mm0, %mm1, 0
+ .endm
+
+ # fetch the top right square
+ .macro fetchTR
+ ldtop %mm0, %mm1
+ shift %mm0, %mm1, 1
+ .endm
+
+
+
+ # fetch from center row
+
+ # fetch the mid left square
+ .macro fetchML
+ ldcenter %mm0, %mm1
+ shift %mm0, %mm1, -1
+ .endm
+
+ # fetch the mid mid square
+ .macro fetchMM
+ ldcenter %mm0, %mm1
+ shift %mm0, %mm1, 0
+ .endm
+
+ # fetch the mid right square
+ .macro fetchMR
+ ldcenter %mm0, %mm1
+ shift %mm0, %mm1, 1
+ .endm
+
+
+
+
+
+ # fetch from bottom row
+
+ # fetch the bottom left square
+ .macro fetchBL
+ ldbottom %mm0, %mm1
+ shift %mm0, %mm1, -1
+ .endm
+
+ # fetch the bottom mid square
+ .macro fetchBM
+ ldbottom %mm0, %mm1
+ shift %mm0, %mm1, 0
+ .endm
+
+ # fetch the bottom right square
+ .macro fetchBR
+ ldbottom %mm0, %mm1
+ shift %mm0, %mm1, 1
+ .endm
+
+
+
+ # *************************** CA STACK MANIP MACROS *****************************
+ # dup drop dropdup swap nip dropover
+
+ .macro dup
+ lea -8(%esi), %esi
+ movq %mm0, (%esi)
+ .endm
+
+ .macro drop
+ movq (%esi), %mm0
+ lea 8(%esi), %esi
+ .endm
+
+ .macro dropdup
+ movq (%esi), %mm0
+ .endm
+
+ .macro swap
+ movq (%esi), %mm1
+ movq %mm0, (%esi)
+ movq %mm1, %mm0
+ .endm
+
+ .macro nip
+ lea 8(%esi), %esi
+ .endm
+
+ .macro dropover
+ movq 8(%esi), %mm0
+ .endm
+
+
+ # *************************** CA BOOLEAN LOGIC MACROS *****************************
+ # overxor
+
+ .macro overxor
+ pxor (%esi), %mm0
+ .endm
+
+
+
+
+
diff --git a/system/mmx/pixel_cascade_s16.s b/system/mmx/pixel_cascade_s16.s
new file mode 100644
index 0000000..bf88d08
--- /dev/null
+++ b/system/mmx/pixel_cascade_s16.s
@@ -0,0 +1,330 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+
+ # TODO: COUPLED CASCADE SECOND ORDER SECTION
+ #
+ # s1[k] = ar * s1[k-1] + ai * s2[k-1] + x[k]
+ # s2[k] = ar * s2[k-1] - ai * s1[k-1]
+ # y[k] = c0 * x[k] + c1 * s1[k-1] + c2 * s2[k-1]
+
+
+ # MACRO: df2
+ #
+ # computes a coupled cascade
+ #
+ # input: %mm0 == input
+ # %mm1 == state 1
+ # %mm2 == state 2
+ # (%esi) == cascade coefs (ar ai c0 c1 c2) in s0.15
+ # output: %mm0 == output
+ # %mm1 == state 1
+ # %mm2 == state 2
+
+
+ .macro coupled
+ pmovq %mm1, %mm3 # mm3 == s1[k-1]
+ pmovq %mm1, %mm4 # mm4 == s1[k-1]
+ pmovq %mm2, %mm5 # mm5 == s2[k-1]
+ pmovq %mm2, %mm6 # mm5 == s2[k-1]
+ pmulhw (%esi), %mm1 # mm1 == s1[k-1] * ar
+ pmulhw 8(%esi), %mm3 # mm3 == s1[k-1] * ai
+ pmulhw 24(%esi), %mm4 # mm4 == s1[k-1] * c1
+ pmulhw (%esi), %mm2 # mm2 == s2[k-1] * ar
+ pmulhw 8(%esi), %mm5 # mm5 == s2[k-1] * ai
+ pmulhw 32(%esi), %mm6 # mm6 == s2[k-1] * c2
+ paddw %mm5, %mm1 # mm1 == s1[k-1] * ar + s2[k-1] * ai
+ psubw %mm3, %mm2 # mm2 == s2[k-1] * ar - s1[k-1] * ai == s2[k]
+ paddw %mm0, %mm1 # mm1 == s1[k]
+ pmulhw 16(%esi), %mm0 # mm0 == x[k] * c0
+ paddw %mm6, %mm4 # mm4 == s1[k-1] * c1 + s2[k-1] * c2
+ paddw %mm4, %mm0 # mm0 == y[k]
+ .endm
+
+
+
+
+ # in order to use the 4 line parallel cascade routine on horizontal
+ # lines, we need to reorder (rotate or transpose) the matrix, since
+ # images are scanline encoded, and we want to work in parallell
+ # on 4 lines.
+ #
+ # since the 4 lines are independent, it doesnt matter in which order
+ # the the vector elements are present.
+ #
+ # this allows us to use the same routine for left->right and right->left
+ # processing.
+ #
+ # some comments on the non-abelean group of square isometries consisting of
+ # (I) identity
+ # (H) horizontal axis mirror
+ # (V) vertical axis mirror
+ # (T) transpose (diagonal axis mirror)
+ # (A) antitranspose (antidiagonal axis mirror)
+ # (R1) 90deg anticlockwize rotation
+ # (R2) 180deg rotation
+ # (R3) 90deg clockwize rotation
+ #
+ #
+ # we basicly have two options: (R1,R3) or (T,A)
+ # we opt for T and A because they are self inverting, which improves locality
+ #
+ # use antitranspose for right to left an transpose
+ # for left to right (little endian)
+
+
+ # antitranspose 4x4
+
+ # input
+ # %mm3 == {d0 d1 d2 d3}
+ # %mm2 == {c0 c1 c2 c3}
+ # %mm1 == {b0 b1 b2 b3}
+ # %mm0 == {a0 a1 a2 a3}
+
+ # output
+ # %mm3 == {a3 b3 c3 d3}
+ # %mm2 == {a2 b2 c2 d2}
+ # %mm1 == {a1 b1 c1 d1}
+ # %mm0 == {a0 b0 c0 d0}
+
+
+ .macro antitranspose_4x4:
+ movq %mm3, %mm4
+ punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
+ movq %mm3, %mm5
+ punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
+
+ movq %mm2, %mm6
+ punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
+ movq %mm2, %mm7
+ punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
+
+ movq %mm4, %mm3
+ punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
+ movq %mm4, %mm2
+ punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
+
+ movq %mm5, %mm1
+ punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
+ movq %mm5, %mm0
+ punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
+
+ .endm
+
+
+ # transpose 4x4
+
+ # input
+ # %mm3 == {d3 d2 d1 d0}
+ # %mm2 == {c3 c2 c1 c0}
+ # %mm1 == {b3 b2 b1 b0}
+ # %mm0 == {a3 a2 a1 a0}
+
+ # output
+ # %mm3 == {d3 c3 b3 a3}
+ # %mm2 == {d2 c2 b2 a2}
+ # %mm1 == {d1 c1 b1 a1}
+ # %mm0 == {d0 c0 b0 a0}
+
+
+ .macro transpose_4x4:
+ movq %mm0, %mm4
+ punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
+ movq %mm0, %mm5
+ punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
+
+ movq %mm1, %mm6
+ punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
+ movq %mm1, %mm7
+ punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
+
+ movq %mm4, %mm0
+ punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
+ movq %mm4, %mm1
+ punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
+
+ movq %mm5, %mm2
+ punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
+ movq %mm5, %mm3
+ punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
+
+ .endm
+
+.globl pixel_cascade_vertb_s16
+.type pixel_cascade_vertb_s16,@function
+
+
+# pixel_cascade_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_cascade_vertb_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ subl %edx, %ebx
+
+ movq 0(%edi), %mm1 # s1[k-1]
+ movq 8(%edi), %mm2 # s2[k-1]
+ .align 16
+ .cascade_vertb_line_loop:
+
+ movq (%ebx,%edx,1), %mm3
+ movq %mm3, %mm0
+ addl %edx, %ebx
+ coupled
+ movq %mm0, (%ebx)
+
+ movq (%ebx,%edx,1), %mm3
+ movq %mm3, %mm0
+ addl %edx, %ebx
+ coupled
+ movq %mm0, (%ebx)
+
+ movq (%ebx,%edx,1), %mm3
+ movq %mm3, %mm0
+ addl %edx, %ebx
+ coupled
+ movq %mm0, (%ebx)
+
+ movq (%ebx,%edx,1), %mm3
+ movq %mm3, %mm0
+ addl %edx, %ebx
+ coupled
+ movq %mm0, (%ebx)
+
+ decl %ecx
+ jnz .cascade_vertb_line_loop
+
+ movq %mm1, 0(%edi) # s1[k-1]
+ movq %mm2, 8(%edi) # s2[k-1]
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+.globl pixel_cascade_horlr_s16
+.type pixel_cascade_horlr_s16,@function
+
+
+# pixel_cascade_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_cascade_horlr_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+
+ .align 16
+ .cascade_horlr_line_loop:
+ movq (%edi), %mm1
+ movq 8(%edi), %mm2
+
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+
+ transpose_4x4
+
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+
+ coupled
+
+ movq %mm0, (%ebx)
+ movq (%ebx,%edx,1), %mm3
+ movq %mm3, %mm0
+
+ coupled
+
+ movq %mm0, (%ebx, %edx,1)
+ movq (%ebx,%edx,2), %mm3
+ movq %mm3, %mm0
+
+ coupled
+
+ movq %mm0, (%ebx, %edx,2)
+ movq (%ebx,%eax,1), %mm3
+ movq %mm3, %mm0
+
+ coupled
+
+ movq %mm1, 0(%edi) # s1[k-1]
+ movq %mm2, 8(%edi) # s2[k-1]
+
+ movq %mm0, %mm3
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+
+ transpose_4x4
+
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+
+ addl $8, %ebx
+ decl %ecx
+ jnz .cascade_horlr_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+
+
diff --git a/system/mmx/pixel_conv_hor_s16.s b/system/mmx/pixel_conv_hor_s16.s
new file mode 100644
index 0000000..e90a692
--- /dev/null
+++ b/system/mmx/pixel_conv_hor_s16.s
@@ -0,0 +1,134 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+ # intermediate function
+
+ # input in register:
+ # %mm0: left 4 pixels
+ # %mm1: middle 4 pixels
+ # %mm2: right 4 pixels
+
+ # %mm5: left 4 pixel masks
+ # %mm6: middle 4 pixel masks
+ # %mm7: right 4 pixel masks
+
+ # return in register:
+ # %mm0: middle 4 pixels result
+
+
+ .conv_hor_4_pixels:
+ .align 16
+
+ # compute quadruplet
+
+ # get left pixels
+ psrlq $48, %mm0 # shift word 3 to byte 0
+ movq %mm1, %mm4
+ psllq $16, %mm4 # shift word 0,1,2 to 1,2,3
+ por %mm4, %mm0 # combine
+ pmulhw %mm5, %mm0
+ psllw $1, %mm0
+
+
+ # get middle pixels
+ movq %mm1, %mm4
+ pmulhw %mm6, %mm4
+ psllw $1, %mm4
+ paddsw %mm4, %mm0
+
+
+ # get right pixels
+ movq %mm2, %mm3
+ psllq $48, %mm3 # shift word 0 to word 3
+ movq %mm1, %mm4
+ psrlq $16, %mm4 # shift word 1,2,3 to 0,1,2
+ por %mm4, %mm3 # combine
+ pmulhw %mm7, %mm3
+ psllw $1, %mm3
+ paddsw %mm3, %mm0 # accumulate
+
+ ret
+
+.globl pixel_conv_hor_s16
+.type pixel_conv_hor_s16,@function
+
+
+# pixel_conv_hor_s16(short int *pixel_array, int nb_4_pixel_vectors, short int border[4], short int mask[12])
+# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
+# NOT TESTED
+
+
+pixel_conv_hor_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %esi # pixel array offset
+ movl 12(%ebp), %ecx # nb of 8 pixel vectors in a row (at least 2)
+
+ movl 20(%ebp), %edi # mask vector
+ movq (%edi), %mm5
+ movq 8(%edi), %mm6
+ movq 16(%edi), %mm7
+
+ movl 16(%ebp), %edi # boundary pixel vector
+
+
+
+ movq (%edi), %mm0 # init regs (left edge, so mm0 is zero)
+ movq (%esi), %mm1
+ movq 8(%esi), %mm2
+
+ decl %ecx # loop has 2 terminator stubs
+ decl %ecx # todo: handle if ecx < 3
+
+ jmp .conv_line_loop
+
+
+ .align 16
+ .conv_line_loop:
+ call .conv_hor_4_pixels # compute conv
+ movq %mm0, (%esi) # store result
+ movq %mm1, %mm0 # mm0 <- prev (%esi)
+ movq %mm2, %mm1 # mm1 <- 8(%esi)
+ movq 16(%esi), %mm2 # mm2 <- 16(%esi)
+
+ addl $8, %esi # increase pointer
+ decl %ecx
+ jnz .conv_line_loop
+
+ call .conv_hor_4_pixels # compute conv
+ movq %mm0, (%esi) # store result
+ movq %mm1, %mm0 # mm0 <- prev (%esi)
+ movq %mm2, %mm1 # mm1 <- 8(%esi)
+ movq (%edi), %mm2 # mm2 <- border
+
+ call .conv_hor_4_pixels # compute last vector
+ movq %mm0, 8(%esi) # store it
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
+
+
diff --git a/system/mmx/pixel_conv_ver_s16.s b/system/mmx/pixel_conv_ver_s16.s
new file mode 100644
index 0000000..ae2456f
--- /dev/null
+++ b/system/mmx/pixel_conv_ver_s16.s
@@ -0,0 +1,128 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+#TODO: fix out of bound acces in conv_ver and conv_hor
+
+ # intermediate function
+
+ # input in register:
+ # %mm0: top 4 pixels
+ # %mm1: middle 4 pixels
+ # %mm2: bottom 4 pixels
+
+ # %mm5: top 4 pixel mask
+ # %mm6: middle 4 pixel mask
+ # %mm7: bottom 4 pixel mask
+
+ # return in register:
+ # %mm0: middle 4 pixels result
+
+
+ .conv_ver_4_pixels:
+ .align 16
+
+ # compute quadruplet
+
+ # get top pixel
+ pmulhw %mm5, %mm0
+ psllw $1, %mm0
+
+ # get middle pixel
+ movq %mm1, %mm4
+ pmulhw %mm6, %mm4
+ psllw $1, %mm4
+ paddsw %mm4, %mm0
+
+ # get bottom pixel
+ movq %mm2, %mm3
+ pmulhw %mm7, %mm3
+ psllw $1, %mm3 # mm3 <- mm3/4
+ paddsw %mm3, %mm0
+
+ ret
+
+.globl pixel_conv_ver_s16
+.type pixel_conv_ver_s16,@function
+
+
+# pixel_conv_ver_s16(short int *pixel_array, int nb_4_pixel_vectors, int row_byte_size, short int border[4])
+# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
+# NOT TESTED
+
+
+pixel_conv_ver_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %esi # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4 pixel vectors in a row (at least 2)
+ movl 16(%ebp), %edx # rowsize in bytes
+
+ movl 24(%ebp), %edi # mask vector
+ movq (%edi), %mm5
+ movq 8(%edi), %mm6
+ movq 16(%edi), %mm7
+
+ movl 20(%ebp), %edi # edge vector
+
+
+ shll $1, %edx
+ decl %ecx # loop has a terminator stub
+ decl %ecx # loop has another terminator stub
+
+
+ movq (%edi), %mm0 # init regs (left edge, so mm0 is zero)
+ movq (%esi), %mm1
+ movq (%esi,%edx,1), %mm2
+ jmp .conv_line_loop
+
+
+ .align 16
+ .conv_line_loop:
+ call .conv_ver_4_pixels # compute conv
+ movq %mm0, (%esi) # store result
+ movq %mm1, %mm0 # mm0 <- prev (%esi)
+ movq %mm2, %mm1 # mm1 <- (%esi,%edx,1)
+ movq (%esi,%edx,2), %mm2 # mm2 <- (%esi,%edx,2)
+
+ addl %edx, %esi # increase pointer
+ decl %ecx
+ jnz .conv_line_loop
+
+ call .conv_ver_4_pixels # compute conv
+ movq %mm0, (%esi) # store result
+ movq %mm1, %mm0 # mm0 <- prev (%esi)
+ movq %mm2, %mm1 # mm1 <- (%esi,%edx,1)
+ movq (%edi), %mm2 # clear invalid edge vector
+
+ addl %edx, %esi # increase pointer
+ call .conv_ver_4_pixels # compute last vector
+ movq %mm0, (%esi) # store it
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
+
+
diff --git a/system/mmx/pixel_crot_s16.s b/system/mmx/pixel_crot_s16.s
new file mode 100644
index 0000000..2427869
--- /dev/null
+++ b/system/mmx/pixel_crot_s16.s
@@ -0,0 +1,153 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_crot3d_s16
+.type pixel_crot3d_s16,@function
+
+
+# 3 dimensional colour space rotation
+# 3x3 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
+
+# void pixel_crot3d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
+
+pixel_crot3d_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+
+ movl 8(%ebp), %esi # input array
+ movl 12(%ebp), %ecx # pixel count
+ movl 16(%ebp), %edi # rotation matrix
+ movl %ecx, %edx
+ shll $3, %edx # %edx = plane spacing
+
+
+ .align 16
+ .loop_crot3d:
+
+ movq (%esi), %mm0 # get 1st component
+ movq (%esi,%edx,1), %mm6 # get 2nd component
+ movq (%esi,%edx,2), %mm7 # get 3rd component
+
+ movq %mm0, %mm1 # copy 1st component
+ movq %mm0, %mm2
+
+ pmulhw (%edi), %mm0 # mul first column
+ pmulhw 8(%edi), %mm1
+ pmulhw 16(%edi), %mm2
+
+ movq %mm6, %mm5 # copy 2nd component
+ movq %mm6, %mm3
+
+ pmulhw 24(%edi), %mm6 # mul second column
+ pmulhw 32(%edi), %mm5
+ pmulhw 40(%edi), %mm3
+
+ paddsw %mm6, %mm0 # accumulate
+ paddsw %mm5, %mm1
+ paddsw %mm3, %mm2
+
+ movq %mm7, %mm4 # copy 3rd component
+ movq %mm7, %mm6
+
+ pmulhw 48(%edi), %mm4 # mul third column
+ pmulhw 56(%edi), %mm6
+ pmulhw 64(%edi), %mm7
+
+ paddsw %mm4, %mm0 # accumulate
+ paddsw %mm6, %mm1
+ paddsw %mm7, %mm2
+
+ paddsw %mm0, %mm0 # double (fixed point normalization)
+ paddsw %mm1, %mm1
+ paddsw %mm2, %mm2
+
+ movq %mm0, (%esi) # store
+ movq %mm1, (%esi, %edx, 1)
+ movq %mm2, (%esi, %edx, 2)
+
+ addl $8, %esi # increment source pointer
+ decl %ecx
+ jnz .loop_crot3d # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
+
+.globl pixel_crot2d_s16
+.type pixel_crot2d_s16,@function
+
+# 2 dimensional colour space rotation
+# 2x2 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
+
+# void pixel_crot2d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
+
+pixel_crot2d_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+
+ movl 8(%ebp), %esi # input array
+ movl 12(%ebp), %ecx # pixel count
+ movl 16(%ebp), %edi # rotation matrix
+ movl %ecx, %edx
+ shll $3, %edx # %edx = plane spacing
+
+
+ .align 16
+ .loop_crot2d:
+
+ movq (%esi), %mm0 # get 1st component
+ movq (%esi,%edx,1), %mm2 # get 2nd component
+
+ movq %mm0, %mm1 # copy 1st component
+ movq %mm2, %mm3 # copy 2nd component
+
+ pmulhw (%edi), %mm0 # mul first column
+ pmulhw 8(%edi), %mm1
+
+ pmulhw 16(%edi), %mm2 # mul second column
+ pmulhw 24(%edi), %mm3
+
+ paddsw %mm2, %mm0 # accumulate
+ paddsw %mm3, %mm1
+
+ paddsw %mm0, %mm0 # fixed point gain correction
+ paddsw %mm1, %mm1
+
+ movq %mm0, (%esi) # store
+ movq %mm1, (%esi, %edx, 1)
+
+ addl $8, %esi # increment source pointer
+ decl %ecx
+ jnz .loop_crot2d # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_gain.s b/system/mmx/pixel_gain.s
new file mode 100644
index 0000000..5cd5057
--- /dev/null
+++ b/system/mmx/pixel_gain.s
@@ -0,0 +1,83 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_gain
+.type pixel_gain,@function
+
+# mmx rgba pixel gain
+# void asmtest(char *pixelarray, int32 nbpixels, int *rgba_gain)
+# gains are 7.9 fixed point for rgba
+
+pixel_gain:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %esi # pixel array offset
+ movl 12(%ebp), %ecx # nb of elements
+ movl 16(%ebp), %edi # int16[4] array of gains
+
+ prefetch (%esi)
+
+ emms
+ sarl $2, %ecx # process 4 pixels per loop iteration
+ jz .exit
+ movq (%edi), %mm7 # read gain array from memory
+ jmp .loop_gain
+
+ .align 16
+ .loop_gain:
+
+ prefetch 128(%esi)
+ movq (%esi), %mm5 # load pixel 1-2 from memory
+ movq 8(%esi), %mm6 # load pixel 3-4 from memory
+ pxor %mm0, %mm0 # zero mm0 - mm3
+ pxor %mm1, %mm1
+ pxor %mm2, %mm2
+ pxor %mm3, %mm3
+ punpcklbw %mm5, %mm0 # unpack 1st pixel into 8.8 bit ints
+ punpckhbw %mm5, %mm1 # unpack 2nd
+ punpcklbw %mm6, %mm2 # unpack 3rd
+ punpckhbw %mm6, %mm3 # unpack 4th
+ psrlw $0x1, %mm0 # shift right to clear sign bit 9.7
+ psrlw $0x1, %mm1
+ psrlw $0x1, %mm2
+ psrlw $0x1, %mm3
+
+ pmulhw %mm7, %mm0 # multiply 1st pixel 9.7 * 7.9 -> 16.0
+ pmulhw %mm7, %mm1 # multiply 2nd
+ pmulhw %mm7, %mm2 # multiply 3rd
+ pmulhw %mm7, %mm3 # multiply 4th
+
+ packuswb %mm1, %mm0 # pack & saturate to 8bit vector
+ movq %mm0, (%esi) # store result in memory
+ packuswb %mm3, %mm2 # pack & saturate to 8bit vector
+ movq %mm2, 8(%esi) # store result in memory
+
+ addl $16, %esi # increment source pointer
+ decl %ecx
+ jnz .loop_gain # loop
+
+ .exit:
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_gain_s16.s b/system/mmx/pixel_gain_s16.s
new file mode 100644
index 0000000..adcfdf5
--- /dev/null
+++ b/system/mmx/pixel_gain_s16.s
@@ -0,0 +1,71 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_gain_s16
+.type pixel_gain_s16,@function
+
+# gain is integer, shift count is down
+# void pixel_gain_s16(int *buf, int nb_8pixel_vectors, short int gain[4], unsigned long long *shift)
+
+pixel_gain_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 20(%ebp), %edi
+ movq (%edi), %mm6 # get shift vector
+
+ movl 16(%ebp), %edi
+ movq (%edi), %mm7 # get gain vector
+
+ movl 8(%ebp), %esi # input array
+ movl 12(%ebp), %ecx # pixel count
+
+
+ .align 16
+ .loop_gain:
+
+ movq (%esi), %mm0 # load 4 pixels from memory
+ movq %mm0, %mm1
+ pmulhw %mm7, %mm1 # apply gain (s15.0) fixed point, high word
+ pmullw %mm7, %mm0 # low word
+
+ movq %mm0, %mm2 # copy
+ movq %mm1, %mm3
+
+ punpcklwd %mm1, %mm0 # unpack lsw components
+ punpckhwd %mm3, %mm2 # unpack msw components
+
+ psrad %mm6, %mm0 # apply signed shift
+ psrad %mm6, %mm2
+
+ packssdw %mm2, %mm0 # pack result & saturate
+ movq %mm0, (%esi) # store result
+
+
+ addl $8, %esi # increment source pointer
+ decl %ecx
+ jnz .loop_gain # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_mix_s16.s b/system/mmx/pixel_mix_s16.s
new file mode 100644
index 0000000..9bf41eb
--- /dev/null
+++ b/system/mmx/pixel_mix_s16.s
@@ -0,0 +1,68 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_mix_s16
+.type pixel_mix_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_mix_s16(int *left, int *right, int nb_4pixel_vectors,
+# short int gain_left[4], short int gain_right[4])
+
+pixel_mix_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 20(%ebp), %edi # int16[4] array of gains
+ movq (%edi), %mm6 # get left gain array
+
+ movl 24(%ebp), %edi # int16[4] array of gains
+ movq (%edi), %mm7 # get right gain array
+
+ movl 8(%ebp), %edi # left array
+ movl 12(%ebp), %esi # right array
+ movl 16(%ebp), %ecx # pixel count
+
+
+ .align 16
+ .loop_mix:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm1 # load right 4 pixels from memory
+ pmulhw %mm7, %mm1 # apply right gain
+ movq (%edi), %mm0 # load 4 left pixels from memory
+ pmulhw %mm6, %mm0 # apply left gain
+# pslaw $1, %mm1 # shift left ((s).15 x (s).15 -> (s0).14))
+# pslaw $1, %mm0
+ paddsw %mm0, %mm0 # no shift left arithmic, so use add instead
+ paddsw %mm1, %mm1
+ paddsw %mm1, %mm0 # mix
+ movq %mm0, (%edi)
+ addl $8, %esi
+ addl $8, %edi
+ decl %ecx
+ jnz .loop_mix # loop
+
+ emms
+
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_mul_s16.s b/system/mmx/pixel_mul_s16.s
new file mode 100644
index 0000000..240a024
--- /dev/null
+++ b/system/mmx/pixel_mul_s16.s
@@ -0,0 +1,56 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_mul_s16
+.type pixel_mul_s16,@function
+
+# simple add
+# void pixel_mul_s16(int *left, int *right, int nb_4pixel_vectors)
+
+pixel_mul_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %edi # left array
+ movl 12(%ebp), %esi # right array
+ movl 16(%ebp), %ecx # pixel count
+
+
+ .align 16
+ .loop_mix:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm1 # load right 4 pixels from memory
+ movq (%edi), %mm0 # load 4 left pixels from memory
+ pmulhw %mm1, %mm0 # mul
+ psllw $1, %mm0 # fixed point shift correction
+ movq %mm0, (%edi)
+ addl $8, %esi
+ addl $8, %edi
+ decl %ecx
+ jnz .loop_mix # loop
+
+ emms
+
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_pack_s16u8.s b/system/mmx/pixel_pack_s16u8.s
new file mode 100644
index 0000000..57df702
--- /dev/null
+++ b/system/mmx/pixel_pack_s16u8.s
@@ -0,0 +1,126 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_pack_s16u8_y
+.type pixel_pack_s16u8_y,@function
+
+# mmx rgba pixel gain
+# void pixel_pack_s16u8_y(int *input, int *output, int nb_8pixel_vectors)
+
+pixel_pack_s16u8_y:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+# movl 20(%ebp), %edi # int16[4] array of gains
+# movq (%edi), %mm7 # get gain array
+# psllw $1, %mm7 # adjust for shifted sign bit
+
+ movl 8(%ebp), %esi # input array
+ movl 12(%ebp), %edi # output array
+ movl 16(%ebp), %ecx # pixel count
+
+ pxor %mm6, %mm6
+
+ .align 16
+ .loop_pack_y:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm0 # load 4 pixels from memory
+# pmulhw %mm7, %mm0 # apply gain
+ movq 8(%esi), %mm1 # load 4 pixels from memory
+# pmulhw %mm7, %mm1 # apply gain
+
+# movq %mm0, %mm2
+# pcmpgtw %mm6, %mm2 # mm2 > 0 ? 0xffff : 0
+# pand %mm2, %mm0
+
+# movq %mm1, %mm3
+# pcmpgtw %mm6, %mm3 # mm3 > 0 ? 0xffff : 0
+# pand %mm3, %mm1
+
+# psllw $1, %mm0 # shift out sign bit
+# psllw $1, %mm1 # shift out sign bit
+
+ psraw $7, %mm0 # shift to lsb
+ psraw $7, %mm1 # shift to lsb
+
+ packuswb %mm1, %mm0 # pack & saturate to 8bit vector
+ movq %mm0, (%edi) # store result in memory
+
+ addl $16, %esi # increment source pointer
+ addl $8, %edi # increment dest pointer
+ decl %ecx
+ jnz .loop_pack_y # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
+.globl pixel_pack_s16u8_uv
+.type pixel_pack_s16u8_uv,@function
+
+pixel_pack_s16u8_uv:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+# movl 20(%ebp), %edi # int16[4] array of gains
+# movq (%edi), %mm7 # get gain array
+ movl 8(%ebp), %esi # pixel array offset
+ movl 12(%ebp), %edi # nb of elements
+ movl 16(%ebp), %ecx # pixel count
+
+ pcmpeqw %mm6, %mm6
+ psllw $15, %mm6
+ movq %mm6, %mm5
+ psrlw $8, %mm5
+ por %mm5, %mm6 # mm6 <- 8 times 0x80
+
+ .align 16
+ .loop_pack_uv:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm0 # load 4 pixels from memory
+# pmulhw %mm7, %mm0 # apply gain
+ movq 8(%esi), %mm1 # load 4 pixels from memory
+# pmulhw %mm7, %mm1 # apply gain
+
+ psraw $8, %mm0 # shift to msb
+ psraw $8, %mm1
+
+ packsswb %mm1, %mm0 # pack & saturate to 8bit vector
+ pxor %mm6, %mm0 # flip sign bits
+ movq %mm0, (%edi) # store result in memory
+
+ addl $16, %esi # increment source pointer
+ addl $8, %edi # increment dest pointer
+ decl %ecx
+ jnz .loop_pack_uv # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_rand_s16.s b/system/mmx/pixel_rand_s16.s
new file mode 100644
index 0000000..649400b
--- /dev/null
+++ b/system/mmx/pixel_rand_s16.s
@@ -0,0 +1,76 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_rand_s16
+.type pixel_rand_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_rand_s16(int *dst, nb_4pixel_vectors, short int random_seed[4])
+
+pixel_rand_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 16(%ebp), %esi # int16[4] array of random seeds
+ movl 8(%ebp), %edi # dst array
+ movl 12(%ebp), %ecx # pixel count
+
+ movq (%esi), %mm6
+
+
+ pcmpeqw %mm3, %mm3
+ psrlw $15, %mm3 # get bit mask 4 times 0x0001
+
+ .align 16
+ .loop_rand:
+
+# prefetch 128(%esi)
+
+
+ movq %mm6, %mm4 # get random vector
+ psrlw $15, %mm4 # get first component
+ movq %mm6, %mm5
+ psrlw $14, %mm5 # get second component
+ pxor %mm5, %mm4
+ movq %mm6, %mm5
+ psrlw $12, %mm5 # get third component
+ pxor %mm5, %mm4
+ movq %mm6, %mm5
+ psrlw $3, %mm5 # get forth component
+ pxor %mm5, %mm4
+
+ psllw $1, %mm6 # shift left original random vector
+ pand %mm3, %mm4 # isolate new bit
+ por %mm4, %mm6 # combine into new random vector
+
+ movq %mm6, (%edi)
+ addl $8, %edi
+ decl %ecx
+ jnz .loop_rand # loop
+
+
+ movq %mm6, (%esi) # store random seeds
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_randmix_s16.s b/system/mmx/pixel_randmix_s16.s
new file mode 100644
index 0000000..44e1702
--- /dev/null
+++ b/system/mmx/pixel_randmix_s16.s
@@ -0,0 +1,91 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_randmix_s16
+.type pixel_randmix_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_randmix_s16(int *left, int *right, int nb_4pixel_vectors, short int random_seed[4], short int threshold[4])
+
+pixel_randmix_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 20(%ebp), %edi # int16[4] array of random seeds
+ movq (%edi), %mm6
+
+ movl 24(%ebp), %edi # int16[4] array of thresholds
+ movq (%edi), %mm7
+
+ movl 8(%ebp), %edi # left array
+ movl 12(%ebp), %esi # right array
+ movl 16(%ebp), %ecx # pixel count
+
+ pcmpeqw %mm3, %mm3
+ psrlw $15, %mm3 # get bit mask 4 times 0x0001
+
+ .align 16
+ .loop_randmix:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm1 # load right 4 pixels from memory
+ movq (%edi), %mm0 # load 4 left pixels from memory
+
+ movq %mm6, %mm2 # get random vector
+ pcmpgtw %mm7, %mm2 # compare random vector with threshold
+ movq %mm2, %mm5
+
+ pand %mm0, %mm2 # get left array's components
+ pandn %mm1, %mm5 # get right array's components
+ por %mm2, %mm5
+
+ movq %mm5, (%edi) # store pixels
+
+ movq %mm6, %mm4 # get random vector
+ psrlw $15, %mm4 # get first component
+ movq %mm6, %mm5
+ psrlw $14, %mm5 # get second component
+ pxor %mm5, %mm4
+ movq %mm6, %mm5
+ psrlw $12, %mm5 # get third component
+ pxor %mm5, %mm4
+ movq %mm6, %mm5
+ psrlw $3, %mm5 # get forth component
+ pxor %mm5, %mm4
+
+ psllw $1, %mm6 # shift left original random vector
+ pand %mm3, %mm4 # isolate new bit
+ por %mm4, %mm6 # combine into new random vector
+
+ addl $8, %esi
+ addl $8, %edi
+ decl %ecx
+ jnz .loop_randmix # loop
+
+
+ movl 20(%ebp), %edi # int16[4] array of random seeds
+ movq %mm6, (%edi) # store random seeds
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_s1.s b/system/mmx/pixel_s1.s
new file mode 100644
index 0000000..d6bc5ca
--- /dev/null
+++ b/system/mmx/pixel_s1.s
@@ -0,0 +1,201 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+ # this file contains ops for binary image processing
+ # 8x8 bit tile encoded
+ # low byte = bottom row
+ # low bit = right column
+ # %mm7 = scratch reg for all macros
+
+
+ # ************ load mask *******************
+ # compute bit masks for rows and columns
+ # %mm7: scratch reg
+
+ # load mask top
+ .macro ldmt count reg
+ pcmpeqb \reg, \reg
+ psllq $(64-(\count<<3)), \reg
+ .endm
+
+ # load mask bottom
+ .macro ldmb count reg
+ pcmpeqb \reg, \reg
+ psrlq $(64-(\count<<3)), \reg
+ .endm
+
+ # load mask top and bottom
+ .macro ldmtb count regt regb
+ ldmb \count, \regb
+ ldmt \count, \regt
+ .endm
+
+ # load mask right
+ .macro ldmr count reg
+ pcmpeqb %mm7, %mm7
+ psrlw $(16-\count), %mm7
+ movq %mm7, \reg
+ psllq $8, %mm7
+ por %mm7, \reg
+ .endm
+
+ # load mask left
+ .macro ldml count reg
+ pcmpeqb %mm7, %mm7
+ psllw $(16-\count), %mm7
+ movq %mm7, \reg
+ psrlq $8, %mm7
+ por %mm7, \reg
+ .endm
+
+ # load mask left and right
+ .macro ldmlr count regl regr
+ pcmpeqb %mm7, %mm7
+ psllw $(16-\count), %mm7
+ movq %mm7, \regl
+ psrlq $8, %mm7
+ por %mm7, \regl
+ movq \regl, \regr
+ psrlq $(8-\count), \regr
+ .endm
+
+ # ************* shift square **********
+ # shifts a square in reg, fills with zeros
+
+ # shift square top
+ .macro sst count reg
+ psllq $(\count<<3), \reg
+ .endm
+
+ # shift square bottom
+ .macro ssb count reg
+ psrlq $(\count<<3), \reg
+ .endm
+
+ # not tested
+ # shift square left
+ .macro ssl count reg
+ movq \reg, %mm7
+ pcmpeqb \reg, \reg
+ psllw $(16-\count), \reg
+ psrlw $8, \reg
+ pandn %mm7, \reg
+ psllw $(\count), \reg
+ .endm
+
+ # shift square right
+ .macro ssr count reg
+ movq \reg, %mm7
+ pcmpeqb \reg, \reg
+ psrlw $(16-\count), \reg
+ psllw $8, \reg
+ pandn %mm7, \reg
+ psrlw $(\count), \reg
+ .endm
+
+
+ # ********** combine square *************
+ # combines 2 squares
+
+ # combine right
+ .macro csr count regr reg
+ ssl \count, \reg
+ ssr (8-\count), \regr
+ por \regr, \reg
+ .endm
+
+ # combine left
+ .macro csl count regl reg
+ ssr \count, \reg
+ ssl (8-\count), \regl
+ por \regl, \reg
+ .endm
+
+ # combine top
+ .macro cst count regt reg
+ ssb \count, \reg
+ sst (8-\count), \regt
+ por \regt, \reg
+ .endm
+
+
+ # combine bottom
+ .macro csb count regb reg
+ sst \count, \reg
+ ssb (8-\count), \regb
+ por \regb, \reg
+ .endm
+
+
+ # ********** load combine square *************
+ # loads combined square using mask
+
+ # load combined square left
+ # mask should be count bits set right (i.e. 0x01)
+ .macro lcsml count mask source sourcel dstreg
+ movq \mask, \dstreg
+ movq \mask, %mm7
+ pandn \source, \dstreg
+ pand \sourcel, %mm7
+ psrlq $(\count), \dstreg
+ psllq $(8-\count), %mm7
+ por %mm7, \dstreg
+ .endm
+
+
+
+.globl pixel_test_s1
+.type pixel_test_s1,@function
+
+# simple add
+# void pixel_add_s16(void *dest, void *source, int nb_squares, int spacing)
+
+
+
+ #
+
+
+pixel_test_s1:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %edi # dest
+ movl 12(%ebp), %esi # source
+ movl 16(%ebp), %ecx # count
+ movl 20(%ebp), %edx # row distance
+
+ ldmr 1, %mm6
+ lcsml 1, %mm6, (%esi), 8(%esi), %mm0
+ movq %mm0, (%edi)
+
+
+# movq (%esi), %mm0
+# movq 8(%esi), %mm1
+# csl 4, %mm1, %mm0
+# movq %mm0, (%edi)
+
+ emms
+
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_unpack_u8s16.s b/system/mmx/pixel_unpack_u8s16.s
new file mode 100644
index 0000000..0fc14c2
--- /dev/null
+++ b/system/mmx/pixel_unpack_u8s16.s
@@ -0,0 +1,113 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_unpack_u8s16_y
+.type pixel_unpack_u8s16_y,@function
+
+# mmx rgba pixel gain
+# void pixel_unpack_u8s16_y(char *input, char *output, int32 nb_pixels_div8)
+
+pixel_unpack_u8s16_y:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+# movl 20(%ebp), %edi # int16[4] array of gains
+# movq (%edi), %mm7 # get gain array
+
+ movl 8(%ebp), %esi # input uint8 pixel array
+ movl 12(%ebp), %edi # output sint16 pixel array
+ movl 16(%ebp), %ecx # nb of elements div 8
+
+
+ .align 16
+ .loop_unpack_y:
+
+ movq (%esi), %mm5 # load 8 pixels from memory
+ pxor %mm0, %mm0 # zero mm0 - mm3
+ pxor %mm1, %mm1
+ punpcklbw %mm5, %mm0 # unpack 1st 4 pixels
+ punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles
+ psrlw $0x1, %mm0 # shift right to clear sign bit 9.7
+ psrlw $0x1, %mm1
+# pmulhw %mm7, %mm0 # apply gain
+# pmulhw %mm7, %mm1
+# paddsw %mm0, %mm0 # correct factor 2
+# paddsw %mm1, %mm1
+ movq %mm0, (%edi) # store
+ movq %mm1, 8(%edi)
+
+ addl $8, %esi # increment source pointer
+ addl $16, %edi # increment dest pointer
+ decl %ecx
+ jnz .loop_unpack_y # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
+.globl pixel_unpack_u8s16_uv
+.type pixel_unpack_u8s16_uv,@function
+pixel_unpack_u8s16_uv:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+# movl 20(%ebp), %edi # int16[4] array of gains
+# movq (%edi), %mm7 # get gain array
+
+ movl 8(%ebp), %esi # input uint8 pixel array
+ movl 12(%ebp), %edi # output sint16 pixel array
+ movl 16(%ebp), %ecx # nb of elements div 8
+
+ pcmpeqw %mm6, %mm6
+ psllw $15, %mm6
+
+ .align 16
+ .loop_unpack_uv:
+
+ movq (%esi), %mm5 # load 8 pixels from memory
+ pxor %mm0, %mm0 # zero mm0 - mm3
+ pxor %mm1, %mm1
+ punpcklbw %mm5, %mm0 # unpack 1st 4 pixels
+ punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles
+ pxor %mm6, %mm0 # flip sign bit (Cr and Cb are ofset by 128)
+ pxor %mm6, %mm1
+# pmulhw %mm7, %mm0 # apply gain
+# pmulhw %mm7, %mm1
+# paddsw %mm0, %mm0 # correct factor 2
+# paddsw %mm1, %mm1
+ movq %mm0, (%edi) # store
+ movq %mm1, 8(%edi)
+
+ addl $8, %esi # increment source pointer
+ addl $16, %edi # increment dest pointer
+ decl %ecx
+ jnz .loop_unpack_uv # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/pdp.c b/system/pdp.c
new file mode 100644
index 0000000..8651971
--- /dev/null
+++ b/system/pdp.c
@@ -0,0 +1,115 @@
+/*
+ * Pure Data Packet system implementation: setup code
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "pdp.h"
+#include <stdio.h>
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+
+/* module setup declarations (all C-style) */
+
+/* pdp system / internal stuff */
+void pdp_packet_setup(void);
+void pdp_ut_setup(void);
+void pdp_queue_setup(void);
+void pdp_control_setup(void);
+
+/* pdp modules */
+void pdp_xv_setup(void);
+void pdp_add_setup(void);
+void pdp_mul_setup(void);
+void pdp_mix_setup(void);
+void pdp_randmix_setup(void);
+void pdp_qt_setup(void);
+void pdp_v4l_setup(void);
+void pdp_reg_setup(void);
+void pdp_conv_setup(void);
+void pdp_bq_setup(void);
+void pdp_affine_setup(void);
+void pdp_del_setup(void);
+void pdp_snap_setup(void);
+void pdp_trigger_setup(void);
+void pdp_route_setup(void);
+void pdp_noise_setup(void);
+void pdp_gradient_setup(void);
+void pdp_gain_setup(void);
+void pdp_grey_setup(void);
+void pdp_chrot_setup(void);
+void pdp_scope_setup(void);
+void pdp_scale_setup(void);
+void pdp_zoom_setup(void);
+
+
+/* library setup routine */
+void pdp_setup(void){
+
+ /* babble */
+ post ("PDP: pure data packet");
+
+#ifdef PDP_VERSION
+ fprintf(stderr, "PDP: version " PDP_VERSION "\n");
+#endif
+
+
+ /* setup pdp system */
+ pdp_packet_setup();
+ pdp_queue_setup();
+ pdp_control_setup();
+
+ /* setup utility toolkit */
+ pdp_ut_setup();
+
+ /* setup pdp modules*/
+ pdp_add_setup();
+ pdp_mul_setup();
+ pdp_mix_setup();
+ pdp_randmix_setup();
+ pdp_xv_setup();
+ pdp_qt_setup();
+ pdp_v4l_setup();
+ pdp_reg_setup();
+ pdp_conv_setup();
+ pdp_bq_setup();
+ pdp_affine_setup();
+ pdp_del_setup();
+ pdp_snap_setup();
+ pdp_trigger_setup();
+ pdp_route_setup();
+ pdp_noise_setup();
+ pdp_gradient_setup();
+ pdp_gain_setup();
+ pdp_grey_setup();
+ pdp_chrot_setup();
+ pdp_scope_setup();
+ pdp_scale_setup();
+ pdp_zoom_setup();
+
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_comm.c b/system/pdp_comm.c
new file mode 100644
index 0000000..80cbaaa
--- /dev/null
+++ b/system/pdp_comm.c
@@ -0,0 +1,119 @@
+/*
+ * Pure Data Packet system implementation.
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+/* this file contains misc communication methods */
+
+
+#include "pdp.h"
+#include "pdp_internals.h"
+#include <stdio.h>
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+/************** packet management and communication convenience functions ************/
+
+/* send a packet to an outlet */
+void outlet_pdp(t_outlet *out, int packetid)
+{
+ t_atom atom[2];
+ t_symbol *s = gensym("pdp");
+ t_symbol *rro = gensym("register_ro");
+ t_symbol *rrw = gensym("register_rw");
+ t_symbol *proc = gensym("process");
+
+
+ SETFLOAT(atom+1, (float)packetid);
+
+ SETSYMBOL(atom+0, rro);
+ outlet_anything(out, s, 2, atom);
+
+ SETSYMBOL(atom+0, rrw);
+ outlet_anything(out, s, 2, atom);
+
+ /* this is not really necessary, it can be triggered by the rw message */
+ SETSYMBOL(atom+0, proc);
+ outlet_anything(out, s, 1, atom);
+
+}
+
+
+/* unregister a packet and send it to an outlet */
+void
+pdp_pass_if_valid(t_outlet *outlet, int *packet)
+{
+ if (-1 != *packet){
+ pdp_packet_mark_unused(*packet);
+ outlet_pdp(outlet, *packet);
+ *packet = -1;
+ }
+}
+
+void
+pdp_replace_if_valid(int *dpacket, int *spacket)
+{
+ if (-1 != *spacket){
+ pdp_packet_mark_unused(*dpacket);
+ *dpacket = *spacket;
+ *spacket = -1;
+ }
+
+}
+
+
+int
+pdp_packet_copy_ro_or_drop(int *dpacket, int spacket)
+{
+ int drop = 0;
+ if (*dpacket == -1) *dpacket = pdp_packet_copy_ro(spacket);
+ else {
+ /* send a notification there is a dropped packet */
+ pdp_control_notify_drop(spacket);
+ drop = 1;
+ }
+ return drop;
+}
+
+
+int
+pdp_packet_copy_rw_or_drop(int *dpacket, int spacket)
+{
+ int drop = 0;
+ if (*dpacket == -1) *dpacket = pdp_packet_copy_rw(spacket);
+ else {
+ /* send a notification there is a dropped packet */
+ pdp_control_notify_drop(spacket);
+ drop = 1;
+ }
+ return drop;
+}
+
+
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_control.c b/system/pdp_control.c
new file mode 100644
index 0000000..a7ee0c7
--- /dev/null
+++ b/system/pdp_control.c
@@ -0,0 +1,162 @@
+/*
+ * Pure Data Packet system implementation: control object
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+/* this is an actual pd class that is used for communication with the
+ pdp framework */
+
+#include "pdp.h"
+#include "pdp_internals.h"
+#include <stdio.h>
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+
+static long dropped_packets;
+
+static t_class* pdp_control_class;
+
+
+/* pdp control instance data */
+
+struct pdp_control_struct;
+typedef struct pdp_control_struct
+{
+ t_object x_obj;
+ t_outlet *x_outlet0;
+ struct pdp_control_struct *x_next;
+
+} t_pdp_control;
+
+typedef void (t_pdp_control_method_notify)(t_pdp_control *x);
+
+
+static t_pdp_control *pdp_control_list;
+
+static void pdp_control_info(t_pdp_control *x)
+{
+}
+
+static void pdp_control_thread(t_pdp_control *x, t_floatarg f)
+{
+ int t = (int)f;
+
+ if (t){
+ post("pdp_control: switching on processing in thread");
+ pdp_queue_use_thread(1);
+ }
+ else {
+ post("pdp_control: switching off processing in thread");
+ pdp_queue_use_thread(0);
+ }
+}
+
+
+static void pdp_control_send_drop_message(t_pdp_control *x)
+{
+ t_atom atom[1];
+ t_symbol *s = gensym("pdp_drop");
+
+ SETFLOAT(atom+0, (float)dropped_packets);
+ outlet_anything(x->x_outlet0, s, 1, atom);
+}
+
+
+static void pdp_control_free(t_pdp_control *x)
+{
+ /* remove from linked list */
+ t_pdp_control *curr = pdp_control_list;
+ if (pdp_control_list == x) pdp_control_list = x->x_next;
+ else while (curr){
+ if (curr->x_next == x) {
+ curr->x_next = x->x_next;
+ break;
+ }
+ else {
+ curr = curr->x_next;
+ }
+
+ }
+}
+
+
+static void *pdp_control_new(void)
+{
+ t_pdp_control *x = (t_pdp_control *)pd_new(pdp_control_class);
+ x->x_outlet0 = outlet_new(&x->x_obj, &s_anything);
+
+ /* add to list */
+ x->x_next = pdp_control_list;
+ pdp_control_list = x;
+ return x;
+}
+
+/************************* class methods ***************************************/
+
+
+void pdp_control_setup(void)
+{
+
+ pdp_control_list = 0;
+ dropped_packets = 0;
+
+ /* setup pd class data */
+ pdp_control_class = class_new(gensym("pdp_control"), (t_newmethod)pdp_control_new,
+ (t_method)pdp_control_free, sizeof(t_pdp_control), 0, A_NULL);
+
+
+ class_addmethod(pdp_control_class, (t_method)pdp_control_info, gensym("info"), A_NULL);
+ class_addmethod(pdp_control_class, (t_method)pdp_control_thread, gensym("thread"), A_DEFFLOAT, A_NULL);
+}
+
+
+
+void pdp_control_notify_broadcast(t_pdp_control_method_notify *notify)
+{
+ t_pdp_control *curr = pdp_control_list;
+ while (curr){
+ (*notify)(curr);
+ curr = curr->x_next;
+ }
+}
+
+
+
+/************************* notify class methods *************************/
+
+void pdp_control_notify_drop(int packet)
+{
+ dropped_packets++;
+
+ /* send drop notify to controller class instances */
+ pdp_control_notify_broadcast(pdp_control_send_drop_message);
+ //post("dropped packet");
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_imageproc_mmx.c b/system/pdp_imageproc_mmx.c
new file mode 100644
index 0000000..2f32c3f
--- /dev/null
+++ b/system/pdp_imageproc_mmx.c
@@ -0,0 +1,319 @@
+/*
+ * Pure Data Packet. c wrapper for mmx image processing routines.
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+/* this is a c wrapper around platform specific (mmx) code */
+#include <stdlib.h>
+#include "pdp_mmx.h"
+#include "pdp_imageproc.h"
+
+// utility stuff
+inline static s16 float2fixed(float f)
+{
+ if (f > 1) f = 1;
+ if (f < -1) f = -1;
+ f *= 0x7fff;
+ return (s16)f;
+}
+
+inline static void setvec(s16 *v, float f)
+{
+ s16 a = float2fixed(f);
+ v[0] = a;
+ v[1] = a;
+ v[2] = a;
+ v[3] = a;
+}
+
+
+
+// add two images
+void pdp_imageproc_add_process(s16 *image, s16 *image2, u32 width, u32 height)
+{
+ unsigned int totalnbpixels = width * height;
+ pixel_add_s16(image, image2, totalnbpixels>>2);
+}
+
+// mul two images
+void pdp_imageproc_mul_process(s16 *image, s16 *image2, u32 width, u32 height)
+{
+ unsigned int totalnbpixels = width * height;
+ pixel_mul_s16(image, image2, totalnbpixels>>2);
+}
+
+// mix 2 images
+void *pdp_imageproc_mix_new(void){return malloc(8*sizeof(s16));}
+void pdp_imageproc_mix_delete(void *x) {free (x);}
+void pdp_imageproc_mix_setleftgain(void *x, float gain){setvec((s16 *)x, gain);}
+void pdp_imageproc_mix_setrightgain(void *x, float gain){setvec((s16 *)x + 4, gain);}
+void pdp_imageproc_mix_process(void *x, s16 *image, s16 *image2, u32 width, u32 height)
+{
+ s16 *d = (s16 *)x;
+ unsigned int totalnbpixels = width * height;
+ pixel_mix_s16(image, image2, totalnbpixels>>2, d, d+4);
+}
+
+
+// random mix 2 images
+void *pdp_imageproc_randmix_new(void){return malloc(8*sizeof(s16));}
+void pdp_imageproc_randmix_delete(void *x) {free (x);}
+void pdp_imageproc_randmix_setthreshold(void *x, float threshold){setvec((s16 *)x, 2*threshold-1);}
+void pdp_imageproc_randmix_setseed(void *x, float seed)
+{
+ s16 *d = (s16 *)x;
+ srandom((u32)seed);
+ d[4] = (s16)random();
+ d[5] = (s16)random();
+ d[6] = (s16)random();
+ d[7] = (s16)random();
+
+}
+void pdp_imageproc_randmix_process(void *x, s16 *image, s16 *image2, u32 width, u32 height)
+{
+ s16 *d = (s16 *)x;
+ unsigned int totalnbpixels = width * height;
+ pixel_randmix_s16(image, image2, totalnbpixels>>2, d+4, d);
+}
+
+// affine transformation (applies gain + adds offset)
+void *pdp_imageproc_affine_new(void){return malloc(8*sizeof(s16));}
+void pdp_imageproc_affine_delete(void *x){free(x);}
+void pdp_imageproc_affine_setgain(void *x, float gain){setvec((s16 *)x, gain);}
+void pdp_imageproc_affine_setoffset(void *x, float offset){setvec((s16 *)x+4, offset);}
+void pdp_imageproc_affine_process(void *x, s16 *image, u32 width, u32 height)
+{
+ s16 *d = (s16 *)x;
+ pixel_affine_s16(image, (width*height)>>2, d, d+4);
+}
+
+// 3x1 or 1x3 in place convolution
+// orientation
+void *pdp_imageproc_conv_new(void){return(malloc(16*sizeof(s16)));}
+void pdp_imageproc_conv_delete(void *x){free(x);}
+void pdp_imageproc_conv_setmin1(void *x, float val){setvec((s16 *)x, val);}
+void pdp_imageproc_conv_setzero(void *x, float val){setvec((s16 *)x+4, val);}
+void pdp_imageproc_conv_setplus1(void *x, float val){setvec((s16 *)x+8, val);}
+void pdp_imageproc_conv_setbordercolor(void *x, float val){setvec((s16 *)x+12, val);}
+void pdp_imageproc_conv_process(void *x, s16 *image, u32 width, u32 height, u32 orientation, u32 nbp)
+{
+ s16 *d = (s16 *)x;
+ u32 i,j;
+
+ if (orientation == PDP_IMAGEPROC_CONV_HORIZONTAL)
+ {
+ for(i=0; i<width*height; i+=width)
+ for (j=0; j<nbp; j++)
+ pixel_conv_hor_s16(image+i, width>>2, d+12, d);
+ }
+
+ else
+ {
+ for (j=0; j<nbp; j++)
+ for(i=0; i<width; i +=4) pixel_conv_ver_s16(image+i, height, width, d+12, d);
+ }
+
+
+
+}
+
+// apply a gain to an image
+void *pdp_imageproc_gain_new(void){return(malloc(8*sizeof(s16)));}
+void pdp_imageproc_gain_delete(void *x){free(x);}
+void pdp_imageproc_gain_setgain(void *x, float gain)
+{
+ /* convert float to s16 + shift */
+ s16 *d = (s16 *)x;
+ s16 g;
+ int i;
+ float sign;
+ int shift = 0;
+
+ sign = (gain < 0) ? -1 : 1;
+ gain *= sign;
+
+ /* max shift = 16 */
+ for(i=0; i<=16; i++){
+ if (gain < 0x4000){
+ gain *= 2;
+ shift++;
+ }
+ else break;
+ }
+
+ gain *= sign;
+ g = (s16) gain;
+
+ //g = 0x4000;
+ //shift = 14;
+
+ d[0]=g;
+ d[1]=g;
+ d[2]=g;
+ d[3]=g;
+ d[4]=(s16)shift;
+ d[5]=0;
+ d[6]=0;
+ d[7]=0;
+}
+void pdp_imageproc_gain_process(void *x, s16 *image, u32 width, u32 height)
+{
+ s16 *d = (s16 *)x;
+ pixel_gain_s16(image, (width*height)>>2, d, (u64 *)(d+4));
+}
+
+// colour rotation for 2 colour planes
+void *pdp_imageproc_crot2d_new(void){return malloc(16*sizeof(s16));}
+void pdp_imageproc_crot2d_delete(void *x){free(x);}
+void pdp_imageproc_crot2d_setmatrix(void *x, float *matrix)
+{
+ s16 *d = (s16 *)x;
+ setvec(d, matrix[0]);
+ setvec(d+4, matrix[1]);
+ setvec(d+8, matrix[2]);
+ setvec(d+12, matrix[3]);
+}
+void pdp_imageproc_crot2d_process(void *x, s16 *image, u32 width, u32 height)
+{
+ s16 *d = (s16 *)x;
+ pixel_crot2d_s16(image, width*height >> 2, d);
+}
+
+// biquad and biquad time
+void *pdp_imageproc_bq_new(void){return malloc((5+2+2)*4*sizeof(s16));}//5xcoef, 2xstate, 2xsavestate
+void pdp_imageproc_bq_delete(void *x){free(x);}
+void pdp_imageproc_bq_setcoef(void *x, float *coef) // a0,-a1,-a2,b0,b1,b2,u0,u1
+{
+ s16 *d = (s16 *)x;
+ float ia0 = 1.0f / coef[0];
+
+ /* all coefs are s1.14 fixed point */
+ /* representing values -2 < x < 2 */
+ /* so scale down before using the ordinary s0.15 float->fixed routine */
+
+ ia0 *= 0.5f;
+
+ // coef
+ setvec(d, ia0*coef[1]);
+ setvec(d+4, ia0*coef[2]);
+ setvec(d+8, ia0*coef[3]);
+ setvec(d+12, ia0*coef[4]);
+ setvec(d+16, ia0*coef[5]);
+
+ // state to reset too
+ setvec(d+28, coef[6]);
+ setvec(d+32, coef[7]);
+
+}
+void pdp_imageproc_bqt_process(void *x, s16 *image, s16 *state0, s16 *state1, u32 width, u32 height)
+{
+ s16 *d = (s16 *)x;
+ pixel_biquad_time_s16(image, state0, state1, d, (width*height)>>2);
+}
+
+void pdp_imageproc_bq_process(void *x, s16 *image, u32 width, u32 height, u32 direction, u32 nbp)
+{
+ s16 *d = (s16 *)x;
+ unsigned int i,j;
+
+
+
+ /* VERTICAL */
+
+ if ((direction & PDP_IMAGEPROC_BIQUAD_TOP2BOTTOM)
+ && (direction & PDP_IMAGEPROC_BIQUAD_BOTTOM2TOP)){
+
+ for(i=0; i<width; i +=4){
+ for (j=0; j<nbp; j++){
+ pixel_biquad_vertb_s16(image+i, height>>2, width, d, d + (5*4));
+ pixel_biquad_verbt_s16(image+i, height>>2, width, d, d + (5*4));
+ }
+ }
+ }
+
+ else if (direction & PDP_IMAGEPROC_BIQUAD_TOP2BOTTOM){
+ for(i=0; i<width; i +=4){
+ for (j=0; j<nbp; j++){
+ pixel_biquad_vertb_s16(image+i, height>>2, width, d, d + (5*4));
+ }
+ }
+ }
+
+ else if (direction & PDP_IMAGEPROC_BIQUAD_BOTTOM2TOP){
+ for(i=0; i<width; i +=4){
+ for (j=0; j<nbp; j++){
+ pixel_biquad_verbt_s16(image+i, height>>2, width, d, d + (5*4));
+ }
+ }
+ }
+
+ /* HORIZONTAL */
+
+ if ((direction & PDP_IMAGEPROC_BIQUAD_LEFT2RIGHT)
+ && (direction & PDP_IMAGEPROC_BIQUAD_RIGHT2LEFT)){
+
+ for(i=0; i<(width*height); i +=(width<<2)){
+ for (j=0; j<nbp; j++){
+ pixel_biquad_horlr_s16(image+i, width>>2, width, d, d + (5*4));
+ pixel_biquad_horrl_s16(image+i, width>>2, width, d, d + (5*4));
+ }
+ }
+ }
+
+ else if (direction & PDP_IMAGEPROC_BIQUAD_LEFT2RIGHT){
+ for(i=0; i<(width*height); i +=(width<<2)){
+ for (j=0; j<nbp; j++){
+ pixel_biquad_horlr_s16(image+i, width>>2, width, d, d + (5*4));
+ }
+ }
+ }
+
+ else if (direction & PDP_IMAGEPROC_BIQUAD_RIGHT2LEFT){
+ for(i=0; i<(width*height); i +=(width<<2)){
+ for (j=0; j<nbp; j++){
+ pixel_biquad_horrl_s16(image+i, width>>2, width, d, d + (5*4));
+ }
+ }
+ }
+
+}
+
+// produce a random image
+// note: random number generator can be platform specific
+// however, it should be seeded. (same seed produces the same result)
+void *pdp_imageproc_random_new(void){return malloc(4*sizeof(s16));}
+void pdp_imageproc_random_delete(void *x){free(x);}
+void pdp_imageproc_random_setseed(void *x, float seed)
+{
+ s16 *d = (s16 *)x;
+ srandom((u32)seed);
+ d[0] = (s16)random();
+ d[1] = (s16)random();
+ d[2] = (s16)random();
+ d[3] = (s16)random();
+
+}
+void pdp_imageproc_random_process(void *x, s16 *image, u32 width, u32 height)
+{
+ s16 *d = (s16 *)x;
+ unsigned int totalnbpixels = width * height;
+ pixel_rand_s16(image, totalnbpixels>>2, d);
+}
+
+
diff --git a/system/pdp_imageproc_portable.c b/system/pdp_imageproc_portable.c
new file mode 100644
index 0000000..60062d6
--- /dev/null
+++ b/system/pdp_imageproc_portable.c
@@ -0,0 +1,492 @@
+/*
+ * Pure Data Packet. portable image processing routines.
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+
+#include <stdlib.h>
+#include "pdp_imageproc.h"
+
+// utility stuff
+inline static s32 float2fixed(float f)
+{
+ if (f > 1) f = 1;
+ if (f < -1) f = -1;
+ f *= 0x7fff;
+ return (s32)f;
+}
+
+
+
+#define CLAMP16(x) (((x) > 0x7fff) ? 0x7fff : (((x) < -0x7fff) ? -0x7fff : (x)))
+
+// add two images
+void pdp_imageproc_add_process(s16 *image, s16 *image2, u32 width, u32 height)
+{
+ int a, b;
+ unsigned int i;
+ for (i=0; i<width*height; i++){
+ a = (int)image[i];
+ b = (int)image2[i];
+ image[i] = (s16)(CLAMP16(a+b));
+ }
+
+}
+
+// mul two images
+void pdp_imageproc_mul_process(s16 *image, s16 *image2, u32 width, u32 height)
+{
+ int a, b;
+ unsigned int i;
+ for (i=0; i<width*height; i++){
+ a = (int)image[i];
+ b = (int)image2[i];
+ image[i] = (s16)((a*b)>>15);
+ }
+
+}
+
+// mix 2 images
+void *pdp_imageproc_mix_new(void){return malloc(2*sizeof(s32));}
+void pdp_imageproc_mix_delete(void *x) {free (x);}
+void pdp_imageproc_mix_setleftgain(void *x, float gain)
+{
+ s32 *d = (s32 *)x;
+ d[0] = float2fixed(gain);
+}
+void pdp_imageproc_mix_setrightgain(void *x, float gain)
+{
+ s32 *d = (s32 *)x;
+ d[1] = float2fixed(gain);
+}
+void pdp_imageproc_mix_process(void *x, s16 *image, s16 *image2, u32 width, u32 height)
+{
+ s32 *d = (s32 *)x;
+ u32 i;
+ s32 a,b;
+
+ for(i=0; i<width*height; i++){
+ a = (s32)image[i];
+ b = (s32)image2[i];
+ a = (a*d[0] + b*d[1]) >> 15;
+ image[i] = (s16)CLAMP16(a);
+ }
+
+}
+
+
+// random mix 2 images
+void *pdp_imageproc_randmix_new(void){return malloc(2*sizeof(s32));;}
+void pdp_imageproc_randmix_delete(void *x) {free(x);}
+void pdp_imageproc_randmix_setthreshold(void *x, float threshold)
+{
+ s32 *d = (s32 *)x;
+ if (threshold > 1.0f) threshold = 1.0f;
+ if (threshold < 0.0f) threshold = 0.0f;
+ d[0] = float2fixed(threshold);
+}
+void pdp_imageproc_randmix_setseed(void *x, float seed)
+{
+ s32 *d = (s32 *)x;
+ d[1] = float2fixed(seed);
+}
+void pdp_imageproc_randmix_process(void *x, s16 *image, s16 *image2, u32 width, u32 height)
+{
+ s32 *d = (s32 *)x;
+ u32 i;
+ s16 r;
+ srandom((u32)d[1]);
+
+
+ for(i=0; i<width*height; i++){
+ // get a random val between 0 and 0x7fff
+ r = (s16)(random() & 0x7fff);
+ if (r < d[0]) image[i] = image2[i];
+ }
+}
+
+// affine transformation (applies gain + adds offset)
+void *pdp_imageproc_affine_new(void){return malloc(2*sizeof(s32));}
+void pdp_imageproc_affine_delete(void *x){free(x);}
+void pdp_imageproc_affine_setgain(void *x, float gain)
+{
+ s32 *d = (s32 *)x;
+ d[0] = float2fixed(gain);
+}
+
+void pdp_imageproc_affine_setoffset(void *x, float offset)
+{
+ s32 *d = (s32 *)x;
+ d[1] = float2fixed(offset);
+}
+void pdp_imageproc_affine_process(void *x, s16 *image, u32 width, u32 height)
+{
+ s32 *d = (s32 *)x;
+ u32 i;
+ s32 a;
+
+ for(i=0; i<width*height; i++){
+ a = (s32)image[i];
+ a = (a*d[0]) >> 15;
+ a += d[1];
+ image[i] = (s16)CLAMP16(a);
+ }
+}
+
+// 3x1 or 1x3 in place convolution
+// orientation
+void *pdp_imageproc_conv_new(void){return(malloc(4*sizeof(s32)));}
+void pdp_imageproc_conv_delete(void *x){free(x);}
+void pdp_imageproc_conv_setmin1(void *x, float val)
+{
+ s32 *d = (s32 *)x;
+ d[0] = float2fixed(val);
+}
+void pdp_imageproc_conv_setzero(void *x, float val)
+{
+ s32 *d = (s32 *)x;
+ d[1] = float2fixed(val);
+}
+void pdp_imageproc_conv_setplus1(void *x, float val)
+{
+ s32 *d = (s32 *)x;
+ d[2] = float2fixed(val);
+}
+void pdp_imageproc_conv_setbordercolor(void *x, float val)
+{
+ s32 *d = (s32 *)x;
+ d[3] = float2fixed(val);
+}
+
+static inline void pdp_imageproc_conv_scanline(void *x, s16 *data, u32 count, s32 stride)
+{
+ s32 *d = (s32 *)x;
+ s32 a,b,c,r;
+ u32 i;
+
+ a = d[3]; //border
+ b = data[0];
+ c = data[stride];
+
+ for(i = 0; i < count-2; i++){
+ r = a*d[0] + b*d[1] + c*d[2];
+ a = data[0];
+ b = data[stride];
+ c = data[stride<<1];
+ data[0] = (s16)CLAMP16(r>>15);
+ data += stride;
+ }
+ r = a*d[0] + b*d[1] + c*d[2];
+ a = data[0];
+ b = data[stride];
+ c = d[3]; //border
+ data[0] = (s16)CLAMP16(r>>15);
+ r = a*d[0] + b*d[1] + c*d[2];
+ data[stride] = (s16)CLAMP16(r>>15);
+
+}
+
+void pdp_imageproc_conv_process(void *x, s16 *image, u32 width, u32 height, u32 orientation, u32 nbp)
+{
+ s32 *d = (s32 *)x;
+ u32 i, j;
+
+ if (orientation == PDP_IMAGEPROC_CONV_HORIZONTAL){
+ for(i=0; i<width*height; i+=width)
+ for(j=0; j<nbp; j++)
+ pdp_imageproc_conv_scanline(x, image+i, width, 1);
+
+ }
+
+ if (orientation == PDP_IMAGEPROC_CONV_VERTICAL){
+ for(i=0; i<width; i++)
+ for(j=0; j<nbp; j++)
+ pdp_imageproc_conv_scanline(x, image+i, height, width);
+
+ }
+
+
+
+
+}
+
+// apply a gain to an image
+void *pdp_imageproc_gain_new(void){return(malloc(2*sizeof(s32)));}
+void pdp_imageproc_gain_delete(void *x){free(x);}
+void pdp_imageproc_gain_setgain(void *x, float gain)
+{
+ /* convert float to s16 + shift */
+ s32 *d = (s32 *)x;
+ s32 g;
+ int i;
+ float sign;
+ s32 shift = 0;
+
+ sign = (gain < 0) ? -1 : 1;
+ gain *= sign;
+
+ /* max shift = 16 */
+ for(i=0; i<=16; i++){
+ if (gain < 0x4000){
+ gain *= 2;
+ shift++;
+ }
+ else break;
+ }
+
+ gain *= sign;
+ g = (s32) gain;
+
+ //g = 0x4000;
+ //shift = 14;
+
+ d[0]=g;
+ d[1]=shift;
+}
+void pdp_imageproc_gain_process(void *x, s16 *image, u32 width, u32 height)
+{
+ s32 *d = (s32 *)x;
+ s32 a;
+ u32 i;
+ for (i=0; i<width*height; i++){
+ a = (s32)image[i];
+ image[i] = (s16)(CLAMP16((a * d[0]) >> d[1]));
+ }
+}
+
+// colour rotation for 2 colour planes
+void *pdp_imageproc_crot2d_new(void){return malloc(4*sizeof(s32));}
+void pdp_imageproc_crot2d_delete(void *x){free(x);}
+void pdp_imageproc_crot2d_setmatrix(void *x, float *matrix)
+{
+ s32 *d = (s32 *)x;
+ d[0] = float2fixed(matrix[0]);
+ d[1] = float2fixed(matrix[1]);
+ d[2] = float2fixed(matrix[2]);
+ d[3] = float2fixed(matrix[3]);
+
+}
+void pdp_imageproc_crot2d_process(void *x, s16 *image, u32 width, u32 height)
+{
+ s32 *d = (s32 *)x;
+ u32 i,j;
+ s32 a1,a2,c1,c2;
+
+ for(i=0, j=width*height; i<width*height; i++, j++){
+ c1 = (s32)image[i];
+ c2 = (s32)image[j];
+
+ a1 = d[0] * c1;
+ a2 = d[1] * c1;
+ a1+= d[2] * c2;
+ a2+= d[3] * c2;
+
+ a1 >>= 15;
+ a2 >>= 15;
+
+ image[i] = (s16)CLAMP16(a1);
+ image[j] = (s16)CLAMP16(a2);
+ }
+}
+
+// biquad and biquad time
+void *pdp_imageproc_bq_new(void){return malloc((5+2+2)*sizeof(s32));}//5xcoef, 2xstate, 2xsavestate
+void pdp_imageproc_bq_delete(void *x){free(x);}
+void pdp_imageproc_bq_setcoef(void *x, float *coef) // a0,-a1,-a2,b0,b1,b2,u0,u1
+{
+ s32 *d = (s32 *)x;
+ float ia0 = 1.0f / coef[0];
+
+ /* all coefs are s1.14 fixed point */
+ /* representing values -2 < x < 2 */
+ /* so scale down before using the ordinary s0.15 float->fixed routine */
+
+ ia0 *= 0.5f;
+
+ // coef
+ d[0] = float2fixed(ia0*coef[1]); // -a1
+ d[1] = float2fixed(ia0*coef[2]); // -a2
+ d[2] = float2fixed(ia0*coef[3]); // b0
+ d[3] = float2fixed(ia0*coef[4]); // b1
+ d[4] = float2fixed(ia0*coef[5]); // b2
+
+
+ // state to reset too
+ d[5] = float2fixed(coef[6]);
+ d[6] = float2fixed(coef[7]);
+
+}
+
+#define A1 d[0]
+#define A2 d[1]
+#define B0 d[2]
+#define B1 d[3]
+#define B2 d[4]
+/*
+ # DIRECT FORM II BIQUAD (from pixel_biquad_s16.s)
+ #
+ # y[k] = b0 * x[k] + u1[k-1]
+ # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
+ # u2[k] = b2 * x[k] - a2 * y[k]
+*/
+
+/* remark A1 and A2 are already negated) */
+
+
+static inline void pdp_imageproc_bq_scanline(void *x, s16 *data, u32 count, s32 stride)
+{
+
+ s32 *d = (s32 *)x;
+ s32 u1,u2, xx, yy;
+
+ u32 i;
+
+ u1 = d[7];
+ u2 = d[8];
+
+ for(i = 0; i < count; i++){
+
+ xx = (s32)data[0];
+
+ yy = ((B0 * xx)>>14) + u1;
+ u1 = ((B1 * xx)>>14) + u2 + ((A1 * yy)>>14);
+ u2 = ((B2 * xx)>>14) + ((A2 * yy)>>14);
+
+ data[0] = (s16)CLAMP16(yy);
+
+ data += stride;
+
+ }
+
+ d[7] = u1;
+ d[8] = u2;
+
+}
+
+void pdp_imageproc_bqt_process(void *x, s16 *image, s16 *state1, s16 *state2, u32 width, u32 height)
+{
+ s32 *d = (s32 *)x;
+ u32 i;
+ s32 u1, u2, xx, yy;
+
+ for (i=0; i<width*height; i++){
+
+ xx = (s32)image[i];
+ u1 = (s32)state1[i];
+ u2 = (s32)state2[i];
+
+ yy = ((B0 * xx)>>14) + u1;
+ u1 = ((B1 * xx)>>14) + u2 + ((A1 * yy)>>14);
+ u2 = ((B2 * xx)>>14) + ((A2 * yy)>>14);
+
+ image[i] = (s16)CLAMP16(yy);
+ state1[i] = (s16)CLAMP16(u1);
+ state2[i] = (s16)CLAMP16(u2);
+ }
+
+
+}
+
+void pdp_imageproc_bq_process(void *x, s16 *data, u32 width, u32 height, u32 direction, u32 nbp)
+{
+ s32 *d = (s32 *)x;
+ unsigned int i,j, offset;
+
+ /* VERTICAL */
+ offset = (height-1)*width;
+
+ if ((direction & PDP_IMAGEPROC_BIQUAD_TOP2BOTTOM)
+ && (direction & PDP_IMAGEPROC_BIQUAD_BOTTOM2TOP)){
+
+ for(i=0; i<width; i++){
+ for (j=0; j<nbp; j++){
+ pdp_imageproc_bq_scanline(x, data+i, height, width); //T->B
+ pdp_imageproc_bq_scanline(x, data+offset+i, height, -width); //B->T
+ }
+ }
+ }
+
+ else if (direction & PDP_IMAGEPROC_BIQUAD_TOP2BOTTOM){
+ for(i=0; i<width; i++){
+ for (j=0; j<nbp; j++){
+ pdp_imageproc_bq_scanline(x, data+i, height, width); //T->B
+ }
+ }
+ }
+
+ else if (direction & PDP_IMAGEPROC_BIQUAD_BOTTOM2TOP){
+ for(i=0; i<width; i++){
+ for (j=0; j<nbp; j++){
+ pdp_imageproc_bq_scanline(x, data+offset+i, height, -width); //B->T
+ }
+ }
+ }
+
+ /* HORIZONTAL */
+
+ offset = width-1;
+ if ((direction & PDP_IMAGEPROC_BIQUAD_LEFT2RIGHT)
+ && (direction & PDP_IMAGEPROC_BIQUAD_RIGHT2LEFT)){
+
+ for(i=0; i<(width*height); i += width){
+ for (j=0; j<nbp; j++){
+ pdp_imageproc_bq_scanline(x, data+i, width, 1); //L->R
+ pdp_imageproc_bq_scanline(x, data+offset+i, width, -1); //R->L
+ }
+ }
+ }
+
+ else if (direction & PDP_IMAGEPROC_BIQUAD_LEFT2RIGHT){
+ for(i=0; i<(width*height); i += width){
+ for (j=0; j<nbp; j++){
+ pdp_imageproc_bq_scanline(x, data+i, width, 1); //L->R
+ }
+ }
+ }
+
+ else if (direction & PDP_IMAGEPROC_BIQUAD_RIGHT2LEFT){
+ for(i=0; i<(width*height); i += width){
+ for (j=0; j<nbp; j++){
+ pdp_imageproc_bq_scanline(x, data+offset+i, width, -1); //R->L
+
+ }
+ }
+ }
+
+}
+
+// produce a random image
+// note: random number generator can be platform specific
+// however, it should be seeded. (same seed produces the same result)
+void *pdp_imageproc_random_new(void){return malloc(sizeof(s32));}
+void pdp_imageproc_random_delete(void *x){free(x);}
+void pdp_imageproc_random_setseed(void *x, float seed)
+{
+ s32 *d = (s32 *)x;
+ d[0] = float2fixed(seed);
+}
+void pdp_imageproc_random_process(void *x, s16 *image, u32 width, u32 height)
+{
+ s32 *d = (s32 *)x;
+ u32 i;
+ srandom((u32)d[0]);
+ for (i=0; i<width*height; i++) image[i] = (s16)(random() & 0xffff);
+
+}
+
diff --git a/system/pdp_llconv.c b/system/pdp_llconv.c
new file mode 100644
index 0000000..93d2934
--- /dev/null
+++ b/system/pdp_llconv.c
@@ -0,0 +1,293 @@
+/*
+ * Pure Data Packet system implementation. : low level format conversion code
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+/* this file contains low level image conversion code
+ nominated as "the ugliest part of pdp"
+ some code is mmx, most is not. */
+
+#include "pdp_llconv.h"
+#include "pdp_mmx.h"
+
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+#define CLAMP(x) (((x)<0) ? 0 : ((x>255)? 255 : (x)))
+#define CLAMP16(x) (((x)<-0x7fff) ? -0x7fff : ((x>0x7fff) ? 0x7fff : (x)))
+#define FP(x) ((int)(((float)(x)) * 256.0f))
+
+
+/* some prototypes for functions defined elsewhere */
+void llconv_yvu_planar_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels);
+void llconv_yuv_planar_u8s16(unsigned char* source, short int *dest, int nbpixels);
+void llconv_grey_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels);
+void llconv_yvu_planar_u8s16(unsigned char* source, short int *dest, int nbpixels);
+
+
+static inline int rgb2y(int r, int g, int b){return (FP(0.257) * r) + (FP(0.504) * g) + (FP(0.098) * b) + FP(16);}
+static inline int rgb2v(int r, int g, int b){return (FP(0.439) * r) - (FP(0.368) * g) - (FP(0.071) * b) + FP(128);}
+static inline int rgb2u(int r, int g, int b){return -(FP(0.148) * r) - (FP(0.291) * g) + (FP(0.439) * b) + FP(128);}
+
+
+/* "standard" 8 bit conversion routine */
+static void llconv_rgb2yvu(unsigned char* src, unsigned char* dst, int nbpixels)
+{
+ int r,g,b,y,v,u,i;
+ for (i=0; i<nbpixels; i++){
+ r = src[0];
+ g = src[1];
+ b = src[2];
+
+ y = rgb2y(r,g,b);
+ v = rgb2v(r,g,b);
+ u = rgb2u(r,g,b);
+
+ dst[0] = CLAMP(y>>8);
+ dst[1] = CLAMP(v>>8);
+ dst[2] = CLAMP(u>>8);
+
+ src += 3;
+ dst += 3;
+ }
+}
+
+
+
+/* 8 bit rgb to 16 bit planar subsampled yvu */
+static void llconv_rgb2yvu_planar16sub(unsigned char* src, short int* dst, int w, int h)
+{
+ int r,g,b,y,v,u,i,j,k;
+ int size = w*h;
+
+ int voffset = size;
+ int uoffset = size + (size>>2);
+
+
+ int loffset = w * 3;
+
+ k=0;
+ for (j=0; j<w*h; j+=(w<<1)){
+ k = 3 * j;
+ for (i=0; i<w; i+=2){
+
+
+ // well, this seems to work... strange though
+ b = src[k];
+ g = src[k+1];
+ r = src[k+2];
+
+ y = (FP(0.257) * r) + (FP(0.504) * g) + (FP(0.098) * b) + FP(16);
+ v = (FP(0.439) * r) - (FP(0.368) * g) - (FP(0.071) * b);
+ u = -(FP(0.148) * r) - (FP(0.291) * g) + (FP(0.439) * b);
+
+ dst[i+j] = CLAMP16(y >> 1);
+
+ b = src[k+3];
+ g = src[k+4];
+ r = src[k+5];
+
+ y = (FP(0.257) * r) + (FP(0.504) * g) + (FP(0.098) * b) + FP(16);
+ v += (FP(0.439) * r) - (FP(0.368) * g) - (FP(0.071) * b);
+ u += -(FP(0.148) * r) - (FP(0.291) * g) + (FP(0.439) * b);
+
+ dst[i+j+1] = CLAMP16(y >> 1);
+
+
+
+ b = src[loffset + k];
+ g = src[loffset + k+1];
+ r = src[loffset + k+2];
+
+ y = (FP(0.257) * r) + (FP(0.504) * g) + (FP(0.098) * b) + FP(16);
+ v = (FP(0.439) * r) - (FP(0.368) * g) - (FP(0.071) * b);
+ u = -(FP(0.148) * r) - (FP(0.291) * g) + (FP(0.439) * b);
+
+ dst[w+i+j] = CLAMP16(y >> 1);
+
+ b = src[loffset + k+3];
+ g = src[loffset + k+4];
+ r = src[loffset + k+5];
+
+ k += 6;
+
+ y = (FP(0.257) * r) + (FP(0.504) * g) + (FP(0.098) * b) + FP(16);
+ v += (FP(0.439) * r) - (FP(0.368) * g) - (FP(0.071) * b);
+ u += -(FP(0.148) * r) - (FP(0.291) * g) + (FP(0.439) * b);
+
+ dst[w+i+j+1] = CLAMP16(y >> 1);
+
+ dst[uoffset+ (i>>1) + (j>>2)] = (CLAMP16(u >> 1));
+ dst[voffset+ (i>>1) + (j>>2)] = (CLAMP16(v >> 1));
+ }
+ }
+}
+
+/* these seem to be pretty slow */
+
+static void llconv_yvu2rgb(unsigned char* src, unsigned char* dst, int nbpixels)
+{
+ int r,g,b,y,v,u,i;
+ for (i=0; i<nbpixels; i++){
+ y = src[0];
+ v = src[1];
+ u = src[2];
+
+
+ b = FP(1.164) * (y - 16) + FP(2.018) * (u - 128);
+ g = FP(1.164) * (y - 16) - FP(0.813) * (v - 128) - FP(0.391) * (u - 128);
+ r = FP(1.164) * (y - 16) + FP(1.596) * (v - 128);
+
+ dst[0] = CLAMP(r>>8);
+ dst[1] = CLAMP(g>>8);
+ dst[2] = CLAMP(b>>8);
+
+ src += 3;
+ dst += 3;
+ }
+}
+
+
+
+/* convert yvu to yuyv */
+static void llconv_yvu2yuyv(unsigned char *src, unsigned char *dst, unsigned int nbpixels)
+{
+ unsigned int y1, y2, u, v, i;
+
+ for (i = 0; i < nbpixels/2; i++){
+
+ y1 = src[0];
+ y2 = src[3];
+ v = (src[1] + src[4]) >> 1;
+ u = (src[2] + src[5]) >> 1;
+ dst[0] = y1;
+ dst[1] = u;
+ dst[2] = y2;
+ dst[3] = v;
+
+ src += 6;
+ dst += 4;
+
+ }
+
+}
+
+
+
+/* convert yuvu packed 8 bit unsigned to yv12 planar 16bit signed */
+static void llconv_yuyv_packed_u8s16(unsigned char* ucsource, short int *sidest, unsigned int w, unsigned int h)
+{
+ unsigned int i, j;
+ unsigned int *source = (unsigned int *)ucsource;
+
+ unsigned int *dest = (unsigned int *)sidest;
+ unsigned int uoffset = (w*h)>>1;
+ unsigned int voffset = (w*h + ((w*h) >> 2)) >> 1;
+
+ for(j=0; j < (h*w)>>1; j +=(w)){
+ for(i=0; i< (w>>1); i+=2){
+ unsigned int y,u,v;
+ unsigned int v00, v01, v10, v11;
+ v00 = source[i+j];
+ v01 = source[i+j+1];
+ v10 = source[i+j+(w>>1)];
+ v11 = source[i+j+(w>>1)+1];
+
+ // save luma
+ dest[i+j] = ((v00 & 0x00ff00ff) << 7);
+ dest[i+j+1] = ((v01 & 0x00ff00ff) << 7);
+ dest[i+j+(w>>1)] = ((v10 & 0x00ff00ff) << 7);
+ dest[i+j+(w>>1)+1] = ((v11 & 0x00ff00ff) << 7);
+
+ // compute chroma
+
+ // mask out luma & shift right
+ v00 = (v00 & 0xff00ff00)>>1;
+ v01 = (v01 & 0xff00ff00)>>1;
+ v10 = (v10 & 0xff00ff00)>>1;
+ v11 = (v11 & 0xff00ff00)>>1;
+
+ // average 2 scan lines
+ v00 += v10;
+ v01 += v11;
+
+ // combine
+ v = (v01 << 16) | (v00 & 0x0000ffff);
+ u = (v01 & 0xffff0000) | (v00 >> 16);
+
+ // flip sign bits for u,v
+ u ^= 0x80008000;
+ v ^= 0x80008000;
+
+ // save chroma
+ dest[uoffset + (i>>1) + (j>>2)] = u;
+ dest[voffset + (i>>1) + (j>>2)] = v;
+ }
+ }
+
+
+}
+
+#define CONVERT(x,y) ((x) + ((y)<<16))
+
+void pdp_llconv(void *src, int stype, void *dst, int dtype, int w, int h)
+{
+ int conversion = CONVERT(stype, dtype);
+ void *tmpbuf;
+
+ switch(CONVERT(stype, dtype)){
+
+ case CONVERT( RIF_YVU__P411_U8, RIF_YVU__P411_S16 ):
+ llconv_yvu_planar_u8s16((unsigned char*)src, (short int *)dst, w*h);
+ break;
+
+ case CONVERT( RIF_YUV__P411_U8, RIF_YVU__P411_S16 ):
+ llconv_yuv_planar_u8s16((unsigned char*)src, (short int *)dst, w*h);
+ break;
+
+ case CONVERT( RIF_YUYV_P____U8, RIF_YVU__P411_S16 ):
+ llconv_yuyv_packed_u8s16((unsigned char*)src, (short int *)dst, w, h);
+ break;
+
+ case CONVERT( RIF_RGB__P____U8, RIF_YVU__P411_S16 ):
+ llconv_rgb2yvu_planar16sub((unsigned char*) src, (short int*) dst, w, h);
+ break;
+
+ case CONVERT( RIF_YVU__P411_S16, RIF_YVU__P411_U8 ):
+ llconv_yvu_planar_s16u8((short int*)src, (unsigned char*)dst, w*h);
+ break;
+
+ case CONVERT( RIF_GREY______S16, RIF_GREY______U8 ):
+ llconv_grey_s16u8((short int*)src, (unsigned char*)dst, w*h);
+ break;
+ default:
+ post("pdp_llconv: WARNING: no conversion routine defined for (%d)->(%d)", stype, dtype);
+
+ }
+
+}
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_llconv_mmx.c b/system/pdp_llconv_mmx.c
new file mode 100644
index 0000000..8070bac
--- /dev/null
+++ b/system/pdp_llconv_mmx.c
@@ -0,0 +1,55 @@
+
+/*
+ * Pure Data Packet system implementation. : wrapper for mmx low level format conversion code
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "pdp_mmx.h"
+
+
+
+/* convert greyscale 8 bit unsigned to 16bit signed */
+void llconv_grey_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+ pixel_pack_s16u8_y(src, dst, nbpixels>>3);
+}
+
+/* convert yvu planar 411 16 bit signed to 8 bit unsigned */
+void llconv_yvu_planar_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+ pixel_pack_s16u8_y(src, dst, nbpixels>>3);
+ pixel_pack_s16u8_uv(src + nbpixels, dst + nbpixels, nbpixels>>4);
+}
+
+
+/* convert yvu planar 411 8 bit unsigned to yv12 planar 16bit signed */
+void llconv_yvu_planar_u8s16(unsigned char* source, short int *dest, int nbpixels)
+{
+ pixel_unpack_u8s16_y(source, dest, nbpixels>>3);
+ pixel_unpack_u8s16_uv(&source[nbpixels], &dest[nbpixels], nbpixels>>4);
+}
+
+/* convert yuv planar 411 8 bit unsigned to yv12 planar 16bit signed */
+void llconv_yuv_planar_u8s16(unsigned char* source, short int *dest, int nbpixels)
+{
+ pixel_unpack_u8s16_y(source, dest, nbpixels>>3);
+ pixel_unpack_u8s16_uv(&source[nbpixels], &dest[nbpixels + (nbpixels>>2)], nbpixels>>5);
+ pixel_unpack_u8s16_uv(&source[nbpixels + (nbpixels>>2)], &dest[nbpixels], nbpixels>>5);
+}
+
diff --git a/system/pdp_llconv_portable.c b/system/pdp_llconv_portable.c
new file mode 100644
index 0000000..de65ef5
--- /dev/null
+++ b/system/pdp_llconv_portable.c
@@ -0,0 +1,81 @@
+
+/*
+ * Pure Data Packet system implementation. : portable low level format conversion code
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define CLAMP(x) (((x)<0) ? 0 : ((x>255)? 255 : (x)))
+#define FP(x) ((int)(((float)(x)) * 256.0f))
+
+void pixel_unpack_portable_u8s16_y(unsigned char *src ,short int *dst, unsigned int nbpixels)
+{
+ unsigned int i;
+ for (i=0; i<nbpixels; i++) dst[i] = ((short int)(src[i])) << 7;
+}
+
+void pixel_unpack_portable_u8s16_uv(unsigned char *src ,short int *dst, unsigned int nbpixels)
+{
+ unsigned int i;
+ for (i=0; i<nbpixels; i++) dst[i] = (((short int)(src[i])) << 8) ^ 0x8000;
+}
+
+
+void pixel_pack_portable_s16u8_y(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+ unsigned int i;
+ for (i=0; i<nbpixels; i++) dst[i] = (unsigned char)(CLAMP(src[i]>>7));
+}
+
+void pixel_pack_portable_s16u8_uv(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+ unsigned int i;
+ for (i=0; i<nbpixels; i++) dst[i] = (unsigned char)((src[i]^0x8000)>>8);
+}
+
+
+/* convert greyscale 8 bit unsigned to 16bit signed */
+void llconv_grey_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+ pixel_pack_portable_s16u8_y(src, dst, nbpixels);
+}
+
+/* convert yvu planar 411 16 bit signed to 8 bit unsigned */
+void llconv_yvu_planar_s16u8(short int *src, unsigned char *dst, unsigned int nbpixels)
+{
+ pixel_pack_portable_s16u8_y(src, dst, nbpixels);
+ pixel_pack_portable_s16u8_uv(src + nbpixels, dst + nbpixels, nbpixels>>1);
+
+}
+
+
+/* convert yvu planar 411 8 bit unsigned to yv12 planar 16bit signed */
+void llconv_yvu_planar_u8s16(unsigned char* source, short int *dest, int nbpixels)
+{
+ pixel_unpack_portable_u8s16_y(source, dest, nbpixels);
+ pixel_unpack_portable_u8s16_uv(&source[nbpixels], &dest[nbpixels], nbpixels>>1);
+}
+
+/* convert yuv planar 411 8 bit unsigned to yv12 planar 16bit signed */
+void llconv_yuv_planar_u8s16(unsigned char* source, short int *dest, int nbpixels)
+{
+ pixel_unpack_portable_u8s16_y(source, dest, nbpixels);
+ pixel_unpack_portable_u8s16_uv(&source[nbpixels], &dest[nbpixels + (nbpixels>>2)], nbpixels>>2);
+ pixel_unpack_portable_u8s16_uv(&source[nbpixels + (nbpixels>>2)], &dest[nbpixels], nbpixels>>2);
+}
+
+
diff --git a/system/pdp_packet.c b/system/pdp_packet.c
new file mode 100644
index 0000000..0c0b2c2
--- /dev/null
+++ b/system/pdp_packet.c
@@ -0,0 +1,239 @@
+/*
+ * Pure Data Packet system implementation:
+ * code for allocation/deallocation/copying/...
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "pdp.h"
+#include <stdio.h>
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* this needs to be able to grow dynamically, think about it later */
+#define PDP_OBJECT_ARRAY_SIZE 1024
+static t_pdp* pdp_stack[PDP_OBJECT_ARRAY_SIZE];
+
+
+/* some global vars */
+static t_symbol* pdp_sym_register_rw;
+static t_symbol* pdp_sym_register_ro;
+static t_symbol* pdp_sym_process;
+
+
+/* setup methods */
+
+void
+pdp_packet_setup(void)
+{
+ bzero(pdp_stack, PDP_OBJECT_ARRAY_SIZE * sizeof(t_pdp *));
+ pdp_sym_register_rw = gensym("register_rw");
+ pdp_sym_register_ro = gensym("register_ro");
+ pdp_sym_process = gensym("process");
+}
+
+void
+pdp_packet_destroy(void)
+{
+ int i = 0;
+ /* dealloc all the data in object stack */
+ while ((i < PDP_OBJECT_ARRAY_SIZE) && (pdp_stack[i])) free(pdp_stack[i++]);
+}
+
+
+/* private pdp_mem methods */
+
+
+/* public object manips */
+
+
+/* alloc method: alloc time is linear in the number of used packets */
+/* think about a better (tree) method when this number grows large */
+int
+pdp_packet_new(unsigned int datatype, unsigned int datasize /*without header*/)
+{
+ unsigned int totalsize = datasize + PDP_HEADER_SIZE;
+ int i = 0;
+ unsigned int align;
+ t_pdp* p;
+ for (i=0; i < PDP_OBJECT_ARRAY_SIZE; i++){
+ p = pdp_stack[i];
+ /* check if we can reuse this one if it is already allocated */
+ if (p) {
+ /* remark: if p->size >= totalsize we can give away the packet */
+ /* but that would lead to unefficient use if we have a lot of packets */
+ /* of different sizes */
+ if ((p->users == 0) && (p->size == totalsize) && (p->type == datatype)){
+ //post("pdp_new_object: can reuse %d", i);
+ p->users = 1;
+ return i;
+ }
+ else{
+ //post("pdp_new_object: can't reuse %d, (%d users)", i, p->users);
+ //post("size:%d, newsize:%d, type:%d, newtype:%d", p->size, totalsize, p->type, datatype);
+ }
+ }
+ /* allocate new one */
+ else {
+ p = (t_pdp *)malloc(totalsize);
+ align = ((unsigned int)p) & (PDP_ALIGN - 1);
+ if (align) post("pdp_new_object: warning data misaligned by %x", align);
+ pdp_stack[i] = p;
+ p->type = datatype;
+ p->size = totalsize;
+ p->users = 1;
+ //post("pdp_new_object: allocating new (%d)", i);
+ return i;
+ }
+ }
+ post("pdp_new_object: WARNING: out of memory");
+
+ return -1;
+
+}
+
+
+t_pdp*
+pdp_packet_header(int handle)
+{
+ if ((handle >= 0) && (handle < PDP_OBJECT_ARRAY_SIZE)) return pdp_stack[handle];
+ else return 0;
+}
+
+void*
+pdp_packet_data(int handle)
+{
+ if ((handle >= 0) && (handle < PDP_OBJECT_ARRAY_SIZE))
+ return (char *)(pdp_stack[handle]) + PDP_HEADER_SIZE;
+ else return 0;
+}
+
+
+
+int
+pdp_packet_copy_ro(int handle)
+{
+ int out_handle;
+
+ t_pdp* p;
+ if ((handle >= 0)
+ && (handle < PDP_OBJECT_ARRAY_SIZE)
+ && (p = pdp_stack[handle])){
+ /* increase the number of users and return */
+ p->users++;
+ out_handle = handle;
+ }
+ else out_handle = -1;
+
+ //post("pdp_copy_ro: outhandle:%d", out_handle);
+
+ return out_handle;
+}
+
+int
+pdp_packet_copy_rw(int handle)
+{
+ int out_handle;
+
+ t_pdp* p;
+ if ((handle >= 0)
+ && (handle < PDP_OBJECT_ARRAY_SIZE)
+ && (p = pdp_stack[handle])){
+ /* if there are other users, copy the object otherwize return the same handle */
+ if (p->users){
+ int new_handle = pdp_packet_new(p->type, p->size - PDP_HEADER_SIZE);
+ t_pdp* new_p = pdp_packet_header(new_handle);
+ memcpy(new_p, p, p->size);
+ new_p->users = 1;
+ out_handle = new_handle;
+ }
+ else {
+ p->users++;
+ out_handle = handle;
+ }
+ //post("pdp_copy_rw: inhandle:%d outhandle:%d", handle, out_handle);
+
+ }
+ else out_handle = -1;
+
+ return out_handle;
+}
+
+int
+pdp_packet_clone_rw(int handle)
+{
+ int out_handle;
+
+ t_pdp* p;
+ if ((handle >= 0)
+ && (handle < PDP_OBJECT_ARRAY_SIZE)
+ && (p = pdp_stack[handle])){
+
+ /* clone the packet header, don't copy the data */
+ int new_handle = pdp_packet_new(p->type, p->size - PDP_HEADER_SIZE);
+ t_pdp* new_p = pdp_packet_header(new_handle);
+ memcpy(new_p, p, PDP_HEADER_SIZE);
+ new_p->users = 1;
+ out_handle = new_handle;
+ }
+
+ else out_handle = -1;
+
+ return out_handle;
+}
+
+void
+pdp_packet_mark_unused(int handle)
+{
+ t_pdp* p;
+ if ((handle >= 0) && (handle < PDP_OBJECT_ARRAY_SIZE)){
+ if (p = pdp_stack[handle]) {
+ if (p->users) {
+ p->users--;
+ //post("pdp_mark_unused: handle %d, users left %d", handle, p->users);
+ }
+ else {
+ post("pdp_mark_unused: WARNING: handle %d has zero users (duplicate pdp_mark_unused ?)", handle);
+ }
+ }
+ else {
+ post("pdp_mark_unused: WARNING: invalid handle %d: no associated object", handle);
+ }
+ }
+
+ else {
+ /* -1 is the only invalid handle that doesn't trigger a warning */
+ if (handle != -1) post("pdp_mark_unused: WARNING: invalid handle %d: out of bound", handle);
+ }
+
+
+}
+
+/* remark. if an owner frees a rw copy, he can still pass it along to clients.
+the first copy instruction revives the object. maybe free should not be called free but unregister.
+as long as no new_object method is called, or no copy on another object is performed,
+the "undead" copy can be revived. this smells a bit, i know...*/
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_queue.c b/system/pdp_queue.c
new file mode 100644
index 0000000..2932728
--- /dev/null
+++ b/system/pdp_queue.c
@@ -0,0 +1,337 @@
+/*
+ * Pure Data Packet - processor queue module.
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+
+/*
+ this is a the processor queue pdp system module
+ it receives tasks from objects that are schedules to
+ be computed in another thread. the object is signalled back
+ when the task is completed.
+
+ this is not a standard pd class. it is a sigleton class
+ using a standard pd clock to poll for compleded methods on
+ every scheduler run. this is a hack to do thread synchronization
+ in a thread unsafe pd.
+
+ */
+
+#include "pdp.h"
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define PDP_QUEUE_SIZE 1024
+#define PDP_QUEUE_DELTIME 1.0f;
+
+
+
+
+/********************* pdp process queue data *********************/
+
+typedef void (*t_pdpmethod)(void *client);
+
+/* the process queue data record */
+typedef struct process_queue_struct
+{
+ void *x_owner; /* the object we are dealing with */
+ t_pdpmethod x_process; /* the process method */
+ t_pdpmethod x_callback; /* the function to be called when finished */
+ int *x_queue_id; /* place to store the queue id for task */
+} t_process_queue;
+
+
+
+/* clock members */
+static t_clock *pdp_clock;
+static double deltime;
+
+/* some bookkeeping vars */
+static long long ticks;
+static long long packets;
+
+/* queue members */
+static t_process_queue *q; /* queue */
+static int mask;
+static int head; /* last entry in queue + 1 */
+static int tail; /* first entry in queque */
+static int curr; /* the object currently processed in other thread */
+
+/* pthread vars */
+static pthread_mutex_t mut;
+static pthread_cond_t cond_dataready;
+static pthread_cond_t cond_processingdone;
+static pthread_t thread_id;
+
+/* synchro pipes */
+static int pipe_fd[2];
+
+/* toggle for thread usage */
+static int use_thread;
+
+
+
+/* the methods */
+void pdp_queue_wait()
+{
+ //post("pdp_pq_wait: waiting for pdp_queue_thread to finish processing");
+ pthread_mutex_lock(&mut);
+ while(((curr - head) & mask) != 0){
+
+ pthread_cond_wait(&cond_processingdone, &mut);
+ }
+ pthread_mutex_unlock(&mut);
+ //post("pdp_pq_wait: pdp_queue_thread has finished processing");
+
+}
+void pdp_queue_finish(int index)
+{
+
+ if (-1 == index) {
+ //post("pdp_pq_remove: index == -1");
+ return;
+ }
+ /* wait for processing thread to finish*/
+ pdp_queue_wait();
+
+ /* invalidate callback at index */
+ q[index & mask].x_callback = 0;
+ q[index & mask].x_queue_id = 0;
+
+}
+
+static void pdp_queue_signal_processor(void)
+{
+
+ pthread_mutex_lock(&mut);
+ //post("signalling process thread");
+ pthread_cond_signal(&cond_dataready);
+ pthread_mutex_unlock(&mut);
+ //post("signalling process thread done");
+
+}
+
+static void pdp_queue_wait_for_feeder(void)
+{
+
+
+ /* only use locking when there is no data */
+ if(((curr - head) & mask) == 0){
+ pthread_mutex_lock(&mut);
+
+ /* signal processing done */
+ //post("pdp_queue_thread: signalling processing is done");
+ pthread_cond_signal(&cond_processingdone);
+
+ /* wait until there is an item in the queue */
+ while(((curr - head) & mask) == 0){
+ //post("waiting for feeder");
+ pthread_cond_wait(&cond_dataready, &mut);
+ //post("waiting for feeder done");
+ }
+
+ pthread_mutex_unlock(&mut);
+
+ }
+}
+
+void pdp_queue_add(void *owner, void *process, void *callback, int *queue_id)
+{
+ int i;
+
+ /* if processing is in not in thread, just call the funcs */
+ if (!use_thread){
+ //post("pdp_queue_add: calling processing routine directly");
+ *queue_id = -1;
+ ((t_pdpmethod) process)(owner);
+ ((t_pdpmethod) callback)(owner);
+ return;
+ }
+
+
+
+ /* schedule method in thread queue */
+ if (1 == ((tail - head) & mask)) {
+ post("pdp_queue_add: WARNING: processing queue is full.\n");
+ post("pdp_queue_add: WARNING: skipping process method, calling callback directly.\n");
+ *queue_id = -1;
+ ((t_pdpmethod) callback)(owner);
+ }
+
+
+
+ i = head & mask;
+ q[i].x_owner = owner;
+ q[i].x_process = process;
+ q[i].x_callback = callback;
+ q[i].x_queue_id = queue_id;
+ *queue_id = i;
+ //post("pdp_queue_add: added method to queue, index %d", i);
+
+
+ // increase the packet count
+ packets++;
+
+ // move head forward
+ head++;
+
+ pdp_queue_signal_processor();
+
+}
+
+
+/* processing thread */
+static void *pdp_queue_thread(void *dummy)
+{
+ while(1){
+
+
+ /* wait until there is data available */
+ pdp_queue_wait_for_feeder();
+
+
+ //post("pdp_queue_thread: processing %d", curr);
+
+
+ /* call the process routine */
+ (q[curr & mask].x_process)(q[curr & mask].x_owner);
+
+ /* advance */
+ curr++;
+
+
+ }
+}
+
+
+/* call back all the callbacks */
+static void pdp_queue_callback (void)
+{
+
+ /* call callbacks for finished packets */
+ while(0 != ((curr - tail) & mask))
+ {
+ int i = tail & mask;
+ /* invalidate queue id */
+ if(q[i].x_queue_id) *q[i].x_queue_id = -1;
+ /* call callback */
+ if(q[i].x_callback) (q[i].x_callback)(q[i].x_owner);
+ //else post("pdp_pq_tick: callback %d is disabled",i );
+ tail++;
+ }
+
+}
+
+/* the clock method */
+static void pdp_queue_tick (void)
+{
+ /* do work */
+ //if (!(ticks % 1000)) post("pdp tick %d", ticks);
+
+ if (!use_thread) return;
+
+ /* call callbacks */
+ pdp_queue_callback();
+
+ /* increase counter */
+ ticks++;
+
+ /* set clock for next update */
+ clock_delay(pdp_clock, deltime);
+}
+
+
+void pdp_queue_use_thread(int t)
+{
+ /* if thread usage is being disabled,
+ wait for thread to finish processing first */
+ if (t == 0) {
+ pdp_queue_wait();
+ use_thread = 0;
+ pdp_queue_callback();
+ clock_unset(pdp_clock);
+ }
+ else {
+ clock_unset(pdp_clock);
+ clock_delay(pdp_clock, deltime);
+ use_thread = 1;
+ }
+
+}
+
+void pdp_queue_setup(void)
+{
+ pthread_attr_t attr;
+
+ /* setup pdp queue processor object */
+ ticks = 0;
+ deltime = PDP_QUEUE_DELTIME;
+
+ /* setup queue data */
+ mask = PDP_QUEUE_SIZE - 1;
+ head = 0;
+ tail = 0;
+ curr = 0;
+ q = getbytes(PDP_QUEUE_SIZE * sizeof(*q));
+
+ /* use threads by default */
+ use_thread = 1;
+
+ /* setup synchro stuff */
+ pthread_mutex_init(&mut, NULL);
+ pthread_cond_init(&cond_dataready, NULL);
+ pthread_cond_init(&cond_processingdone, NULL);
+
+
+ /* allocate the clock */
+ pdp_clock = clock_new(0, (t_method)pdp_queue_tick);
+
+ /* set the clock */
+ clock_delay(pdp_clock, 0);
+
+ /* start processing thread */
+
+ /* glibc doc says SCHED_OTHER is default,
+ but it seems not to be when initiated from a RT thread
+ so we explicitly set it here */
+ pthread_attr_init (&attr);
+ //pthread_attr_setschedpolicy(&attr, SCHED_FIFO);
+ pthread_attr_setschedpolicy(&attr, SCHED_OTHER);
+ pthread_create(&thread_id, &attr, pdp_queue_thread, (void *)0);
+
+
+
+}
+
+
+
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_resample.c b/system/pdp_resample.c
new file mode 100644
index 0000000..2b5a9de
--- /dev/null
+++ b/system/pdp_resample.c
@@ -0,0 +1,135 @@
+/*
+ * Pure Data Packet system file. - image resampling routines
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "pdp_resample.h"
+#include "pdp.h"
+
+/*
+
+efficient bilinear resampling ??
+performance: how to eliminate divides? -> virtual coordinates 2^k x 2^k (conf. opengl)
+
+i.e. 16 bit virtual coordinates: easy modular addressing
+
+*/
+
+s32 pdp_resample_bilin(s16 *image, s32 width, s32 height, s32 virt_x, s32 virt_y)
+{
+
+ s32 fp_x, fp_y, frac_x, frac_y, f, offset, r_1, r_2;
+
+ virt_x &= 0xffff;
+ virt_y &= 0xffff;
+
+ fp_x = virt_x * (width - 1);
+ fp_y = virt_y * (height - 1);
+
+ frac_x = fp_x & (0xffff);
+ frac_y = fp_y & (0xffff);
+
+ offset = (fp_x >> 16) + (fp_y >> 16) * width;
+ image += offset;
+
+ f = 0x10000 - frac_x;
+
+ r_1 = ((f * (s32)(image[0]) + frac_x * (s32)(image[1])))>>16;
+
+ image += width;
+
+ r_2 = ((f * (s32)(image[0]) + frac_x * (s32)(image[1])))>>16;
+
+ f = 0x10000 - frac_y;
+
+ return ((f * r_1 + frac_y * r_2)>>16);
+
+}
+
+
+void pdp_resample_scale_bilin(s16 *src_image, s16 *dst_image, s32 src_w, s32 src_h, s32 dst_w, s32 dst_h)
+{
+ s32 i,j;
+ s32 virt_x=0;
+ s32 virt_y=0; /* virtual coordinates in 30 bit */
+ s32 scale_x = 0x40000000 / dst_w;
+ s32 scale_y = 0x40000000 / dst_h;
+
+ for (j=0; j<dst_h; j++){
+ for (i=0; i<dst_w; i++){
+ *dst_image++ = pdp_resample_bilin(src_image, src_w, src_h, virt_x>>14, virt_y>>14);
+ virt_x += scale_x;
+ }
+ virt_x = 0;
+ virt_y += scale_y;
+ }
+
+}
+
+void pdp_resample_scale_nn(s16 *src_image, s16 *dst_image, s32 src_w, s32 src_h, s32 dst_w, s32 dst_h)
+{
+ s32 i,j;
+ s32 x=0;
+ s32 y=0;
+ s32 frac_x=0;
+ s32 frac_y=0;
+ s32 scale_x = (src_w << 20 ) / dst_w;
+ s32 scale_y = (src_h << 20 ) / dst_h;
+
+ for (j=0; j<dst_h; j++){
+ for (i=0; i<dst_w; i++){
+ *dst_image++ = src_image[x+y];
+ frac_x += scale_x;
+ x = frac_x >> 20;
+ }
+ x = 0;
+ frac_x = 0;
+ frac_y += scale_y;
+ y = (frac_y >> 20) * src_w;
+ }
+
+}
+
+void pdp_resample_zoom_tiled_bilin(s16 *src_image, s16 *dst_image, s32 w, s32 h,
+ float zoom_x, float zoom_y, float center_x_relative, float center_y_relative)
+{
+ float izx = 1.0f / zoom_x;
+ float izy = 1.0f / zoom_y;
+ s32 scale_x = (s32)((float)0x100000 * izx / (float)w);
+ s32 scale_y = (s32)((float)0x100000 * izy / (float)h);
+
+ s32 top_virt_x = (s32)((1.0f - izx) * (float)0x100000 * center_x_relative);
+ s32 top_virt_y = (s32)((1.0f - izy) * (float)0x100000 * center_y_relative);
+
+ s32 virt_x = top_virt_x;
+ s32 virt_y = top_virt_y;
+
+ s32 i,j;
+
+ for (j=0; j<h; j++){
+ for (i=0; i<w; i++){
+ *dst_image++ = pdp_resample_bilin(src_image, w, h, virt_x>>4, virt_y>>4);
+ virt_x += scale_x;
+ }
+ virt_x = top_virt_x;
+ virt_y += scale_y;
+ }
+
+}
+
diff --git a/system/pdp_type.c b/system/pdp_type.c
new file mode 100644
index 0000000..b23b9cd
--- /dev/null
+++ b/system/pdp_type.c
@@ -0,0 +1,143 @@
+/*
+ * Pure Data Packet system implementation. : code for handling different packet types
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "pdp.h"
+#include <stdio.h>
+
+/* all symbols are C style */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+/****************** packet type checking methods ********************/
+
+
+/* check if two packets are allocated and of the same type */
+int pdp_type_compat(int packet0, int packet1)
+{
+
+ t_pdp *header0 = pdp_packet_header(packet0);
+ t_pdp *header1 = pdp_packet_header(packet1);
+
+ if (!(header1)){
+ //post("pdp_type_compat: invalid header packet1");
+ return 0;
+ }
+ if (!(header0)){
+ //post("pdp_type_compat: invalid header packet 0");
+ return 0;
+ }
+ if (header0->type != header1->type){
+ //post("pdp_type_compat: types do not match");
+ return 0;
+ }
+
+ return 1;
+}
+
+/* check if two image packets are allocated and of the same type */
+int pdp_type_compat_image(int packet0, int packet1)
+{
+ t_pdp *header0 = pdp_packet_header(packet0);
+ t_pdp *header1 = pdp_packet_header(packet1);
+
+
+ if (!(pdp_type_compat(packet0, packet1))) return 0;
+ if (header0->type != PDP_IMAGE){
+ //post("pdp_type_compat_image: not a PDP_IMAGE");
+ return 0;
+ }
+ if (header0->info.image.encoding != header1->info.image.encoding){
+ //post("pdp_type_compat_image: encodings differ");
+ return 0;
+ }
+ if (header0->info.image.width != header1->info.image.width){
+ //post("pdp_type_compat_image: image withs differ");
+ return 0;
+ }
+ if (header0->info.image.height != header1->info.image.height){
+ //post("pdp_type_compat_image: image heights differ");
+ return 0;
+ }
+ return 1;
+}
+
+/* check if packet is a valid image packet */
+int pdp_type_isvalid_image(int packet)
+{
+ t_pdp *header = pdp_packet_header(packet);
+ if (!header) return 0;
+ if (PDP_IMAGE != header->type) return 0;
+ if ((PDP_IMAGE_YV12 != header->info.image.encoding)
+ && (PDP_IMAGE_GREY != header->info.image.encoding)) return 0;
+
+ return 1;
+
+}
+
+
+
+int pdp_packet_new_image_yv12(u32 w, u32 h)
+{
+ t_pdp *header;
+ int packet;
+
+
+ u32 size = w*h;
+ u32 totalnbpixels = size + (size >> 1);
+ u32 packet_size = totalnbpixels << 1;
+
+ packet = pdp_packet_new(PDP_IMAGE, packet_size);
+ header = pdp_packet_header(packet);
+
+ header->info.image.encoding = PDP_IMAGE_YV12;
+ header->info.image.width = w;
+ header->info.image.height = h;
+
+ return packet;
+}
+
+int pdp_packet_new_image_grey(u32 w, u32 h)
+{
+ t_pdp *header;
+ int packet;
+
+
+ u32 size = w*h;
+ u32 totalnbpixels = size;
+ u32 packet_size = totalnbpixels << 1;
+
+ packet = pdp_packet_new(PDP_IMAGE, packet_size);
+ header = pdp_packet_header(packet);
+
+ header->info.image.encoding = PDP_IMAGE_GREY;
+ header->info.image.width = w;
+ header->info.image.height = h;
+
+ return packet;
+}
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/system/pdp_ut.c b/system/pdp_ut.c
new file mode 100644
index 0000000..83b4cb0
--- /dev/null
+++ b/system/pdp_ut.c
@@ -0,0 +1,195 @@
+/*
+ * Pure Data Packet - Utility toolkit objects.
+ * Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+/* This file contains some small utility pd objects that make working with
+ pdp objects a lot easier. Mainly as glue to be used in the abstractions
+ in the distro. */
+
+#include "pdp.h"
+#include <math.h>
+
+/* this object does an add, scale, clip operation */
+
+t_class *pdp_ut_addscaleclip_class;
+
+typedef struct pdp_ut_addscaleclip_struct
+{
+ t_object x_obj;
+ t_outlet *x_outlet0;
+ t_float x_min;
+ t_float x_max;
+ t_float x_offset;
+ t_float x_scale;
+} t_pdp_ut_addscaleclip;
+
+
+static void pdp_ut_addscaleclip_float(t_pdp_ut_addscaleclip *x, t_floatarg f)
+{
+ f += x->x_offset;
+ f *= x->x_scale;
+ f = (f < x->x_min) ? x->x_min : f;
+ f = (f > x->x_max) ? x->x_max : f;
+ outlet_float(x->x_outlet0, f);
+}
+
+static void pdp_ut_addscaleclip_free(t_pdp_ut_addscaleclip *x){}
+
+void *pdp_ut_addscaleclip_new(t_floatarg offset, t_floatarg scale, t_floatarg min, t_floatarg max)
+{
+ t_pdp_ut_addscaleclip *x = (t_pdp_ut_addscaleclip *)pd_new(pdp_ut_addscaleclip_class);
+ x->x_outlet0 = outlet_new(&x->x_obj, &s_float);
+ x->x_offset = offset;
+ x->x_scale = scale;
+ x->x_min = min;
+ x->x_max = max;
+ return (void *)x;
+}
+
+void pdp_ut_addscaleclip_setup(void)
+{
+ pdp_ut_addscaleclip_class = class_new(gensym("pdp_ut_addscaleclip"), (t_newmethod)pdp_ut_addscaleclip_new,
+ (t_method)pdp_ut_addscaleclip_free, sizeof(t_pdp_ut_addscaleclip), 0,
+ A_FLOAT, A_FLOAT, A_FLOAT, A_FLOAT, A_NULL);
+ class_addfloat(pdp_ut_addscaleclip_class, pdp_ut_addscaleclip_float);
+}
+
+
+/* pdp_ut_logmap does a logarithmic parameter mapping [0->1] x -> min(max/min)^x max an add, scale, clip operation */
+/* pdp_ut_logmap_comp does x -> min(max/min)^(1-x) */
+/* pdp_ut_linmap dos x -> min + (max - min * x */
+
+t_class *pdp_ut_linmap_class;
+t_class *pdp_ut_logmap_class;
+t_class *pdp_ut_logmap_comp_class;
+
+typedef struct pdp_ut_map_struct
+{
+ t_object x_obj;
+ t_outlet *x_outlet0;
+ t_float x_min;
+ t_float x_max;
+} t_pdp_ut_map;
+
+
+static void pdp_ut_logmap_float(t_pdp_ut_map *x, t_floatarg f)
+{
+ f = (f < 0.0f) ? 0.0f : f;
+ f = (f > 1.0f) ? 1.0f : f;
+
+ f = x->x_min * pow((x->x_max / x->x_min), f);
+
+ outlet_float(x->x_outlet0, f);
+}
+
+static void pdp_ut_linmap_float(t_pdp_ut_map *x, t_floatarg f)
+{
+ f = (f < 0.0f) ? 0.0f : f;
+ f = (f > 1.0f) ? 1.0f : f;
+
+ f = x->x_min + ((x->x_max - x->x_min) * f);
+
+ outlet_float(x->x_outlet0, f);
+}
+
+static void pdp_ut_logmap_comp_float(t_pdp_ut_map *x, t_floatarg f)
+{
+ f = (f < 0.0f) ? 0.0f : f;
+ f = (f > 1.0f) ? 1.0f : f;
+
+ f = x->x_min * pow((x->x_max / x->x_min), (1.0f - f));
+
+ outlet_float(x->x_outlet0, f);
+}
+
+static void pdp_ut_map_free(t_pdp_ut_map *x){}
+
+
+void pdp_ut_map_init(t_pdp_ut_map *x, t_floatarg min, t_floatarg max)
+{
+ x->x_outlet0 = outlet_new(&x->x_obj, &s_float);
+ x->x_min = min;
+ x->x_max = max;
+}
+
+void *pdp_ut_logmap_new(t_floatarg min, t_floatarg max)
+{
+ t_pdp_ut_map *x = (t_pdp_ut_map *)pd_new(pdp_ut_logmap_class);
+ pdp_ut_map_init(x, min, max);
+ return (void *)x;
+}
+
+void *pdp_ut_linmap_new(t_floatarg min, t_floatarg max)
+{
+ t_pdp_ut_map *x = (t_pdp_ut_map *)pd_new(pdp_ut_linmap_class);
+ pdp_ut_map_init(x, min, max);
+ return (void *)x;
+}
+
+void *pdp_ut_logmap_comp_new(t_floatarg min, t_floatarg max)
+{
+ t_pdp_ut_map *x = (t_pdp_ut_map *)pd_new(pdp_ut_logmap_comp_class);
+ pdp_ut_map_init(x, min, max);
+ return (void *)x;
+}
+
+void pdp_ut_logmap_setup(void)
+{
+ pdp_ut_logmap_class = class_new(gensym("pdp_ut_logmap"), (t_newmethod)pdp_ut_logmap_new,
+ (t_method)pdp_ut_map_free, sizeof(t_pdp_ut_map), 0,
+ A_FLOAT, A_FLOAT, A_NULL);
+ class_addfloat(pdp_ut_logmap_class, pdp_ut_logmap_float);
+}
+
+void pdp_ut_logmap_comp_setup(void)
+{
+ pdp_ut_logmap_comp_class = class_new(gensym("pdp_ut_logmap_comp"), (t_newmethod)pdp_ut_logmap_comp_new,
+ (t_method)pdp_ut_map_free, sizeof(t_pdp_ut_map), 0,
+ A_FLOAT, A_FLOAT, A_NULL);
+ class_addfloat(pdp_ut_logmap_comp_class, pdp_ut_logmap_comp_float);
+}
+
+void pdp_ut_linmap_setup(void)
+{
+ pdp_ut_linmap_class = class_new(gensym("pdp_ut_linmap"), (t_newmethod)pdp_ut_linmap_new,
+ (t_method)pdp_ut_map_free, sizeof(t_pdp_ut_map), 0,
+ A_FLOAT, A_FLOAT, A_NULL);
+ class_addfloat(pdp_ut_linmap_class, pdp_ut_linmap_float);
+}
+
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void pdp_ut_setup(void)
+{
+ pdp_ut_addscaleclip_setup();
+ pdp_ut_logmap_setup();
+ pdp_ut_logmap_comp_setup();
+ pdp_ut_linmap_setup();
+}
+
+
+#ifdef __cplusplus
+}
+#endif