aboutsummaryrefslogtreecommitdiff
path: root/system/mmx
diff options
context:
space:
mode:
Diffstat (limited to 'system/mmx')
-rw-r--r--system/mmx/Makefile29
-rw-r--r--system/mmx/pdp_mmx_test.c62
-rw-r--r--system/mmx/pixel_add_s16.s55
-rw-r--r--system/mmx/pixel_affine_s16.s59
-rw-r--r--system/mmx/pixel_biquad_dirI_s16.s361
-rw-r--r--system/mmx/pixel_biquad_s16.s451
-rw-r--r--system/mmx/pixel_ca_s1.s189
-rw-r--r--system/mmx/pixel_cascade_s16.s330
-rw-r--r--system/mmx/pixel_conv_hor_s16.s134
-rw-r--r--system/mmx/pixel_conv_ver_s16.s128
-rw-r--r--system/mmx/pixel_crot_s16.s153
-rw-r--r--system/mmx/pixel_gain.s83
-rw-r--r--system/mmx/pixel_gain_s16.s71
-rw-r--r--system/mmx/pixel_mix_s16.s68
-rw-r--r--system/mmx/pixel_mul_s16.s56
-rw-r--r--system/mmx/pixel_pack_s16u8.s126
-rw-r--r--system/mmx/pixel_rand_s16.s76
-rw-r--r--system/mmx/pixel_randmix_s16.s91
-rw-r--r--system/mmx/pixel_s1.s201
-rw-r--r--system/mmx/pixel_unpack_u8s16.s113
20 files changed, 2836 insertions, 0 deletions
diff --git a/system/mmx/Makefile b/system/mmx/Makefile
new file mode 100644
index 0000000..0f8f836
--- /dev/null
+++ b/system/mmx/Makefile
@@ -0,0 +1,29 @@
+include ../../Makefile.config
+
+OBJ = \
+pixel_pack_s16u8.o \
+pixel_unpack_u8s16.o \
+pixel_add_s16.o \
+pixel_mul_s16.o \
+pixel_mix_s16.o \
+pixel_randmix_s16.o \
+pixel_conv_hor_s16.o \
+pixel_conv_ver_s16.o \
+pixel_affine_s16.o \
+pixel_biquad_s16.o \
+pixel_ca_s1.o \
+pixel_rand_s16.o \
+pixel_crot_s16.o \
+pixel_gain_s16.o
+
+all: $(OBJ)
+
+test: pdp_mmx_test.o $(OBJ)
+ gcc -o pdp_mmx_test pdp_mmx_test.o $(OBJ) -g
+
+clean:
+ rm -f *.o
+ rm -f *~
+ rm -f pdp_mmx.a
+ rm -f pdp_mmx_test
+
diff --git a/system/mmx/pdp_mmx_test.c b/system/mmx/pdp_mmx_test.c
new file mode 100644
index 0000000..e93539f
--- /dev/null
+++ b/system/mmx/pdp_mmx_test.c
@@ -0,0 +1,62 @@
+#include "pdp_mmx.h"
+
+#define FP(x) ((short int)(((float)(x) * 2 * 256.0f)))
+
+#define nbp 256
+
+ short int a1[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int a2[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int b0[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int b1[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int b2[4] = {0x0100,0x0100,0x0100,0x0100};
+
+ short int u1[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int u2[4] = {0x0100,0x0100,0x0100,0x0100};
+
+ short int x0[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int x1[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int x2[4] = {0x0100,0x0100,0x0100,0x0100};
+ short int x3[4] = {0x0100,0x0100,0x0100,0x0100};
+
+void print_pixel(unsigned int i)
+{
+ if (i) printf("x ");
+ else printf(". ");
+}
+
+void print_line(void)
+{
+ printf("\n");
+}
+
+void print_square(unsigned char *c)
+{
+ int i,j;
+
+ for(j=7; j>=0; j--){
+ for(i=0; i<8; i++) print_pixel(c[j] & (1<<(7-i)));
+ printf("\n");
+ }
+
+}
+
+main()
+{
+
+ unsigned char src[16]={1,2,3,4,5,6,7,8,-1,-2,-3,-4,-5,-6,-7,-8};
+ unsigned char dst[8];
+
+
+ print_square(src);
+ print_line();
+ print_square(src+8);
+ print_line();
+
+ pixel_test_s1(dst,src,1,1);
+
+ print_square(dst);
+ print_line();
+
+
+
+}
diff --git a/system/mmx/pixel_add_s16.s b/system/mmx/pixel_add_s16.s
new file mode 100644
index 0000000..8d4c7df
--- /dev/null
+++ b/system/mmx/pixel_add_s16.s
@@ -0,0 +1,55 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_add_s16
+.type pixel_add_s16,@function
+
+# simple add
+# void pixel_add_s16(int *left, int *right, int nb_4pixel_vectors)
+
+pixel_add_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %edi # left array
+ movl 12(%ebp), %esi # right array
+ movl 16(%ebp), %ecx # pixel count
+
+
+ .align 16
+ .loop_mix:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm1 # load right 4 pixels from memory
+ movq (%edi), %mm0 # load 4 left pixels from memory
+ paddsw %mm1, %mm0 # mix
+ movq %mm0, (%edi)
+ addl $8, %esi
+ addl $8, %edi
+ decl %ecx
+ jnz .loop_mix # loop
+
+ emms
+
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_affine_s16.s b/system/mmx/pixel_affine_s16.s
new file mode 100644
index 0000000..b357de3
--- /dev/null
+++ b/system/mmx/pixel_affine_s16.s
@@ -0,0 +1,59 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_affine_s16
+.type pixel_affine_s16,@function
+
+# void pixel_affine_s16(int *buf, int nb_8pixel_vectors, short int gain[4], short int offset[4])
+
+pixel_affine_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 20(%ebp), %edi
+ movq (%edi), %mm6 # get offset vector
+
+ movl 16(%ebp), %edi
+ movq (%edi), %mm7 # get gain vector
+
+ movl 8(%ebp), %esi # input array
+ movl 12(%ebp), %ecx # pixel count
+
+
+ .align 16
+ .loop_affine:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm0 # load 4 pixels from memory
+ pmulhw %mm7, %mm0 # apply gain (s).15 fixed point
+ psllw $1, %mm0 # apply correction shift
+ paddsw %mm6, %mm0 # add offset
+ movq %mm0, (%esi) # store result in memory
+
+ addl $8, %esi # increment source pointer
+ decl %ecx
+ jnz .loop_affine # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_biquad_dirI_s16.s b/system/mmx/pixel_biquad_dirI_s16.s
new file mode 100644
index 0000000..1729502
--- /dev/null
+++ b/system/mmx/pixel_biquad_dirI_s16.s
@@ -0,0 +1,361 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+
+ # TODO MOVE TO DIRECT FORM II
+ # y[k] = b0 * x[k] + u1[k-1]
+ # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
+ # u2[k] = b2 * x[k] - a2 * y[k]
+
+ # input in register:
+ # %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3}
+ # %esi: coef memory (a1, a2, b0, b1, b2)
+ # %edi: state memory (u1, u2)
+
+
+ # return in register:
+ # %mm0-mm4: 4x4 pixels result
+
+
+ .biquad_4x4_pixels:
+ .align 16
+ # prescale
+ movq -8(%esi), %mm4
+ pmulhw %mm4, %mm0
+ pmulhw %mm4, %mm1
+ pmulhw %mm4, %mm2
+ pmulhw %mm4, %mm3
+ psllw $1, %mm0
+ psllw $1, %mm1
+ psllw $1, %mm2
+ psllw $1, %mm3
+
+
+ # first vector
+ movq 0(%edi), %mm4 # mm4 <- u[-1]
+ movq 8(%edi), %mm5 # mm5 <- u[-2]
+ movq %mm4, %mm6
+ movq %mm5, %mm7
+
+ pmulhw 0(%esi), %mm6 # multiply by a1
+ pmulhw 8(%esi), %mm7 # multiply by a2
+
+ paddsw %mm6, %mm0 # accumulate
+ paddsw %mm7, %mm0 # accumulate
+ paddsw %mm0, %mm0 # scale by 2 (since all fixed point muls are x*y/2)
+
+ movq %mm0, %mm6 # mm6 <- u[0]
+ movq %mm4, %mm7 # mm7 <- u[-1]
+ pmulhw 16(%esi), %mm0 # multiply by b0
+ pmulhw 24(%esi), %mm4 # multiply by b1
+ pmulhw 32(%esi), %mm5 # multiply by b2
+
+ paddsw %mm4, %mm0 # accumulate
+ paddsw %mm5, %mm0 # accumulate
+
+ # mm0 is result 0
+
+ # second vector
+ movq %mm6, %mm4 # mm4 <- u[0]
+ movq %mm7, %mm5 # mm5 <- u[-1]
+
+ pmulhw 0(%esi), %mm6 # multiply by a1
+ pmulhw 8(%esi), %mm7 # multiply by a2
+
+ paddsw %mm6, %mm1 # accumulate
+ paddsw %mm7, %mm1 # accumulate
+ paddsw %mm1, %mm1 # scale by 2
+
+
+ movq %mm1, %mm6 # mm6 <- u[1]
+ movq %mm4, %mm7 # mm7 <- u[0]
+ pmulhw 16(%esi), %mm1 # multiply by b0
+ pmulhw 24(%esi), %mm4 # multiply by b1
+ pmulhw 32(%esi), %mm5 # multiply by b2
+
+ paddsw %mm4, %mm1 # accumulate
+ paddsw %mm5, %mm1 # accumulate
+
+ # mm1 is result 1
+
+ # third vector
+ movq %mm6, %mm4 # mm4 <- u[1]
+ movq %mm7, %mm5 # mm5 <- u[0]
+
+ pmulhw 0(%esi), %mm6 # multiply by a1
+ pmulhw 8(%esi), %mm7 # multiply by a2
+
+ paddsw %mm6, %mm2 # accumulate
+ paddsw %mm7, %mm2 # accumulate
+ paddsw %mm2, %mm2 # scale by 2
+
+
+ movq %mm2, %mm6 # mm6 <- u[2]
+ movq %mm4, %mm7 # mm7 <- u[1]
+ pmulhw 16(%esi), %mm2 # multiply by b0
+ pmulhw 24(%esi), %mm4 # multiply by b1
+ pmulhw 32(%esi), %mm5 # multiply by b2
+
+ paddsw %mm4, %mm2 # accumulate
+ paddsw %mm5, %mm2 # accumulate
+
+ # mm2 is result 2
+
+ # fourth vector
+ movq %mm6, %mm4 # mm4 <- u[2]
+ movq %mm7, %mm5 # mm5 <- u[1]
+
+ pmulhw 0(%esi), %mm6 # multiply by a1
+ pmulhw 8(%esi), %mm7 # multiply by a2
+
+ paddsw %mm6, %mm3 # accumulate
+ paddsw %mm7, %mm3 # accumulate
+ paddsw %mm3, %mm3 # scale by 2
+
+
+ movq %mm3, 0(%edi) # store u[3]
+ movq %mm4, 8(%edi) # store u[2]
+ pmulhw 16(%esi), %mm3 # multiply by b0
+ pmulhw 24(%esi), %mm4 # multiply by b1
+ pmulhw 32(%esi), %mm5 # multiply by b2
+
+ paddsw %mm4, %mm3 # accumulate
+ paddsw %mm5, %mm3 # accumulate
+
+ # mm3 is result 3
+
+ ret
+
+
+ # in order to use the 4 line parallel biquad routine on horizontal
+ # lines, we need to reorder (rotate or transpose) the matrix, since
+ # images are scanline encoded, and we want to work in parallell
+ # on 4 lines.
+ #
+ # since the 4 lines are independent, it doesnt matter in which order
+ # the the vector elements are present.
+ #
+ # this allows us to use the same routine for left->right and right->left
+ # processing.
+ #
+ # some comments on the non-abelean group of square isometries consisting of
+ # (I) identity
+ # (H) horizontal axis mirror
+ # (V) vertical axis mirror
+ # (T) transpose (diagonal axis mirror)
+ # (A) antitranspose (antidiagonal axis mirror)
+ # (R1) 90deg anticlockwize rotation
+ # (R2) 180deg rotation
+ # (R3) 90deg clockwize rotation
+ #
+ #
+ # we basicly have two options: (R1,R3) or (T,A)
+ # we opt for T and A because they are self inverting, which improves locality
+ #
+ # use antitranspose for right to left an transpose
+ # for left to right (little endian)
+
+
+ # antitranspose 4x4
+
+ # input
+ # %mm3 == {d0 d1 d2 d3}
+ # %mm2 == {c0 c1 c2 c3}
+ # %mm1 == {b0 b1 b2 b3}
+ # %mm0 == {a0 a1 a2 a3}
+
+ # output
+ # %mm3 == {a3 b3 c3 d3}
+ # %mm2 == {a2 b2 c2 d2}
+ # %mm1 == {a1 b1 c1 d1}
+ # %mm0 == {a0 b0 c0 d0}
+
+
+ .antitranspose_4x4:
+ .align 16
+ movq %mm3, %mm4
+ punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
+ movq %mm3, %mm5
+ punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
+
+ movq %mm2, %mm6
+ punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
+ movq %mm2, %mm7
+ punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
+
+ movq %mm4, %mm3
+ punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
+ movq %mm4, %mm2
+ punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
+
+ movq %mm5, %mm1
+ punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
+ movq %mm5, %mm0
+ punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
+
+ ret
+
+
+
+ # transpose 4x4
+
+ # input
+ # %mm3 == {d3 d2 d1 d0}
+ # %mm2 == {c3 c2 c1 c0}
+ # %mm1 == {b3 b2 b1 b0}
+ # %mm0 == {a3 a2 a1 a0}
+
+ # output
+ # %mm3 == {d3 c3 b3 a3}
+ # %mm2 == {d2 c2 b2 a2}
+ # %mm1 == {d1 c1 b1 a1}
+ # %mm0 == {d0 c0 b0 a0}
+
+
+ .transpose_4x4:
+ .align 16
+ movq %mm0, %mm4
+ punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
+ movq %mm0, %mm5
+ punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
+
+ movq %mm1, %mm6
+ punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
+ movq %mm1, %mm7
+ punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
+
+ movq %mm4, %mm0
+ punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
+ movq %mm4, %mm1
+ punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
+
+ movq %mm5, %mm2
+ punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
+ movq %mm5, %mm3
+ punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
+
+ ret
+
+
+.globl pixel_biquad_vertb_s16
+.type pixel_biquad_vertb_s16,@function
+
+
+# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_biquad_vertb_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_vertb_line_loop:
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+ call .biquad_4x4_pixels
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+ addl %edx, %ebx
+ addl %eax, %ebx
+ decl %ecx
+ jnz .biquad_vertb_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+.globl pixel_biquad_horlr_s16
+.type pixel_biquad_horlr_s16,@function
+
+
+# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_biquad_horlr_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_horlr_line_loop:
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+ call .transpose_4x4
+ call .biquad_4x4_pixels
+ call .transpose_4x4
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+ addl $8, %ebx
+ decl %ecx
+ jnz .biquad_horlr_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+
+
diff --git a/system/mmx/pixel_biquad_s16.s b/system/mmx/pixel_biquad_s16.s
new file mode 100644
index 0000000..844b041
--- /dev/null
+++ b/system/mmx/pixel_biquad_s16.s
@@ -0,0 +1,451 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+
+ # DIRECT FORM II BIQUAD
+ #
+ # y[k] = b0 * x[k] + u1[k-1]
+ # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
+ # u2[k] = b2 * x[k] - a2 * y[k]
+ # MACRO: df2 <reg>
+ #
+ # computes a direct form 2 biquad
+ # does not use {mm0-mm3}\<inreg>
+ #
+ # input: <reg> == input
+ # %mm4 == state 1
+ # %mm5 == state 2
+ # (%esi) == biquad coefs (-a1 -a2 b0 b1 b2) in s1.14
+ # output: <reg> == output
+ # %mm4 == state 1
+ # %mm5 == state 2
+
+ .macro df2 reg
+ movq \reg, %mm6 # mm6 == x[k]
+ movq \reg, %mm7 # mm7 == x[k]
+ pmulhw 16(%esi), %mm6 # mm6 == x[k] * b0
+ pmulhw 24(%esi), %mm7 # mm7 == x[k] * b1
+ paddw %mm4, %mm6 # mm6 == x[k] * b0 + u1[k-1] == y[k]
+ paddw %mm5, %mm7 # mm7 == x[k] * b1 + u2[k-1]
+ paddsw %mm6, %mm6 # compensate for mul = x*y/4 (coefs are s1.14 fixed point)
+ paddsw %mm6, %mm6 # paddsw ensures saturation
+ movq \reg, %mm5 # mm5 == x[k]
+ movq %mm6, %mm4 # mm4 == y[k]
+ movq %mm6, \reg # reg == y[k] --------------------
+ pmulhw 0(%esi), %mm4 # mm4 == y[k] * (-a1)
+ pmulhw 8(%esi), %mm6 # mm6 == y[k] * (-a2)
+ pmulhw 32(%esi), %mm5 # mm5 == x[k] * b2
+ paddw %mm7, %mm4 # mm4 == u1[k] --------------------
+ paddw %mm6, %mm5 # mm5 == u2[k] --------------------
+ .endm
+
+
+ # input in register:
+ # %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3}
+ # %esi: coef memory (-a1, -a2, b0, b1, b2) in s1.14
+ # %edi: state memory (u1, u2)
+
+ # return in register:
+ # %mm0-mm4: 4x4 pixels result
+
+
+
+
+ .macro biquad_4x4_pixels
+ .align 16
+ movq 0(%edi), %mm4 # get state
+ movq 8(%edi), %mm5
+ df2 %mm0 # compute 4 biquads
+ df2 %mm1
+ df2 %mm2
+ df2 %mm3
+ movq %mm4, 0(%edi) # store state
+ movq %mm5, 8(%edi)
+ .endm
+
+
+
+ # in order to use the 4 line parallel biquad routine on horizontal
+ # lines, we need to reorder (rotate or transpose) the matrix, since
+ # images are scanline encoded, and we want to work in parallell
+ # on 4 lines.
+ #
+ # since the 4 lines are independent, it doesnt matter in which order
+ # the the vector elements are present.
+ #
+ # this allows us to use the same routine for left->right and right->left
+ # processing.
+ #
+ # some comments on the non-abelean group of square isometries consisting of
+ # (I) identity
+ # (H) horizontal axis mirror
+ # (V) vertical axis mirror
+ # (T) transpose (diagonal axis mirror)
+ # (A) antitranspose (antidiagonal axis mirror)
+ # (R1) 90deg anticlockwize rotation
+ # (R2) 180deg rotation
+ # (R3) 90deg clockwize rotation
+ #
+ #
+ # we basicly have two options: (R1,R3) or (T,A)
+ # we opt for T and A because they are self inverting, which improves locality
+ #
+ # use antitranspose for right to left an transpose
+ # for left to right (little endian)
+
+
+ # antitranspose 4x4
+
+ # input
+ # %mm3 == {d0 d1 d2 d3}
+ # %mm2 == {c0 c1 c2 c3}
+ # %mm1 == {b0 b1 b2 b3}
+ # %mm0 == {a0 a1 a2 a3}
+
+ # output
+ # %mm3 == {a3 b3 c3 d3}
+ # %mm2 == {a2 b2 c2 d2}
+ # %mm1 == {a1 b1 c1 d1}
+ # %mm0 == {a0 b0 c0 d0}
+
+
+ .macro antitranspose_4x4:
+ movq %mm3, %mm4
+ punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
+ movq %mm3, %mm5
+ punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
+
+ movq %mm2, %mm6
+ punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
+ movq %mm2, %mm7
+ punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
+
+ movq %mm4, %mm3
+ punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
+ movq %mm4, %mm2
+ punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
+
+ movq %mm5, %mm1
+ punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
+ movq %mm5, %mm0
+ punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
+
+ .endm
+
+
+ # transpose 4x4
+
+ # input
+ # %mm3 == {d3 d2 d1 d0}
+ # %mm2 == {c3 c2 c1 c0}
+ # %mm1 == {b3 b2 b1 b0}
+ # %mm0 == {a3 a2 a1 a0}
+
+ # output
+ # %mm3 == {d3 c3 b3 a3}
+ # %mm2 == {d2 c2 b2 a2}
+ # %mm1 == {d1 c1 b1 a1}
+ # %mm0 == {d0 c0 b0 a0}
+
+
+ .macro transpose_4x4:
+ movq %mm0, %mm4
+ punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
+ movq %mm0, %mm5
+ punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
+
+ movq %mm1, %mm6
+ punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
+ movq %mm1, %mm7
+ punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
+
+ movq %mm4, %mm0
+ punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
+ movq %mm4, %mm1
+ punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
+
+ movq %mm5, %mm2
+ punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
+ movq %mm5, %mm3
+ punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
+
+ .endm
+
+.globl pixel_biquad_vertb_s16
+.type pixel_biquad_vertb_s16,@function
+
+
+# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_biquad_vertb_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_vertb_line_loop:
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+ biquad_4x4_pixels
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+ addl %edx, %ebx
+ addl %eax, %ebx
+ decl %ecx
+ jnz .biquad_vertb_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+.globl pixel_biquad_verbt_s16
+.type pixel_biquad_verbt_s16,@function
+
+
+# pixel_biquad_vertbt_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_biquad_verbt_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %eax # line with
+
+ shll $3, %eax # 4 line byte spacing
+ decl %ecx
+ mul %ecx
+ incl %ecx
+ addl %eax, %ebx # ebx points to last pixblock
+
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_verbt_line_loop:
+ movq (%ebx), %mm3
+ movq (%ebx,%edx,1), %mm2
+ movq (%ebx,%edx,2), %mm1
+ movq (%ebx,%eax,1), %mm0
+ biquad_4x4_pixels
+ movq %mm3, (%ebx)
+ movq %mm2, (%ebx,%edx,1)
+ movq %mm1, (%ebx,%edx,2)
+ movq %mm0, (%ebx,%eax,1)
+ subl %edx, %ebx
+ subl %eax, %ebx
+ decl %ecx
+ jnz .biquad_verbt_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+.globl pixel_biquad_horlr_s16
+.type pixel_biquad_horlr_s16,@function
+# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+pixel_biquad_horlr_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_horlr_line_loop:
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+ transpose_4x4
+ biquad_4x4_pixels
+ transpose_4x4
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+ addl $8, %ebx
+ decl %ecx
+ jnz .biquad_horlr_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+
+.globl pixel_biquad_horrl_s16
+.type pixel_biquad_horrl_s16,@function
+# pixel_biquad_horrl_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+pixel_biquad_horrl_s16:
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+
+ movl %ecx, %eax
+ decl %eax
+ shll $3, %eax
+ addl %eax, %ebx # ebx points to last pixblock
+
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+ .align 16
+ .biquad_horrl_line_loop:
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+ antitranspose_4x4
+ biquad_4x4_pixels
+ antitranspose_4x4
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+ subl $8, %ebx
+ decl %ecx
+ jnz .biquad_horrl_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+
+.globl pixel_biquad_time_s16
+.type pixel_biquad_time_s16,@function
+# pixel_biquad_time_s16(short int *pixel_array, short int *s1, short int *s2, short int *coefs, int nb_4_pix_vectors)
+
+pixel_biquad_time_s16:
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %edx # state 1 array
+ movl 16(%ebp), %edi # state 2 array
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %ecx # nb of 4 pixel vectors
+
+
+ .align 16
+ .biquad_time_loop:
+ movq (%ebx), %mm0 # get input
+ movq (%edx), %mm4 # get state 1
+ movq (%edi), %mm5 # get state 2
+ df2 %mm0 # compute direct form 2
+ movq %mm0, (%ebx) # write output
+ movq %mm5, (%edi) # write state 2
+ movq %mm4, (%edx) # write state 1
+ addl $8, %ebx
+ addl $8, %edi
+ addl $8, %edx
+ decl %ecx
+ jnz .biquad_time_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+
diff --git a/system/mmx/pixel_ca_s1.s b/system/mmx/pixel_ca_s1.s
new file mode 100644
index 0000000..d9c730f
--- /dev/null
+++ b/system/mmx/pixel_ca_s1.s
@@ -0,0 +1,189 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+ # this file contains assembler routines for 2D 1 bit cellular automata
+ # processing. it is organized around a feeder kernel and a
+ # stack based bit processor (virtual forth machine)
+ #
+ # the feeder kernel is responsable for loading/storing CA cells
+ # from/to memory. data in memory is organized as a scanline
+ # encoded toroidial bitplane (lsb = left). to simplify the kernel, the top
+ # left corner of the rectangular grid of pixels will shift down
+ # every processing step.
+ #
+ # the stack machine has the following architecture:
+ # CA stack: %esi, TOS: %mm0 (32x2 pixels. lsw = top row)
+ # CA horizon: %mm4-%mm7 (64x4 pixels. %mm4 = top row)
+ #
+ # the stack size / organization is not known to the stack machine.
+ # it can be thought of as operating on a 3x3 cell neightbourhood.
+ # the only purpose of forth program is to determine the CA local update rule.
+ #
+ # the machine is supposed to be very minimal. no looping control.
+ # no adressing modes. no conditional code (hey, this is an experiment!)
+ # so recursion is not allowed (no way to stop it)
+ # there are 9 words to load the cell neigbourhood on the stack.
+ # the rest is just logic and stack manips.
+
+
+ # this file contains pure asm macros. it is to be included before assembly
+ # after scaforth.pl has processed the .scaf file
+
+
+ # *************************** CA CELL ACCESS MACROS *****************************
+ # fetchTL - fetchBR
+
+ # shift / load rectangle macros:
+
+ # shift rectangle horizontal
+ # result is in reg1
+ .macro shift reg1 reg2 count
+ psllq $(32+\count), \reg1
+ psrlq $(32-\count), \reg2
+ psrlq $32, \reg1
+ psllq $32, \reg2
+ por \reg2, \reg1
+ .endm
+
+ .macro ldtop reg1 reg2
+ movq %mm4, \reg1
+ movq %mm5, \reg2
+ .endm
+
+ .macro ldcenter reg1 reg2
+ movq %mm5, \reg1
+ movq %mm6, \reg2
+ .endm
+
+ .macro ldbottom reg1 reg2
+ movq %mm6, \reg1
+ movq %mm7, \reg2
+ .endm
+
+
+ # fetch from top row
+
+ # fetch the top left square
+ .macro fetchTL
+ ldtop %mm0, %mm1
+ shift %mm0, %mm1, -1
+ .endm
+
+ # fetch the top mid square
+ .macro fetchTM
+ ldtop %mm0, %mm1
+ shift %mm0, %mm1, 0
+ .endm
+
+ # fetch the top right square
+ .macro fetchTR
+ ldtop %mm0, %mm1
+ shift %mm0, %mm1, 1
+ .endm
+
+
+
+ # fetch from center row
+
+ # fetch the mid left square
+ .macro fetchML
+ ldcenter %mm0, %mm1
+ shift %mm0, %mm1, -1
+ .endm
+
+ # fetch the mid mid square
+ .macro fetchMM
+ ldcenter %mm0, %mm1
+ shift %mm0, %mm1, 0
+ .endm
+
+ # fetch the mid right square
+ .macro fetchMR
+ ldcenter %mm0, %mm1
+ shift %mm0, %mm1, 1
+ .endm
+
+
+
+
+
+ # fetch from bottom row
+
+ # fetch the bottom left square
+ .macro fetchBL
+ ldbottom %mm0, %mm1
+ shift %mm0, %mm1, -1
+ .endm
+
+ # fetch the bottom mid square
+ .macro fetchBM
+ ldbottom %mm0, %mm1
+ shift %mm0, %mm1, 0
+ .endm
+
+ # fetch the bottom right square
+ .macro fetchBR
+ ldbottom %mm0, %mm1
+ shift %mm0, %mm1, 1
+ .endm
+
+
+
+ # *************************** CA STACK MANIP MACROS *****************************
+ # dup drop dropdup swap nip dropover
+
+ .macro dup
+ lea -8(%esi), %esi
+ movq %mm0, (%esi)
+ .endm
+
+ .macro drop
+ movq (%esi), %mm0
+ lea 8(%esi), %esi
+ .endm
+
+ .macro dropdup
+ movq (%esi), %mm0
+ .endm
+
+ .macro swap
+ movq (%esi), %mm1
+ movq %mm0, (%esi)
+ movq %mm1, %mm0
+ .endm
+
+ .macro nip
+ lea 8(%esi), %esi
+ .endm
+
+ .macro dropover
+ movq 8(%esi), %mm0
+ .endm
+
+
+ # *************************** CA BOOLEAN LOGIC MACROS *****************************
+ # overxor
+
+ .macro overxor
+ pxor (%esi), %mm0
+ .endm
+
+
+
+
+
diff --git a/system/mmx/pixel_cascade_s16.s b/system/mmx/pixel_cascade_s16.s
new file mode 100644
index 0000000..bf88d08
--- /dev/null
+++ b/system/mmx/pixel_cascade_s16.s
@@ -0,0 +1,330 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+
+ # TODO: COUPLED CASCADE SECOND ORDER SECTION
+ #
+ # s1[k] = ar * s1[k-1] + ai * s2[k-1] + x[k]
+ # s2[k] = ar * s2[k-1] - ai * s1[k-1]
+ # y[k] = c0 * x[k] + c1 * s1[k-1] + c2 * s2[k-1]
+
+
+ # MACRO: df2
+ #
+ # computes a coupled cascade
+ #
+ # input: %mm0 == input
+ # %mm1 == state 1
+ # %mm2 == state 2
+ # (%esi) == cascade coefs (ar ai c0 c1 c2) in s0.15
+ # output: %mm0 == output
+ # %mm1 == state 1
+ # %mm2 == state 2
+
+
+ .macro coupled
+ pmovq %mm1, %mm3 # mm3 == s1[k-1]
+ pmovq %mm1, %mm4 # mm4 == s1[k-1]
+ pmovq %mm2, %mm5 # mm5 == s2[k-1]
+ pmovq %mm2, %mm6 # mm5 == s2[k-1]
+ pmulhw (%esi), %mm1 # mm1 == s1[k-1] * ar
+ pmulhw 8(%esi), %mm3 # mm3 == s1[k-1] * ai
+ pmulhw 24(%esi), %mm4 # mm4 == s1[k-1] * c1
+ pmulhw (%esi), %mm2 # mm2 == s2[k-1] * ar
+ pmulhw 8(%esi), %mm5 # mm5 == s2[k-1] * ai
+ pmulhw 32(%esi), %mm6 # mm6 == s2[k-1] * c2
+ paddw %mm5, %mm1 # mm1 == s1[k-1] * ar + s2[k-1] * ai
+ psubw %mm3, %mm2 # mm2 == s2[k-1] * ar - s1[k-1] * ai == s2[k]
+ paddw %mm0, %mm1 # mm1 == s1[k]
+ pmulhw 16(%esi), %mm0 # mm0 == x[k] * c0
+ paddw %mm6, %mm4 # mm4 == s1[k-1] * c1 + s2[k-1] * c2
+ paddw %mm4, %mm0 # mm0 == y[k]
+ .endm
+
+
+
+
+ # in order to use the 4 line parallel cascade routine on horizontal
+ # lines, we need to reorder (rotate or transpose) the matrix, since
+ # images are scanline encoded, and we want to work in parallell
+ # on 4 lines.
+ #
+ # since the 4 lines are independent, it doesnt matter in which order
+ # the the vector elements are present.
+ #
+ # this allows us to use the same routine for left->right and right->left
+ # processing.
+ #
+ # some comments on the non-abelean group of square isometries consisting of
+ # (I) identity
+ # (H) horizontal axis mirror
+ # (V) vertical axis mirror
+ # (T) transpose (diagonal axis mirror)
+ # (A) antitranspose (antidiagonal axis mirror)
+ # (R1) 90deg anticlockwize rotation
+ # (R2) 180deg rotation
+ # (R3) 90deg clockwize rotation
+ #
+ #
+ # we basicly have two options: (R1,R3) or (T,A)
+ # we opt for T and A because they are self inverting, which improves locality
+ #
+ # use antitranspose for right to left an transpose
+ # for left to right (little endian)
+
+
+ # antitranspose 4x4
+
+ # input
+ # %mm3 == {d0 d1 d2 d3}
+ # %mm2 == {c0 c1 c2 c3}
+ # %mm1 == {b0 b1 b2 b3}
+ # %mm0 == {a0 a1 a2 a3}
+
+ # output
+ # %mm3 == {a3 b3 c3 d3}
+ # %mm2 == {a2 b2 c2 d2}
+ # %mm1 == {a1 b1 c1 d1}
+ # %mm0 == {a0 b0 c0 d0}
+
+
+ .macro antitranspose_4x4:
+ movq %mm3, %mm4
+ punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
+ movq %mm3, %mm5
+ punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
+
+ movq %mm2, %mm6
+ punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
+ movq %mm2, %mm7
+ punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
+
+ movq %mm4, %mm3
+ punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
+ movq %mm4, %mm2
+ punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
+
+ movq %mm5, %mm1
+ punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
+ movq %mm5, %mm0
+ punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
+
+ .endm
+
+
+ # transpose 4x4
+
+ # input
+ # %mm3 == {d3 d2 d1 d0}
+ # %mm2 == {c3 c2 c1 c0}
+ # %mm1 == {b3 b2 b1 b0}
+ # %mm0 == {a3 a2 a1 a0}
+
+ # output
+ # %mm3 == {d3 c3 b3 a3}
+ # %mm2 == {d2 c2 b2 a2}
+ # %mm1 == {d1 c1 b1 a1}
+ # %mm0 == {d0 c0 b0 a0}
+
+
+ .macro transpose_4x4:
+ movq %mm0, %mm4
+ punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
+ movq %mm0, %mm5
+ punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
+
+ movq %mm1, %mm6
+ punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
+ movq %mm1, %mm7
+ punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
+
+ movq %mm4, %mm0
+ punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
+ movq %mm4, %mm1
+ punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
+
+ movq %mm5, %mm2
+ punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
+ movq %mm5, %mm3
+ punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
+
+ .endm
+
+.globl pixel_cascade_vertb_s16
+.type pixel_cascade_vertb_s16,@function
+
+
+# pixel_cascade_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_cascade_vertb_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ subl %edx, %ebx
+
+ movq 0(%edi), %mm1 # s1[k-1]
+ movq 8(%edi), %mm2 # s2[k-1]
+ .align 16
+ .cascade_vertb_line_loop:
+
+ movq (%ebx,%edx,1), %mm3
+ movq %mm3, %mm0
+ addl %edx, %ebx
+ coupled
+ movq %mm0, (%ebx)
+
+ movq (%ebx,%edx,1), %mm3
+ movq %mm3, %mm0
+ addl %edx, %ebx
+ coupled
+ movq %mm0, (%ebx)
+
+ movq (%ebx,%edx,1), %mm3
+ movq %mm3, %mm0
+ addl %edx, %ebx
+ coupled
+ movq %mm0, (%ebx)
+
+ movq (%ebx,%edx,1), %mm3
+ movq %mm3, %mm0
+ addl %edx, %ebx
+ coupled
+ movq %mm0, (%ebx)
+
+ decl %ecx
+ jnz .cascade_vertb_line_loop
+
+ movq %mm1, 0(%edi) # s1[k-1]
+ movq %mm2, 8(%edi) # s2[k-1]
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+.globl pixel_cascade_horlr_s16
+.type pixel_cascade_horlr_s16,@function
+
+
+# pixel_cascade_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
+
+
+pixel_cascade_horlr_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %ebx # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4x4 pixblocks
+ movl 16(%ebp), %edx # line with
+
+ movl 20(%ebp), %esi # coefs
+ movl 24(%ebp), %edi # state
+
+ shll $1, %edx # short int addressing
+ movl %edx, %eax
+ shll $1, %eax
+ addl %edx, %eax # eax = 3 * edx
+
+
+ .align 16
+ .cascade_horlr_line_loop:
+ movq (%edi), %mm1
+ movq 8(%edi), %mm2
+
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+ movq (%ebx,%eax,1), %mm3
+
+ transpose_4x4
+
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+
+ coupled
+
+ movq %mm0, (%ebx)
+ movq (%ebx,%edx,1), %mm3
+ movq %mm3, %mm0
+
+ coupled
+
+ movq %mm0, (%ebx, %edx,1)
+ movq (%ebx,%edx,2), %mm3
+ movq %mm3, %mm0
+
+ coupled
+
+ movq %mm0, (%ebx, %edx,2)
+ movq (%ebx,%eax,1), %mm3
+ movq %mm3, %mm0
+
+ coupled
+
+ movq %mm1, 0(%edi) # s1[k-1]
+ movq %mm2, 8(%edi) # s2[k-1]
+
+ movq %mm0, %mm3
+ movq (%ebx), %mm0
+ movq (%ebx,%edx,1), %mm1
+ movq (%ebx,%edx,2), %mm2
+
+ transpose_4x4
+
+ movq %mm0, (%ebx)
+ movq %mm1, (%ebx,%edx,1)
+ movq %mm2, (%ebx,%edx,2)
+ movq %mm3, (%ebx,%eax,1)
+
+ addl $8, %ebx
+ decl %ecx
+ jnz .cascade_horlr_line_loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ leave
+ ret
+
+
+
diff --git a/system/mmx/pixel_conv_hor_s16.s b/system/mmx/pixel_conv_hor_s16.s
new file mode 100644
index 0000000..e90a692
--- /dev/null
+++ b/system/mmx/pixel_conv_hor_s16.s
@@ -0,0 +1,134 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+ # intermediate function
+
+ # input in register:
+ # %mm0: left 4 pixels
+ # %mm1: middle 4 pixels
+ # %mm2: right 4 pixels
+
+ # %mm5: left 4 pixel masks
+ # %mm6: middle 4 pixel masks
+ # %mm7: right 4 pixel masks
+
+ # return in register:
+ # %mm0: middle 4 pixels result
+
+
+ .conv_hor_4_pixels:
+ .align 16
+
+ # compute quadruplet
+
+ # get left pixels
+ psrlq $48, %mm0 # shift word 3 to byte 0
+ movq %mm1, %mm4
+ psllq $16, %mm4 # shift word 0,1,2 to 1,2,3
+ por %mm4, %mm0 # combine
+ pmulhw %mm5, %mm0
+ psllw $1, %mm0
+
+
+ # get middle pixels
+ movq %mm1, %mm4
+ pmulhw %mm6, %mm4
+ psllw $1, %mm4
+ paddsw %mm4, %mm0
+
+
+ # get right pixels
+ movq %mm2, %mm3
+ psllq $48, %mm3 # shift word 0 to word 3
+ movq %mm1, %mm4
+ psrlq $16, %mm4 # shift word 1,2,3 to 0,1,2
+ por %mm4, %mm3 # combine
+ pmulhw %mm7, %mm3
+ psllw $1, %mm3
+ paddsw %mm3, %mm0 # accumulate
+
+ ret
+
+.globl pixel_conv_hor_s16
+.type pixel_conv_hor_s16,@function
+
+
+# pixel_conv_hor_s16(short int *pixel_array, int nb_4_pixel_vectors, short int border[4], short int mask[12])
+# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
+# NOT TESTED
+
+
+pixel_conv_hor_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %esi # pixel array offset
+ movl 12(%ebp), %ecx # nb of 8 pixel vectors in a row (at least 2)
+
+ movl 20(%ebp), %edi # mask vector
+ movq (%edi), %mm5
+ movq 8(%edi), %mm6
+ movq 16(%edi), %mm7
+
+ movl 16(%ebp), %edi # boundary pixel vector
+
+
+
+ movq (%edi), %mm0 # init regs (left edge, so mm0 is zero)
+ movq (%esi), %mm1
+ movq 8(%esi), %mm2
+
+ decl %ecx # loop has 2 terminator stubs
+ decl %ecx # todo: handle if ecx < 3
+
+ jmp .conv_line_loop
+
+
+ .align 16
+ .conv_line_loop:
+ call .conv_hor_4_pixels # compute conv
+ movq %mm0, (%esi) # store result
+ movq %mm1, %mm0 # mm0 <- prev (%esi)
+ movq %mm2, %mm1 # mm1 <- 8(%esi)
+ movq 16(%esi), %mm2 # mm2 <- 16(%esi)
+
+ addl $8, %esi # increase pointer
+ decl %ecx
+ jnz .conv_line_loop
+
+ call .conv_hor_4_pixels # compute conv
+ movq %mm0, (%esi) # store result
+ movq %mm1, %mm0 # mm0 <- prev (%esi)
+ movq %mm2, %mm1 # mm1 <- 8(%esi)
+ movq (%edi), %mm2 # mm2 <- border
+
+ call .conv_hor_4_pixels # compute last vector
+ movq %mm0, 8(%esi) # store it
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
+
+
diff --git a/system/mmx/pixel_conv_ver_s16.s b/system/mmx/pixel_conv_ver_s16.s
new file mode 100644
index 0000000..ae2456f
--- /dev/null
+++ b/system/mmx/pixel_conv_ver_s16.s
@@ -0,0 +1,128 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+#TODO: fix out of bound acces in conv_ver and conv_hor
+
+ # intermediate function
+
+ # input in register:
+ # %mm0: top 4 pixels
+ # %mm1: middle 4 pixels
+ # %mm2: bottom 4 pixels
+
+ # %mm5: top 4 pixel mask
+ # %mm6: middle 4 pixel mask
+ # %mm7: bottom 4 pixel mask
+
+ # return in register:
+ # %mm0: middle 4 pixels result
+
+
+ .conv_ver_4_pixels:
+ .align 16
+
+ # compute quadruplet
+
+ # get top pixel
+ pmulhw %mm5, %mm0
+ psllw $1, %mm0
+
+ # get middle pixel
+ movq %mm1, %mm4
+ pmulhw %mm6, %mm4
+ psllw $1, %mm4
+ paddsw %mm4, %mm0
+
+ # get bottom pixel
+ movq %mm2, %mm3
+ pmulhw %mm7, %mm3
+ psllw $1, %mm3 # mm3 <- mm3/4
+ paddsw %mm3, %mm0
+
+ ret
+
+.globl pixel_conv_ver_s16
+.type pixel_conv_ver_s16,@function
+
+
+# pixel_conv_ver_s16(short int *pixel_array, int nb_4_pixel_vectors, int row_byte_size, short int border[4])
+# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
+# NOT TESTED
+
+
+pixel_conv_ver_s16:
+
+
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %esi # pixel array offset
+ movl 12(%ebp), %ecx # nb of 4 pixel vectors in a row (at least 2)
+ movl 16(%ebp), %edx # rowsize in bytes
+
+ movl 24(%ebp), %edi # mask vector
+ movq (%edi), %mm5
+ movq 8(%edi), %mm6
+ movq 16(%edi), %mm7
+
+ movl 20(%ebp), %edi # edge vector
+
+
+ shll $1, %edx
+ decl %ecx # loop has a terminator stub
+ decl %ecx # loop has another terminator stub
+
+
+ movq (%edi), %mm0 # init regs (left edge, so mm0 is zero)
+ movq (%esi), %mm1
+ movq (%esi,%edx,1), %mm2
+ jmp .conv_line_loop
+
+
+ .align 16
+ .conv_line_loop:
+ call .conv_ver_4_pixels # compute conv
+ movq %mm0, (%esi) # store result
+ movq %mm1, %mm0 # mm0 <- prev (%esi)
+ movq %mm2, %mm1 # mm1 <- (%esi,%edx,1)
+ movq (%esi,%edx,2), %mm2 # mm2 <- (%esi,%edx,2)
+
+ addl %edx, %esi # increase pointer
+ decl %ecx
+ jnz .conv_line_loop
+
+ call .conv_ver_4_pixels # compute conv
+ movq %mm0, (%esi) # store result
+ movq %mm1, %mm0 # mm0 <- prev (%esi)
+ movq %mm2, %mm1 # mm1 <- (%esi,%edx,1)
+ movq (%edi), %mm2 # clear invalid edge vector
+
+ addl %edx, %esi # increase pointer
+ call .conv_ver_4_pixels # compute last vector
+ movq %mm0, (%esi) # store it
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
+
+
diff --git a/system/mmx/pixel_crot_s16.s b/system/mmx/pixel_crot_s16.s
new file mode 100644
index 0000000..2427869
--- /dev/null
+++ b/system/mmx/pixel_crot_s16.s
@@ -0,0 +1,153 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_crot3d_s16
+.type pixel_crot3d_s16,@function
+
+
+# 3 dimensional colour space rotation
+# 3x3 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
+
+# void pixel_crot3d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
+
+pixel_crot3d_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+
+ movl 8(%ebp), %esi # input array
+ movl 12(%ebp), %ecx # pixel count
+ movl 16(%ebp), %edi # rotation matrix
+ movl %ecx, %edx
+ shll $3, %edx # %edx = plane spacing
+
+
+ .align 16
+ .loop_crot3d:
+
+ movq (%esi), %mm0 # get 1st component
+ movq (%esi,%edx,1), %mm6 # get 2nd component
+ movq (%esi,%edx,2), %mm7 # get 3rd component
+
+ movq %mm0, %mm1 # copy 1st component
+ movq %mm0, %mm2
+
+ pmulhw (%edi), %mm0 # mul first column
+ pmulhw 8(%edi), %mm1
+ pmulhw 16(%edi), %mm2
+
+ movq %mm6, %mm5 # copy 2nd component
+ movq %mm6, %mm3
+
+ pmulhw 24(%edi), %mm6 # mul second column
+ pmulhw 32(%edi), %mm5
+ pmulhw 40(%edi), %mm3
+
+ paddsw %mm6, %mm0 # accumulate
+ paddsw %mm5, %mm1
+ paddsw %mm3, %mm2
+
+ movq %mm7, %mm4 # copy 3rd component
+ movq %mm7, %mm6
+
+ pmulhw 48(%edi), %mm4 # mul third column
+ pmulhw 56(%edi), %mm6
+ pmulhw 64(%edi), %mm7
+
+ paddsw %mm4, %mm0 # accumulate
+ paddsw %mm6, %mm1
+ paddsw %mm7, %mm2
+
+ paddsw %mm0, %mm0 # double (fixed point normalization)
+ paddsw %mm1, %mm1
+ paddsw %mm2, %mm2
+
+ movq %mm0, (%esi) # store
+ movq %mm1, (%esi, %edx, 1)
+ movq %mm2, (%esi, %edx, 2)
+
+ addl $8, %esi # increment source pointer
+ decl %ecx
+ jnz .loop_crot3d # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
+
+.globl pixel_crot2d_s16
+.type pixel_crot2d_s16,@function
+
+# 2 dimensional colour space rotation
+# 2x2 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
+
+# void pixel_crot2d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
+
+pixel_crot2d_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+
+ movl 8(%ebp), %esi # input array
+ movl 12(%ebp), %ecx # pixel count
+ movl 16(%ebp), %edi # rotation matrix
+ movl %ecx, %edx
+ shll $3, %edx # %edx = plane spacing
+
+
+ .align 16
+ .loop_crot2d:
+
+ movq (%esi), %mm0 # get 1st component
+ movq (%esi,%edx,1), %mm2 # get 2nd component
+
+ movq %mm0, %mm1 # copy 1st component
+ movq %mm2, %mm3 # copy 2nd component
+
+ pmulhw (%edi), %mm0 # mul first column
+ pmulhw 8(%edi), %mm1
+
+ pmulhw 16(%edi), %mm2 # mul second column
+ pmulhw 24(%edi), %mm3
+
+ paddsw %mm2, %mm0 # accumulate
+ paddsw %mm3, %mm1
+
+ paddsw %mm0, %mm0 # fixed point gain correction
+ paddsw %mm1, %mm1
+
+ movq %mm0, (%esi) # store
+ movq %mm1, (%esi, %edx, 1)
+
+ addl $8, %esi # increment source pointer
+ decl %ecx
+ jnz .loop_crot2d # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_gain.s b/system/mmx/pixel_gain.s
new file mode 100644
index 0000000..5cd5057
--- /dev/null
+++ b/system/mmx/pixel_gain.s
@@ -0,0 +1,83 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_gain
+.type pixel_gain,@function
+
+# mmx rgba pixel gain
+# void asmtest(char *pixelarray, int32 nbpixels, int *rgba_gain)
+# gains are 7.9 fixed point for rgba
+
+pixel_gain:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %esi # pixel array offset
+ movl 12(%ebp), %ecx # nb of elements
+ movl 16(%ebp), %edi # int16[4] array of gains
+
+ prefetch (%esi)
+
+ emms
+ sarl $2, %ecx # process 4 pixels per loop iteration
+ jz .exit
+ movq (%edi), %mm7 # read gain array from memory
+ jmp .loop_gain
+
+ .align 16
+ .loop_gain:
+
+ prefetch 128(%esi)
+ movq (%esi), %mm5 # load pixel 1-2 from memory
+ movq 8(%esi), %mm6 # load pixel 3-4 from memory
+ pxor %mm0, %mm0 # zero mm0 - mm3
+ pxor %mm1, %mm1
+ pxor %mm2, %mm2
+ pxor %mm3, %mm3
+ punpcklbw %mm5, %mm0 # unpack 1st pixel into 8.8 bit ints
+ punpckhbw %mm5, %mm1 # unpack 2nd
+ punpcklbw %mm6, %mm2 # unpack 3rd
+ punpckhbw %mm6, %mm3 # unpack 4th
+ psrlw $0x1, %mm0 # shift right to clear sign bit 9.7
+ psrlw $0x1, %mm1
+ psrlw $0x1, %mm2
+ psrlw $0x1, %mm3
+
+ pmulhw %mm7, %mm0 # multiply 1st pixel 9.7 * 7.9 -> 16.0
+ pmulhw %mm7, %mm1 # multiply 2nd
+ pmulhw %mm7, %mm2 # multiply 3rd
+ pmulhw %mm7, %mm3 # multiply 4th
+
+ packuswb %mm1, %mm0 # pack & saturate to 8bit vector
+ movq %mm0, (%esi) # store result in memory
+ packuswb %mm3, %mm2 # pack & saturate to 8bit vector
+ movq %mm2, 8(%esi) # store result in memory
+
+ addl $16, %esi # increment source pointer
+ decl %ecx
+ jnz .loop_gain # loop
+
+ .exit:
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_gain_s16.s b/system/mmx/pixel_gain_s16.s
new file mode 100644
index 0000000..adcfdf5
--- /dev/null
+++ b/system/mmx/pixel_gain_s16.s
@@ -0,0 +1,71 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_gain_s16
+.type pixel_gain_s16,@function
+
+# gain is integer, shift count is down
+# void pixel_gain_s16(int *buf, int nb_8pixel_vectors, short int gain[4], unsigned long long *shift)
+
+pixel_gain_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 20(%ebp), %edi
+ movq (%edi), %mm6 # get shift vector
+
+ movl 16(%ebp), %edi
+ movq (%edi), %mm7 # get gain vector
+
+ movl 8(%ebp), %esi # input array
+ movl 12(%ebp), %ecx # pixel count
+
+
+ .align 16
+ .loop_gain:
+
+ movq (%esi), %mm0 # load 4 pixels from memory
+ movq %mm0, %mm1
+ pmulhw %mm7, %mm1 # apply gain (s15.0) fixed point, high word
+ pmullw %mm7, %mm0 # low word
+
+ movq %mm0, %mm2 # copy
+ movq %mm1, %mm3
+
+ punpcklwd %mm1, %mm0 # unpack lsw components
+ punpckhwd %mm3, %mm2 # unpack msw components
+
+ psrad %mm6, %mm0 # apply signed shift
+ psrad %mm6, %mm2
+
+ packssdw %mm2, %mm0 # pack result & saturate
+ movq %mm0, (%esi) # store result
+
+
+ addl $8, %esi # increment source pointer
+ decl %ecx
+ jnz .loop_gain # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_mix_s16.s b/system/mmx/pixel_mix_s16.s
new file mode 100644
index 0000000..9bf41eb
--- /dev/null
+++ b/system/mmx/pixel_mix_s16.s
@@ -0,0 +1,68 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_mix_s16
+.type pixel_mix_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_mix_s16(int *left, int *right, int nb_4pixel_vectors,
+# short int gain_left[4], short int gain_right[4])
+
+pixel_mix_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 20(%ebp), %edi # int16[4] array of gains
+ movq (%edi), %mm6 # get left gain array
+
+ movl 24(%ebp), %edi # int16[4] array of gains
+ movq (%edi), %mm7 # get right gain array
+
+ movl 8(%ebp), %edi # left array
+ movl 12(%ebp), %esi # right array
+ movl 16(%ebp), %ecx # pixel count
+
+
+ .align 16
+ .loop_mix:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm1 # load right 4 pixels from memory
+ pmulhw %mm7, %mm1 # apply right gain
+ movq (%edi), %mm0 # load 4 left pixels from memory
+ pmulhw %mm6, %mm0 # apply left gain
+# pslaw $1, %mm1 # shift left ((s).15 x (s).15 -> (s0).14))
+# pslaw $1, %mm0
+ paddsw %mm0, %mm0 # no shift left arithmic, so use add instead
+ paddsw %mm1, %mm1
+ paddsw %mm1, %mm0 # mix
+ movq %mm0, (%edi)
+ addl $8, %esi
+ addl $8, %edi
+ decl %ecx
+ jnz .loop_mix # loop
+
+ emms
+
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_mul_s16.s b/system/mmx/pixel_mul_s16.s
new file mode 100644
index 0000000..240a024
--- /dev/null
+++ b/system/mmx/pixel_mul_s16.s
@@ -0,0 +1,56 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_mul_s16
+.type pixel_mul_s16,@function
+
+# simple add
+# void pixel_mul_s16(int *left, int *right, int nb_4pixel_vectors)
+
+pixel_mul_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %edi # left array
+ movl 12(%ebp), %esi # right array
+ movl 16(%ebp), %ecx # pixel count
+
+
+ .align 16
+ .loop_mix:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm1 # load right 4 pixels from memory
+ movq (%edi), %mm0 # load 4 left pixels from memory
+ pmulhw %mm1, %mm0 # mul
+ psllw $1, %mm0 # fixed point shift correction
+ movq %mm0, (%edi)
+ addl $8, %esi
+ addl $8, %edi
+ decl %ecx
+ jnz .loop_mix # loop
+
+ emms
+
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_pack_s16u8.s b/system/mmx/pixel_pack_s16u8.s
new file mode 100644
index 0000000..57df702
--- /dev/null
+++ b/system/mmx/pixel_pack_s16u8.s
@@ -0,0 +1,126 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_pack_s16u8_y
+.type pixel_pack_s16u8_y,@function
+
+# mmx rgba pixel gain
+# void pixel_pack_s16u8_y(int *input, int *output, int nb_8pixel_vectors)
+
+pixel_pack_s16u8_y:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+# movl 20(%ebp), %edi # int16[4] array of gains
+# movq (%edi), %mm7 # get gain array
+# psllw $1, %mm7 # adjust for shifted sign bit
+
+ movl 8(%ebp), %esi # input array
+ movl 12(%ebp), %edi # output array
+ movl 16(%ebp), %ecx # pixel count
+
+ pxor %mm6, %mm6
+
+ .align 16
+ .loop_pack_y:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm0 # load 4 pixels from memory
+# pmulhw %mm7, %mm0 # apply gain
+ movq 8(%esi), %mm1 # load 4 pixels from memory
+# pmulhw %mm7, %mm1 # apply gain
+
+# movq %mm0, %mm2
+# pcmpgtw %mm6, %mm2 # mm2 > 0 ? 0xffff : 0
+# pand %mm2, %mm0
+
+# movq %mm1, %mm3
+# pcmpgtw %mm6, %mm3 # mm3 > 0 ? 0xffff : 0
+# pand %mm3, %mm1
+
+# psllw $1, %mm0 # shift out sign bit
+# psllw $1, %mm1 # shift out sign bit
+
+ psraw $7, %mm0 # shift to lsb
+ psraw $7, %mm1 # shift to lsb
+
+ packuswb %mm1, %mm0 # pack & saturate to 8bit vector
+ movq %mm0, (%edi) # store result in memory
+
+ addl $16, %esi # increment source pointer
+ addl $8, %edi # increment dest pointer
+ decl %ecx
+ jnz .loop_pack_y # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
+.globl pixel_pack_s16u8_uv
+.type pixel_pack_s16u8_uv,@function
+
+pixel_pack_s16u8_uv:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+# movl 20(%ebp), %edi # int16[4] array of gains
+# movq (%edi), %mm7 # get gain array
+ movl 8(%ebp), %esi # pixel array offset
+ movl 12(%ebp), %edi # nb of elements
+ movl 16(%ebp), %ecx # pixel count
+
+ pcmpeqw %mm6, %mm6
+ psllw $15, %mm6
+ movq %mm6, %mm5
+ psrlw $8, %mm5
+ por %mm5, %mm6 # mm6 <- 8 times 0x80
+
+ .align 16
+ .loop_pack_uv:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm0 # load 4 pixels from memory
+# pmulhw %mm7, %mm0 # apply gain
+ movq 8(%esi), %mm1 # load 4 pixels from memory
+# pmulhw %mm7, %mm1 # apply gain
+
+ psraw $8, %mm0 # shift to msb
+ psraw $8, %mm1
+
+ packsswb %mm1, %mm0 # pack & saturate to 8bit vector
+ pxor %mm6, %mm0 # flip sign bits
+ movq %mm0, (%edi) # store result in memory
+
+ addl $16, %esi # increment source pointer
+ addl $8, %edi # increment dest pointer
+ decl %ecx
+ jnz .loop_pack_uv # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_rand_s16.s b/system/mmx/pixel_rand_s16.s
new file mode 100644
index 0000000..649400b
--- /dev/null
+++ b/system/mmx/pixel_rand_s16.s
@@ -0,0 +1,76 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_rand_s16
+.type pixel_rand_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_rand_s16(int *dst, nb_4pixel_vectors, short int random_seed[4])
+
+pixel_rand_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 16(%ebp), %esi # int16[4] array of random seeds
+ movl 8(%ebp), %edi # dst array
+ movl 12(%ebp), %ecx # pixel count
+
+ movq (%esi), %mm6
+
+
+ pcmpeqw %mm3, %mm3
+ psrlw $15, %mm3 # get bit mask 4 times 0x0001
+
+ .align 16
+ .loop_rand:
+
+# prefetch 128(%esi)
+
+
+ movq %mm6, %mm4 # get random vector
+ psrlw $15, %mm4 # get first component
+ movq %mm6, %mm5
+ psrlw $14, %mm5 # get second component
+ pxor %mm5, %mm4
+ movq %mm6, %mm5
+ psrlw $12, %mm5 # get third component
+ pxor %mm5, %mm4
+ movq %mm6, %mm5
+ psrlw $3, %mm5 # get forth component
+ pxor %mm5, %mm4
+
+ psllw $1, %mm6 # shift left original random vector
+ pand %mm3, %mm4 # isolate new bit
+ por %mm4, %mm6 # combine into new random vector
+
+ movq %mm6, (%edi)
+ addl $8, %edi
+ decl %ecx
+ jnz .loop_rand # loop
+
+
+ movq %mm6, (%esi) # store random seeds
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_randmix_s16.s b/system/mmx/pixel_randmix_s16.s
new file mode 100644
index 0000000..44e1702
--- /dev/null
+++ b/system/mmx/pixel_randmix_s16.s
@@ -0,0 +1,91 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_randmix_s16
+.type pixel_randmix_s16,@function
+
+# mmx rgba pixel gain
+# void pixel_randmix_s16(int *left, int *right, int nb_4pixel_vectors, short int random_seed[4], short int threshold[4])
+
+pixel_randmix_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 20(%ebp), %edi # int16[4] array of random seeds
+ movq (%edi), %mm6
+
+ movl 24(%ebp), %edi # int16[4] array of thresholds
+ movq (%edi), %mm7
+
+ movl 8(%ebp), %edi # left array
+ movl 12(%ebp), %esi # right array
+ movl 16(%ebp), %ecx # pixel count
+
+ pcmpeqw %mm3, %mm3
+ psrlw $15, %mm3 # get bit mask 4 times 0x0001
+
+ .align 16
+ .loop_randmix:
+
+# prefetch 128(%esi)
+ movq (%esi), %mm1 # load right 4 pixels from memory
+ movq (%edi), %mm0 # load 4 left pixels from memory
+
+ movq %mm6, %mm2 # get random vector
+ pcmpgtw %mm7, %mm2 # compare random vector with threshold
+ movq %mm2, %mm5
+
+ pand %mm0, %mm2 # get left array's components
+ pandn %mm1, %mm5 # get right array's components
+ por %mm2, %mm5
+
+ movq %mm5, (%edi) # store pixels
+
+ movq %mm6, %mm4 # get random vector
+ psrlw $15, %mm4 # get first component
+ movq %mm6, %mm5
+ psrlw $14, %mm5 # get second component
+ pxor %mm5, %mm4
+ movq %mm6, %mm5
+ psrlw $12, %mm5 # get third component
+ pxor %mm5, %mm4
+ movq %mm6, %mm5
+ psrlw $3, %mm5 # get forth component
+ pxor %mm5, %mm4
+
+ psllw $1, %mm6 # shift left original random vector
+ pand %mm3, %mm4 # isolate new bit
+ por %mm4, %mm6 # combine into new random vector
+
+ addl $8, %esi
+ addl $8, %edi
+ decl %ecx
+ jnz .loop_randmix # loop
+
+
+ movl 20(%ebp), %edi # int16[4] array of random seeds
+ movq %mm6, (%edi) # store random seeds
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_s1.s b/system/mmx/pixel_s1.s
new file mode 100644
index 0000000..d6bc5ca
--- /dev/null
+++ b/system/mmx/pixel_s1.s
@@ -0,0 +1,201 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+ # this file contains ops for binary image processing
+ # 8x8 bit tile encoded
+ # low byte = bottom row
+ # low bit = right column
+ # %mm7 = scratch reg for all macros
+
+
+ # ************ load mask *******************
+ # compute bit masks for rows and columns
+ # %mm7: scratch reg
+
+ # load mask top
+ .macro ldmt count reg
+ pcmpeqb \reg, \reg
+ psllq $(64-(\count<<3)), \reg
+ .endm
+
+ # load mask bottom
+ .macro ldmb count reg
+ pcmpeqb \reg, \reg
+ psrlq $(64-(\count<<3)), \reg
+ .endm
+
+ # load mask top and bottom
+ .macro ldmtb count regt regb
+ ldmb \count, \regb
+ ldmt \count, \regt
+ .endm
+
+ # load mask right
+ .macro ldmr count reg
+ pcmpeqb %mm7, %mm7
+ psrlw $(16-\count), %mm7
+ movq %mm7, \reg
+ psllq $8, %mm7
+ por %mm7, \reg
+ .endm
+
+ # load mask left
+ .macro ldml count reg
+ pcmpeqb %mm7, %mm7
+ psllw $(16-\count), %mm7
+ movq %mm7, \reg
+ psrlq $8, %mm7
+ por %mm7, \reg
+ .endm
+
+ # load mask left and right
+ .macro ldmlr count regl regr
+ pcmpeqb %mm7, %mm7
+ psllw $(16-\count), %mm7
+ movq %mm7, \regl
+ psrlq $8, %mm7
+ por %mm7, \regl
+ movq \regl, \regr
+ psrlq $(8-\count), \regr
+ .endm
+
+ # ************* shift square **********
+ # shifts a square in reg, fills with zeros
+
+ # shift square top
+ .macro sst count reg
+ psllq $(\count<<3), \reg
+ .endm
+
+ # shift square bottom
+ .macro ssb count reg
+ psrlq $(\count<<3), \reg
+ .endm
+
+ # not tested
+ # shift square left
+ .macro ssl count reg
+ movq \reg, %mm7
+ pcmpeqb \reg, \reg
+ psllw $(16-\count), \reg
+ psrlw $8, \reg
+ pandn %mm7, \reg
+ psllw $(\count), \reg
+ .endm
+
+ # shift square right
+ .macro ssr count reg
+ movq \reg, %mm7
+ pcmpeqb \reg, \reg
+ psrlw $(16-\count), \reg
+ psllw $8, \reg
+ pandn %mm7, \reg
+ psrlw $(\count), \reg
+ .endm
+
+
+ # ********** combine square *************
+ # combines 2 squares
+
+ # combine right
+ .macro csr count regr reg
+ ssl \count, \reg
+ ssr (8-\count), \regr
+ por \regr, \reg
+ .endm
+
+ # combine left
+ .macro csl count regl reg
+ ssr \count, \reg
+ ssl (8-\count), \regl
+ por \regl, \reg
+ .endm
+
+ # combine top
+ .macro cst count regt reg
+ ssb \count, \reg
+ sst (8-\count), \regt
+ por \regt, \reg
+ .endm
+
+
+ # combine bottom
+ .macro csb count regb reg
+ sst \count, \reg
+ ssb (8-\count), \regb
+ por \regb, \reg
+ .endm
+
+
+ # ********** load combine square *************
+ # loads combined square using mask
+
+ # load combined square left
+ # mask should be count bits set right (i.e. 0x01)
+ .macro lcsml count mask source sourcel dstreg
+ movq \mask, \dstreg
+ movq \mask, %mm7
+ pandn \source, \dstreg
+ pand \sourcel, %mm7
+ psrlq $(\count), \dstreg
+ psllq $(8-\count), %mm7
+ por %mm7, \dstreg
+ .endm
+
+
+
+.globl pixel_test_s1
+.type pixel_test_s1,@function
+
+# simple add
+# void pixel_add_s16(void *dest, void *source, int nb_squares, int spacing)
+
+
+
+ #
+
+
+pixel_test_s1:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+ movl 8(%ebp), %edi # dest
+ movl 12(%ebp), %esi # source
+ movl 16(%ebp), %ecx # count
+ movl 20(%ebp), %edx # row distance
+
+ ldmr 1, %mm6
+ lcsml 1, %mm6, (%esi), 8(%esi), %mm0
+ movq %mm0, (%edi)
+
+
+# movq (%esi), %mm0
+# movq 8(%esi), %mm1
+# csl 4, %mm1, %mm0
+# movq %mm0, (%edi)
+
+ emms
+
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
diff --git a/system/mmx/pixel_unpack_u8s16.s b/system/mmx/pixel_unpack_u8s16.s
new file mode 100644
index 0000000..0fc14c2
--- /dev/null
+++ b/system/mmx/pixel_unpack_u8s16.s
@@ -0,0 +1,113 @@
+# Pure Data Packet mmx routine.
+# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+.globl pixel_unpack_u8s16_y
+.type pixel_unpack_u8s16_y,@function
+
+# mmx rgba pixel gain
+# void pixel_unpack_u8s16_y(char *input, char *output, int32 nb_pixels_div8)
+
+pixel_unpack_u8s16_y:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+# movl 20(%ebp), %edi # int16[4] array of gains
+# movq (%edi), %mm7 # get gain array
+
+ movl 8(%ebp), %esi # input uint8 pixel array
+ movl 12(%ebp), %edi # output sint16 pixel array
+ movl 16(%ebp), %ecx # nb of elements div 8
+
+
+ .align 16
+ .loop_unpack_y:
+
+ movq (%esi), %mm5 # load 8 pixels from memory
+ pxor %mm0, %mm0 # zero mm0 - mm3
+ pxor %mm1, %mm1
+ punpcklbw %mm5, %mm0 # unpack 1st 4 pixels
+ punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles
+ psrlw $0x1, %mm0 # shift right to clear sign bit 9.7
+ psrlw $0x1, %mm1
+# pmulhw %mm7, %mm0 # apply gain
+# pmulhw %mm7, %mm1
+# paddsw %mm0, %mm0 # correct factor 2
+# paddsw %mm1, %mm1
+ movq %mm0, (%edi) # store
+ movq %mm1, 8(%edi)
+
+ addl $8, %esi # increment source pointer
+ addl $16, %edi # increment dest pointer
+ decl %ecx
+ jnz .loop_unpack_y # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+
+.globl pixel_unpack_u8s16_uv
+.type pixel_unpack_u8s16_uv,@function
+pixel_unpack_u8s16_uv:
+ pushl %ebp
+ movl %esp, %ebp
+ push %esi
+ push %edi
+
+# movl 20(%ebp), %edi # int16[4] array of gains
+# movq (%edi), %mm7 # get gain array
+
+ movl 8(%ebp), %esi # input uint8 pixel array
+ movl 12(%ebp), %edi # output sint16 pixel array
+ movl 16(%ebp), %ecx # nb of elements div 8
+
+ pcmpeqw %mm6, %mm6
+ psllw $15, %mm6
+
+ .align 16
+ .loop_unpack_uv:
+
+ movq (%esi), %mm5 # load 8 pixels from memory
+ pxor %mm0, %mm0 # zero mm0 - mm3
+ pxor %mm1, %mm1
+ punpcklbw %mm5, %mm0 # unpack 1st 4 pixels
+ punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles
+ pxor %mm6, %mm0 # flip sign bit (Cr and Cb are ofset by 128)
+ pxor %mm6, %mm1
+# pmulhw %mm7, %mm0 # apply gain
+# pmulhw %mm7, %mm1
+# paddsw %mm0, %mm0 # correct factor 2
+# paddsw %mm1, %mm1
+ movq %mm0, (%edi) # store
+ movq %mm1, 8(%edi)
+
+ addl $8, %esi # increment source pointer
+ addl $16, %edi # increment dest pointer
+ decl %ecx
+ jnz .loop_unpack_uv # loop
+
+ emms
+
+ pop %edi
+ pop %esi
+ leave
+ ret
+