aboutsummaryrefslogtreecommitdiff
path: root/system/mmx
diff options
context:
space:
mode:
authorHans-Christoph Steiner <eighthave@users.sourceforge.net>2005-12-15 07:26:47 +0000
committerHans-Christoph Steiner <eighthave@users.sourceforge.net>2005-12-15 07:26:47 +0000
commit37b6643df2df7d784a31ca73f7bb90dc109c2401 (patch)
treea8664e5adcfcb60cae136063d627549ecb76619b /system/mmx
parentc50ce0e0217ea07e2d450add2ab29cecea66fa96 (diff)
removing PDP source (except debian files) before import of PDP 0.12.4
svn path=/trunk/externals/pdp/; revision=4217
Diffstat (limited to 'system/mmx')
-rw-r--r--system/mmx/Makefile32
-rw-r--r--system/mmx/pdp_mmx_test.c62
-rw-r--r--system/mmx/pixel_add_s16.s55
-rw-r--r--system/mmx/pixel_affine_s16.s59
-rw-r--r--system/mmx/pixel_biquad_dirI_s16.s361
-rw-r--r--system/mmx/pixel_biquad_s16.s451
-rw-r--r--system/mmx/pixel_ca_s1.s189
-rw-r--r--system/mmx/pixel_cascade_s16.s330
-rw-r--r--system/mmx/pixel_cheby_s16.s90
-rw-r--r--system/mmx/pixel_conv_hor_s16.s134
-rw-r--r--system/mmx/pixel_conv_ver_s16.s128
-rw-r--r--system/mmx/pixel_crot_s16.s153
-rw-r--r--system/mmx/pixel_gain.s83
-rw-r--r--system/mmx/pixel_gain_s16.s71
-rw-r--r--system/mmx/pixel_mix_s16.s68
-rw-r--r--system/mmx/pixel_mul_s16.s56
-rw-r--r--system/mmx/pixel_pack_s16u8.s126
-rw-r--r--system/mmx/pixel_rand_s16.s76
-rw-r--r--system/mmx/pixel_randmix_s16.s91
-rw-r--r--system/mmx/pixel_resample_s16.s314
-rw-r--r--system/mmx/pixel_s1.s201
-rw-r--r--system/mmx/pixel_unpack_u8s16.s113
22 files changed, 0 insertions, 3243 deletions
diff --git a/system/mmx/Makefile b/system/mmx/Makefile
deleted file mode 100644
index 51e5052..0000000
--- a/system/mmx/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-include ../../Makefile.config
-
-OBJ = \
-pixel_pack_s16u8.o \
-pixel_unpack_u8s16.o \
-pixel_add_s16.o \
-pixel_mul_s16.o \
-pixel_mix_s16.o \
-pixel_randmix_s16.o \
-pixel_conv_hor_s16.o \
-pixel_conv_ver_s16.o \
-pixel_affine_s16.o \
-pixel_biquad_s16.o \
-pixel_ca_s1.o \
-pixel_rand_s16.o \
-pixel_crot_s16.o \
-pixel_gain_s16.o \
-pixel_resample_s16.o \
-pixel_cheby_s16.o
-
-all: $(OBJ)
-
-test: pdp_mmx_test.o $(OBJ)
- gcc -o pdp_mmx_test pdp_mmx_test.o $(OBJ) -g
-
-clean:
- rm -f *.o
- rm -f *~
- rm -f pdp_mmx.a
- rm -f pdp_mmx_test
-
-
diff --git a/system/mmx/pdp_mmx_test.c b/system/mmx/pdp_mmx_test.c
deleted file mode 100644
index e93539f..0000000
--- a/system/mmx/pdp_mmx_test.c
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "pdp_mmx.h"
-
-#define FP(x) ((short int)(((float)(x) * 2 * 256.0f)))
-
-#define nbp 256
-
- short int a1[4] = {0x0100,0x0100,0x0100,0x0100};
- short int a2[4] = {0x0100,0x0100,0x0100,0x0100};
- short int b0[4] = {0x0100,0x0100,0x0100,0x0100};
- short int b1[4] = {0x0100,0x0100,0x0100,0x0100};
- short int b2[4] = {0x0100,0x0100,0x0100,0x0100};
-
- short int u1[4] = {0x0100,0x0100,0x0100,0x0100};
- short int u2[4] = {0x0100,0x0100,0x0100,0x0100};
-
- short int x0[4] = {0x0100,0x0100,0x0100,0x0100};
- short int x1[4] = {0x0100,0x0100,0x0100,0x0100};
- short int x2[4] = {0x0100,0x0100,0x0100,0x0100};
- short int x3[4] = {0x0100,0x0100,0x0100,0x0100};
-
-void print_pixel(unsigned int i)
-{
- if (i) printf("x ");
- else printf(". ");
-}
-
-void print_line(void)
-{
- printf("\n");
-}
-
-void print_square(unsigned char *c)
-{
- int i,j;
-
- for(j=7; j>=0; j--){
- for(i=0; i<8; i++) print_pixel(c[j] & (1<<(7-i)));
- printf("\n");
- }
-
-}
-
-main()
-{
-
- unsigned char src[16]={1,2,3,4,5,6,7,8,-1,-2,-3,-4,-5,-6,-7,-8};
- unsigned char dst[8];
-
-
- print_square(src);
- print_line();
- print_square(src+8);
- print_line();
-
- pixel_test_s1(dst,src,1,1);
-
- print_square(dst);
- print_line();
-
-
-
-}
diff --git a/system/mmx/pixel_add_s16.s b/system/mmx/pixel_add_s16.s
deleted file mode 100644
index 8d4c7df..0000000
--- a/system/mmx/pixel_add_s16.s
+++ /dev/null
@@ -1,55 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_add_s16
-.type pixel_add_s16,@function
-
-# simple add
-# void pixel_add_s16(int *left, int *right, int nb_4pixel_vectors)
-
-pixel_add_s16:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 8(%ebp), %edi # left array
- movl 12(%ebp), %esi # right array
- movl 16(%ebp), %ecx # pixel count
-
-
- .align 16
- .loop_mix:
-
-# prefetch 128(%esi)
- movq (%esi), %mm1 # load right 4 pixels from memory
- movq (%edi), %mm0 # load 4 left pixels from memory
- paddsw %mm1, %mm0 # mix
- movq %mm0, (%edi)
- addl $8, %esi
- addl $8, %edi
- decl %ecx
- jnz .loop_mix # loop
-
- emms
-
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_affine_s16.s b/system/mmx/pixel_affine_s16.s
deleted file mode 100644
index b357de3..0000000
--- a/system/mmx/pixel_affine_s16.s
+++ /dev/null
@@ -1,59 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_affine_s16
-.type pixel_affine_s16,@function
-
-# void pixel_affine_s16(int *buf, int nb_8pixel_vectors, short int gain[4], short int offset[4])
-
-pixel_affine_s16:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 20(%ebp), %edi
- movq (%edi), %mm6 # get offset vector
-
- movl 16(%ebp), %edi
- movq (%edi), %mm7 # get gain vector
-
- movl 8(%ebp), %esi # input array
- movl 12(%ebp), %ecx # pixel count
-
-
- .align 16
- .loop_affine:
-
-# prefetch 128(%esi)
- movq (%esi), %mm0 # load 4 pixels from memory
- pmulhw %mm7, %mm0 # apply gain (s).15 fixed point
- psllw $1, %mm0 # apply correction shift
- paddsw %mm6, %mm0 # add offset
- movq %mm0, (%esi) # store result in memory
-
- addl $8, %esi # increment source pointer
- decl %ecx
- jnz .loop_affine # loop
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_biquad_dirI_s16.s b/system/mmx/pixel_biquad_dirI_s16.s
deleted file mode 100644
index 1729502..0000000
--- a/system/mmx/pixel_biquad_dirI_s16.s
+++ /dev/null
@@ -1,361 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
-
- # TODO MOVE TO DIRECT FORM II
- # y[k] = b0 * x[k] + u1[k-1]
- # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
- # u2[k] = b2 * x[k] - a2 * y[k]
-
- # input in register:
- # %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3}
- # %esi: coef memory (a1, a2, b0, b1, b2)
- # %edi: state memory (u1, u2)
-
-
- # return in register:
- # %mm0-mm4: 4x4 pixels result
-
-
- .biquad_4x4_pixels:
- .align 16
- # prescale
- movq -8(%esi), %mm4
- pmulhw %mm4, %mm0
- pmulhw %mm4, %mm1
- pmulhw %mm4, %mm2
- pmulhw %mm4, %mm3
- psllw $1, %mm0
- psllw $1, %mm1
- psllw $1, %mm2
- psllw $1, %mm3
-
-
- # first vector
- movq 0(%edi), %mm4 # mm4 <- u[-1]
- movq 8(%edi), %mm5 # mm5 <- u[-2]
- movq %mm4, %mm6
- movq %mm5, %mm7
-
- pmulhw 0(%esi), %mm6 # multiply by a1
- pmulhw 8(%esi), %mm7 # multiply by a2
-
- paddsw %mm6, %mm0 # accumulate
- paddsw %mm7, %mm0 # accumulate
- paddsw %mm0, %mm0 # scale by 2 (since all fixed point muls are x*y/2)
-
- movq %mm0, %mm6 # mm6 <- u[0]
- movq %mm4, %mm7 # mm7 <- u[-1]
- pmulhw 16(%esi), %mm0 # multiply by b0
- pmulhw 24(%esi), %mm4 # multiply by b1
- pmulhw 32(%esi), %mm5 # multiply by b2
-
- paddsw %mm4, %mm0 # accumulate
- paddsw %mm5, %mm0 # accumulate
-
- # mm0 is result 0
-
- # second vector
- movq %mm6, %mm4 # mm4 <- u[0]
- movq %mm7, %mm5 # mm5 <- u[-1]
-
- pmulhw 0(%esi), %mm6 # multiply by a1
- pmulhw 8(%esi), %mm7 # multiply by a2
-
- paddsw %mm6, %mm1 # accumulate
- paddsw %mm7, %mm1 # accumulate
- paddsw %mm1, %mm1 # scale by 2
-
-
- movq %mm1, %mm6 # mm6 <- u[1]
- movq %mm4, %mm7 # mm7 <- u[0]
- pmulhw 16(%esi), %mm1 # multiply by b0
- pmulhw 24(%esi), %mm4 # multiply by b1
- pmulhw 32(%esi), %mm5 # multiply by b2
-
- paddsw %mm4, %mm1 # accumulate
- paddsw %mm5, %mm1 # accumulate
-
- # mm1 is result 1
-
- # third vector
- movq %mm6, %mm4 # mm4 <- u[1]
- movq %mm7, %mm5 # mm5 <- u[0]
-
- pmulhw 0(%esi), %mm6 # multiply by a1
- pmulhw 8(%esi), %mm7 # multiply by a2
-
- paddsw %mm6, %mm2 # accumulate
- paddsw %mm7, %mm2 # accumulate
- paddsw %mm2, %mm2 # scale by 2
-
-
- movq %mm2, %mm6 # mm6 <- u[2]
- movq %mm4, %mm7 # mm7 <- u[1]
- pmulhw 16(%esi), %mm2 # multiply by b0
- pmulhw 24(%esi), %mm4 # multiply by b1
- pmulhw 32(%esi), %mm5 # multiply by b2
-
- paddsw %mm4, %mm2 # accumulate
- paddsw %mm5, %mm2 # accumulate
-
- # mm2 is result 2
-
- # fourth vector
- movq %mm6, %mm4 # mm4 <- u[2]
- movq %mm7, %mm5 # mm5 <- u[1]
-
- pmulhw 0(%esi), %mm6 # multiply by a1
- pmulhw 8(%esi), %mm7 # multiply by a2
-
- paddsw %mm6, %mm3 # accumulate
- paddsw %mm7, %mm3 # accumulate
- paddsw %mm3, %mm3 # scale by 2
-
-
- movq %mm3, 0(%edi) # store u[3]
- movq %mm4, 8(%edi) # store u[2]
- pmulhw 16(%esi), %mm3 # multiply by b0
- pmulhw 24(%esi), %mm4 # multiply by b1
- pmulhw 32(%esi), %mm5 # multiply by b2
-
- paddsw %mm4, %mm3 # accumulate
- paddsw %mm5, %mm3 # accumulate
-
- # mm3 is result 3
-
- ret
-
-
- # in order to use the 4 line parallel biquad routine on horizontal
- # lines, we need to reorder (rotate or transpose) the matrix, since
- # images are scanline encoded, and we want to work in parallell
- # on 4 lines.
- #
- # since the 4 lines are independent, it doesnt matter in which order
- # the the vector elements are present.
- #
- # this allows us to use the same routine for left->right and right->left
- # processing.
- #
- # some comments on the non-abelean group of square isometries consisting of
- # (I) identity
- # (H) horizontal axis mirror
- # (V) vertical axis mirror
- # (T) transpose (diagonal axis mirror)
- # (A) antitranspose (antidiagonal axis mirror)
- # (R1) 90deg anticlockwize rotation
- # (R2) 180deg rotation
- # (R3) 90deg clockwize rotation
- #
- #
- # we basicly have two options: (R1,R3) or (T,A)
- # we opt for T and A because they are self inverting, which improves locality
- #
- # use antitranspose for right to left an transpose
- # for left to right (little endian)
-
-
- # antitranspose 4x4
-
- # input
- # %mm3 == {d0 d1 d2 d3}
- # %mm2 == {c0 c1 c2 c3}
- # %mm1 == {b0 b1 b2 b3}
- # %mm0 == {a0 a1 a2 a3}
-
- # output
- # %mm3 == {a3 b3 c3 d3}
- # %mm2 == {a2 b2 c2 d2}
- # %mm1 == {a1 b1 c1 d1}
- # %mm0 == {a0 b0 c0 d0}
-
-
- .antitranspose_4x4:
- .align 16
- movq %mm3, %mm4
- punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
- movq %mm3, %mm5
- punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
-
- movq %mm2, %mm6
- punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
- movq %mm2, %mm7
- punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
-
- movq %mm4, %mm3
- punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
- movq %mm4, %mm2
- punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
-
- movq %mm5, %mm1
- punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
- movq %mm5, %mm0
- punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
-
- ret
-
-
-
- # transpose 4x4
-
- # input
- # %mm3 == {d3 d2 d1 d0}
- # %mm2 == {c3 c2 c1 c0}
- # %mm1 == {b3 b2 b1 b0}
- # %mm0 == {a3 a2 a1 a0}
-
- # output
- # %mm3 == {d3 c3 b3 a3}
- # %mm2 == {d2 c2 b2 a2}
- # %mm1 == {d1 c1 b1 a1}
- # %mm0 == {d0 c0 b0 a0}
-
-
- .transpose_4x4:
- .align 16
- movq %mm0, %mm4
- punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
- movq %mm0, %mm5
- punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
-
- movq %mm1, %mm6
- punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
- movq %mm1, %mm7
- punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
-
- movq %mm4, %mm0
- punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
- movq %mm4, %mm1
- punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
-
- movq %mm5, %mm2
- punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
- movq %mm5, %mm3
- punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
-
- ret
-
-
-.globl pixel_biquad_vertb_s16
-.type pixel_biquad_vertb_s16,@function
-
-
-# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-
-pixel_biquad_vertb_s16:
-
-
- pushl %ebp
- movl %esp, %ebp
- push %ebx
- push %esi
- push %edi
-
- movl 8(%ebp), %ebx # pixel array offset
- movl 12(%ebp), %ecx # nb of 4x4 pixblocks
- movl 16(%ebp), %edx # line with
-
- movl 20(%ebp), %esi # coefs
- movl 24(%ebp), %edi # state
-
- shll $1, %edx # short int addressing
- movl %edx, %eax
- shll $1, %eax
- addl %edx, %eax # eax = 3 * edx
-
- .align 16
- .biquad_vertb_line_loop:
- movq (%ebx), %mm0
- movq (%ebx,%edx,1), %mm1
- movq (%ebx,%edx,2), %mm2
- movq (%ebx,%eax,1), %mm3
- call .biquad_4x4_pixels
- movq %mm0, (%ebx)
- movq %mm1, (%ebx,%edx,1)
- movq %mm2, (%ebx,%edx,2)
- movq %mm3, (%ebx,%eax,1)
- addl %edx, %ebx
- addl %eax, %ebx
- decl %ecx
- jnz .biquad_vertb_line_loop
-
- emms
-
- pop %edi
- pop %esi
- pop %ebx
- leave
- ret
-
-.globl pixel_biquad_horlr_s16
-.type pixel_biquad_horlr_s16,@function
-
-
-# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-
-pixel_biquad_horlr_s16:
-
-
- pushl %ebp
- movl %esp, %ebp
- push %ebx
- push %esi
- push %edi
-
- movl 8(%ebp), %ebx # pixel array offset
- movl 12(%ebp), %ecx # nb of 4x4 pixblocks
- movl 16(%ebp), %edx # line with
-
- movl 20(%ebp), %esi # coefs
- movl 24(%ebp), %edi # state
-
- shll $1, %edx # short int addressing
- movl %edx, %eax
- shll $1, %eax
- addl %edx, %eax # eax = 3 * edx
-
- .align 16
- .biquad_horlr_line_loop:
- movq (%ebx), %mm0
- movq (%ebx,%edx,1), %mm1
- movq (%ebx,%edx,2), %mm2
- movq (%ebx,%eax,1), %mm3
- call .transpose_4x4
- call .biquad_4x4_pixels
- call .transpose_4x4
- movq %mm0, (%ebx)
- movq %mm1, (%ebx,%edx,1)
- movq %mm2, (%ebx,%edx,2)
- movq %mm3, (%ebx,%eax,1)
- addl $8, %ebx
- decl %ecx
- jnz .biquad_horlr_line_loop
-
- emms
-
- pop %edi
- pop %esi
- pop %ebx
- leave
- ret
-
-
-
diff --git a/system/mmx/pixel_biquad_s16.s b/system/mmx/pixel_biquad_s16.s
deleted file mode 100644
index 844b041..0000000
--- a/system/mmx/pixel_biquad_s16.s
+++ /dev/null
@@ -1,451 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
-
- # DIRECT FORM II BIQUAD
- #
- # y[k] = b0 * x[k] + u1[k-1]
- # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
- # u2[k] = b2 * x[k] - a2 * y[k]
- # MACRO: df2 <reg>
- #
- # computes a direct form 2 biquad
- # does not use {mm0-mm3}\<inreg>
- #
- # input: <reg> == input
- # %mm4 == state 1
- # %mm5 == state 2
- # (%esi) == biquad coefs (-a1 -a2 b0 b1 b2) in s1.14
- # output: <reg> == output
- # %mm4 == state 1
- # %mm5 == state 2
-
- .macro df2 reg
- movq \reg, %mm6 # mm6 == x[k]
- movq \reg, %mm7 # mm7 == x[k]
- pmulhw 16(%esi), %mm6 # mm6 == x[k] * b0
- pmulhw 24(%esi), %mm7 # mm7 == x[k] * b1
- paddw %mm4, %mm6 # mm6 == x[k] * b0 + u1[k-1] == y[k]
- paddw %mm5, %mm7 # mm7 == x[k] * b1 + u2[k-1]
- paddsw %mm6, %mm6 # compensate for mul = x*y/4 (coefs are s1.14 fixed point)
- paddsw %mm6, %mm6 # paddsw ensures saturation
- movq \reg, %mm5 # mm5 == x[k]
- movq %mm6, %mm4 # mm4 == y[k]
- movq %mm6, \reg # reg == y[k] --------------------
- pmulhw 0(%esi), %mm4 # mm4 == y[k] * (-a1)
- pmulhw 8(%esi), %mm6 # mm6 == y[k] * (-a2)
- pmulhw 32(%esi), %mm5 # mm5 == x[k] * b2
- paddw %mm7, %mm4 # mm4 == u1[k] --------------------
- paddw %mm6, %mm5 # mm5 == u2[k] --------------------
- .endm
-
-
- # input in register:
- # %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3}
- # %esi: coef memory (-a1, -a2, b0, b1, b2) in s1.14
- # %edi: state memory (u1, u2)
-
- # return in register:
- # %mm0-mm4: 4x4 pixels result
-
-
-
-
- .macro biquad_4x4_pixels
- .align 16
- movq 0(%edi), %mm4 # get state
- movq 8(%edi), %mm5
- df2 %mm0 # compute 4 biquads
- df2 %mm1
- df2 %mm2
- df2 %mm3
- movq %mm4, 0(%edi) # store state
- movq %mm5, 8(%edi)
- .endm
-
-
-
- # in order to use the 4 line parallel biquad routine on horizontal
- # lines, we need to reorder (rotate or transpose) the matrix, since
- # images are scanline encoded, and we want to work in parallell
- # on 4 lines.
- #
- # since the 4 lines are independent, it doesnt matter in which order
- # the the vector elements are present.
- #
- # this allows us to use the same routine for left->right and right->left
- # processing.
- #
- # some comments on the non-abelean group of square isometries consisting of
- # (I) identity
- # (H) horizontal axis mirror
- # (V) vertical axis mirror
- # (T) transpose (diagonal axis mirror)
- # (A) antitranspose (antidiagonal axis mirror)
- # (R1) 90deg anticlockwize rotation
- # (R2) 180deg rotation
- # (R3) 90deg clockwize rotation
- #
- #
- # we basicly have two options: (R1,R3) or (T,A)
- # we opt for T and A because they are self inverting, which improves locality
- #
- # use antitranspose for right to left an transpose
- # for left to right (little endian)
-
-
- # antitranspose 4x4
-
- # input
- # %mm3 == {d0 d1 d2 d3}
- # %mm2 == {c0 c1 c2 c3}
- # %mm1 == {b0 b1 b2 b3}
- # %mm0 == {a0 a1 a2 a3}
-
- # output
- # %mm3 == {a3 b3 c3 d3}
- # %mm2 == {a2 b2 c2 d2}
- # %mm1 == {a1 b1 c1 d1}
- # %mm0 == {a0 b0 c0 d0}
-
-
- .macro antitranspose_4x4:
- movq %mm3, %mm4
- punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
- movq %mm3, %mm5
- punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
-
- movq %mm2, %mm6
- punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
- movq %mm2, %mm7
- punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
-
- movq %mm4, %mm3
- punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
- movq %mm4, %mm2
- punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
-
- movq %mm5, %mm1
- punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
- movq %mm5, %mm0
- punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
-
- .endm
-
-
- # transpose 4x4
-
- # input
- # %mm3 == {d3 d2 d1 d0}
- # %mm2 == {c3 c2 c1 c0}
- # %mm1 == {b3 b2 b1 b0}
- # %mm0 == {a3 a2 a1 a0}
-
- # output
- # %mm3 == {d3 c3 b3 a3}
- # %mm2 == {d2 c2 b2 a2}
- # %mm1 == {d1 c1 b1 a1}
- # %mm0 == {d0 c0 b0 a0}
-
-
- .macro transpose_4x4:
- movq %mm0, %mm4
- punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
- movq %mm0, %mm5
- punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
-
- movq %mm1, %mm6
- punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
- movq %mm1, %mm7
- punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
-
- movq %mm4, %mm0
- punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
- movq %mm4, %mm1
- punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
-
- movq %mm5, %mm2
- punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
- movq %mm5, %mm3
- punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
-
- .endm
-
-.globl pixel_biquad_vertb_s16
-.type pixel_biquad_vertb_s16,@function
-
-
-# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-
-pixel_biquad_vertb_s16:
-
-
- pushl %ebp
- movl %esp, %ebp
- push %ebx
- push %esi
- push %edi
-
- movl 8(%ebp), %ebx # pixel array offset
- movl 12(%ebp), %ecx # nb of 4x4 pixblocks
- movl 16(%ebp), %edx # line with
-
- movl 20(%ebp), %esi # coefs
- movl 24(%ebp), %edi # state
-
- shll $1, %edx # short int addressing
- movl %edx, %eax
- shll $1, %eax
- addl %edx, %eax # eax = 3 * edx
-
- .align 16
- .biquad_vertb_line_loop:
- movq (%ebx), %mm0
- movq (%ebx,%edx,1), %mm1
- movq (%ebx,%edx,2), %mm2
- movq (%ebx,%eax,1), %mm3
- biquad_4x4_pixels
- movq %mm0, (%ebx)
- movq %mm1, (%ebx,%edx,1)
- movq %mm2, (%ebx,%edx,2)
- movq %mm3, (%ebx,%eax,1)
- addl %edx, %ebx
- addl %eax, %ebx
- decl %ecx
- jnz .biquad_vertb_line_loop
-
- emms
-
- pop %edi
- pop %esi
- pop %ebx
- leave
- ret
-.globl pixel_biquad_verbt_s16
-.type pixel_biquad_verbt_s16,@function
-
-
-# pixel_biquad_vertbt_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-
-pixel_biquad_verbt_s16:
-
-
- pushl %ebp
- movl %esp, %ebp
- push %ebx
- push %esi
- push %edi
-
- movl 8(%ebp), %ebx # pixel array offset
- movl 12(%ebp), %ecx # nb of 4x4 pixblocks
- movl 16(%ebp), %eax # line with
-
- shll $3, %eax # 4 line byte spacing
- decl %ecx
- mul %ecx
- incl %ecx
- addl %eax, %ebx # ebx points to last pixblock
-
- movl 16(%ebp), %edx # line with
-
- movl 20(%ebp), %esi # coefs
- movl 24(%ebp), %edi # state
-
- shll $1, %edx # short int addressing
- movl %edx, %eax
- shll $1, %eax
- addl %edx, %eax # eax = 3 * edx
-
- .align 16
- .biquad_verbt_line_loop:
- movq (%ebx), %mm3
- movq (%ebx,%edx,1), %mm2
- movq (%ebx,%edx,2), %mm1
- movq (%ebx,%eax,1), %mm0
- biquad_4x4_pixels
- movq %mm3, (%ebx)
- movq %mm2, (%ebx,%edx,1)
- movq %mm1, (%ebx,%edx,2)
- movq %mm0, (%ebx,%eax,1)
- subl %edx, %ebx
- subl %eax, %ebx
- decl %ecx
- jnz .biquad_verbt_line_loop
-
- emms
-
- pop %edi
- pop %esi
- pop %ebx
- leave
- ret
-
-.globl pixel_biquad_horlr_s16
-.type pixel_biquad_horlr_s16,@function
-# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-pixel_biquad_horlr_s16:
-
-
- pushl %ebp
- movl %esp, %ebp
- push %ebx
- push %esi
- push %edi
-
- movl 8(%ebp), %ebx # pixel array offset
- movl 12(%ebp), %ecx # nb of 4x4 pixblocks
- movl 16(%ebp), %edx # line with
-
- movl 20(%ebp), %esi # coefs
- movl 24(%ebp), %edi # state
-
- shll $1, %edx # short int addressing
- movl %edx, %eax
- shll $1, %eax
- addl %edx, %eax # eax = 3 * edx
-
- .align 16
- .biquad_horlr_line_loop:
- movq (%ebx), %mm0
- movq (%ebx,%edx,1), %mm1
- movq (%ebx,%edx,2), %mm2
- movq (%ebx,%eax,1), %mm3
- transpose_4x4
- biquad_4x4_pixels
- transpose_4x4
- movq %mm0, (%ebx)
- movq %mm1, (%ebx,%edx,1)
- movq %mm2, (%ebx,%edx,2)
- movq %mm3, (%ebx,%eax,1)
- addl $8, %ebx
- decl %ecx
- jnz .biquad_horlr_line_loop
-
- emms
-
- pop %edi
- pop %esi
- pop %ebx
- leave
- ret
-
-
-.globl pixel_biquad_horrl_s16
-.type pixel_biquad_horrl_s16,@function
-# pixel_biquad_horrl_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-pixel_biquad_horrl_s16:
-
- pushl %ebp
- movl %esp, %ebp
- push %ebx
- push %esi
- push %edi
-
- movl 8(%ebp), %ebx # pixel array offset
- movl 12(%ebp), %ecx # nb of 4x4 pixblocks
- movl 16(%ebp), %edx # line with
-
-
- movl %ecx, %eax
- decl %eax
- shll $3, %eax
- addl %eax, %ebx # ebx points to last pixblock
-
-
- movl 20(%ebp), %esi # coefs
- movl 24(%ebp), %edi # state
-
- shll $1, %edx # short int addressing
- movl %edx, %eax
- shll $1, %eax
- addl %edx, %eax # eax = 3 * edx
-
- .align 16
- .biquad_horrl_line_loop:
- movq (%ebx), %mm0
- movq (%ebx,%edx,1), %mm1
- movq (%ebx,%edx,2), %mm2
- movq (%ebx,%eax,1), %mm3
- antitranspose_4x4
- biquad_4x4_pixels
- antitranspose_4x4
- movq %mm0, (%ebx)
- movq %mm1, (%ebx,%edx,1)
- movq %mm2, (%ebx,%edx,2)
- movq %mm3, (%ebx,%eax,1)
- subl $8, %ebx
- decl %ecx
- jnz .biquad_horrl_line_loop
-
- emms
-
- pop %edi
- pop %esi
- pop %ebx
- leave
- ret
-
-
-.globl pixel_biquad_time_s16
-.type pixel_biquad_time_s16,@function
-# pixel_biquad_time_s16(short int *pixel_array, short int *s1, short int *s2, short int *coefs, int nb_4_pix_vectors)
-
-pixel_biquad_time_s16:
-
- pushl %ebp
- movl %esp, %ebp
- push %ebx
- push %esi
- push %edi
-
- movl 8(%ebp), %ebx # pixel array offset
- movl 12(%ebp), %edx # state 1 array
- movl 16(%ebp), %edi # state 2 array
-
- movl 20(%ebp), %esi # coefs
- movl 24(%ebp), %ecx # nb of 4 pixel vectors
-
-
- .align 16
- .biquad_time_loop:
- movq (%ebx), %mm0 # get input
- movq (%edx), %mm4 # get state 1
- movq (%edi), %mm5 # get state 2
- df2 %mm0 # compute direct form 2
- movq %mm0, (%ebx) # write output
- movq %mm5, (%edi) # write state 2
- movq %mm4, (%edx) # write state 1
- addl $8, %ebx
- addl $8, %edi
- addl $8, %edx
- decl %ecx
- jnz .biquad_time_loop
-
- emms
-
- pop %edi
- pop %esi
- pop %ebx
- leave
- ret
-
-
diff --git a/system/mmx/pixel_ca_s1.s b/system/mmx/pixel_ca_s1.s
deleted file mode 100644
index d9c730f..0000000
--- a/system/mmx/pixel_ca_s1.s
+++ /dev/null
@@ -1,189 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
- # this file contains assembler routines for 2D 1 bit cellular automata
- # processing. it is organized around a feeder kernel and a
- # stack based bit processor (virtual forth machine)
- #
- # the feeder kernel is responsable for loading/storing CA cells
- # from/to memory. data in memory is organized as a scanline
- # encoded toroidial bitplane (lsb = left). to simplify the kernel, the top
- # left corner of the rectangular grid of pixels will shift down
- # every processing step.
- #
- # the stack machine has the following architecture:
- # CA stack: %esi, TOS: %mm0 (32x2 pixels. lsw = top row)
- # CA horizon: %mm4-%mm7 (64x4 pixels. %mm4 = top row)
- #
- # the stack size / organization is not known to the stack machine.
- # it can be thought of as operating on a 3x3 cell neightbourhood.
- # the only purpose of forth program is to determine the CA local update rule.
- #
- # the machine is supposed to be very minimal. no looping control.
- # no adressing modes. no conditional code (hey, this is an experiment!)
- # so recursion is not allowed (no way to stop it)
- # there are 9 words to load the cell neigbourhood on the stack.
- # the rest is just logic and stack manips.
-
-
- # this file contains pure asm macros. it is to be included before assembly
- # after scaforth.pl has processed the .scaf file
-
-
- # *************************** CA CELL ACCESS MACROS *****************************
- # fetchTL - fetchBR
-
- # shift / load rectangle macros:
-
- # shift rectangle horizontal
- # result is in reg1
- .macro shift reg1 reg2 count
- psllq $(32+\count), \reg1
- psrlq $(32-\count), \reg2
- psrlq $32, \reg1
- psllq $32, \reg2
- por \reg2, \reg1
- .endm
-
- .macro ldtop reg1 reg2
- movq %mm4, \reg1
- movq %mm5, \reg2
- .endm
-
- .macro ldcenter reg1 reg2
- movq %mm5, \reg1
- movq %mm6, \reg2
- .endm
-
- .macro ldbottom reg1 reg2
- movq %mm6, \reg1
- movq %mm7, \reg2
- .endm
-
-
- # fetch from top row
-
- # fetch the top left square
- .macro fetchTL
- ldtop %mm0, %mm1
- shift %mm0, %mm1, -1
- .endm
-
- # fetch the top mid square
- .macro fetchTM
- ldtop %mm0, %mm1
- shift %mm0, %mm1, 0
- .endm
-
- # fetch the top right square
- .macro fetchTR
- ldtop %mm0, %mm1
- shift %mm0, %mm1, 1
- .endm
-
-
-
- # fetch from center row
-
- # fetch the mid left square
- .macro fetchML
- ldcenter %mm0, %mm1
- shift %mm0, %mm1, -1
- .endm
-
- # fetch the mid mid square
- .macro fetchMM
- ldcenter %mm0, %mm1
- shift %mm0, %mm1, 0
- .endm
-
- # fetch the mid right square
- .macro fetchMR
- ldcenter %mm0, %mm1
- shift %mm0, %mm1, 1
- .endm
-
-
-
-
-
- # fetch from bottom row
-
- # fetch the bottom left square
- .macro fetchBL
- ldbottom %mm0, %mm1
- shift %mm0, %mm1, -1
- .endm
-
- # fetch the bottom mid square
- .macro fetchBM
- ldbottom %mm0, %mm1
- shift %mm0, %mm1, 0
- .endm
-
- # fetch the bottom right square
- .macro fetchBR
- ldbottom %mm0, %mm1
- shift %mm0, %mm1, 1
- .endm
-
-
-
- # *************************** CA STACK MANIP MACROS *****************************
- # dup drop dropdup swap nip dropover
-
- .macro dup
- lea -8(%esi), %esi
- movq %mm0, (%esi)
- .endm
-
- .macro drop
- movq (%esi), %mm0
- lea 8(%esi), %esi
- .endm
-
- .macro dropdup
- movq (%esi), %mm0
- .endm
-
- .macro swap
- movq (%esi), %mm1
- movq %mm0, (%esi)
- movq %mm1, %mm0
- .endm
-
- .macro nip
- lea 8(%esi), %esi
- .endm
-
- .macro dropover
- movq 8(%esi), %mm0
- .endm
-
-
- # *************************** CA BOOLEAN LOGIC MACROS *****************************
- # overxor
-
- .macro overxor
- pxor (%esi), %mm0
- .endm
-
-
-
-
-
diff --git a/system/mmx/pixel_cascade_s16.s b/system/mmx/pixel_cascade_s16.s
deleted file mode 100644
index bf88d08..0000000
--- a/system/mmx/pixel_cascade_s16.s
+++ /dev/null
@@ -1,330 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
-
- # TODO: COUPLED CASCADE SECOND ORDER SECTION
- #
- # s1[k] = ar * s1[k-1] + ai * s2[k-1] + x[k]
- # s2[k] = ar * s2[k-1] - ai * s1[k-1]
- # y[k] = c0 * x[k] + c1 * s1[k-1] + c2 * s2[k-1]
-
-
- # MACRO: df2
- #
- # computes a coupled cascade
- #
- # input: %mm0 == input
- # %mm1 == state 1
- # %mm2 == state 2
- # (%esi) == cascade coefs (ar ai c0 c1 c2) in s0.15
- # output: %mm0 == output
- # %mm1 == state 1
- # %mm2 == state 2
-
-
- .macro coupled
- pmovq %mm1, %mm3 # mm3 == s1[k-1]
- pmovq %mm1, %mm4 # mm4 == s1[k-1]
- pmovq %mm2, %mm5 # mm5 == s2[k-1]
- pmovq %mm2, %mm6 # mm5 == s2[k-1]
- pmulhw (%esi), %mm1 # mm1 == s1[k-1] * ar
- pmulhw 8(%esi), %mm3 # mm3 == s1[k-1] * ai
- pmulhw 24(%esi), %mm4 # mm4 == s1[k-1] * c1
- pmulhw (%esi), %mm2 # mm2 == s2[k-1] * ar
- pmulhw 8(%esi), %mm5 # mm5 == s2[k-1] * ai
- pmulhw 32(%esi), %mm6 # mm6 == s2[k-1] * c2
- paddw %mm5, %mm1 # mm1 == s1[k-1] * ar + s2[k-1] * ai
- psubw %mm3, %mm2 # mm2 == s2[k-1] * ar - s1[k-1] * ai == s2[k]
- paddw %mm0, %mm1 # mm1 == s1[k]
- pmulhw 16(%esi), %mm0 # mm0 == x[k] * c0
- paddw %mm6, %mm4 # mm4 == s1[k-1] * c1 + s2[k-1] * c2
- paddw %mm4, %mm0 # mm0 == y[k]
- .endm
-
-
-
-
- # in order to use the 4 line parallel cascade routine on horizontal
- # lines, we need to reorder (rotate or transpose) the matrix, since
- # images are scanline encoded, and we want to work in parallell
- # on 4 lines.
- #
- # since the 4 lines are independent, it doesnt matter in which order
- # the the vector elements are present.
- #
- # this allows us to use the same routine for left->right and right->left
- # processing.
- #
- # some comments on the non-abelean group of square isometries consisting of
- # (I) identity
- # (H) horizontal axis mirror
- # (V) vertical axis mirror
- # (T) transpose (diagonal axis mirror)
- # (A) antitranspose (antidiagonal axis mirror)
- # (R1) 90deg anticlockwize rotation
- # (R2) 180deg rotation
- # (R3) 90deg clockwize rotation
- #
- #
- # we basicly have two options: (R1,R3) or (T,A)
- # we opt for T and A because they are self inverting, which improves locality
- #
- # use antitranspose for right to left an transpose
- # for left to right (little endian)
-
-
- # antitranspose 4x4
-
- # input
- # %mm3 == {d0 d1 d2 d3}
- # %mm2 == {c0 c1 c2 c3}
- # %mm1 == {b0 b1 b2 b3}
- # %mm0 == {a0 a1 a2 a3}
-
- # output
- # %mm3 == {a3 b3 c3 d3}
- # %mm2 == {a2 b2 c2 d2}
- # %mm1 == {a1 b1 c1 d1}
- # %mm0 == {a0 b0 c0 d0}
-
-
- .macro antitranspose_4x4:
- movq %mm3, %mm4
- punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
- movq %mm3, %mm5
- punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
-
- movq %mm2, %mm6
- punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
- movq %mm2, %mm7
- punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
-
- movq %mm4, %mm3
- punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
- movq %mm4, %mm2
- punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
-
- movq %mm5, %mm1
- punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
- movq %mm5, %mm0
- punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
-
- .endm
-
-
- # transpose 4x4
-
- # input
- # %mm3 == {d3 d2 d1 d0}
- # %mm2 == {c3 c2 c1 c0}
- # %mm1 == {b3 b2 b1 b0}
- # %mm0 == {a3 a2 a1 a0}
-
- # output
- # %mm3 == {d3 c3 b3 a3}
- # %mm2 == {d2 c2 b2 a2}
- # %mm1 == {d1 c1 b1 a1}
- # %mm0 == {d0 c0 b0 a0}
-
-
- .macro transpose_4x4:
- movq %mm0, %mm4
- punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
- movq %mm0, %mm5
- punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
-
- movq %mm1, %mm6
- punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
- movq %mm1, %mm7
- punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
-
- movq %mm4, %mm0
- punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
- movq %mm4, %mm1
- punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
-
- movq %mm5, %mm2
- punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
- movq %mm5, %mm3
- punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
-
- .endm
-
-.globl pixel_cascade_vertb_s16
-.type pixel_cascade_vertb_s16,@function
-
-
-# pixel_cascade_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-
-pixel_cascade_vertb_s16:
-
-
- pushl %ebp
- movl %esp, %ebp
- push %ebx
- push %esi
- push %edi
-
- movl 8(%ebp), %ebx # pixel array offset
- movl 12(%ebp), %ecx # nb of 4x4 pixblocks
- movl 16(%ebp), %edx # line with
-
- movl 20(%ebp), %esi # coefs
- movl 24(%ebp), %edi # state
-
- shll $1, %edx # short int addressing
- subl %edx, %ebx
-
- movq 0(%edi), %mm1 # s1[k-1]
- movq 8(%edi), %mm2 # s2[k-1]
- .align 16
- .cascade_vertb_line_loop:
-
- movq (%ebx,%edx,1), %mm3
- movq %mm3, %mm0
- addl %edx, %ebx
- coupled
- movq %mm0, (%ebx)
-
- movq (%ebx,%edx,1), %mm3
- movq %mm3, %mm0
- addl %edx, %ebx
- coupled
- movq %mm0, (%ebx)
-
- movq (%ebx,%edx,1), %mm3
- movq %mm3, %mm0
- addl %edx, %ebx
- coupled
- movq %mm0, (%ebx)
-
- movq (%ebx,%edx,1), %mm3
- movq %mm3, %mm0
- addl %edx, %ebx
- coupled
- movq %mm0, (%ebx)
-
- decl %ecx
- jnz .cascade_vertb_line_loop
-
- movq %mm1, 0(%edi) # s1[k-1]
- movq %mm2, 8(%edi) # s2[k-1]
-
- emms
-
- pop %edi
- pop %esi
- pop %ebx
- leave
- ret
-
-.globl pixel_cascade_horlr_s16
-.type pixel_cascade_horlr_s16,@function
-
-
-# pixel_cascade_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
-
-
-pixel_cascade_horlr_s16:
-
-
- pushl %ebp
- movl %esp, %ebp
- push %ebx
- push %esi
- push %edi
-
- movl 8(%ebp), %ebx # pixel array offset
- movl 12(%ebp), %ecx # nb of 4x4 pixblocks
- movl 16(%ebp), %edx # line with
-
- movl 20(%ebp), %esi # coefs
- movl 24(%ebp), %edi # state
-
- shll $1, %edx # short int addressing
- movl %edx, %eax
- shll $1, %eax
- addl %edx, %eax # eax = 3 * edx
-
-
- .align 16
- .cascade_horlr_line_loop:
- movq (%edi), %mm1
- movq 8(%edi), %mm2
-
- movq (%ebx), %mm0
- movq (%ebx,%edx,1), %mm1
- movq (%ebx,%edx,2), %mm2
- movq (%ebx,%eax,1), %mm3
-
- transpose_4x4
-
- movq %mm1, (%ebx,%edx,1)
- movq %mm2, (%ebx,%edx,2)
- movq %mm3, (%ebx,%eax,1)
-
- coupled
-
- movq %mm0, (%ebx)
- movq (%ebx,%edx,1), %mm3
- movq %mm3, %mm0
-
- coupled
-
- movq %mm0, (%ebx, %edx,1)
- movq (%ebx,%edx,2), %mm3
- movq %mm3, %mm0
-
- coupled
-
- movq %mm0, (%ebx, %edx,2)
- movq (%ebx,%eax,1), %mm3
- movq %mm3, %mm0
-
- coupled
-
- movq %mm1, 0(%edi) # s1[k-1]
- movq %mm2, 8(%edi) # s2[k-1]
-
- movq %mm0, %mm3
- movq (%ebx), %mm0
- movq (%ebx,%edx,1), %mm1
- movq (%ebx,%edx,2), %mm2
-
- transpose_4x4
-
- movq %mm0, (%ebx)
- movq %mm1, (%ebx,%edx,1)
- movq %mm2, (%ebx,%edx,2)
- movq %mm3, (%ebx,%eax,1)
-
- addl $8, %ebx
- decl %ecx
- jnz .cascade_horlr_line_loop
-
- emms
-
- pop %edi
- pop %esi
- pop %ebx
- leave
- ret
-
-
-
diff --git a/system/mmx/pixel_cheby_s16.s b/system/mmx/pixel_cheby_s16.s
deleted file mode 100644
index 2afe9e2..0000000
--- a/system/mmx/pixel_cheby_s16.s
+++ /dev/null
@@ -1,90 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_cheby_s16_3plus
-.type pixel_cheby_s16_3plus,@function
-
-# void pixel_cheby_s16(int *buf, int nb_8pixel_vectors, int order+1, short int *coefs)
-
-
-# coefs are s2.13 fixed point (-4->4)
-pixel_cheby_s16_3plus:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
- push %edx
-
- movl 8(%ebp), %esi # input array
- movl 12(%ebp), %ecx # vector count
- movl 16(%ebp), %eax # get order+1
-
- shll $3, %eax
- movl 20(%ebp), %edx
- addl %eax, %edx # edx = coef endx address
-
-# jmp skip
-
- .align 16
- .loop_cheby:
-
- movl 20(%ebp), %edi # get coefs
- movq (%esi), %mm0 # load 4 pixels from memory (mm0 = x)
- pcmpeqw %mm2, %mm2
- movq %mm0, %mm1 # mm1 (T_n-1) <- x
- psrlw $1, %mm2 # mm2 (T_n-2) <- 1
-
-
- movq (%edi), %mm4 # mm4 (acc) == a0
- psraw $1, %mm4 # mm4 == a0/2
- movq %mm0, %mm5 # mm5 (intermediate)
- pmulhw 8(%edi), %mm5 # mm5 == (x * a1)/2
- paddsw %mm5, %mm4 # acc = c0 + c1 x
- addl $16, %edi
-
- .loop_cheby_inner:
- movq %mm1, %mm3 # mm3 == T_n-1
- psraw $2, %mm2 # mm2 == T_n-2 / 4
- pmulhw %mm0, %mm3 # mm3 == (2 x T_n-1) / 4
- psubsw %mm2, %mm3 # mm3 == (2 x T_n-1 - T_n-2) / 4
- paddsw %mm3, %mm3
- paddsw %mm3, %mm3 # mm3 == T_n
- movq %mm1, %mm2 # mm2 == new T_n-1
- movq %mm3, %mm1 # mm3 == new T_n-2
- pmulhw (%edi), %mm3 # mm3 = a_n * T_n / 2
- paddsw %mm3, %mm4 # accumulate
- addl $8, %edi
- cmpl %edx, %edi
- jne .loop_cheby_inner
-
- paddsw %mm4, %mm4 # compensate for 0.125 factor
- paddsw %mm4, %mm4
- paddsw %mm4, %mm4
- movq %mm4, (%esi) # store result in memory
- addl $8, %esi # increment source/dest pointer
- decl %ecx
- jnz .loop_cheby # loop
-
-skip:
- emms
-
- pop %edx
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_conv_hor_s16.s b/system/mmx/pixel_conv_hor_s16.s
deleted file mode 100644
index e90a692..0000000
--- a/system/mmx/pixel_conv_hor_s16.s
+++ /dev/null
@@ -1,134 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
- # intermediate function
-
- # input in register:
- # %mm0: left 4 pixels
- # %mm1: middle 4 pixels
- # %mm2: right 4 pixels
-
- # %mm5: left 4 pixel masks
- # %mm6: middle 4 pixel masks
- # %mm7: right 4 pixel masks
-
- # return in register:
- # %mm0: middle 4 pixels result
-
-
- .conv_hor_4_pixels:
- .align 16
-
- # compute quadruplet
-
- # get left pixels
- psrlq $48, %mm0 # shift word 3 to byte 0
- movq %mm1, %mm4
- psllq $16, %mm4 # shift word 0,1,2 to 1,2,3
- por %mm4, %mm0 # combine
- pmulhw %mm5, %mm0
- psllw $1, %mm0
-
-
- # get middle pixels
- movq %mm1, %mm4
- pmulhw %mm6, %mm4
- psllw $1, %mm4
- paddsw %mm4, %mm0
-
-
- # get right pixels
- movq %mm2, %mm3
- psllq $48, %mm3 # shift word 0 to word 3
- movq %mm1, %mm4
- psrlq $16, %mm4 # shift word 1,2,3 to 0,1,2
- por %mm4, %mm3 # combine
- pmulhw %mm7, %mm3
- psllw $1, %mm3
- paddsw %mm3, %mm0 # accumulate
-
- ret
-
-.globl pixel_conv_hor_s16
-.type pixel_conv_hor_s16,@function
-
-
-# pixel_conv_hor_s16(short int *pixel_array, int nb_4_pixel_vectors, short int border[4], short int mask[12])
-# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
-# NOT TESTED
-
-
-pixel_conv_hor_s16:
-
-
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 8(%ebp), %esi # pixel array offset
- movl 12(%ebp), %ecx # nb of 8 pixel vectors in a row (at least 2)
-
- movl 20(%ebp), %edi # mask vector
- movq (%edi), %mm5
- movq 8(%edi), %mm6
- movq 16(%edi), %mm7
-
- movl 16(%ebp), %edi # boundary pixel vector
-
-
-
- movq (%edi), %mm0 # init regs (left edge, so mm0 is zero)
- movq (%esi), %mm1
- movq 8(%esi), %mm2
-
- decl %ecx # loop has 2 terminator stubs
- decl %ecx # todo: handle if ecx < 3
-
- jmp .conv_line_loop
-
-
- .align 16
- .conv_line_loop:
- call .conv_hor_4_pixels # compute conv
- movq %mm0, (%esi) # store result
- movq %mm1, %mm0 # mm0 <- prev (%esi)
- movq %mm2, %mm1 # mm1 <- 8(%esi)
- movq 16(%esi), %mm2 # mm2 <- 16(%esi)
-
- addl $8, %esi # increase pointer
- decl %ecx
- jnz .conv_line_loop
-
- call .conv_hor_4_pixels # compute conv
- movq %mm0, (%esi) # store result
- movq %mm1, %mm0 # mm0 <- prev (%esi)
- movq %mm2, %mm1 # mm1 <- 8(%esi)
- movq (%edi), %mm2 # mm2 <- border
-
- call .conv_hor_4_pixels # compute last vector
- movq %mm0, 8(%esi) # store it
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
-
-
diff --git a/system/mmx/pixel_conv_ver_s16.s b/system/mmx/pixel_conv_ver_s16.s
deleted file mode 100644
index ae2456f..0000000
--- a/system/mmx/pixel_conv_ver_s16.s
+++ /dev/null
@@ -1,128 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-#TODO: fix out of bound acces in conv_ver and conv_hor
-
- # intermediate function
-
- # input in register:
- # %mm0: top 4 pixels
- # %mm1: middle 4 pixels
- # %mm2: bottom 4 pixels
-
- # %mm5: top 4 pixel mask
- # %mm6: middle 4 pixel mask
- # %mm7: bottom 4 pixel mask
-
- # return in register:
- # %mm0: middle 4 pixels result
-
-
- .conv_ver_4_pixels:
- .align 16
-
- # compute quadruplet
-
- # get top pixel
- pmulhw %mm5, %mm0
- psllw $1, %mm0
-
- # get middle pixel
- movq %mm1, %mm4
- pmulhw %mm6, %mm4
- psllw $1, %mm4
- paddsw %mm4, %mm0
-
- # get bottom pixel
- movq %mm2, %mm3
- pmulhw %mm7, %mm3
- psllw $1, %mm3 # mm3 <- mm3/4
- paddsw %mm3, %mm0
-
- ret
-
-.globl pixel_conv_ver_s16
-.type pixel_conv_ver_s16,@function
-
-
-# pixel_conv_ver_s16(short int *pixel_array, int nb_4_pixel_vectors, int row_byte_size, short int border[4])
-# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
-# NOT TESTED
-
-
-pixel_conv_ver_s16:
-
-
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 8(%ebp), %esi # pixel array offset
- movl 12(%ebp), %ecx # nb of 4 pixel vectors in a row (at least 2)
- movl 16(%ebp), %edx # rowsize in bytes
-
- movl 24(%ebp), %edi # mask vector
- movq (%edi), %mm5
- movq 8(%edi), %mm6
- movq 16(%edi), %mm7
-
- movl 20(%ebp), %edi # edge vector
-
-
- shll $1, %edx
- decl %ecx # loop has a terminator stub
- decl %ecx # loop has another terminator stub
-
-
- movq (%edi), %mm0 # init regs (left edge, so mm0 is zero)
- movq (%esi), %mm1
- movq (%esi,%edx,1), %mm2
- jmp .conv_line_loop
-
-
- .align 16
- .conv_line_loop:
- call .conv_ver_4_pixels # compute conv
- movq %mm0, (%esi) # store result
- movq %mm1, %mm0 # mm0 <- prev (%esi)
- movq %mm2, %mm1 # mm1 <- (%esi,%edx,1)
- movq (%esi,%edx,2), %mm2 # mm2 <- (%esi,%edx,2)
-
- addl %edx, %esi # increase pointer
- decl %ecx
- jnz .conv_line_loop
-
- call .conv_ver_4_pixels # compute conv
- movq %mm0, (%esi) # store result
- movq %mm1, %mm0 # mm0 <- prev (%esi)
- movq %mm2, %mm1 # mm1 <- (%esi,%edx,1)
- movq (%edi), %mm2 # clear invalid edge vector
-
- addl %edx, %esi # increase pointer
- call .conv_ver_4_pixels # compute last vector
- movq %mm0, (%esi) # store it
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
-
-
diff --git a/system/mmx/pixel_crot_s16.s b/system/mmx/pixel_crot_s16.s
deleted file mode 100644
index 2427869..0000000
--- a/system/mmx/pixel_crot_s16.s
+++ /dev/null
@@ -1,153 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_crot3d_s16
-.type pixel_crot3d_s16,@function
-
-
-# 3 dimensional colour space rotation
-# 3x3 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
-
-# void pixel_crot3d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
-
-pixel_crot3d_s16:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
-
- movl 8(%ebp), %esi # input array
- movl 12(%ebp), %ecx # pixel count
- movl 16(%ebp), %edi # rotation matrix
- movl %ecx, %edx
- shll $3, %edx # %edx = plane spacing
-
-
- .align 16
- .loop_crot3d:
-
- movq (%esi), %mm0 # get 1st component
- movq (%esi,%edx,1), %mm6 # get 2nd component
- movq (%esi,%edx,2), %mm7 # get 3rd component
-
- movq %mm0, %mm1 # copy 1st component
- movq %mm0, %mm2
-
- pmulhw (%edi), %mm0 # mul first column
- pmulhw 8(%edi), %mm1
- pmulhw 16(%edi), %mm2
-
- movq %mm6, %mm5 # copy 2nd component
- movq %mm6, %mm3
-
- pmulhw 24(%edi), %mm6 # mul second column
- pmulhw 32(%edi), %mm5
- pmulhw 40(%edi), %mm3
-
- paddsw %mm6, %mm0 # accumulate
- paddsw %mm5, %mm1
- paddsw %mm3, %mm2
-
- movq %mm7, %mm4 # copy 3rd component
- movq %mm7, %mm6
-
- pmulhw 48(%edi), %mm4 # mul third column
- pmulhw 56(%edi), %mm6
- pmulhw 64(%edi), %mm7
-
- paddsw %mm4, %mm0 # accumulate
- paddsw %mm6, %mm1
- paddsw %mm7, %mm2
-
- paddsw %mm0, %mm0 # double (fixed point normalization)
- paddsw %mm1, %mm1
- paddsw %mm2, %mm2
-
- movq %mm0, (%esi) # store
- movq %mm1, (%esi, %edx, 1)
- movq %mm2, (%esi, %edx, 2)
-
- addl $8, %esi # increment source pointer
- decl %ecx
- jnz .loop_crot3d # loop
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
-
-.globl pixel_crot2d_s16
-.type pixel_crot2d_s16,@function
-
-# 2 dimensional colour space rotation
-# 2x2 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
-
-# void pixel_crot2d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
-
-pixel_crot2d_s16:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
-
- movl 8(%ebp), %esi # input array
- movl 12(%ebp), %ecx # pixel count
- movl 16(%ebp), %edi # rotation matrix
- movl %ecx, %edx
- shll $3, %edx # %edx = plane spacing
-
-
- .align 16
- .loop_crot2d:
-
- movq (%esi), %mm0 # get 1st component
- movq (%esi,%edx,1), %mm2 # get 2nd component
-
- movq %mm0, %mm1 # copy 1st component
- movq %mm2, %mm3 # copy 2nd component
-
- pmulhw (%edi), %mm0 # mul first column
- pmulhw 8(%edi), %mm1
-
- pmulhw 16(%edi), %mm2 # mul second column
- pmulhw 24(%edi), %mm3
-
- paddsw %mm2, %mm0 # accumulate
- paddsw %mm3, %mm1
-
- paddsw %mm0, %mm0 # fixed point gain correction
- paddsw %mm1, %mm1
-
- movq %mm0, (%esi) # store
- movq %mm1, (%esi, %edx, 1)
-
- addl $8, %esi # increment source pointer
- decl %ecx
- jnz .loop_crot2d # loop
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_gain.s b/system/mmx/pixel_gain.s
deleted file mode 100644
index 5cd5057..0000000
--- a/system/mmx/pixel_gain.s
+++ /dev/null
@@ -1,83 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_gain
-.type pixel_gain,@function
-
-# mmx rgba pixel gain
-# void asmtest(char *pixelarray, int32 nbpixels, int *rgba_gain)
-# gains are 7.9 fixed point for rgba
-
-pixel_gain:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 8(%ebp), %esi # pixel array offset
- movl 12(%ebp), %ecx # nb of elements
- movl 16(%ebp), %edi # int16[4] array of gains
-
- prefetch (%esi)
-
- emms
- sarl $2, %ecx # process 4 pixels per loop iteration
- jz .exit
- movq (%edi), %mm7 # read gain array from memory
- jmp .loop_gain
-
- .align 16
- .loop_gain:
-
- prefetch 128(%esi)
- movq (%esi), %mm5 # load pixel 1-2 from memory
- movq 8(%esi), %mm6 # load pixel 3-4 from memory
- pxor %mm0, %mm0 # zero mm0 - mm3
- pxor %mm1, %mm1
- pxor %mm2, %mm2
- pxor %mm3, %mm3
- punpcklbw %mm5, %mm0 # unpack 1st pixel into 8.8 bit ints
- punpckhbw %mm5, %mm1 # unpack 2nd
- punpcklbw %mm6, %mm2 # unpack 3rd
- punpckhbw %mm6, %mm3 # unpack 4th
- psrlw $0x1, %mm0 # shift right to clear sign bit 9.7
- psrlw $0x1, %mm1
- psrlw $0x1, %mm2
- psrlw $0x1, %mm3
-
- pmulhw %mm7, %mm0 # multiply 1st pixel 9.7 * 7.9 -> 16.0
- pmulhw %mm7, %mm1 # multiply 2nd
- pmulhw %mm7, %mm2 # multiply 3rd
- pmulhw %mm7, %mm3 # multiply 4th
-
- packuswb %mm1, %mm0 # pack & saturate to 8bit vector
- movq %mm0, (%esi) # store result in memory
- packuswb %mm3, %mm2 # pack & saturate to 8bit vector
- movq %mm2, 8(%esi) # store result in memory
-
- addl $16, %esi # increment source pointer
- decl %ecx
- jnz .loop_gain # loop
-
- .exit:
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_gain_s16.s b/system/mmx/pixel_gain_s16.s
deleted file mode 100644
index adcfdf5..0000000
--- a/system/mmx/pixel_gain_s16.s
+++ /dev/null
@@ -1,71 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_gain_s16
-.type pixel_gain_s16,@function
-
-# gain is integer, shift count is down
-# void pixel_gain_s16(int *buf, int nb_8pixel_vectors, short int gain[4], unsigned long long *shift)
-
-pixel_gain_s16:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 20(%ebp), %edi
- movq (%edi), %mm6 # get shift vector
-
- movl 16(%ebp), %edi
- movq (%edi), %mm7 # get gain vector
-
- movl 8(%ebp), %esi # input array
- movl 12(%ebp), %ecx # pixel count
-
-
- .align 16
- .loop_gain:
-
- movq (%esi), %mm0 # load 4 pixels from memory
- movq %mm0, %mm1
- pmulhw %mm7, %mm1 # apply gain (s15.0) fixed point, high word
- pmullw %mm7, %mm0 # low word
-
- movq %mm0, %mm2 # copy
- movq %mm1, %mm3
-
- punpcklwd %mm1, %mm0 # unpack lsw components
- punpckhwd %mm3, %mm2 # unpack msw components
-
- psrad %mm6, %mm0 # apply signed shift
- psrad %mm6, %mm2
-
- packssdw %mm2, %mm0 # pack result & saturate
- movq %mm0, (%esi) # store result
-
-
- addl $8, %esi # increment source pointer
- decl %ecx
- jnz .loop_gain # loop
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_mix_s16.s b/system/mmx/pixel_mix_s16.s
deleted file mode 100644
index 9bf41eb..0000000
--- a/system/mmx/pixel_mix_s16.s
+++ /dev/null
@@ -1,68 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_mix_s16
-.type pixel_mix_s16,@function
-
-# mmx rgba pixel gain
-# void pixel_mix_s16(int *left, int *right, int nb_4pixel_vectors,
-# short int gain_left[4], short int gain_right[4])
-
-pixel_mix_s16:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 20(%ebp), %edi # int16[4] array of gains
- movq (%edi), %mm6 # get left gain array
-
- movl 24(%ebp), %edi # int16[4] array of gains
- movq (%edi), %mm7 # get right gain array
-
- movl 8(%ebp), %edi # left array
- movl 12(%ebp), %esi # right array
- movl 16(%ebp), %ecx # pixel count
-
-
- .align 16
- .loop_mix:
-
-# prefetch 128(%esi)
- movq (%esi), %mm1 # load right 4 pixels from memory
- pmulhw %mm7, %mm1 # apply right gain
- movq (%edi), %mm0 # load 4 left pixels from memory
- pmulhw %mm6, %mm0 # apply left gain
-# pslaw $1, %mm1 # shift left ((s).15 x (s).15 -> (s0).14))
-# pslaw $1, %mm0
- paddsw %mm0, %mm0 # no shift left arithmic, so use add instead
- paddsw %mm1, %mm1
- paddsw %mm1, %mm0 # mix
- movq %mm0, (%edi)
- addl $8, %esi
- addl $8, %edi
- decl %ecx
- jnz .loop_mix # loop
-
- emms
-
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_mul_s16.s b/system/mmx/pixel_mul_s16.s
deleted file mode 100644
index 240a024..0000000
--- a/system/mmx/pixel_mul_s16.s
+++ /dev/null
@@ -1,56 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_mul_s16
-.type pixel_mul_s16,@function
-
-# simple add
-# void pixel_mul_s16(int *left, int *right, int nb_4pixel_vectors)
-
-pixel_mul_s16:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 8(%ebp), %edi # left array
- movl 12(%ebp), %esi # right array
- movl 16(%ebp), %ecx # pixel count
-
-
- .align 16
- .loop_mix:
-
-# prefetch 128(%esi)
- movq (%esi), %mm1 # load right 4 pixels from memory
- movq (%edi), %mm0 # load 4 left pixels from memory
- pmulhw %mm1, %mm0 # mul
- psllw $1, %mm0 # fixed point shift correction
- movq %mm0, (%edi)
- addl $8, %esi
- addl $8, %edi
- decl %ecx
- jnz .loop_mix # loop
-
- emms
-
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_pack_s16u8.s b/system/mmx/pixel_pack_s16u8.s
deleted file mode 100644
index 57df702..0000000
--- a/system/mmx/pixel_pack_s16u8.s
+++ /dev/null
@@ -1,126 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_pack_s16u8_y
-.type pixel_pack_s16u8_y,@function
-
-# mmx rgba pixel gain
-# void pixel_pack_s16u8_y(int *input, int *output, int nb_8pixel_vectors)
-
-pixel_pack_s16u8_y:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
-# movl 20(%ebp), %edi # int16[4] array of gains
-# movq (%edi), %mm7 # get gain array
-# psllw $1, %mm7 # adjust for shifted sign bit
-
- movl 8(%ebp), %esi # input array
- movl 12(%ebp), %edi # output array
- movl 16(%ebp), %ecx # pixel count
-
- pxor %mm6, %mm6
-
- .align 16
- .loop_pack_y:
-
-# prefetch 128(%esi)
- movq (%esi), %mm0 # load 4 pixels from memory
-# pmulhw %mm7, %mm0 # apply gain
- movq 8(%esi), %mm1 # load 4 pixels from memory
-# pmulhw %mm7, %mm1 # apply gain
-
-# movq %mm0, %mm2
-# pcmpgtw %mm6, %mm2 # mm2 > 0 ? 0xffff : 0
-# pand %mm2, %mm0
-
-# movq %mm1, %mm3
-# pcmpgtw %mm6, %mm3 # mm3 > 0 ? 0xffff : 0
-# pand %mm3, %mm1
-
-# psllw $1, %mm0 # shift out sign bit
-# psllw $1, %mm1 # shift out sign bit
-
- psraw $7, %mm0 # shift to lsb
- psraw $7, %mm1 # shift to lsb
-
- packuswb %mm1, %mm0 # pack & saturate to 8bit vector
- movq %mm0, (%edi) # store result in memory
-
- addl $16, %esi # increment source pointer
- addl $8, %edi # increment dest pointer
- decl %ecx
- jnz .loop_pack_y # loop
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
-.globl pixel_pack_s16u8_uv
-.type pixel_pack_s16u8_uv,@function
-
-pixel_pack_s16u8_uv:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
-# movl 20(%ebp), %edi # int16[4] array of gains
-# movq (%edi), %mm7 # get gain array
- movl 8(%ebp), %esi # pixel array offset
- movl 12(%ebp), %edi # nb of elements
- movl 16(%ebp), %ecx # pixel count
-
- pcmpeqw %mm6, %mm6
- psllw $15, %mm6
- movq %mm6, %mm5
- psrlw $8, %mm5
- por %mm5, %mm6 # mm6 <- 8 times 0x80
-
- .align 16
- .loop_pack_uv:
-
-# prefetch 128(%esi)
- movq (%esi), %mm0 # load 4 pixels from memory
-# pmulhw %mm7, %mm0 # apply gain
- movq 8(%esi), %mm1 # load 4 pixels from memory
-# pmulhw %mm7, %mm1 # apply gain
-
- psraw $8, %mm0 # shift to msb
- psraw $8, %mm1
-
- packsswb %mm1, %mm0 # pack & saturate to 8bit vector
- pxor %mm6, %mm0 # flip sign bits
- movq %mm0, (%edi) # store result in memory
-
- addl $16, %esi # increment source pointer
- addl $8, %edi # increment dest pointer
- decl %ecx
- jnz .loop_pack_uv # loop
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_rand_s16.s b/system/mmx/pixel_rand_s16.s
deleted file mode 100644
index 649400b..0000000
--- a/system/mmx/pixel_rand_s16.s
+++ /dev/null
@@ -1,76 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_rand_s16
-.type pixel_rand_s16,@function
-
-# mmx rgba pixel gain
-# void pixel_rand_s16(int *dst, nb_4pixel_vectors, short int random_seed[4])
-
-pixel_rand_s16:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 16(%ebp), %esi # int16[4] array of random seeds
- movl 8(%ebp), %edi # dst array
- movl 12(%ebp), %ecx # pixel count
-
- movq (%esi), %mm6
-
-
- pcmpeqw %mm3, %mm3
- psrlw $15, %mm3 # get bit mask 4 times 0x0001
-
- .align 16
- .loop_rand:
-
-# prefetch 128(%esi)
-
-
- movq %mm6, %mm4 # get random vector
- psrlw $15, %mm4 # get first component
- movq %mm6, %mm5
- psrlw $14, %mm5 # get second component
- pxor %mm5, %mm4
- movq %mm6, %mm5
- psrlw $12, %mm5 # get third component
- pxor %mm5, %mm4
- movq %mm6, %mm5
- psrlw $3, %mm5 # get forth component
- pxor %mm5, %mm4
-
- psllw $1, %mm6 # shift left original random vector
- pand %mm3, %mm4 # isolate new bit
- por %mm4, %mm6 # combine into new random vector
-
- movq %mm6, (%edi)
- addl $8, %edi
- decl %ecx
- jnz .loop_rand # loop
-
-
- movq %mm6, (%esi) # store random seeds
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_randmix_s16.s b/system/mmx/pixel_randmix_s16.s
deleted file mode 100644
index 44e1702..0000000
--- a/system/mmx/pixel_randmix_s16.s
+++ /dev/null
@@ -1,91 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_randmix_s16
-.type pixel_randmix_s16,@function
-
-# mmx rgba pixel gain
-# void pixel_randmix_s16(int *left, int *right, int nb_4pixel_vectors, short int random_seed[4], short int threshold[4])
-
-pixel_randmix_s16:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 20(%ebp), %edi # int16[4] array of random seeds
- movq (%edi), %mm6
-
- movl 24(%ebp), %edi # int16[4] array of thresholds
- movq (%edi), %mm7
-
- movl 8(%ebp), %edi # left array
- movl 12(%ebp), %esi # right array
- movl 16(%ebp), %ecx # pixel count
-
- pcmpeqw %mm3, %mm3
- psrlw $15, %mm3 # get bit mask 4 times 0x0001
-
- .align 16
- .loop_randmix:
-
-# prefetch 128(%esi)
- movq (%esi), %mm1 # load right 4 pixels from memory
- movq (%edi), %mm0 # load 4 left pixels from memory
-
- movq %mm6, %mm2 # get random vector
- pcmpgtw %mm7, %mm2 # compare random vector with threshold
- movq %mm2, %mm5
-
- pand %mm0, %mm2 # get left array's components
- pandn %mm1, %mm5 # get right array's components
- por %mm2, %mm5
-
- movq %mm5, (%edi) # store pixels
-
- movq %mm6, %mm4 # get random vector
- psrlw $15, %mm4 # get first component
- movq %mm6, %mm5
- psrlw $14, %mm5 # get second component
- pxor %mm5, %mm4
- movq %mm6, %mm5
- psrlw $12, %mm5 # get third component
- pxor %mm5, %mm4
- movq %mm6, %mm5
- psrlw $3, %mm5 # get forth component
- pxor %mm5, %mm4
-
- psllw $1, %mm6 # shift left original random vector
- pand %mm3, %mm4 # isolate new bit
- por %mm4, %mm6 # combine into new random vector
-
- addl $8, %esi
- addl $8, %edi
- decl %ecx
- jnz .loop_randmix # loop
-
-
- movl 20(%ebp), %edi # int16[4] array of random seeds
- movq %mm6, (%edi) # store random seeds
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_resample_s16.s b/system/mmx/pixel_resample_s16.s
deleted file mode 100644
index 3959f9c..0000000
--- a/system/mmx/pixel_resample_s16.s
+++ /dev/null
@@ -1,314 +0,0 @@
-
-
-#interpolation data:
-#* 4 vectors: neighbourhood for samples (TL, TR, BL, BR)
-#* 2 vectors: fractional part (unsigned)
-#* 2 vectors: addresses of pixel blocks
-
-#coord conversion data:
-#1 vector: 32bit splatted address
-#1 vector: 16bit splatted w-1
-#1 vector: 16bit splatted h-1
-#1 vector: 16bit splatted w (reuse w-1 with add?)
-#1 dword: 32 bit line offset
-
-#coord generation data: several vectors for parameter update stuff..
-
-#coordinate systems: 16 bit virtual coordinates (signed, center relative)
-#* 2 vectors: virtual coordinates
-#(evt tussenstap + conversie naar 16 bit virtual)
-
-
-#step 1: generate virtual coords
-
-
-#step 2: virtual coords -> block adresses + fractional adresses
-#* mulhigh: real coords (x,y) (center relative)
-#* add center -> unsigned (top left relative)
-#* mullow: fractional part (x_frac, y_frac)
-#* mulhigh, mullow, pack 32bit: y_offset
-#* pack 32bit: x_offset
-#* add, shift, add start address: real addresses
-
-
-#step3: data fetch using generated addresses:
-# this step would be much simpler in 4x16bit rgba. life's a bitch..
-
-#step4: billinear interpolation
-
-#stat5: store
-
-
-
- # this can be simplified by doing 32 bit unaligned moves
- # and vector unpacking on the data
-
-
-
- # cooked image data structure
- # pixel environment temp storage
- TL1 = 0x00
- TL2 = 0x02
- TL3 = 0x04
- TL4 = 0x06
- TR1 = 0x08
- TR2 = 0x0A
- TR3 = 0x0C
- TR4 = 0x0E
- BL1 = 0x10
- BL2 = 0x12
- BL3 = 0x14
- BL4 = 0x16
- BR1 = 0x18
- BR2 = 0x1A
- BR3 = 0x1C
- BR4 = 0x1E
- # addresses of pixel blocks
- ADDRESS1 = 0x20
- ADDRESS2 = 0x24
- ADDRESS3 = 0x28
- ADDRESS4 = 0x2C
-
- # second env + address buffer (testing: not used)
- SECONDBUFFER = 0x30
-
- # 32bit splatted bitmap address
- V2PLANEADDRESS = 0x60
- # 16bit splatted image constants
- V4TWOWIDTHM1 = 0x68
- V4TWOHEIGHTM1 = 0x70
- V4LINEOFFSET = 0x78
- # data struct size
- RESAMPLEDATASIZE = 0x80
-
-
-
- # interpolation routine
- # input: %mm0, %mm1 4 x 16bit unsigned top left relative virtual x and y coordinates
- # %esi: temp & algo data structure
-
-getpixelsbilin: psrlw $1, %mm0 # convert to range 0->0x7fff [0,0.5[
- psrlw $1, %mm1
- movq %mm0, %mm2
- movq %mm1, %mm3
- movq V4TWOWIDTHM1(%esi), %mm4 # 2 * (width - 1)
- movq V4TWOHEIGHTM1(%esi), %mm5 # 2 * (height - 1)
- pmulhw %mm5, %mm3 # mm3 == y coord (topleft relative)
- pmulhw %mm4, %mm2 # mm2 == x coord (topleft relative)
- pmullw %mm5, %mm1 # mm1 == y frac (unsigned)
- pmullw %mm4, %mm0 # mm0 == x frac (unsigned)
-
- movq %mm3, %mm5 # copy y coord
- pmullw V4LINEOFFSET(%esi), %mm3 # low part of line offset
- pmulhw V4LINEOFFSET(%esi), %mm5 # high part of line offset
-
- movq %mm2, %mm7 # copy x coord vector
- pxor %mm4, %mm4
- punpcklwd %mm4, %mm2 # low part in %mm2
- punpckhwd %mm4, %mm7 # hight part in %mm7
-
- movq %mm3, %mm6 # copy
- punpcklwd %mm5, %mm3 # unpack low part in %mm3
- punpckhwd %mm5, %mm6 # high part int %mm6
-
- paddd %mm2, %mm3
- paddd %mm7, %mm6
- pslld $1, %mm3 # convert to word adresses
- pslld $1, %mm6
-
- paddd V2PLANEADDRESS(%esi), %mm3 # add pixel plane address
- paddd V2PLANEADDRESS(%esi), %mm6
-
- movq %mm3, ADDRESS1(%esi) # store adresses
- movq %mm6, ADDRESS3(%esi)
-
- pcmpeqw %mm2, %mm2 # all ones
- movq %mm0, %mm4 # copy x frac
- movq %mm1, %mm5 # copy y frac
- pxor %mm2, %mm4 # compute compliment (approx negative)
- pxor %mm2, %mm5
-
- psrlw $1, %mm0 # shift right (0.5 * (frac x)
- psrlw $1, %mm1 # shift right (0.5 * (frac y)
- psrlw $1, %mm4 # shift right (0.5 * (1 - frac x)
- psrlw $1, %mm5 # shift right (0.5 * (1 - frac y)
-
- movq %mm0, %mm2 # copy of frac x
- movq %mm4, %mm3 # copy of (1-frac x)
- # fetch data
-
- #jmp skipfetch # seems the fetch is the real killer. try to optimize this
- # using 32 bit accesses & shifts
-
- # the src image data struct is padded to the cooked data struct
- movl RESAMPLEDATASIZE(%esi), %edi
- shll $1, %edi
-
- movl ADDRESS1(%esi), %ecx
- movl ADDRESS2(%esi), %edx
-
- movw (%ecx), %ax
- movw (%edx), %bx
- movw %ax, TL1(%esi)
- movw %bx, TL2(%esi)
- movw 2(%ecx), %ax
- movw 2(%edx), %bx
- movw %ax, TR1(%esi)
- movw %bx, TR2(%esi)
-
- addl %edi, %ecx
- addl %edi, %edx
-
- movw (%ecx), %ax
- movw (%edx), %bx
- movw %ax, BL1(%esi)
- movw %bx, BL2(%esi)
- movw 2(%ecx), %ax
- movw 2(%edx), %bx
- movw %ax, BR1(%esi)
- movw %bx, BR2(%esi)
-
-
- movl ADDRESS3(%esi), %ecx
- movl ADDRESS4(%esi), %edx
-
-
- movw (%ecx), %ax
- movw (%edx), %bx
- movw %ax, TL3(%esi)
- movw %bx, TL4(%esi)
- movw 2(%ecx), %ax
- movw 2(%edx), %bx
- movw %ax, TR3(%esi)
- movw %bx, TR4(%esi)
-
- addl %edi, %ecx
- addl %edi, %edx
-
- movw (%ecx), %ax
- movw (%edx), %bx
- movw %ax, BL3(%esi)
- movw %bx, BL4(%esi)
- movw 2(%ecx), %ax
- movw 2(%edx), %bx
- movw %ax, BR3(%esi)
- movw %bx, BR4(%esi)
-
-
-skipfetch:
- pmulhw TL1(%esi), %mm4 # bilin interpolation
- pmulhw TR1(%esi), %mm0
- pmulhw BL1(%esi), %mm3
- pmulhw BR1(%esi), %mm2
-
-
- paddw %mm4, %mm0
- paddw %mm3, %mm2
-
- pmulhw %mm5, %mm0
- pmulhw %mm1, %mm2
-
- paddw %mm2, %mm0
- psllw $2, %mm0 # compensate for gain reduction
-
- ret
-
-
- // linear mapping data struct
- ROWSTATEX = 0x0
- ROWSTATEY = 0x8
- COLSTATEX = 0x10
- COLSTATEY = 0x18
- ROWINCX = 0x20
- ROWINCY = 0x28
- COLINCX = 0x30
- COLINCY = 0x38
-
- // image data struct
- LINEOFFSET = 0x0
- IMAGEADDRESS = 0x4
- WIDTH = 0x8
- HEIGHT = 0xC
- IMAGEDATASIZE = 0x10
-
-
-
-# pixel_resample_linmap_s16(void *x)
-.globl pixel_resample_linmap_s16
-.type pixel_resample_linmap_s16,@function
-
- SOURCEIMAGE = RESAMPLEDATASIZE
- DESTIMAGE = SOURCEIMAGE + IMAGEDATASIZE
- LINMAPDATA = DESTIMAGE + IMAGEDATASIZE
-
-pixel_resample_linmap_s16:
- pushl %ebp
- movl %esp, %ebp
- pushl %esi
- pushl %edi
- pushl %ebx
-
-
- movl 8(%ebp), %esi # get data struct
- movl DESTIMAGE+HEIGHT(%esi), %edx # image height
- movl DESTIMAGE+IMAGEADDRESS(%esi), %edi # dest image address
- movl DESTIMAGE+WIDTH(%esi), %ecx # image width
- shrl $2, %ecx # vector count
- .align 16
-
-linmap_looprow:
- movq LINMAPDATA+ROWSTATEX(%esi), %mm0 # get current coordinates
- movq LINMAPDATA+ROWSTATEY(%esi), %mm1
-
-linmap_loopcol:
- movq %mm0, %mm4 # copy
- movq %mm1, %mm5
- paddd LINMAPDATA+ROWINCX(%esi), %mm4 # increment
- paddd LINMAPDATA+ROWINCY(%esi), %mm5
- movq %mm4, %mm6 # copy
- movq %mm5, %mm7
- paddd LINMAPDATA+ROWINCX(%esi), %mm6 # increment
- paddd LINMAPDATA+ROWINCY(%esi), %mm7
- movq %mm6, LINMAPDATA+ROWSTATEX(%esi) # store next state
- movq %mm7, LINMAPDATA+ROWSTATEY(%esi)
-
- psrad $16, %mm0 # round to 16 bit
- psrad $16, %mm1
- psrad $16, %mm4
- psrad $16, %mm5
- packssdw %mm4, %mm0 # pack new coordinates
- packssdw %mm5, %mm1
-
- push %ecx
- push %edx
- push %edi
-
- call getpixelsbilin # do interpolation
-
- pop %edi
- pop %edx
- pop %ecx
- movq %mm0, (%edi) # store 4 pixels
- addl $0x8, %edi # point to next 4 pixels
- decl %ecx # dec row counter
- jnz linmap_looprow
-
- movq LINMAPDATA+COLSTATEX(%esi), %mm0 # get column state vector
- movq LINMAPDATA+COLSTATEY(%esi), %mm1
- movl DESTIMAGE+WIDTH(%esi), %ecx # image width
- shrl $2, %ecx # vector count
- paddd LINMAPDATA+COLINCX(%esi), %mm0 # increment
- paddd LINMAPDATA+COLINCY(%esi), %mm1
- movq %mm0, LINMAPDATA+COLSTATEX(%esi) # store
- movq %mm1, LINMAPDATA+COLSTATEY(%esi)
- decl %edx # dec column counter
- jnz linmap_loopcol
-
- emms
- popl %ebx
- popl %edi
- popl %esi
- leave
- ret
-
-
diff --git a/system/mmx/pixel_s1.s b/system/mmx/pixel_s1.s
deleted file mode 100644
index d6bc5ca..0000000
--- a/system/mmx/pixel_s1.s
+++ /dev/null
@@ -1,201 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
- # this file contains ops for binary image processing
- # 8x8 bit tile encoded
- # low byte = bottom row
- # low bit = right column
- # %mm7 = scratch reg for all macros
-
-
- # ************ load mask *******************
- # compute bit masks for rows and columns
- # %mm7: scratch reg
-
- # load mask top
- .macro ldmt count reg
- pcmpeqb \reg, \reg
- psllq $(64-(\count<<3)), \reg
- .endm
-
- # load mask bottom
- .macro ldmb count reg
- pcmpeqb \reg, \reg
- psrlq $(64-(\count<<3)), \reg
- .endm
-
- # load mask top and bottom
- .macro ldmtb count regt regb
- ldmb \count, \regb
- ldmt \count, \regt
- .endm
-
- # load mask right
- .macro ldmr count reg
- pcmpeqb %mm7, %mm7
- psrlw $(16-\count), %mm7
- movq %mm7, \reg
- psllq $8, %mm7
- por %mm7, \reg
- .endm
-
- # load mask left
- .macro ldml count reg
- pcmpeqb %mm7, %mm7
- psllw $(16-\count), %mm7
- movq %mm7, \reg
- psrlq $8, %mm7
- por %mm7, \reg
- .endm
-
- # load mask left and right
- .macro ldmlr count regl regr
- pcmpeqb %mm7, %mm7
- psllw $(16-\count), %mm7
- movq %mm7, \regl
- psrlq $8, %mm7
- por %mm7, \regl
- movq \regl, \regr
- psrlq $(8-\count), \regr
- .endm
-
- # ************* shift square **********
- # shifts a square in reg, fills with zeros
-
- # shift square top
- .macro sst count reg
- psllq $(\count<<3), \reg
- .endm
-
- # shift square bottom
- .macro ssb count reg
- psrlq $(\count<<3), \reg
- .endm
-
- # not tested
- # shift square left
- .macro ssl count reg
- movq \reg, %mm7
- pcmpeqb \reg, \reg
- psllw $(16-\count), \reg
- psrlw $8, \reg
- pandn %mm7, \reg
- psllw $(\count), \reg
- .endm
-
- # shift square right
- .macro ssr count reg
- movq \reg, %mm7
- pcmpeqb \reg, \reg
- psrlw $(16-\count), \reg
- psllw $8, \reg
- pandn %mm7, \reg
- psrlw $(\count), \reg
- .endm
-
-
- # ********** combine square *************
- # combines 2 squares
-
- # combine right
- .macro csr count regr reg
- ssl \count, \reg
- ssr (8-\count), \regr
- por \regr, \reg
- .endm
-
- # combine left
- .macro csl count regl reg
- ssr \count, \reg
- ssl (8-\count), \regl
- por \regl, \reg
- .endm
-
- # combine top
- .macro cst count regt reg
- ssb \count, \reg
- sst (8-\count), \regt
- por \regt, \reg
- .endm
-
-
- # combine bottom
- .macro csb count regb reg
- sst \count, \reg
- ssb (8-\count), \regb
- por \regb, \reg
- .endm
-
-
- # ********** load combine square *************
- # loads combined square using mask
-
- # load combined square left
- # mask should be count bits set right (i.e. 0x01)
- .macro lcsml count mask source sourcel dstreg
- movq \mask, \dstreg
- movq \mask, %mm7
- pandn \source, \dstreg
- pand \sourcel, %mm7
- psrlq $(\count), \dstreg
- psllq $(8-\count), %mm7
- por %mm7, \dstreg
- .endm
-
-
-
-.globl pixel_test_s1
-.type pixel_test_s1,@function
-
-# simple add
-# void pixel_add_s16(void *dest, void *source, int nb_squares, int spacing)
-
-
-
- #
-
-
-pixel_test_s1:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
- movl 8(%ebp), %edi # dest
- movl 12(%ebp), %esi # source
- movl 16(%ebp), %ecx # count
- movl 20(%ebp), %edx # row distance
-
- ldmr 1, %mm6
- lcsml 1, %mm6, (%esi), 8(%esi), %mm0
- movq %mm0, (%edi)
-
-
-# movq (%esi), %mm0
-# movq 8(%esi), %mm1
-# csl 4, %mm1, %mm0
-# movq %mm0, (%edi)
-
- emms
-
-
- pop %edi
- pop %esi
- leave
- ret
-
diff --git a/system/mmx/pixel_unpack_u8s16.s b/system/mmx/pixel_unpack_u8s16.s
deleted file mode 100644
index 0fc14c2..0000000
--- a/system/mmx/pixel_unpack_u8s16.s
+++ /dev/null
@@ -1,113 +0,0 @@
-# Pure Data Packet mmx routine.
-# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-.globl pixel_unpack_u8s16_y
-.type pixel_unpack_u8s16_y,@function
-
-# mmx rgba pixel gain
-# void pixel_unpack_u8s16_y(char *input, char *output, int32 nb_pixels_div8)
-
-pixel_unpack_u8s16_y:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
-# movl 20(%ebp), %edi # int16[4] array of gains
-# movq (%edi), %mm7 # get gain array
-
- movl 8(%ebp), %esi # input uint8 pixel array
- movl 12(%ebp), %edi # output sint16 pixel array
- movl 16(%ebp), %ecx # nb of elements div 8
-
-
- .align 16
- .loop_unpack_y:
-
- movq (%esi), %mm5 # load 8 pixels from memory
- pxor %mm0, %mm0 # zero mm0 - mm3
- pxor %mm1, %mm1
- punpcklbw %mm5, %mm0 # unpack 1st 4 pixels
- punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles
- psrlw $0x1, %mm0 # shift right to clear sign bit 9.7
- psrlw $0x1, %mm1
-# pmulhw %mm7, %mm0 # apply gain
-# pmulhw %mm7, %mm1
-# paddsw %mm0, %mm0 # correct factor 2
-# paddsw %mm1, %mm1
- movq %mm0, (%edi) # store
- movq %mm1, 8(%edi)
-
- addl $8, %esi # increment source pointer
- addl $16, %edi # increment dest pointer
- decl %ecx
- jnz .loop_unpack_y # loop
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-
-.globl pixel_unpack_u8s16_uv
-.type pixel_unpack_u8s16_uv,@function
-pixel_unpack_u8s16_uv:
- pushl %ebp
- movl %esp, %ebp
- push %esi
- push %edi
-
-# movl 20(%ebp), %edi # int16[4] array of gains
-# movq (%edi), %mm7 # get gain array
-
- movl 8(%ebp), %esi # input uint8 pixel array
- movl 12(%ebp), %edi # output sint16 pixel array
- movl 16(%ebp), %ecx # nb of elements div 8
-
- pcmpeqw %mm6, %mm6
- psllw $15, %mm6
-
- .align 16
- .loop_unpack_uv:
-
- movq (%esi), %mm5 # load 8 pixels from memory
- pxor %mm0, %mm0 # zero mm0 - mm3
- pxor %mm1, %mm1
- punpcklbw %mm5, %mm0 # unpack 1st 4 pixels
- punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles
- pxor %mm6, %mm0 # flip sign bit (Cr and Cb are ofset by 128)
- pxor %mm6, %mm1
-# pmulhw %mm7, %mm0 # apply gain
-# pmulhw %mm7, %mm1
-# paddsw %mm0, %mm0 # correct factor 2
-# paddsw %mm1, %mm1
- movq %mm0, (%edi) # store
- movq %mm1, 8(%edi)
-
- addl $8, %esi # increment source pointer
- addl $16, %edi # increment dest pointer
- decl %ecx
- jnz .loop_unpack_uv # loop
-
- emms
-
- pop %edi
- pop %esi
- leave
- ret
-