diff options
Diffstat (limited to 'system/mmx')
-rw-r--r-- | system/mmx/Makefile | 29 | ||||
-rw-r--r-- | system/mmx/pdp_mmx_test.c | 62 | ||||
-rw-r--r-- | system/mmx/pixel_add_s16.s | 55 | ||||
-rw-r--r-- | system/mmx/pixel_affine_s16.s | 59 | ||||
-rw-r--r-- | system/mmx/pixel_biquad_dirI_s16.s | 361 | ||||
-rw-r--r-- | system/mmx/pixel_biquad_s16.s | 451 | ||||
-rw-r--r-- | system/mmx/pixel_ca_s1.s | 189 | ||||
-rw-r--r-- | system/mmx/pixel_cascade_s16.s | 330 | ||||
-rw-r--r-- | system/mmx/pixel_conv_hor_s16.s | 134 | ||||
-rw-r--r-- | system/mmx/pixel_conv_ver_s16.s | 128 | ||||
-rw-r--r-- | system/mmx/pixel_crot_s16.s | 153 | ||||
-rw-r--r-- | system/mmx/pixel_gain.s | 83 | ||||
-rw-r--r-- | system/mmx/pixel_gain_s16.s | 71 | ||||
-rw-r--r-- | system/mmx/pixel_mix_s16.s | 68 | ||||
-rw-r--r-- | system/mmx/pixel_mul_s16.s | 56 | ||||
-rw-r--r-- | system/mmx/pixel_pack_s16u8.s | 126 | ||||
-rw-r--r-- | system/mmx/pixel_rand_s16.s | 76 | ||||
-rw-r--r-- | system/mmx/pixel_randmix_s16.s | 91 | ||||
-rw-r--r-- | system/mmx/pixel_s1.s | 201 | ||||
-rw-r--r-- | system/mmx/pixel_unpack_u8s16.s | 113 |
20 files changed, 2836 insertions, 0 deletions
diff --git a/system/mmx/Makefile b/system/mmx/Makefile new file mode 100644 index 0000000..0f8f836 --- /dev/null +++ b/system/mmx/Makefile @@ -0,0 +1,29 @@ +include ../../Makefile.config + +OBJ = \ +pixel_pack_s16u8.o \ +pixel_unpack_u8s16.o \ +pixel_add_s16.o \ +pixel_mul_s16.o \ +pixel_mix_s16.o \ +pixel_randmix_s16.o \ +pixel_conv_hor_s16.o \ +pixel_conv_ver_s16.o \ +pixel_affine_s16.o \ +pixel_biquad_s16.o \ +pixel_ca_s1.o \ +pixel_rand_s16.o \ +pixel_crot_s16.o \ +pixel_gain_s16.o + +all: $(OBJ) + +test: pdp_mmx_test.o $(OBJ) + gcc -o pdp_mmx_test pdp_mmx_test.o $(OBJ) -g + +clean: + rm -f *.o + rm -f *~ + rm -f pdp_mmx.a + rm -f pdp_mmx_test + diff --git a/system/mmx/pdp_mmx_test.c b/system/mmx/pdp_mmx_test.c new file mode 100644 index 0000000..e93539f --- /dev/null +++ b/system/mmx/pdp_mmx_test.c @@ -0,0 +1,62 @@ +#include "pdp_mmx.h" + +#define FP(x) ((short int)(((float)(x) * 2 * 256.0f))) + +#define nbp 256 + + short int a1[4] = {0x0100,0x0100,0x0100,0x0100}; + short int a2[4] = {0x0100,0x0100,0x0100,0x0100}; + short int b0[4] = {0x0100,0x0100,0x0100,0x0100}; + short int b1[4] = {0x0100,0x0100,0x0100,0x0100}; + short int b2[4] = {0x0100,0x0100,0x0100,0x0100}; + + short int u1[4] = {0x0100,0x0100,0x0100,0x0100}; + short int u2[4] = {0x0100,0x0100,0x0100,0x0100}; + + short int x0[4] = {0x0100,0x0100,0x0100,0x0100}; + short int x1[4] = {0x0100,0x0100,0x0100,0x0100}; + short int x2[4] = {0x0100,0x0100,0x0100,0x0100}; + short int x3[4] = {0x0100,0x0100,0x0100,0x0100}; + +void print_pixel(unsigned int i) +{ + if (i) printf("x "); + else printf(". "); +} + +void print_line(void) +{ + printf("\n"); +} + +void print_square(unsigned char *c) +{ + int i,j; + + for(j=7; j>=0; j--){ + for(i=0; i<8; i++) print_pixel(c[j] & (1<<(7-i))); + printf("\n"); + } + +} + +main() +{ + + unsigned char src[16]={1,2,3,4,5,6,7,8,-1,-2,-3,-4,-5,-6,-7,-8}; + unsigned char dst[8]; + + + print_square(src); + print_line(); + print_square(src+8); + print_line(); + + pixel_test_s1(dst,src,1,1); + + print_square(dst); + print_line(); + + + +} diff --git a/system/mmx/pixel_add_s16.s b/system/mmx/pixel_add_s16.s new file mode 100644 index 0000000..8d4c7df --- /dev/null +++ b/system/mmx/pixel_add_s16.s @@ -0,0 +1,55 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_add_s16 +.type pixel_add_s16,@function + +# simple add +# void pixel_add_s16(int *left, int *right, int nb_4pixel_vectors) + +pixel_add_s16: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 8(%ebp), %edi # left array + movl 12(%ebp), %esi # right array + movl 16(%ebp), %ecx # pixel count + + + .align 16 + .loop_mix: + +# prefetch 128(%esi) + movq (%esi), %mm1 # load right 4 pixels from memory + movq (%edi), %mm0 # load 4 left pixels from memory + paddsw %mm1, %mm0 # mix + movq %mm0, (%edi) + addl $8, %esi + addl $8, %edi + decl %ecx + jnz .loop_mix # loop + + emms + + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_affine_s16.s b/system/mmx/pixel_affine_s16.s new file mode 100644 index 0000000..b357de3 --- /dev/null +++ b/system/mmx/pixel_affine_s16.s @@ -0,0 +1,59 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_affine_s16 +.type pixel_affine_s16,@function + +# void pixel_affine_s16(int *buf, int nb_8pixel_vectors, short int gain[4], short int offset[4]) + +pixel_affine_s16: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 20(%ebp), %edi + movq (%edi), %mm6 # get offset vector + + movl 16(%ebp), %edi + movq (%edi), %mm7 # get gain vector + + movl 8(%ebp), %esi # input array + movl 12(%ebp), %ecx # pixel count + + + .align 16 + .loop_affine: + +# prefetch 128(%esi) + movq (%esi), %mm0 # load 4 pixels from memory + pmulhw %mm7, %mm0 # apply gain (s).15 fixed point + psllw $1, %mm0 # apply correction shift + paddsw %mm6, %mm0 # add offset + movq %mm0, (%esi) # store result in memory + + addl $8, %esi # increment source pointer + decl %ecx + jnz .loop_affine # loop + + emms + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_biquad_dirI_s16.s b/system/mmx/pixel_biquad_dirI_s16.s new file mode 100644 index 0000000..1729502 --- /dev/null +++ b/system/mmx/pixel_biquad_dirI_s16.s @@ -0,0 +1,361 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# + + + # TODO MOVE TO DIRECT FORM II + # y[k] = b0 * x[k] + u1[k-1] + # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k] + # u2[k] = b2 * x[k] - a2 * y[k] + + # input in register: + # %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3} + # %esi: coef memory (a1, a2, b0, b1, b2) + # %edi: state memory (u1, u2) + + + # return in register: + # %mm0-mm4: 4x4 pixels result + + + .biquad_4x4_pixels: + .align 16 + # prescale + movq -8(%esi), %mm4 + pmulhw %mm4, %mm0 + pmulhw %mm4, %mm1 + pmulhw %mm4, %mm2 + pmulhw %mm4, %mm3 + psllw $1, %mm0 + psllw $1, %mm1 + psllw $1, %mm2 + psllw $1, %mm3 + + + # first vector + movq 0(%edi), %mm4 # mm4 <- u[-1] + movq 8(%edi), %mm5 # mm5 <- u[-2] + movq %mm4, %mm6 + movq %mm5, %mm7 + + pmulhw 0(%esi), %mm6 # multiply by a1 + pmulhw 8(%esi), %mm7 # multiply by a2 + + paddsw %mm6, %mm0 # accumulate + paddsw %mm7, %mm0 # accumulate + paddsw %mm0, %mm0 # scale by 2 (since all fixed point muls are x*y/2) + + movq %mm0, %mm6 # mm6 <- u[0] + movq %mm4, %mm7 # mm7 <- u[-1] + pmulhw 16(%esi), %mm0 # multiply by b0 + pmulhw 24(%esi), %mm4 # multiply by b1 + pmulhw 32(%esi), %mm5 # multiply by b2 + + paddsw %mm4, %mm0 # accumulate + paddsw %mm5, %mm0 # accumulate + + # mm0 is result 0 + + # second vector + movq %mm6, %mm4 # mm4 <- u[0] + movq %mm7, %mm5 # mm5 <- u[-1] + + pmulhw 0(%esi), %mm6 # multiply by a1 + pmulhw 8(%esi), %mm7 # multiply by a2 + + paddsw %mm6, %mm1 # accumulate + paddsw %mm7, %mm1 # accumulate + paddsw %mm1, %mm1 # scale by 2 + + + movq %mm1, %mm6 # mm6 <- u[1] + movq %mm4, %mm7 # mm7 <- u[0] + pmulhw 16(%esi), %mm1 # multiply by b0 + pmulhw 24(%esi), %mm4 # multiply by b1 + pmulhw 32(%esi), %mm5 # multiply by b2 + + paddsw %mm4, %mm1 # accumulate + paddsw %mm5, %mm1 # accumulate + + # mm1 is result 1 + + # third vector + movq %mm6, %mm4 # mm4 <- u[1] + movq %mm7, %mm5 # mm5 <- u[0] + + pmulhw 0(%esi), %mm6 # multiply by a1 + pmulhw 8(%esi), %mm7 # multiply by a2 + + paddsw %mm6, %mm2 # accumulate + paddsw %mm7, %mm2 # accumulate + paddsw %mm2, %mm2 # scale by 2 + + + movq %mm2, %mm6 # mm6 <- u[2] + movq %mm4, %mm7 # mm7 <- u[1] + pmulhw 16(%esi), %mm2 # multiply by b0 + pmulhw 24(%esi), %mm4 # multiply by b1 + pmulhw 32(%esi), %mm5 # multiply by b2 + + paddsw %mm4, %mm2 # accumulate + paddsw %mm5, %mm2 # accumulate + + # mm2 is result 2 + + # fourth vector + movq %mm6, %mm4 # mm4 <- u[2] + movq %mm7, %mm5 # mm5 <- u[1] + + pmulhw 0(%esi), %mm6 # multiply by a1 + pmulhw 8(%esi), %mm7 # multiply by a2 + + paddsw %mm6, %mm3 # accumulate + paddsw %mm7, %mm3 # accumulate + paddsw %mm3, %mm3 # scale by 2 + + + movq %mm3, 0(%edi) # store u[3] + movq %mm4, 8(%edi) # store u[2] + pmulhw 16(%esi), %mm3 # multiply by b0 + pmulhw 24(%esi), %mm4 # multiply by b1 + pmulhw 32(%esi), %mm5 # multiply by b2 + + paddsw %mm4, %mm3 # accumulate + paddsw %mm5, %mm3 # accumulate + + # mm3 is result 3 + + ret + + + # in order to use the 4 line parallel biquad routine on horizontal + # lines, we need to reorder (rotate or transpose) the matrix, since + # images are scanline encoded, and we want to work in parallell + # on 4 lines. + # + # since the 4 lines are independent, it doesnt matter in which order + # the the vector elements are present. + # + # this allows us to use the same routine for left->right and right->left + # processing. + # + # some comments on the non-abelean group of square isometries consisting of + # (I) identity + # (H) horizontal axis mirror + # (V) vertical axis mirror + # (T) transpose (diagonal axis mirror) + # (A) antitranspose (antidiagonal axis mirror) + # (R1) 90deg anticlockwize rotation + # (R2) 180deg rotation + # (R3) 90deg clockwize rotation + # + # + # we basicly have two options: (R1,R3) or (T,A) + # we opt for T and A because they are self inverting, which improves locality + # + # use antitranspose for right to left an transpose + # for left to right (little endian) + + + # antitranspose 4x4 + + # input + # %mm3 == {d0 d1 d2 d3} + # %mm2 == {c0 c1 c2 c3} + # %mm1 == {b0 b1 b2 b3} + # %mm0 == {a0 a1 a2 a3} + + # output + # %mm3 == {a3 b3 c3 d3} + # %mm2 == {a2 b2 c2 d2} + # %mm1 == {a1 b1 c1 d1} + # %mm0 == {a0 b0 c0 d0} + + + .antitranspose_4x4: + .align 16 + movq %mm3, %mm4 + punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3} + movq %mm3, %mm5 + punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1} + + movq %mm2, %mm6 + punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3} + movq %mm2, %mm7 + punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1} + + movq %mm4, %mm3 + punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3} + movq %mm4, %mm2 + punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2} + + movq %mm5, %mm1 + punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1} + movq %mm5, %mm0 + punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0} + + ret + + + + # transpose 4x4 + + # input + # %mm3 == {d3 d2 d1 d0} + # %mm2 == {c3 c2 c1 c0} + # %mm1 == {b3 b2 b1 b0} + # %mm0 == {a3 a2 a1 a0} + + # output + # %mm3 == {d3 c3 b3 a3} + # %mm2 == {d2 c2 b2 a2} + # %mm1 == {d1 c1 b1 a1} + # %mm0 == {d0 c0 b0 a0} + + + .transpose_4x4: + .align 16 + movq %mm0, %mm4 + punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0} + movq %mm0, %mm5 + punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2} + + movq %mm1, %mm6 + punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0} + movq %mm1, %mm7 + punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2} + + movq %mm4, %mm0 + punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0} + movq %mm4, %mm1 + punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1} + + movq %mm5, %mm2 + punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2} + movq %mm5, %mm3 + punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3} + + ret + + +.globl pixel_biquad_vertb_s16 +.type pixel_biquad_vertb_s16,@function + + +# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) + + +pixel_biquad_vertb_s16: + + + pushl %ebp + movl %esp, %ebp + push %ebx + push %esi + push %edi + + movl 8(%ebp), %ebx # pixel array offset + movl 12(%ebp), %ecx # nb of 4x4 pixblocks + movl 16(%ebp), %edx # line with + + movl 20(%ebp), %esi # coefs + movl 24(%ebp), %edi # state + + shll $1, %edx # short int addressing + movl %edx, %eax + shll $1, %eax + addl %edx, %eax # eax = 3 * edx + + .align 16 + .biquad_vertb_line_loop: + movq (%ebx), %mm0 + movq (%ebx,%edx,1), %mm1 + movq (%ebx,%edx,2), %mm2 + movq (%ebx,%eax,1), %mm3 + call .biquad_4x4_pixels + movq %mm0, (%ebx) + movq %mm1, (%ebx,%edx,1) + movq %mm2, (%ebx,%edx,2) + movq %mm3, (%ebx,%eax,1) + addl %edx, %ebx + addl %eax, %ebx + decl %ecx + jnz .biquad_vertb_line_loop + + emms + + pop %edi + pop %esi + pop %ebx + leave + ret + +.globl pixel_biquad_horlr_s16 +.type pixel_biquad_horlr_s16,@function + + +# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) + + +pixel_biquad_horlr_s16: + + + pushl %ebp + movl %esp, %ebp + push %ebx + push %esi + push %edi + + movl 8(%ebp), %ebx # pixel array offset + movl 12(%ebp), %ecx # nb of 4x4 pixblocks + movl 16(%ebp), %edx # line with + + movl 20(%ebp), %esi # coefs + movl 24(%ebp), %edi # state + + shll $1, %edx # short int addressing + movl %edx, %eax + shll $1, %eax + addl %edx, %eax # eax = 3 * edx + + .align 16 + .biquad_horlr_line_loop: + movq (%ebx), %mm0 + movq (%ebx,%edx,1), %mm1 + movq (%ebx,%edx,2), %mm2 + movq (%ebx,%eax,1), %mm3 + call .transpose_4x4 + call .biquad_4x4_pixels + call .transpose_4x4 + movq %mm0, (%ebx) + movq %mm1, (%ebx,%edx,1) + movq %mm2, (%ebx,%edx,2) + movq %mm3, (%ebx,%eax,1) + addl $8, %ebx + decl %ecx + jnz .biquad_horlr_line_loop + + emms + + pop %edi + pop %esi + pop %ebx + leave + ret + + + diff --git a/system/mmx/pixel_biquad_s16.s b/system/mmx/pixel_biquad_s16.s new file mode 100644 index 0000000..844b041 --- /dev/null +++ b/system/mmx/pixel_biquad_s16.s @@ -0,0 +1,451 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# + + + # DIRECT FORM II BIQUAD + # + # y[k] = b0 * x[k] + u1[k-1] + # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k] + # u2[k] = b2 * x[k] - a2 * y[k] + # MACRO: df2 <reg> + # + # computes a direct form 2 biquad + # does not use {mm0-mm3}\<inreg> + # + # input: <reg> == input + # %mm4 == state 1 + # %mm5 == state 2 + # (%esi) == biquad coefs (-a1 -a2 b0 b1 b2) in s1.14 + # output: <reg> == output + # %mm4 == state 1 + # %mm5 == state 2 + + .macro df2 reg + movq \reg, %mm6 # mm6 == x[k] + movq \reg, %mm7 # mm7 == x[k] + pmulhw 16(%esi), %mm6 # mm6 == x[k] * b0 + pmulhw 24(%esi), %mm7 # mm7 == x[k] * b1 + paddw %mm4, %mm6 # mm6 == x[k] * b0 + u1[k-1] == y[k] + paddw %mm5, %mm7 # mm7 == x[k] * b1 + u2[k-1] + paddsw %mm6, %mm6 # compensate for mul = x*y/4 (coefs are s1.14 fixed point) + paddsw %mm6, %mm6 # paddsw ensures saturation + movq \reg, %mm5 # mm5 == x[k] + movq %mm6, %mm4 # mm4 == y[k] + movq %mm6, \reg # reg == y[k] -------------------- + pmulhw 0(%esi), %mm4 # mm4 == y[k] * (-a1) + pmulhw 8(%esi), %mm6 # mm6 == y[k] * (-a2) + pmulhw 32(%esi), %mm5 # mm5 == x[k] * b2 + paddw %mm7, %mm4 # mm4 == u1[k] -------------------- + paddw %mm6, %mm5 # mm5 == u2[k] -------------------- + .endm + + + # input in register: + # %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3} + # %esi: coef memory (-a1, -a2, b0, b1, b2) in s1.14 + # %edi: state memory (u1, u2) + + # return in register: + # %mm0-mm4: 4x4 pixels result + + + + + .macro biquad_4x4_pixels + .align 16 + movq 0(%edi), %mm4 # get state + movq 8(%edi), %mm5 + df2 %mm0 # compute 4 biquads + df2 %mm1 + df2 %mm2 + df2 %mm3 + movq %mm4, 0(%edi) # store state + movq %mm5, 8(%edi) + .endm + + + + # in order to use the 4 line parallel biquad routine on horizontal + # lines, we need to reorder (rotate or transpose) the matrix, since + # images are scanline encoded, and we want to work in parallell + # on 4 lines. + # + # since the 4 lines are independent, it doesnt matter in which order + # the the vector elements are present. + # + # this allows us to use the same routine for left->right and right->left + # processing. + # + # some comments on the non-abelean group of square isometries consisting of + # (I) identity + # (H) horizontal axis mirror + # (V) vertical axis mirror + # (T) transpose (diagonal axis mirror) + # (A) antitranspose (antidiagonal axis mirror) + # (R1) 90deg anticlockwize rotation + # (R2) 180deg rotation + # (R3) 90deg clockwize rotation + # + # + # we basicly have two options: (R1,R3) or (T,A) + # we opt for T and A because they are self inverting, which improves locality + # + # use antitranspose for right to left an transpose + # for left to right (little endian) + + + # antitranspose 4x4 + + # input + # %mm3 == {d0 d1 d2 d3} + # %mm2 == {c0 c1 c2 c3} + # %mm1 == {b0 b1 b2 b3} + # %mm0 == {a0 a1 a2 a3} + + # output + # %mm3 == {a3 b3 c3 d3} + # %mm2 == {a2 b2 c2 d2} + # %mm1 == {a1 b1 c1 d1} + # %mm0 == {a0 b0 c0 d0} + + + .macro antitranspose_4x4: + movq %mm3, %mm4 + punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3} + movq %mm3, %mm5 + punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1} + + movq %mm2, %mm6 + punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3} + movq %mm2, %mm7 + punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1} + + movq %mm4, %mm3 + punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3} + movq %mm4, %mm2 + punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2} + + movq %mm5, %mm1 + punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1} + movq %mm5, %mm0 + punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0} + + .endm + + + # transpose 4x4 + + # input + # %mm3 == {d3 d2 d1 d0} + # %mm2 == {c3 c2 c1 c0} + # %mm1 == {b3 b2 b1 b0} + # %mm0 == {a3 a2 a1 a0} + + # output + # %mm3 == {d3 c3 b3 a3} + # %mm2 == {d2 c2 b2 a2} + # %mm1 == {d1 c1 b1 a1} + # %mm0 == {d0 c0 b0 a0} + + + .macro transpose_4x4: + movq %mm0, %mm4 + punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0} + movq %mm0, %mm5 + punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2} + + movq %mm1, %mm6 + punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0} + movq %mm1, %mm7 + punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2} + + movq %mm4, %mm0 + punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0} + movq %mm4, %mm1 + punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1} + + movq %mm5, %mm2 + punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2} + movq %mm5, %mm3 + punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3} + + .endm + +.globl pixel_biquad_vertb_s16 +.type pixel_biquad_vertb_s16,@function + + +# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) + + +pixel_biquad_vertb_s16: + + + pushl %ebp + movl %esp, %ebp + push %ebx + push %esi + push %edi + + movl 8(%ebp), %ebx # pixel array offset + movl 12(%ebp), %ecx # nb of 4x4 pixblocks + movl 16(%ebp), %edx # line with + + movl 20(%ebp), %esi # coefs + movl 24(%ebp), %edi # state + + shll $1, %edx # short int addressing + movl %edx, %eax + shll $1, %eax + addl %edx, %eax # eax = 3 * edx + + .align 16 + .biquad_vertb_line_loop: + movq (%ebx), %mm0 + movq (%ebx,%edx,1), %mm1 + movq (%ebx,%edx,2), %mm2 + movq (%ebx,%eax,1), %mm3 + biquad_4x4_pixels + movq %mm0, (%ebx) + movq %mm1, (%ebx,%edx,1) + movq %mm2, (%ebx,%edx,2) + movq %mm3, (%ebx,%eax,1) + addl %edx, %ebx + addl %eax, %ebx + decl %ecx + jnz .biquad_vertb_line_loop + + emms + + pop %edi + pop %esi + pop %ebx + leave + ret +.globl pixel_biquad_verbt_s16 +.type pixel_biquad_verbt_s16,@function + + +# pixel_biquad_vertbt_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) + + +pixel_biquad_verbt_s16: + + + pushl %ebp + movl %esp, %ebp + push %ebx + push %esi + push %edi + + movl 8(%ebp), %ebx # pixel array offset + movl 12(%ebp), %ecx # nb of 4x4 pixblocks + movl 16(%ebp), %eax # line with + + shll $3, %eax # 4 line byte spacing + decl %ecx + mul %ecx + incl %ecx + addl %eax, %ebx # ebx points to last pixblock + + movl 16(%ebp), %edx # line with + + movl 20(%ebp), %esi # coefs + movl 24(%ebp), %edi # state + + shll $1, %edx # short int addressing + movl %edx, %eax + shll $1, %eax + addl %edx, %eax # eax = 3 * edx + + .align 16 + .biquad_verbt_line_loop: + movq (%ebx), %mm3 + movq (%ebx,%edx,1), %mm2 + movq (%ebx,%edx,2), %mm1 + movq (%ebx,%eax,1), %mm0 + biquad_4x4_pixels + movq %mm3, (%ebx) + movq %mm2, (%ebx,%edx,1) + movq %mm1, (%ebx,%edx,2) + movq %mm0, (%ebx,%eax,1) + subl %edx, %ebx + subl %eax, %ebx + decl %ecx + jnz .biquad_verbt_line_loop + + emms + + pop %edi + pop %esi + pop %ebx + leave + ret + +.globl pixel_biquad_horlr_s16 +.type pixel_biquad_horlr_s16,@function +# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) + +pixel_biquad_horlr_s16: + + + pushl %ebp + movl %esp, %ebp + push %ebx + push %esi + push %edi + + movl 8(%ebp), %ebx # pixel array offset + movl 12(%ebp), %ecx # nb of 4x4 pixblocks + movl 16(%ebp), %edx # line with + + movl 20(%ebp), %esi # coefs + movl 24(%ebp), %edi # state + + shll $1, %edx # short int addressing + movl %edx, %eax + shll $1, %eax + addl %edx, %eax # eax = 3 * edx + + .align 16 + .biquad_horlr_line_loop: + movq (%ebx), %mm0 + movq (%ebx,%edx,1), %mm1 + movq (%ebx,%edx,2), %mm2 + movq (%ebx,%eax,1), %mm3 + transpose_4x4 + biquad_4x4_pixels + transpose_4x4 + movq %mm0, (%ebx) + movq %mm1, (%ebx,%edx,1) + movq %mm2, (%ebx,%edx,2) + movq %mm3, (%ebx,%eax,1) + addl $8, %ebx + decl %ecx + jnz .biquad_horlr_line_loop + + emms + + pop %edi + pop %esi + pop %ebx + leave + ret + + +.globl pixel_biquad_horrl_s16 +.type pixel_biquad_horrl_s16,@function +# pixel_biquad_horrl_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) + +pixel_biquad_horrl_s16: + + pushl %ebp + movl %esp, %ebp + push %ebx + push %esi + push %edi + + movl 8(%ebp), %ebx # pixel array offset + movl 12(%ebp), %ecx # nb of 4x4 pixblocks + movl 16(%ebp), %edx # line with + + + movl %ecx, %eax + decl %eax + shll $3, %eax + addl %eax, %ebx # ebx points to last pixblock + + + movl 20(%ebp), %esi # coefs + movl 24(%ebp), %edi # state + + shll $1, %edx # short int addressing + movl %edx, %eax + shll $1, %eax + addl %edx, %eax # eax = 3 * edx + + .align 16 + .biquad_horrl_line_loop: + movq (%ebx), %mm0 + movq (%ebx,%edx,1), %mm1 + movq (%ebx,%edx,2), %mm2 + movq (%ebx,%eax,1), %mm3 + antitranspose_4x4 + biquad_4x4_pixels + antitranspose_4x4 + movq %mm0, (%ebx) + movq %mm1, (%ebx,%edx,1) + movq %mm2, (%ebx,%edx,2) + movq %mm3, (%ebx,%eax,1) + subl $8, %ebx + decl %ecx + jnz .biquad_horrl_line_loop + + emms + + pop %edi + pop %esi + pop %ebx + leave + ret + + +.globl pixel_biquad_time_s16 +.type pixel_biquad_time_s16,@function +# pixel_biquad_time_s16(short int *pixel_array, short int *s1, short int *s2, short int *coefs, int nb_4_pix_vectors) + +pixel_biquad_time_s16: + + pushl %ebp + movl %esp, %ebp + push %ebx + push %esi + push %edi + + movl 8(%ebp), %ebx # pixel array offset + movl 12(%ebp), %edx # state 1 array + movl 16(%ebp), %edi # state 2 array + + movl 20(%ebp), %esi # coefs + movl 24(%ebp), %ecx # nb of 4 pixel vectors + + + .align 16 + .biquad_time_loop: + movq (%ebx), %mm0 # get input + movq (%edx), %mm4 # get state 1 + movq (%edi), %mm5 # get state 2 + df2 %mm0 # compute direct form 2 + movq %mm0, (%ebx) # write output + movq %mm5, (%edi) # write state 2 + movq %mm4, (%edx) # write state 1 + addl $8, %ebx + addl $8, %edi + addl $8, %edx + decl %ecx + jnz .biquad_time_loop + + emms + + pop %edi + pop %esi + pop %ebx + leave + ret + + diff --git a/system/mmx/pixel_ca_s1.s b/system/mmx/pixel_ca_s1.s new file mode 100644 index 0000000..d9c730f --- /dev/null +++ b/system/mmx/pixel_ca_s1.s @@ -0,0 +1,189 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# + + # this file contains assembler routines for 2D 1 bit cellular automata + # processing. it is organized around a feeder kernel and a + # stack based bit processor (virtual forth machine) + # + # the feeder kernel is responsable for loading/storing CA cells + # from/to memory. data in memory is organized as a scanline + # encoded toroidial bitplane (lsb = left). to simplify the kernel, the top + # left corner of the rectangular grid of pixels will shift down + # every processing step. + # + # the stack machine has the following architecture: + # CA stack: %esi, TOS: %mm0 (32x2 pixels. lsw = top row) + # CA horizon: %mm4-%mm7 (64x4 pixels. %mm4 = top row) + # + # the stack size / organization is not known to the stack machine. + # it can be thought of as operating on a 3x3 cell neightbourhood. + # the only purpose of forth program is to determine the CA local update rule. + # + # the machine is supposed to be very minimal. no looping control. + # no adressing modes. no conditional code (hey, this is an experiment!) + # so recursion is not allowed (no way to stop it) + # there are 9 words to load the cell neigbourhood on the stack. + # the rest is just logic and stack manips. + + + # this file contains pure asm macros. it is to be included before assembly + # after scaforth.pl has processed the .scaf file + + + # *************************** CA CELL ACCESS MACROS ***************************** + # fetchTL - fetchBR + + # shift / load rectangle macros: + + # shift rectangle horizontal + # result is in reg1 + .macro shift reg1 reg2 count + psllq $(32+\count), \reg1 + psrlq $(32-\count), \reg2 + psrlq $32, \reg1 + psllq $32, \reg2 + por \reg2, \reg1 + .endm + + .macro ldtop reg1 reg2 + movq %mm4, \reg1 + movq %mm5, \reg2 + .endm + + .macro ldcenter reg1 reg2 + movq %mm5, \reg1 + movq %mm6, \reg2 + .endm + + .macro ldbottom reg1 reg2 + movq %mm6, \reg1 + movq %mm7, \reg2 + .endm + + + # fetch from top row + + # fetch the top left square + .macro fetchTL + ldtop %mm0, %mm1 + shift %mm0, %mm1, -1 + .endm + + # fetch the top mid square + .macro fetchTM + ldtop %mm0, %mm1 + shift %mm0, %mm1, 0 + .endm + + # fetch the top right square + .macro fetchTR + ldtop %mm0, %mm1 + shift %mm0, %mm1, 1 + .endm + + + + # fetch from center row + + # fetch the mid left square + .macro fetchML + ldcenter %mm0, %mm1 + shift %mm0, %mm1, -1 + .endm + + # fetch the mid mid square + .macro fetchMM + ldcenter %mm0, %mm1 + shift %mm0, %mm1, 0 + .endm + + # fetch the mid right square + .macro fetchMR + ldcenter %mm0, %mm1 + shift %mm0, %mm1, 1 + .endm + + + + + + # fetch from bottom row + + # fetch the bottom left square + .macro fetchBL + ldbottom %mm0, %mm1 + shift %mm0, %mm1, -1 + .endm + + # fetch the bottom mid square + .macro fetchBM + ldbottom %mm0, %mm1 + shift %mm0, %mm1, 0 + .endm + + # fetch the bottom right square + .macro fetchBR + ldbottom %mm0, %mm1 + shift %mm0, %mm1, 1 + .endm + + + + # *************************** CA STACK MANIP MACROS ***************************** + # dup drop dropdup swap nip dropover + + .macro dup + lea -8(%esi), %esi + movq %mm0, (%esi) + .endm + + .macro drop + movq (%esi), %mm0 + lea 8(%esi), %esi + .endm + + .macro dropdup + movq (%esi), %mm0 + .endm + + .macro swap + movq (%esi), %mm1 + movq %mm0, (%esi) + movq %mm1, %mm0 + .endm + + .macro nip + lea 8(%esi), %esi + .endm + + .macro dropover + movq 8(%esi), %mm0 + .endm + + + # *************************** CA BOOLEAN LOGIC MACROS ***************************** + # overxor + + .macro overxor + pxor (%esi), %mm0 + .endm + + + + + diff --git a/system/mmx/pixel_cascade_s16.s b/system/mmx/pixel_cascade_s16.s new file mode 100644 index 0000000..bf88d08 --- /dev/null +++ b/system/mmx/pixel_cascade_s16.s @@ -0,0 +1,330 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# + + + # TODO: COUPLED CASCADE SECOND ORDER SECTION + # + # s1[k] = ar * s1[k-1] + ai * s2[k-1] + x[k] + # s2[k] = ar * s2[k-1] - ai * s1[k-1] + # y[k] = c0 * x[k] + c1 * s1[k-1] + c2 * s2[k-1] + + + # MACRO: df2 + # + # computes a coupled cascade + # + # input: %mm0 == input + # %mm1 == state 1 + # %mm2 == state 2 + # (%esi) == cascade coefs (ar ai c0 c1 c2) in s0.15 + # output: %mm0 == output + # %mm1 == state 1 + # %mm2 == state 2 + + + .macro coupled + pmovq %mm1, %mm3 # mm3 == s1[k-1] + pmovq %mm1, %mm4 # mm4 == s1[k-1] + pmovq %mm2, %mm5 # mm5 == s2[k-1] + pmovq %mm2, %mm6 # mm5 == s2[k-1] + pmulhw (%esi), %mm1 # mm1 == s1[k-1] * ar + pmulhw 8(%esi), %mm3 # mm3 == s1[k-1] * ai + pmulhw 24(%esi), %mm4 # mm4 == s1[k-1] * c1 + pmulhw (%esi), %mm2 # mm2 == s2[k-1] * ar + pmulhw 8(%esi), %mm5 # mm5 == s2[k-1] * ai + pmulhw 32(%esi), %mm6 # mm6 == s2[k-1] * c2 + paddw %mm5, %mm1 # mm1 == s1[k-1] * ar + s2[k-1] * ai + psubw %mm3, %mm2 # mm2 == s2[k-1] * ar - s1[k-1] * ai == s2[k] + paddw %mm0, %mm1 # mm1 == s1[k] + pmulhw 16(%esi), %mm0 # mm0 == x[k] * c0 + paddw %mm6, %mm4 # mm4 == s1[k-1] * c1 + s2[k-1] * c2 + paddw %mm4, %mm0 # mm0 == y[k] + .endm + + + + + # in order to use the 4 line parallel cascade routine on horizontal + # lines, we need to reorder (rotate or transpose) the matrix, since + # images are scanline encoded, and we want to work in parallell + # on 4 lines. + # + # since the 4 lines are independent, it doesnt matter in which order + # the the vector elements are present. + # + # this allows us to use the same routine for left->right and right->left + # processing. + # + # some comments on the non-abelean group of square isometries consisting of + # (I) identity + # (H) horizontal axis mirror + # (V) vertical axis mirror + # (T) transpose (diagonal axis mirror) + # (A) antitranspose (antidiagonal axis mirror) + # (R1) 90deg anticlockwize rotation + # (R2) 180deg rotation + # (R3) 90deg clockwize rotation + # + # + # we basicly have two options: (R1,R3) or (T,A) + # we opt for T and A because they are self inverting, which improves locality + # + # use antitranspose for right to left an transpose + # for left to right (little endian) + + + # antitranspose 4x4 + + # input + # %mm3 == {d0 d1 d2 d3} + # %mm2 == {c0 c1 c2 c3} + # %mm1 == {b0 b1 b2 b3} + # %mm0 == {a0 a1 a2 a3} + + # output + # %mm3 == {a3 b3 c3 d3} + # %mm2 == {a2 b2 c2 d2} + # %mm1 == {a1 b1 c1 d1} + # %mm0 == {a0 b0 c0 d0} + + + .macro antitranspose_4x4: + movq %mm3, %mm4 + punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3} + movq %mm3, %mm5 + punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1} + + movq %mm2, %mm6 + punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3} + movq %mm2, %mm7 + punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1} + + movq %mm4, %mm3 + punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3} + movq %mm4, %mm2 + punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2} + + movq %mm5, %mm1 + punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1} + movq %mm5, %mm0 + punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0} + + .endm + + + # transpose 4x4 + + # input + # %mm3 == {d3 d2 d1 d0} + # %mm2 == {c3 c2 c1 c0} + # %mm1 == {b3 b2 b1 b0} + # %mm0 == {a3 a2 a1 a0} + + # output + # %mm3 == {d3 c3 b3 a3} + # %mm2 == {d2 c2 b2 a2} + # %mm1 == {d1 c1 b1 a1} + # %mm0 == {d0 c0 b0 a0} + + + .macro transpose_4x4: + movq %mm0, %mm4 + punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0} + movq %mm0, %mm5 + punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2} + + movq %mm1, %mm6 + punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0} + movq %mm1, %mm7 + punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2} + + movq %mm4, %mm0 + punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0} + movq %mm4, %mm1 + punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1} + + movq %mm5, %mm2 + punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2} + movq %mm5, %mm3 + punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3} + + .endm + +.globl pixel_cascade_vertb_s16 +.type pixel_cascade_vertb_s16,@function + + +# pixel_cascade_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) + + +pixel_cascade_vertb_s16: + + + pushl %ebp + movl %esp, %ebp + push %ebx + push %esi + push %edi + + movl 8(%ebp), %ebx # pixel array offset + movl 12(%ebp), %ecx # nb of 4x4 pixblocks + movl 16(%ebp), %edx # line with + + movl 20(%ebp), %esi # coefs + movl 24(%ebp), %edi # state + + shll $1, %edx # short int addressing + subl %edx, %ebx + + movq 0(%edi), %mm1 # s1[k-1] + movq 8(%edi), %mm2 # s2[k-1] + .align 16 + .cascade_vertb_line_loop: + + movq (%ebx,%edx,1), %mm3 + movq %mm3, %mm0 + addl %edx, %ebx + coupled + movq %mm0, (%ebx) + + movq (%ebx,%edx,1), %mm3 + movq %mm3, %mm0 + addl %edx, %ebx + coupled + movq %mm0, (%ebx) + + movq (%ebx,%edx,1), %mm3 + movq %mm3, %mm0 + addl %edx, %ebx + coupled + movq %mm0, (%ebx) + + movq (%ebx,%edx,1), %mm3 + movq %mm3, %mm0 + addl %edx, %ebx + coupled + movq %mm0, (%ebx) + + decl %ecx + jnz .cascade_vertb_line_loop + + movq %mm1, 0(%edi) # s1[k-1] + movq %mm2, 8(%edi) # s2[k-1] + + emms + + pop %edi + pop %esi + pop %ebx + leave + ret + +.globl pixel_cascade_horlr_s16 +.type pixel_cascade_horlr_s16,@function + + +# pixel_cascade_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) + + +pixel_cascade_horlr_s16: + + + pushl %ebp + movl %esp, %ebp + push %ebx + push %esi + push %edi + + movl 8(%ebp), %ebx # pixel array offset + movl 12(%ebp), %ecx # nb of 4x4 pixblocks + movl 16(%ebp), %edx # line with + + movl 20(%ebp), %esi # coefs + movl 24(%ebp), %edi # state + + shll $1, %edx # short int addressing + movl %edx, %eax + shll $1, %eax + addl %edx, %eax # eax = 3 * edx + + + .align 16 + .cascade_horlr_line_loop: + movq (%edi), %mm1 + movq 8(%edi), %mm2 + + movq (%ebx), %mm0 + movq (%ebx,%edx,1), %mm1 + movq (%ebx,%edx,2), %mm2 + movq (%ebx,%eax,1), %mm3 + + transpose_4x4 + + movq %mm1, (%ebx,%edx,1) + movq %mm2, (%ebx,%edx,2) + movq %mm3, (%ebx,%eax,1) + + coupled + + movq %mm0, (%ebx) + movq (%ebx,%edx,1), %mm3 + movq %mm3, %mm0 + + coupled + + movq %mm0, (%ebx, %edx,1) + movq (%ebx,%edx,2), %mm3 + movq %mm3, %mm0 + + coupled + + movq %mm0, (%ebx, %edx,2) + movq (%ebx,%eax,1), %mm3 + movq %mm3, %mm0 + + coupled + + movq %mm1, 0(%edi) # s1[k-1] + movq %mm2, 8(%edi) # s2[k-1] + + movq %mm0, %mm3 + movq (%ebx), %mm0 + movq (%ebx,%edx,1), %mm1 + movq (%ebx,%edx,2), %mm2 + + transpose_4x4 + + movq %mm0, (%ebx) + movq %mm1, (%ebx,%edx,1) + movq %mm2, (%ebx,%edx,2) + movq %mm3, (%ebx,%eax,1) + + addl $8, %ebx + decl %ecx + jnz .cascade_horlr_line_loop + + emms + + pop %edi + pop %esi + pop %ebx + leave + ret + + + diff --git a/system/mmx/pixel_conv_hor_s16.s b/system/mmx/pixel_conv_hor_s16.s new file mode 100644 index 0000000..e90a692 --- /dev/null +++ b/system/mmx/pixel_conv_hor_s16.s @@ -0,0 +1,134 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# + # intermediate function + + # input in register: + # %mm0: left 4 pixels + # %mm1: middle 4 pixels + # %mm2: right 4 pixels + + # %mm5: left 4 pixel masks + # %mm6: middle 4 pixel masks + # %mm7: right 4 pixel masks + + # return in register: + # %mm0: middle 4 pixels result + + + .conv_hor_4_pixels: + .align 16 + + # compute quadruplet + + # get left pixels + psrlq $48, %mm0 # shift word 3 to byte 0 + movq %mm1, %mm4 + psllq $16, %mm4 # shift word 0,1,2 to 1,2,3 + por %mm4, %mm0 # combine + pmulhw %mm5, %mm0 + psllw $1, %mm0 + + + # get middle pixels + movq %mm1, %mm4 + pmulhw %mm6, %mm4 + psllw $1, %mm4 + paddsw %mm4, %mm0 + + + # get right pixels + movq %mm2, %mm3 + psllq $48, %mm3 # shift word 0 to word 3 + movq %mm1, %mm4 + psrlq $16, %mm4 # shift word 1,2,3 to 0,1,2 + por %mm4, %mm3 # combine + pmulhw %mm7, %mm3 + psllw $1, %mm3 + paddsw %mm3, %mm0 # accumulate + + ret + +.globl pixel_conv_hor_s16 +.type pixel_conv_hor_s16,@function + + +# pixel_conv_hor_s16(short int *pixel_array, int nb_4_pixel_vectors, short int border[4], short int mask[12]) +# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested +# NOT TESTED + + +pixel_conv_hor_s16: + + + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 8(%ebp), %esi # pixel array offset + movl 12(%ebp), %ecx # nb of 8 pixel vectors in a row (at least 2) + + movl 20(%ebp), %edi # mask vector + movq (%edi), %mm5 + movq 8(%edi), %mm6 + movq 16(%edi), %mm7 + + movl 16(%ebp), %edi # boundary pixel vector + + + + movq (%edi), %mm0 # init regs (left edge, so mm0 is zero) + movq (%esi), %mm1 + movq 8(%esi), %mm2 + + decl %ecx # loop has 2 terminator stubs + decl %ecx # todo: handle if ecx < 3 + + jmp .conv_line_loop + + + .align 16 + .conv_line_loop: + call .conv_hor_4_pixels # compute conv + movq %mm0, (%esi) # store result + movq %mm1, %mm0 # mm0 <- prev (%esi) + movq %mm2, %mm1 # mm1 <- 8(%esi) + movq 16(%esi), %mm2 # mm2 <- 16(%esi) + + addl $8, %esi # increase pointer + decl %ecx + jnz .conv_line_loop + + call .conv_hor_4_pixels # compute conv + movq %mm0, (%esi) # store result + movq %mm1, %mm0 # mm0 <- prev (%esi) + movq %mm2, %mm1 # mm1 <- 8(%esi) + movq (%edi), %mm2 # mm2 <- border + + call .conv_hor_4_pixels # compute last vector + movq %mm0, 8(%esi) # store it + + emms + + pop %edi + pop %esi + leave + ret + + + diff --git a/system/mmx/pixel_conv_ver_s16.s b/system/mmx/pixel_conv_ver_s16.s new file mode 100644 index 0000000..ae2456f --- /dev/null +++ b/system/mmx/pixel_conv_ver_s16.s @@ -0,0 +1,128 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +#TODO: fix out of bound acces in conv_ver and conv_hor + + # intermediate function + + # input in register: + # %mm0: top 4 pixels + # %mm1: middle 4 pixels + # %mm2: bottom 4 pixels + + # %mm5: top 4 pixel mask + # %mm6: middle 4 pixel mask + # %mm7: bottom 4 pixel mask + + # return in register: + # %mm0: middle 4 pixels result + + + .conv_ver_4_pixels: + .align 16 + + # compute quadruplet + + # get top pixel + pmulhw %mm5, %mm0 + psllw $1, %mm0 + + # get middle pixel + movq %mm1, %mm4 + pmulhw %mm6, %mm4 + psllw $1, %mm4 + paddsw %mm4, %mm0 + + # get bottom pixel + movq %mm2, %mm3 + pmulhw %mm7, %mm3 + psllw $1, %mm3 # mm3 <- mm3/4 + paddsw %mm3, %mm0 + + ret + +.globl pixel_conv_ver_s16 +.type pixel_conv_ver_s16,@function + + +# pixel_conv_ver_s16(short int *pixel_array, int nb_4_pixel_vectors, int row_byte_size, short int border[4]) +# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested +# NOT TESTED + + +pixel_conv_ver_s16: + + + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 8(%ebp), %esi # pixel array offset + movl 12(%ebp), %ecx # nb of 4 pixel vectors in a row (at least 2) + movl 16(%ebp), %edx # rowsize in bytes + + movl 24(%ebp), %edi # mask vector + movq (%edi), %mm5 + movq 8(%edi), %mm6 + movq 16(%edi), %mm7 + + movl 20(%ebp), %edi # edge vector + + + shll $1, %edx + decl %ecx # loop has a terminator stub + decl %ecx # loop has another terminator stub + + + movq (%edi), %mm0 # init regs (left edge, so mm0 is zero) + movq (%esi), %mm1 + movq (%esi,%edx,1), %mm2 + jmp .conv_line_loop + + + .align 16 + .conv_line_loop: + call .conv_ver_4_pixels # compute conv + movq %mm0, (%esi) # store result + movq %mm1, %mm0 # mm0 <- prev (%esi) + movq %mm2, %mm1 # mm1 <- (%esi,%edx,1) + movq (%esi,%edx,2), %mm2 # mm2 <- (%esi,%edx,2) + + addl %edx, %esi # increase pointer + decl %ecx + jnz .conv_line_loop + + call .conv_ver_4_pixels # compute conv + movq %mm0, (%esi) # store result + movq %mm1, %mm0 # mm0 <- prev (%esi) + movq %mm2, %mm1 # mm1 <- (%esi,%edx,1) + movq (%edi), %mm2 # clear invalid edge vector + + addl %edx, %esi # increase pointer + call .conv_ver_4_pixels # compute last vector + movq %mm0, (%esi) # store it + + emms + + pop %edi + pop %esi + leave + ret + + + diff --git a/system/mmx/pixel_crot_s16.s b/system/mmx/pixel_crot_s16.s new file mode 100644 index 0000000..2427869 --- /dev/null +++ b/system/mmx/pixel_crot_s16.s @@ -0,0 +1,153 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_crot3d_s16 +.type pixel_crot3d_s16,@function + + +# 3 dimensional colour space rotation +# 3x3 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector + +# void pixel_crot3d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix) + +pixel_crot3d_s16: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + + movl 8(%ebp), %esi # input array + movl 12(%ebp), %ecx # pixel count + movl 16(%ebp), %edi # rotation matrix + movl %ecx, %edx + shll $3, %edx # %edx = plane spacing + + + .align 16 + .loop_crot3d: + + movq (%esi), %mm0 # get 1st component + movq (%esi,%edx,1), %mm6 # get 2nd component + movq (%esi,%edx,2), %mm7 # get 3rd component + + movq %mm0, %mm1 # copy 1st component + movq %mm0, %mm2 + + pmulhw (%edi), %mm0 # mul first column + pmulhw 8(%edi), %mm1 + pmulhw 16(%edi), %mm2 + + movq %mm6, %mm5 # copy 2nd component + movq %mm6, %mm3 + + pmulhw 24(%edi), %mm6 # mul second column + pmulhw 32(%edi), %mm5 + pmulhw 40(%edi), %mm3 + + paddsw %mm6, %mm0 # accumulate + paddsw %mm5, %mm1 + paddsw %mm3, %mm2 + + movq %mm7, %mm4 # copy 3rd component + movq %mm7, %mm6 + + pmulhw 48(%edi), %mm4 # mul third column + pmulhw 56(%edi), %mm6 + pmulhw 64(%edi), %mm7 + + paddsw %mm4, %mm0 # accumulate + paddsw %mm6, %mm1 + paddsw %mm7, %mm2 + + paddsw %mm0, %mm0 # double (fixed point normalization) + paddsw %mm1, %mm1 + paddsw %mm2, %mm2 + + movq %mm0, (%esi) # store + movq %mm1, (%esi, %edx, 1) + movq %mm2, (%esi, %edx, 2) + + addl $8, %esi # increment source pointer + decl %ecx + jnz .loop_crot3d # loop + + emms + + pop %edi + pop %esi + leave + ret + + +.globl pixel_crot2d_s16 +.type pixel_crot2d_s16,@function + +# 2 dimensional colour space rotation +# 2x2 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector + +# void pixel_crot2d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix) + +pixel_crot2d_s16: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + + movl 8(%ebp), %esi # input array + movl 12(%ebp), %ecx # pixel count + movl 16(%ebp), %edi # rotation matrix + movl %ecx, %edx + shll $3, %edx # %edx = plane spacing + + + .align 16 + .loop_crot2d: + + movq (%esi), %mm0 # get 1st component + movq (%esi,%edx,1), %mm2 # get 2nd component + + movq %mm0, %mm1 # copy 1st component + movq %mm2, %mm3 # copy 2nd component + + pmulhw (%edi), %mm0 # mul first column + pmulhw 8(%edi), %mm1 + + pmulhw 16(%edi), %mm2 # mul second column + pmulhw 24(%edi), %mm3 + + paddsw %mm2, %mm0 # accumulate + paddsw %mm3, %mm1 + + paddsw %mm0, %mm0 # fixed point gain correction + paddsw %mm1, %mm1 + + movq %mm0, (%esi) # store + movq %mm1, (%esi, %edx, 1) + + addl $8, %esi # increment source pointer + decl %ecx + jnz .loop_crot2d # loop + + emms + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_gain.s b/system/mmx/pixel_gain.s new file mode 100644 index 0000000..5cd5057 --- /dev/null +++ b/system/mmx/pixel_gain.s @@ -0,0 +1,83 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_gain +.type pixel_gain,@function + +# mmx rgba pixel gain +# void asmtest(char *pixelarray, int32 nbpixels, int *rgba_gain) +# gains are 7.9 fixed point for rgba + +pixel_gain: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 8(%ebp), %esi # pixel array offset + movl 12(%ebp), %ecx # nb of elements + movl 16(%ebp), %edi # int16[4] array of gains + + prefetch (%esi) + + emms + sarl $2, %ecx # process 4 pixels per loop iteration + jz .exit + movq (%edi), %mm7 # read gain array from memory + jmp .loop_gain + + .align 16 + .loop_gain: + + prefetch 128(%esi) + movq (%esi), %mm5 # load pixel 1-2 from memory + movq 8(%esi), %mm6 # load pixel 3-4 from memory + pxor %mm0, %mm0 # zero mm0 - mm3 + pxor %mm1, %mm1 + pxor %mm2, %mm2 + pxor %mm3, %mm3 + punpcklbw %mm5, %mm0 # unpack 1st pixel into 8.8 bit ints + punpckhbw %mm5, %mm1 # unpack 2nd + punpcklbw %mm6, %mm2 # unpack 3rd + punpckhbw %mm6, %mm3 # unpack 4th + psrlw $0x1, %mm0 # shift right to clear sign bit 9.7 + psrlw $0x1, %mm1 + psrlw $0x1, %mm2 + psrlw $0x1, %mm3 + + pmulhw %mm7, %mm0 # multiply 1st pixel 9.7 * 7.9 -> 16.0 + pmulhw %mm7, %mm1 # multiply 2nd + pmulhw %mm7, %mm2 # multiply 3rd + pmulhw %mm7, %mm3 # multiply 4th + + packuswb %mm1, %mm0 # pack & saturate to 8bit vector + movq %mm0, (%esi) # store result in memory + packuswb %mm3, %mm2 # pack & saturate to 8bit vector + movq %mm2, 8(%esi) # store result in memory + + addl $16, %esi # increment source pointer + decl %ecx + jnz .loop_gain # loop + + .exit: + emms + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_gain_s16.s b/system/mmx/pixel_gain_s16.s new file mode 100644 index 0000000..adcfdf5 --- /dev/null +++ b/system/mmx/pixel_gain_s16.s @@ -0,0 +1,71 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_gain_s16 +.type pixel_gain_s16,@function + +# gain is integer, shift count is down +# void pixel_gain_s16(int *buf, int nb_8pixel_vectors, short int gain[4], unsigned long long *shift) + +pixel_gain_s16: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 20(%ebp), %edi + movq (%edi), %mm6 # get shift vector + + movl 16(%ebp), %edi + movq (%edi), %mm7 # get gain vector + + movl 8(%ebp), %esi # input array + movl 12(%ebp), %ecx # pixel count + + + .align 16 + .loop_gain: + + movq (%esi), %mm0 # load 4 pixels from memory + movq %mm0, %mm1 + pmulhw %mm7, %mm1 # apply gain (s15.0) fixed point, high word + pmullw %mm7, %mm0 # low word + + movq %mm0, %mm2 # copy + movq %mm1, %mm3 + + punpcklwd %mm1, %mm0 # unpack lsw components + punpckhwd %mm3, %mm2 # unpack msw components + + psrad %mm6, %mm0 # apply signed shift + psrad %mm6, %mm2 + + packssdw %mm2, %mm0 # pack result & saturate + movq %mm0, (%esi) # store result + + + addl $8, %esi # increment source pointer + decl %ecx + jnz .loop_gain # loop + + emms + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_mix_s16.s b/system/mmx/pixel_mix_s16.s new file mode 100644 index 0000000..9bf41eb --- /dev/null +++ b/system/mmx/pixel_mix_s16.s @@ -0,0 +1,68 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_mix_s16 +.type pixel_mix_s16,@function + +# mmx rgba pixel gain +# void pixel_mix_s16(int *left, int *right, int nb_4pixel_vectors, +# short int gain_left[4], short int gain_right[4]) + +pixel_mix_s16: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 20(%ebp), %edi # int16[4] array of gains + movq (%edi), %mm6 # get left gain array + + movl 24(%ebp), %edi # int16[4] array of gains + movq (%edi), %mm7 # get right gain array + + movl 8(%ebp), %edi # left array + movl 12(%ebp), %esi # right array + movl 16(%ebp), %ecx # pixel count + + + .align 16 + .loop_mix: + +# prefetch 128(%esi) + movq (%esi), %mm1 # load right 4 pixels from memory + pmulhw %mm7, %mm1 # apply right gain + movq (%edi), %mm0 # load 4 left pixels from memory + pmulhw %mm6, %mm0 # apply left gain +# pslaw $1, %mm1 # shift left ((s).15 x (s).15 -> (s0).14)) +# pslaw $1, %mm0 + paddsw %mm0, %mm0 # no shift left arithmic, so use add instead + paddsw %mm1, %mm1 + paddsw %mm1, %mm0 # mix + movq %mm0, (%edi) + addl $8, %esi + addl $8, %edi + decl %ecx + jnz .loop_mix # loop + + emms + + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_mul_s16.s b/system/mmx/pixel_mul_s16.s new file mode 100644 index 0000000..240a024 --- /dev/null +++ b/system/mmx/pixel_mul_s16.s @@ -0,0 +1,56 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_mul_s16 +.type pixel_mul_s16,@function + +# simple add +# void pixel_mul_s16(int *left, int *right, int nb_4pixel_vectors) + +pixel_mul_s16: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 8(%ebp), %edi # left array + movl 12(%ebp), %esi # right array + movl 16(%ebp), %ecx # pixel count + + + .align 16 + .loop_mix: + +# prefetch 128(%esi) + movq (%esi), %mm1 # load right 4 pixels from memory + movq (%edi), %mm0 # load 4 left pixels from memory + pmulhw %mm1, %mm0 # mul + psllw $1, %mm0 # fixed point shift correction + movq %mm0, (%edi) + addl $8, %esi + addl $8, %edi + decl %ecx + jnz .loop_mix # loop + + emms + + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_pack_s16u8.s b/system/mmx/pixel_pack_s16u8.s new file mode 100644 index 0000000..57df702 --- /dev/null +++ b/system/mmx/pixel_pack_s16u8.s @@ -0,0 +1,126 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_pack_s16u8_y +.type pixel_pack_s16u8_y,@function + +# mmx rgba pixel gain +# void pixel_pack_s16u8_y(int *input, int *output, int nb_8pixel_vectors) + +pixel_pack_s16u8_y: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + +# movl 20(%ebp), %edi # int16[4] array of gains +# movq (%edi), %mm7 # get gain array +# psllw $1, %mm7 # adjust for shifted sign bit + + movl 8(%ebp), %esi # input array + movl 12(%ebp), %edi # output array + movl 16(%ebp), %ecx # pixel count + + pxor %mm6, %mm6 + + .align 16 + .loop_pack_y: + +# prefetch 128(%esi) + movq (%esi), %mm0 # load 4 pixels from memory +# pmulhw %mm7, %mm0 # apply gain + movq 8(%esi), %mm1 # load 4 pixels from memory +# pmulhw %mm7, %mm1 # apply gain + +# movq %mm0, %mm2 +# pcmpgtw %mm6, %mm2 # mm2 > 0 ? 0xffff : 0 +# pand %mm2, %mm0 + +# movq %mm1, %mm3 +# pcmpgtw %mm6, %mm3 # mm3 > 0 ? 0xffff : 0 +# pand %mm3, %mm1 + +# psllw $1, %mm0 # shift out sign bit +# psllw $1, %mm1 # shift out sign bit + + psraw $7, %mm0 # shift to lsb + psraw $7, %mm1 # shift to lsb + + packuswb %mm1, %mm0 # pack & saturate to 8bit vector + movq %mm0, (%edi) # store result in memory + + addl $16, %esi # increment source pointer + addl $8, %edi # increment dest pointer + decl %ecx + jnz .loop_pack_y # loop + + emms + + pop %edi + pop %esi + leave + ret + +.globl pixel_pack_s16u8_uv +.type pixel_pack_s16u8_uv,@function + +pixel_pack_s16u8_uv: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + +# movl 20(%ebp), %edi # int16[4] array of gains +# movq (%edi), %mm7 # get gain array + movl 8(%ebp), %esi # pixel array offset + movl 12(%ebp), %edi # nb of elements + movl 16(%ebp), %ecx # pixel count + + pcmpeqw %mm6, %mm6 + psllw $15, %mm6 + movq %mm6, %mm5 + psrlw $8, %mm5 + por %mm5, %mm6 # mm6 <- 8 times 0x80 + + .align 16 + .loop_pack_uv: + +# prefetch 128(%esi) + movq (%esi), %mm0 # load 4 pixels from memory +# pmulhw %mm7, %mm0 # apply gain + movq 8(%esi), %mm1 # load 4 pixels from memory +# pmulhw %mm7, %mm1 # apply gain + + psraw $8, %mm0 # shift to msb + psraw $8, %mm1 + + packsswb %mm1, %mm0 # pack & saturate to 8bit vector + pxor %mm6, %mm0 # flip sign bits + movq %mm0, (%edi) # store result in memory + + addl $16, %esi # increment source pointer + addl $8, %edi # increment dest pointer + decl %ecx + jnz .loop_pack_uv # loop + + emms + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_rand_s16.s b/system/mmx/pixel_rand_s16.s new file mode 100644 index 0000000..649400b --- /dev/null +++ b/system/mmx/pixel_rand_s16.s @@ -0,0 +1,76 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_rand_s16 +.type pixel_rand_s16,@function + +# mmx rgba pixel gain +# void pixel_rand_s16(int *dst, nb_4pixel_vectors, short int random_seed[4]) + +pixel_rand_s16: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 16(%ebp), %esi # int16[4] array of random seeds + movl 8(%ebp), %edi # dst array + movl 12(%ebp), %ecx # pixel count + + movq (%esi), %mm6 + + + pcmpeqw %mm3, %mm3 + psrlw $15, %mm3 # get bit mask 4 times 0x0001 + + .align 16 + .loop_rand: + +# prefetch 128(%esi) + + + movq %mm6, %mm4 # get random vector + psrlw $15, %mm4 # get first component + movq %mm6, %mm5 + psrlw $14, %mm5 # get second component + pxor %mm5, %mm4 + movq %mm6, %mm5 + psrlw $12, %mm5 # get third component + pxor %mm5, %mm4 + movq %mm6, %mm5 + psrlw $3, %mm5 # get forth component + pxor %mm5, %mm4 + + psllw $1, %mm6 # shift left original random vector + pand %mm3, %mm4 # isolate new bit + por %mm4, %mm6 # combine into new random vector + + movq %mm6, (%edi) + addl $8, %edi + decl %ecx + jnz .loop_rand # loop + + + movq %mm6, (%esi) # store random seeds + + emms + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_randmix_s16.s b/system/mmx/pixel_randmix_s16.s new file mode 100644 index 0000000..44e1702 --- /dev/null +++ b/system/mmx/pixel_randmix_s16.s @@ -0,0 +1,91 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_randmix_s16 +.type pixel_randmix_s16,@function + +# mmx rgba pixel gain +# void pixel_randmix_s16(int *left, int *right, int nb_4pixel_vectors, short int random_seed[4], short int threshold[4]) + +pixel_randmix_s16: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 20(%ebp), %edi # int16[4] array of random seeds + movq (%edi), %mm6 + + movl 24(%ebp), %edi # int16[4] array of thresholds + movq (%edi), %mm7 + + movl 8(%ebp), %edi # left array + movl 12(%ebp), %esi # right array + movl 16(%ebp), %ecx # pixel count + + pcmpeqw %mm3, %mm3 + psrlw $15, %mm3 # get bit mask 4 times 0x0001 + + .align 16 + .loop_randmix: + +# prefetch 128(%esi) + movq (%esi), %mm1 # load right 4 pixels from memory + movq (%edi), %mm0 # load 4 left pixels from memory + + movq %mm6, %mm2 # get random vector + pcmpgtw %mm7, %mm2 # compare random vector with threshold + movq %mm2, %mm5 + + pand %mm0, %mm2 # get left array's components + pandn %mm1, %mm5 # get right array's components + por %mm2, %mm5 + + movq %mm5, (%edi) # store pixels + + movq %mm6, %mm4 # get random vector + psrlw $15, %mm4 # get first component + movq %mm6, %mm5 + psrlw $14, %mm5 # get second component + pxor %mm5, %mm4 + movq %mm6, %mm5 + psrlw $12, %mm5 # get third component + pxor %mm5, %mm4 + movq %mm6, %mm5 + psrlw $3, %mm5 # get forth component + pxor %mm5, %mm4 + + psllw $1, %mm6 # shift left original random vector + pand %mm3, %mm4 # isolate new bit + por %mm4, %mm6 # combine into new random vector + + addl $8, %esi + addl $8, %edi + decl %ecx + jnz .loop_randmix # loop + + + movl 20(%ebp), %edi # int16[4] array of random seeds + movq %mm6, (%edi) # store random seeds + + emms + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_s1.s b/system/mmx/pixel_s1.s new file mode 100644 index 0000000..d6bc5ca --- /dev/null +++ b/system/mmx/pixel_s1.s @@ -0,0 +1,201 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# + + # this file contains ops for binary image processing + # 8x8 bit tile encoded + # low byte = bottom row + # low bit = right column + # %mm7 = scratch reg for all macros + + + # ************ load mask ******************* + # compute bit masks for rows and columns + # %mm7: scratch reg + + # load mask top + .macro ldmt count reg + pcmpeqb \reg, \reg + psllq $(64-(\count<<3)), \reg + .endm + + # load mask bottom + .macro ldmb count reg + pcmpeqb \reg, \reg + psrlq $(64-(\count<<3)), \reg + .endm + + # load mask top and bottom + .macro ldmtb count regt regb + ldmb \count, \regb + ldmt \count, \regt + .endm + + # load mask right + .macro ldmr count reg + pcmpeqb %mm7, %mm7 + psrlw $(16-\count), %mm7 + movq %mm7, \reg + psllq $8, %mm7 + por %mm7, \reg + .endm + + # load mask left + .macro ldml count reg + pcmpeqb %mm7, %mm7 + psllw $(16-\count), %mm7 + movq %mm7, \reg + psrlq $8, %mm7 + por %mm7, \reg + .endm + + # load mask left and right + .macro ldmlr count regl regr + pcmpeqb %mm7, %mm7 + psllw $(16-\count), %mm7 + movq %mm7, \regl + psrlq $8, %mm7 + por %mm7, \regl + movq \regl, \regr + psrlq $(8-\count), \regr + .endm + + # ************* shift square ********** + # shifts a square in reg, fills with zeros + + # shift square top + .macro sst count reg + psllq $(\count<<3), \reg + .endm + + # shift square bottom + .macro ssb count reg + psrlq $(\count<<3), \reg + .endm + + # not tested + # shift square left + .macro ssl count reg + movq \reg, %mm7 + pcmpeqb \reg, \reg + psllw $(16-\count), \reg + psrlw $8, \reg + pandn %mm7, \reg + psllw $(\count), \reg + .endm + + # shift square right + .macro ssr count reg + movq \reg, %mm7 + pcmpeqb \reg, \reg + psrlw $(16-\count), \reg + psllw $8, \reg + pandn %mm7, \reg + psrlw $(\count), \reg + .endm + + + # ********** combine square ************* + # combines 2 squares + + # combine right + .macro csr count regr reg + ssl \count, \reg + ssr (8-\count), \regr + por \regr, \reg + .endm + + # combine left + .macro csl count regl reg + ssr \count, \reg + ssl (8-\count), \regl + por \regl, \reg + .endm + + # combine top + .macro cst count regt reg + ssb \count, \reg + sst (8-\count), \regt + por \regt, \reg + .endm + + + # combine bottom + .macro csb count regb reg + sst \count, \reg + ssb (8-\count), \regb + por \regb, \reg + .endm + + + # ********** load combine square ************* + # loads combined square using mask + + # load combined square left + # mask should be count bits set right (i.e. 0x01) + .macro lcsml count mask source sourcel dstreg + movq \mask, \dstreg + movq \mask, %mm7 + pandn \source, \dstreg + pand \sourcel, %mm7 + psrlq $(\count), \dstreg + psllq $(8-\count), %mm7 + por %mm7, \dstreg + .endm + + + +.globl pixel_test_s1 +.type pixel_test_s1,@function + +# simple add +# void pixel_add_s16(void *dest, void *source, int nb_squares, int spacing) + + + + # + + +pixel_test_s1: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + + movl 8(%ebp), %edi # dest + movl 12(%ebp), %esi # source + movl 16(%ebp), %ecx # count + movl 20(%ebp), %edx # row distance + + ldmr 1, %mm6 + lcsml 1, %mm6, (%esi), 8(%esi), %mm0 + movq %mm0, (%edi) + + +# movq (%esi), %mm0 +# movq 8(%esi), %mm1 +# csl 4, %mm1, %mm0 +# movq %mm0, (%edi) + + emms + + + pop %edi + pop %esi + leave + ret + diff --git a/system/mmx/pixel_unpack_u8s16.s b/system/mmx/pixel_unpack_u8s16.s new file mode 100644 index 0000000..0fc14c2 --- /dev/null +++ b/system/mmx/pixel_unpack_u8s16.s @@ -0,0 +1,113 @@ +# Pure Data Packet mmx routine. +# Copyright (c) by Tom Schouten <pdp@zzz.kotnet.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +.globl pixel_unpack_u8s16_y +.type pixel_unpack_u8s16_y,@function + +# mmx rgba pixel gain +# void pixel_unpack_u8s16_y(char *input, char *output, int32 nb_pixels_div8) + +pixel_unpack_u8s16_y: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + +# movl 20(%ebp), %edi # int16[4] array of gains +# movq (%edi), %mm7 # get gain array + + movl 8(%ebp), %esi # input uint8 pixel array + movl 12(%ebp), %edi # output sint16 pixel array + movl 16(%ebp), %ecx # nb of elements div 8 + + + .align 16 + .loop_unpack_y: + + movq (%esi), %mm5 # load 8 pixels from memory + pxor %mm0, %mm0 # zero mm0 - mm3 + pxor %mm1, %mm1 + punpcklbw %mm5, %mm0 # unpack 1st 4 pixels + punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles + psrlw $0x1, %mm0 # shift right to clear sign bit 9.7 + psrlw $0x1, %mm1 +# pmulhw %mm7, %mm0 # apply gain +# pmulhw %mm7, %mm1 +# paddsw %mm0, %mm0 # correct factor 2 +# paddsw %mm1, %mm1 + movq %mm0, (%edi) # store + movq %mm1, 8(%edi) + + addl $8, %esi # increment source pointer + addl $16, %edi # increment dest pointer + decl %ecx + jnz .loop_unpack_y # loop + + emms + + pop %edi + pop %esi + leave + ret + +.globl pixel_unpack_u8s16_uv +.type pixel_unpack_u8s16_uv,@function +pixel_unpack_u8s16_uv: + pushl %ebp + movl %esp, %ebp + push %esi + push %edi + +# movl 20(%ebp), %edi # int16[4] array of gains +# movq (%edi), %mm7 # get gain array + + movl 8(%ebp), %esi # input uint8 pixel array + movl 12(%ebp), %edi # output sint16 pixel array + movl 16(%ebp), %ecx # nb of elements div 8 + + pcmpeqw %mm6, %mm6 + psllw $15, %mm6 + + .align 16 + .loop_unpack_uv: + + movq (%esi), %mm5 # load 8 pixels from memory + pxor %mm0, %mm0 # zero mm0 - mm3 + pxor %mm1, %mm1 + punpcklbw %mm5, %mm0 # unpack 1st 4 pixels + punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles + pxor %mm6, %mm0 # flip sign bit (Cr and Cb are ofset by 128) + pxor %mm6, %mm1 +# pmulhw %mm7, %mm0 # apply gain +# pmulhw %mm7, %mm1 +# paddsw %mm0, %mm0 # correct factor 2 +# paddsw %mm1, %mm1 + movq %mm0, (%edi) # store + movq %mm1, 8(%edi) + + addl $8, %esi # increment source pointer + addl $16, %edi # increment dest pointer + decl %ecx + jnz .loop_unpack_uv # loop + + emms + + pop %edi + pop %esi + leave + ret + |