From 37b6643df2df7d784a31ca73f7bb90dc109c2401 Mon Sep 17 00:00:00 2001 From: Hans-Christoph Steiner Date: Thu, 15 Dec 2005 07:26:47 +0000 Subject: removing PDP source (except debian files) before import of PDP 0.12.4 svn path=/trunk/externals/pdp/; revision=4217 --- system/mmx/Makefile | 32 --- system/mmx/pdp_mmx_test.c | 62 ----- system/mmx/pixel_add_s16.s | 55 ----- system/mmx/pixel_affine_s16.s | 59 ----- system/mmx/pixel_biquad_dirI_s16.s | 361 ----------------------------- system/mmx/pixel_biquad_s16.s | 451 ------------------------------------- system/mmx/pixel_ca_s1.s | 189 ---------------- system/mmx/pixel_cascade_s16.s | 330 --------------------------- system/mmx/pixel_cheby_s16.s | 90 -------- system/mmx/pixel_conv_hor_s16.s | 134 ----------- system/mmx/pixel_conv_ver_s16.s | 128 ----------- system/mmx/pixel_crot_s16.s | 153 ------------- system/mmx/pixel_gain.s | 83 ------- system/mmx/pixel_gain_s16.s | 71 ------ system/mmx/pixel_mix_s16.s | 68 ------ system/mmx/pixel_mul_s16.s | 56 ----- system/mmx/pixel_pack_s16u8.s | 126 ----------- system/mmx/pixel_rand_s16.s | 76 ------- system/mmx/pixel_randmix_s16.s | 91 -------- system/mmx/pixel_resample_s16.s | 314 -------------------------- system/mmx/pixel_s1.s | 201 ----------------- system/mmx/pixel_unpack_u8s16.s | 113 ---------- 22 files changed, 3243 deletions(-) delete mode 100644 system/mmx/Makefile delete mode 100644 system/mmx/pdp_mmx_test.c delete mode 100644 system/mmx/pixel_add_s16.s delete mode 100644 system/mmx/pixel_affine_s16.s delete mode 100644 system/mmx/pixel_biquad_dirI_s16.s delete mode 100644 system/mmx/pixel_biquad_s16.s delete mode 100644 system/mmx/pixel_ca_s1.s delete mode 100644 system/mmx/pixel_cascade_s16.s delete mode 100644 system/mmx/pixel_cheby_s16.s delete mode 100644 system/mmx/pixel_conv_hor_s16.s delete mode 100644 system/mmx/pixel_conv_ver_s16.s delete mode 100644 system/mmx/pixel_crot_s16.s delete mode 100644 system/mmx/pixel_gain.s delete mode 100644 system/mmx/pixel_gain_s16.s delete mode 100644 system/mmx/pixel_mix_s16.s delete mode 100644 system/mmx/pixel_mul_s16.s delete mode 100644 system/mmx/pixel_pack_s16u8.s delete mode 100644 system/mmx/pixel_rand_s16.s delete mode 100644 system/mmx/pixel_randmix_s16.s delete mode 100644 system/mmx/pixel_resample_s16.s delete mode 100644 system/mmx/pixel_s1.s delete mode 100644 system/mmx/pixel_unpack_u8s16.s (limited to 'system/mmx') diff --git a/system/mmx/Makefile b/system/mmx/Makefile deleted file mode 100644 index 51e5052..0000000 --- a/system/mmx/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -include ../../Makefile.config - -OBJ = \ -pixel_pack_s16u8.o \ -pixel_unpack_u8s16.o \ -pixel_add_s16.o \ -pixel_mul_s16.o \ -pixel_mix_s16.o \ -pixel_randmix_s16.o \ -pixel_conv_hor_s16.o \ -pixel_conv_ver_s16.o \ -pixel_affine_s16.o \ -pixel_biquad_s16.o \ -pixel_ca_s1.o \ -pixel_rand_s16.o \ -pixel_crot_s16.o \ -pixel_gain_s16.o \ -pixel_resample_s16.o \ -pixel_cheby_s16.o - -all: $(OBJ) - -test: pdp_mmx_test.o $(OBJ) - gcc -o pdp_mmx_test pdp_mmx_test.o $(OBJ) -g - -clean: - rm -f *.o - rm -f *~ - rm -f pdp_mmx.a - rm -f pdp_mmx_test - - diff --git a/system/mmx/pdp_mmx_test.c b/system/mmx/pdp_mmx_test.c deleted file mode 100644 index e93539f..0000000 --- a/system/mmx/pdp_mmx_test.c +++ /dev/null @@ -1,62 +0,0 @@ -#include "pdp_mmx.h" - -#define FP(x) ((short int)(((float)(x) * 2 * 256.0f))) - -#define nbp 256 - - short int a1[4] = {0x0100,0x0100,0x0100,0x0100}; - short int a2[4] = {0x0100,0x0100,0x0100,0x0100}; - short int b0[4] = {0x0100,0x0100,0x0100,0x0100}; - short int b1[4] = {0x0100,0x0100,0x0100,0x0100}; - short int b2[4] = {0x0100,0x0100,0x0100,0x0100}; - - short int u1[4] = {0x0100,0x0100,0x0100,0x0100}; - short int u2[4] = {0x0100,0x0100,0x0100,0x0100}; - - short int x0[4] = {0x0100,0x0100,0x0100,0x0100}; - short int x1[4] = {0x0100,0x0100,0x0100,0x0100}; - short int x2[4] = {0x0100,0x0100,0x0100,0x0100}; - short int x3[4] = {0x0100,0x0100,0x0100,0x0100}; - -void print_pixel(unsigned int i) -{ - if (i) printf("x "); - else printf(". "); -} - -void print_line(void) -{ - printf("\n"); -} - -void print_square(unsigned char *c) -{ - int i,j; - - for(j=7; j>=0; j--){ - for(i=0; i<8; i++) print_pixel(c[j] & (1<<(7-i))); - printf("\n"); - } - -} - -main() -{ - - unsigned char src[16]={1,2,3,4,5,6,7,8,-1,-2,-3,-4,-5,-6,-7,-8}; - unsigned char dst[8]; - - - print_square(src); - print_line(); - print_square(src+8); - print_line(); - - pixel_test_s1(dst,src,1,1); - - print_square(dst); - print_line(); - - - -} diff --git a/system/mmx/pixel_add_s16.s b/system/mmx/pixel_add_s16.s deleted file mode 100644 index 8d4c7df..0000000 --- a/system/mmx/pixel_add_s16.s +++ /dev/null @@ -1,55 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_add_s16 -.type pixel_add_s16,@function - -# simple add -# void pixel_add_s16(int *left, int *right, int nb_4pixel_vectors) - -pixel_add_s16: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 8(%ebp), %edi # left array - movl 12(%ebp), %esi # right array - movl 16(%ebp), %ecx # pixel count - - - .align 16 - .loop_mix: - -# prefetch 128(%esi) - movq (%esi), %mm1 # load right 4 pixels from memory - movq (%edi), %mm0 # load 4 left pixels from memory - paddsw %mm1, %mm0 # mix - movq %mm0, (%edi) - addl $8, %esi - addl $8, %edi - decl %ecx - jnz .loop_mix # loop - - emms - - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_affine_s16.s b/system/mmx/pixel_affine_s16.s deleted file mode 100644 index b357de3..0000000 --- a/system/mmx/pixel_affine_s16.s +++ /dev/null @@ -1,59 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_affine_s16 -.type pixel_affine_s16,@function - -# void pixel_affine_s16(int *buf, int nb_8pixel_vectors, short int gain[4], short int offset[4]) - -pixel_affine_s16: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 20(%ebp), %edi - movq (%edi), %mm6 # get offset vector - - movl 16(%ebp), %edi - movq (%edi), %mm7 # get gain vector - - movl 8(%ebp), %esi # input array - movl 12(%ebp), %ecx # pixel count - - - .align 16 - .loop_affine: - -# prefetch 128(%esi) - movq (%esi), %mm0 # load 4 pixels from memory - pmulhw %mm7, %mm0 # apply gain (s).15 fixed point - psllw $1, %mm0 # apply correction shift - paddsw %mm6, %mm0 # add offset - movq %mm0, (%esi) # store result in memory - - addl $8, %esi # increment source pointer - decl %ecx - jnz .loop_affine # loop - - emms - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_biquad_dirI_s16.s b/system/mmx/pixel_biquad_dirI_s16.s deleted file mode 100644 index 1729502..0000000 --- a/system/mmx/pixel_biquad_dirI_s16.s +++ /dev/null @@ -1,361 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# - - - # TODO MOVE TO DIRECT FORM II - # y[k] = b0 * x[k] + u1[k-1] - # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k] - # u2[k] = b2 * x[k] - a2 * y[k] - - # input in register: - # %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3} - # %esi: coef memory (a1, a2, b0, b1, b2) - # %edi: state memory (u1, u2) - - - # return in register: - # %mm0-mm4: 4x4 pixels result - - - .biquad_4x4_pixels: - .align 16 - # prescale - movq -8(%esi), %mm4 - pmulhw %mm4, %mm0 - pmulhw %mm4, %mm1 - pmulhw %mm4, %mm2 - pmulhw %mm4, %mm3 - psllw $1, %mm0 - psllw $1, %mm1 - psllw $1, %mm2 - psllw $1, %mm3 - - - # first vector - movq 0(%edi), %mm4 # mm4 <- u[-1] - movq 8(%edi), %mm5 # mm5 <- u[-2] - movq %mm4, %mm6 - movq %mm5, %mm7 - - pmulhw 0(%esi), %mm6 # multiply by a1 - pmulhw 8(%esi), %mm7 # multiply by a2 - - paddsw %mm6, %mm0 # accumulate - paddsw %mm7, %mm0 # accumulate - paddsw %mm0, %mm0 # scale by 2 (since all fixed point muls are x*y/2) - - movq %mm0, %mm6 # mm6 <- u[0] - movq %mm4, %mm7 # mm7 <- u[-1] - pmulhw 16(%esi), %mm0 # multiply by b0 - pmulhw 24(%esi), %mm4 # multiply by b1 - pmulhw 32(%esi), %mm5 # multiply by b2 - - paddsw %mm4, %mm0 # accumulate - paddsw %mm5, %mm0 # accumulate - - # mm0 is result 0 - - # second vector - movq %mm6, %mm4 # mm4 <- u[0] - movq %mm7, %mm5 # mm5 <- u[-1] - - pmulhw 0(%esi), %mm6 # multiply by a1 - pmulhw 8(%esi), %mm7 # multiply by a2 - - paddsw %mm6, %mm1 # accumulate - paddsw %mm7, %mm1 # accumulate - paddsw %mm1, %mm1 # scale by 2 - - - movq %mm1, %mm6 # mm6 <- u[1] - movq %mm4, %mm7 # mm7 <- u[0] - pmulhw 16(%esi), %mm1 # multiply by b0 - pmulhw 24(%esi), %mm4 # multiply by b1 - pmulhw 32(%esi), %mm5 # multiply by b2 - - paddsw %mm4, %mm1 # accumulate - paddsw %mm5, %mm1 # accumulate - - # mm1 is result 1 - - # third vector - movq %mm6, %mm4 # mm4 <- u[1] - movq %mm7, %mm5 # mm5 <- u[0] - - pmulhw 0(%esi), %mm6 # multiply by a1 - pmulhw 8(%esi), %mm7 # multiply by a2 - - paddsw %mm6, %mm2 # accumulate - paddsw %mm7, %mm2 # accumulate - paddsw %mm2, %mm2 # scale by 2 - - - movq %mm2, %mm6 # mm6 <- u[2] - movq %mm4, %mm7 # mm7 <- u[1] - pmulhw 16(%esi), %mm2 # multiply by b0 - pmulhw 24(%esi), %mm4 # multiply by b1 - pmulhw 32(%esi), %mm5 # multiply by b2 - - paddsw %mm4, %mm2 # accumulate - paddsw %mm5, %mm2 # accumulate - - # mm2 is result 2 - - # fourth vector - movq %mm6, %mm4 # mm4 <- u[2] - movq %mm7, %mm5 # mm5 <- u[1] - - pmulhw 0(%esi), %mm6 # multiply by a1 - pmulhw 8(%esi), %mm7 # multiply by a2 - - paddsw %mm6, %mm3 # accumulate - paddsw %mm7, %mm3 # accumulate - paddsw %mm3, %mm3 # scale by 2 - - - movq %mm3, 0(%edi) # store u[3] - movq %mm4, 8(%edi) # store u[2] - pmulhw 16(%esi), %mm3 # multiply by b0 - pmulhw 24(%esi), %mm4 # multiply by b1 - pmulhw 32(%esi), %mm5 # multiply by b2 - - paddsw %mm4, %mm3 # accumulate - paddsw %mm5, %mm3 # accumulate - - # mm3 is result 3 - - ret - - - # in order to use the 4 line parallel biquad routine on horizontal - # lines, we need to reorder (rotate or transpose) the matrix, since - # images are scanline encoded, and we want to work in parallell - # on 4 lines. - # - # since the 4 lines are independent, it doesnt matter in which order - # the the vector elements are present. - # - # this allows us to use the same routine for left->right and right->left - # processing. - # - # some comments on the non-abelean group of square isometries consisting of - # (I) identity - # (H) horizontal axis mirror - # (V) vertical axis mirror - # (T) transpose (diagonal axis mirror) - # (A) antitranspose (antidiagonal axis mirror) - # (R1) 90deg anticlockwize rotation - # (R2) 180deg rotation - # (R3) 90deg clockwize rotation - # - # - # we basicly have two options: (R1,R3) or (T,A) - # we opt for T and A because they are self inverting, which improves locality - # - # use antitranspose for right to left an transpose - # for left to right (little endian) - - - # antitranspose 4x4 - - # input - # %mm3 == {d0 d1 d2 d3} - # %mm2 == {c0 c1 c2 c3} - # %mm1 == {b0 b1 b2 b3} - # %mm0 == {a0 a1 a2 a3} - - # output - # %mm3 == {a3 b3 c3 d3} - # %mm2 == {a2 b2 c2 d2} - # %mm1 == {a1 b1 c1 d1} - # %mm0 == {a0 b0 c0 d0} - - - .antitranspose_4x4: - .align 16 - movq %mm3, %mm4 - punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3} - movq %mm3, %mm5 - punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1} - - movq %mm2, %mm6 - punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3} - movq %mm2, %mm7 - punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1} - - movq %mm4, %mm3 - punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3} - movq %mm4, %mm2 - punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2} - - movq %mm5, %mm1 - punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1} - movq %mm5, %mm0 - punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0} - - ret - - - - # transpose 4x4 - - # input - # %mm3 == {d3 d2 d1 d0} - # %mm2 == {c3 c2 c1 c0} - # %mm1 == {b3 b2 b1 b0} - # %mm0 == {a3 a2 a1 a0} - - # output - # %mm3 == {d3 c3 b3 a3} - # %mm2 == {d2 c2 b2 a2} - # %mm1 == {d1 c1 b1 a1} - # %mm0 == {d0 c0 b0 a0} - - - .transpose_4x4: - .align 16 - movq %mm0, %mm4 - punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0} - movq %mm0, %mm5 - punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2} - - movq %mm1, %mm6 - punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0} - movq %mm1, %mm7 - punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2} - - movq %mm4, %mm0 - punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0} - movq %mm4, %mm1 - punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1} - - movq %mm5, %mm2 - punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2} - movq %mm5, %mm3 - punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3} - - ret - - -.globl pixel_biquad_vertb_s16 -.type pixel_biquad_vertb_s16,@function - - -# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) - - -pixel_biquad_vertb_s16: - - - pushl %ebp - movl %esp, %ebp - push %ebx - push %esi - push %edi - - movl 8(%ebp), %ebx # pixel array offset - movl 12(%ebp), %ecx # nb of 4x4 pixblocks - movl 16(%ebp), %edx # line with - - movl 20(%ebp), %esi # coefs - movl 24(%ebp), %edi # state - - shll $1, %edx # short int addressing - movl %edx, %eax - shll $1, %eax - addl %edx, %eax # eax = 3 * edx - - .align 16 - .biquad_vertb_line_loop: - movq (%ebx), %mm0 - movq (%ebx,%edx,1), %mm1 - movq (%ebx,%edx,2), %mm2 - movq (%ebx,%eax,1), %mm3 - call .biquad_4x4_pixels - movq %mm0, (%ebx) - movq %mm1, (%ebx,%edx,1) - movq %mm2, (%ebx,%edx,2) - movq %mm3, (%ebx,%eax,1) - addl %edx, %ebx - addl %eax, %ebx - decl %ecx - jnz .biquad_vertb_line_loop - - emms - - pop %edi - pop %esi - pop %ebx - leave - ret - -.globl pixel_biquad_horlr_s16 -.type pixel_biquad_horlr_s16,@function - - -# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) - - -pixel_biquad_horlr_s16: - - - pushl %ebp - movl %esp, %ebp - push %ebx - push %esi - push %edi - - movl 8(%ebp), %ebx # pixel array offset - movl 12(%ebp), %ecx # nb of 4x4 pixblocks - movl 16(%ebp), %edx # line with - - movl 20(%ebp), %esi # coefs - movl 24(%ebp), %edi # state - - shll $1, %edx # short int addressing - movl %edx, %eax - shll $1, %eax - addl %edx, %eax # eax = 3 * edx - - .align 16 - .biquad_horlr_line_loop: - movq (%ebx), %mm0 - movq (%ebx,%edx,1), %mm1 - movq (%ebx,%edx,2), %mm2 - movq (%ebx,%eax,1), %mm3 - call .transpose_4x4 - call .biquad_4x4_pixels - call .transpose_4x4 - movq %mm0, (%ebx) - movq %mm1, (%ebx,%edx,1) - movq %mm2, (%ebx,%edx,2) - movq %mm3, (%ebx,%eax,1) - addl $8, %ebx - decl %ecx - jnz .biquad_horlr_line_loop - - emms - - pop %edi - pop %esi - pop %ebx - leave - ret - - - diff --git a/system/mmx/pixel_biquad_s16.s b/system/mmx/pixel_biquad_s16.s deleted file mode 100644 index 844b041..0000000 --- a/system/mmx/pixel_biquad_s16.s +++ /dev/null @@ -1,451 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# - - - # DIRECT FORM II BIQUAD - # - # y[k] = b0 * x[k] + u1[k-1] - # u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k] - # u2[k] = b2 * x[k] - a2 * y[k] - # MACRO: df2 - # - # computes a direct form 2 biquad - # does not use {mm0-mm3}\ - # - # input: == input - # %mm4 == state 1 - # %mm5 == state 2 - # (%esi) == biquad coefs (-a1 -a2 b0 b1 b2) in s1.14 - # output: == output - # %mm4 == state 1 - # %mm5 == state 2 - - .macro df2 reg - movq \reg, %mm6 # mm6 == x[k] - movq \reg, %mm7 # mm7 == x[k] - pmulhw 16(%esi), %mm6 # mm6 == x[k] * b0 - pmulhw 24(%esi), %mm7 # mm7 == x[k] * b1 - paddw %mm4, %mm6 # mm6 == x[k] * b0 + u1[k-1] == y[k] - paddw %mm5, %mm7 # mm7 == x[k] * b1 + u2[k-1] - paddsw %mm6, %mm6 # compensate for mul = x*y/4 (coefs are s1.14 fixed point) - paddsw %mm6, %mm6 # paddsw ensures saturation - movq \reg, %mm5 # mm5 == x[k] - movq %mm6, %mm4 # mm4 == y[k] - movq %mm6, \reg # reg == y[k] -------------------- - pmulhw 0(%esi), %mm4 # mm4 == y[k] * (-a1) - pmulhw 8(%esi), %mm6 # mm6 == y[k] * (-a2) - pmulhw 32(%esi), %mm5 # mm5 == x[k] * b2 - paddw %mm7, %mm4 # mm4 == u1[k] -------------------- - paddw %mm6, %mm5 # mm5 == u2[k] -------------------- - .endm - - - # input in register: - # %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3} - # %esi: coef memory (-a1, -a2, b0, b1, b2) in s1.14 - # %edi: state memory (u1, u2) - - # return in register: - # %mm0-mm4: 4x4 pixels result - - - - - .macro biquad_4x4_pixels - .align 16 - movq 0(%edi), %mm4 # get state - movq 8(%edi), %mm5 - df2 %mm0 # compute 4 biquads - df2 %mm1 - df2 %mm2 - df2 %mm3 - movq %mm4, 0(%edi) # store state - movq %mm5, 8(%edi) - .endm - - - - # in order to use the 4 line parallel biquad routine on horizontal - # lines, we need to reorder (rotate or transpose) the matrix, since - # images are scanline encoded, and we want to work in parallell - # on 4 lines. - # - # since the 4 lines are independent, it doesnt matter in which order - # the the vector elements are present. - # - # this allows us to use the same routine for left->right and right->left - # processing. - # - # some comments on the non-abelean group of square isometries consisting of - # (I) identity - # (H) horizontal axis mirror - # (V) vertical axis mirror - # (T) transpose (diagonal axis mirror) - # (A) antitranspose (antidiagonal axis mirror) - # (R1) 90deg anticlockwize rotation - # (R2) 180deg rotation - # (R3) 90deg clockwize rotation - # - # - # we basicly have two options: (R1,R3) or (T,A) - # we opt for T and A because they are self inverting, which improves locality - # - # use antitranspose for right to left an transpose - # for left to right (little endian) - - - # antitranspose 4x4 - - # input - # %mm3 == {d0 d1 d2 d3} - # %mm2 == {c0 c1 c2 c3} - # %mm1 == {b0 b1 b2 b3} - # %mm0 == {a0 a1 a2 a3} - - # output - # %mm3 == {a3 b3 c3 d3} - # %mm2 == {a2 b2 c2 d2} - # %mm1 == {a1 b1 c1 d1} - # %mm0 == {a0 b0 c0 d0} - - - .macro antitranspose_4x4: - movq %mm3, %mm4 - punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3} - movq %mm3, %mm5 - punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1} - - movq %mm2, %mm6 - punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3} - movq %mm2, %mm7 - punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1} - - movq %mm4, %mm3 - punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3} - movq %mm4, %mm2 - punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2} - - movq %mm5, %mm1 - punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1} - movq %mm5, %mm0 - punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0} - - .endm - - - # transpose 4x4 - - # input - # %mm3 == {d3 d2 d1 d0} - # %mm2 == {c3 c2 c1 c0} - # %mm1 == {b3 b2 b1 b0} - # %mm0 == {a3 a2 a1 a0} - - # output - # %mm3 == {d3 c3 b3 a3} - # %mm2 == {d2 c2 b2 a2} - # %mm1 == {d1 c1 b1 a1} - # %mm0 == {d0 c0 b0 a0} - - - .macro transpose_4x4: - movq %mm0, %mm4 - punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0} - movq %mm0, %mm5 - punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2} - - movq %mm1, %mm6 - punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0} - movq %mm1, %mm7 - punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2} - - movq %mm4, %mm0 - punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0} - movq %mm4, %mm1 - punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1} - - movq %mm5, %mm2 - punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2} - movq %mm5, %mm3 - punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3} - - .endm - -.globl pixel_biquad_vertb_s16 -.type pixel_biquad_vertb_s16,@function - - -# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) - - -pixel_biquad_vertb_s16: - - - pushl %ebp - movl %esp, %ebp - push %ebx - push %esi - push %edi - - movl 8(%ebp), %ebx # pixel array offset - movl 12(%ebp), %ecx # nb of 4x4 pixblocks - movl 16(%ebp), %edx # line with - - movl 20(%ebp), %esi # coefs - movl 24(%ebp), %edi # state - - shll $1, %edx # short int addressing - movl %edx, %eax - shll $1, %eax - addl %edx, %eax # eax = 3 * edx - - .align 16 - .biquad_vertb_line_loop: - movq (%ebx), %mm0 - movq (%ebx,%edx,1), %mm1 - movq (%ebx,%edx,2), %mm2 - movq (%ebx,%eax,1), %mm3 - biquad_4x4_pixels - movq %mm0, (%ebx) - movq %mm1, (%ebx,%edx,1) - movq %mm2, (%ebx,%edx,2) - movq %mm3, (%ebx,%eax,1) - addl %edx, %ebx - addl %eax, %ebx - decl %ecx - jnz .biquad_vertb_line_loop - - emms - - pop %edi - pop %esi - pop %ebx - leave - ret -.globl pixel_biquad_verbt_s16 -.type pixel_biquad_verbt_s16,@function - - -# pixel_biquad_vertbt_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) - - -pixel_biquad_verbt_s16: - - - pushl %ebp - movl %esp, %ebp - push %ebx - push %esi - push %edi - - movl 8(%ebp), %ebx # pixel array offset - movl 12(%ebp), %ecx # nb of 4x4 pixblocks - movl 16(%ebp), %eax # line with - - shll $3, %eax # 4 line byte spacing - decl %ecx - mul %ecx - incl %ecx - addl %eax, %ebx # ebx points to last pixblock - - movl 16(%ebp), %edx # line with - - movl 20(%ebp), %esi # coefs - movl 24(%ebp), %edi # state - - shll $1, %edx # short int addressing - movl %edx, %eax - shll $1, %eax - addl %edx, %eax # eax = 3 * edx - - .align 16 - .biquad_verbt_line_loop: - movq (%ebx), %mm3 - movq (%ebx,%edx,1), %mm2 - movq (%ebx,%edx,2), %mm1 - movq (%ebx,%eax,1), %mm0 - biquad_4x4_pixels - movq %mm3, (%ebx) - movq %mm2, (%ebx,%edx,1) - movq %mm1, (%ebx,%edx,2) - movq %mm0, (%ebx,%eax,1) - subl %edx, %ebx - subl %eax, %ebx - decl %ecx - jnz .biquad_verbt_line_loop - - emms - - pop %edi - pop %esi - pop %ebx - leave - ret - -.globl pixel_biquad_horlr_s16 -.type pixel_biquad_horlr_s16,@function -# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) - -pixel_biquad_horlr_s16: - - - pushl %ebp - movl %esp, %ebp - push %ebx - push %esi - push %edi - - movl 8(%ebp), %ebx # pixel array offset - movl 12(%ebp), %ecx # nb of 4x4 pixblocks - movl 16(%ebp), %edx # line with - - movl 20(%ebp), %esi # coefs - movl 24(%ebp), %edi # state - - shll $1, %edx # short int addressing - movl %edx, %eax - shll $1, %eax - addl %edx, %eax # eax = 3 * edx - - .align 16 - .biquad_horlr_line_loop: - movq (%ebx), %mm0 - movq (%ebx,%edx,1), %mm1 - movq (%ebx,%edx,2), %mm2 - movq (%ebx,%eax,1), %mm3 - transpose_4x4 - biquad_4x4_pixels - transpose_4x4 - movq %mm0, (%ebx) - movq %mm1, (%ebx,%edx,1) - movq %mm2, (%ebx,%edx,2) - movq %mm3, (%ebx,%eax,1) - addl $8, %ebx - decl %ecx - jnz .biquad_horlr_line_loop - - emms - - pop %edi - pop %esi - pop %ebx - leave - ret - - -.globl pixel_biquad_horrl_s16 -.type pixel_biquad_horrl_s16,@function -# pixel_biquad_horrl_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) - -pixel_biquad_horrl_s16: - - pushl %ebp - movl %esp, %ebp - push %ebx - push %esi - push %edi - - movl 8(%ebp), %ebx # pixel array offset - movl 12(%ebp), %ecx # nb of 4x4 pixblocks - movl 16(%ebp), %edx # line with - - - movl %ecx, %eax - decl %eax - shll $3, %eax - addl %eax, %ebx # ebx points to last pixblock - - - movl 20(%ebp), %esi # coefs - movl 24(%ebp), %edi # state - - shll $1, %edx # short int addressing - movl %edx, %eax - shll $1, %eax - addl %edx, %eax # eax = 3 * edx - - .align 16 - .biquad_horrl_line_loop: - movq (%ebx), %mm0 - movq (%ebx,%edx,1), %mm1 - movq (%ebx,%edx,2), %mm2 - movq (%ebx,%eax,1), %mm3 - antitranspose_4x4 - biquad_4x4_pixels - antitranspose_4x4 - movq %mm0, (%ebx) - movq %mm1, (%ebx,%edx,1) - movq %mm2, (%ebx,%edx,2) - movq %mm3, (%ebx,%eax,1) - subl $8, %ebx - decl %ecx - jnz .biquad_horrl_line_loop - - emms - - pop %edi - pop %esi - pop %ebx - leave - ret - - -.globl pixel_biquad_time_s16 -.type pixel_biquad_time_s16,@function -# pixel_biquad_time_s16(short int *pixel_array, short int *s1, short int *s2, short int *coefs, int nb_4_pix_vectors) - -pixel_biquad_time_s16: - - pushl %ebp - movl %esp, %ebp - push %ebx - push %esi - push %edi - - movl 8(%ebp), %ebx # pixel array offset - movl 12(%ebp), %edx # state 1 array - movl 16(%ebp), %edi # state 2 array - - movl 20(%ebp), %esi # coefs - movl 24(%ebp), %ecx # nb of 4 pixel vectors - - - .align 16 - .biquad_time_loop: - movq (%ebx), %mm0 # get input - movq (%edx), %mm4 # get state 1 - movq (%edi), %mm5 # get state 2 - df2 %mm0 # compute direct form 2 - movq %mm0, (%ebx) # write output - movq %mm5, (%edi) # write state 2 - movq %mm4, (%edx) # write state 1 - addl $8, %ebx - addl $8, %edi - addl $8, %edx - decl %ecx - jnz .biquad_time_loop - - emms - - pop %edi - pop %esi - pop %ebx - leave - ret - - diff --git a/system/mmx/pixel_ca_s1.s b/system/mmx/pixel_ca_s1.s deleted file mode 100644 index d9c730f..0000000 --- a/system/mmx/pixel_ca_s1.s +++ /dev/null @@ -1,189 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# - - # this file contains assembler routines for 2D 1 bit cellular automata - # processing. it is organized around a feeder kernel and a - # stack based bit processor (virtual forth machine) - # - # the feeder kernel is responsable for loading/storing CA cells - # from/to memory. data in memory is organized as a scanline - # encoded toroidial bitplane (lsb = left). to simplify the kernel, the top - # left corner of the rectangular grid of pixels will shift down - # every processing step. - # - # the stack machine has the following architecture: - # CA stack: %esi, TOS: %mm0 (32x2 pixels. lsw = top row) - # CA horizon: %mm4-%mm7 (64x4 pixels. %mm4 = top row) - # - # the stack size / organization is not known to the stack machine. - # it can be thought of as operating on a 3x3 cell neightbourhood. - # the only purpose of forth program is to determine the CA local update rule. - # - # the machine is supposed to be very minimal. no looping control. - # no adressing modes. no conditional code (hey, this is an experiment!) - # so recursion is not allowed (no way to stop it) - # there are 9 words to load the cell neigbourhood on the stack. - # the rest is just logic and stack manips. - - - # this file contains pure asm macros. it is to be included before assembly - # after scaforth.pl has processed the .scaf file - - - # *************************** CA CELL ACCESS MACROS ***************************** - # fetchTL - fetchBR - - # shift / load rectangle macros: - - # shift rectangle horizontal - # result is in reg1 - .macro shift reg1 reg2 count - psllq $(32+\count), \reg1 - psrlq $(32-\count), \reg2 - psrlq $32, \reg1 - psllq $32, \reg2 - por \reg2, \reg1 - .endm - - .macro ldtop reg1 reg2 - movq %mm4, \reg1 - movq %mm5, \reg2 - .endm - - .macro ldcenter reg1 reg2 - movq %mm5, \reg1 - movq %mm6, \reg2 - .endm - - .macro ldbottom reg1 reg2 - movq %mm6, \reg1 - movq %mm7, \reg2 - .endm - - - # fetch from top row - - # fetch the top left square - .macro fetchTL - ldtop %mm0, %mm1 - shift %mm0, %mm1, -1 - .endm - - # fetch the top mid square - .macro fetchTM - ldtop %mm0, %mm1 - shift %mm0, %mm1, 0 - .endm - - # fetch the top right square - .macro fetchTR - ldtop %mm0, %mm1 - shift %mm0, %mm1, 1 - .endm - - - - # fetch from center row - - # fetch the mid left square - .macro fetchML - ldcenter %mm0, %mm1 - shift %mm0, %mm1, -1 - .endm - - # fetch the mid mid square - .macro fetchMM - ldcenter %mm0, %mm1 - shift %mm0, %mm1, 0 - .endm - - # fetch the mid right square - .macro fetchMR - ldcenter %mm0, %mm1 - shift %mm0, %mm1, 1 - .endm - - - - - - # fetch from bottom row - - # fetch the bottom left square - .macro fetchBL - ldbottom %mm0, %mm1 - shift %mm0, %mm1, -1 - .endm - - # fetch the bottom mid square - .macro fetchBM - ldbottom %mm0, %mm1 - shift %mm0, %mm1, 0 - .endm - - # fetch the bottom right square - .macro fetchBR - ldbottom %mm0, %mm1 - shift %mm0, %mm1, 1 - .endm - - - - # *************************** CA STACK MANIP MACROS ***************************** - # dup drop dropdup swap nip dropover - - .macro dup - lea -8(%esi), %esi - movq %mm0, (%esi) - .endm - - .macro drop - movq (%esi), %mm0 - lea 8(%esi), %esi - .endm - - .macro dropdup - movq (%esi), %mm0 - .endm - - .macro swap - movq (%esi), %mm1 - movq %mm0, (%esi) - movq %mm1, %mm0 - .endm - - .macro nip - lea 8(%esi), %esi - .endm - - .macro dropover - movq 8(%esi), %mm0 - .endm - - - # *************************** CA BOOLEAN LOGIC MACROS ***************************** - # overxor - - .macro overxor - pxor (%esi), %mm0 - .endm - - - - - diff --git a/system/mmx/pixel_cascade_s16.s b/system/mmx/pixel_cascade_s16.s deleted file mode 100644 index bf88d08..0000000 --- a/system/mmx/pixel_cascade_s16.s +++ /dev/null @@ -1,330 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# - - - # TODO: COUPLED CASCADE SECOND ORDER SECTION - # - # s1[k] = ar * s1[k-1] + ai * s2[k-1] + x[k] - # s2[k] = ar * s2[k-1] - ai * s1[k-1] - # y[k] = c0 * x[k] + c1 * s1[k-1] + c2 * s2[k-1] - - - # MACRO: df2 - # - # computes a coupled cascade - # - # input: %mm0 == input - # %mm1 == state 1 - # %mm2 == state 2 - # (%esi) == cascade coefs (ar ai c0 c1 c2) in s0.15 - # output: %mm0 == output - # %mm1 == state 1 - # %mm2 == state 2 - - - .macro coupled - pmovq %mm1, %mm3 # mm3 == s1[k-1] - pmovq %mm1, %mm4 # mm4 == s1[k-1] - pmovq %mm2, %mm5 # mm5 == s2[k-1] - pmovq %mm2, %mm6 # mm5 == s2[k-1] - pmulhw (%esi), %mm1 # mm1 == s1[k-1] * ar - pmulhw 8(%esi), %mm3 # mm3 == s1[k-1] * ai - pmulhw 24(%esi), %mm4 # mm4 == s1[k-1] * c1 - pmulhw (%esi), %mm2 # mm2 == s2[k-1] * ar - pmulhw 8(%esi), %mm5 # mm5 == s2[k-1] * ai - pmulhw 32(%esi), %mm6 # mm6 == s2[k-1] * c2 - paddw %mm5, %mm1 # mm1 == s1[k-1] * ar + s2[k-1] * ai - psubw %mm3, %mm2 # mm2 == s2[k-1] * ar - s1[k-1] * ai == s2[k] - paddw %mm0, %mm1 # mm1 == s1[k] - pmulhw 16(%esi), %mm0 # mm0 == x[k] * c0 - paddw %mm6, %mm4 # mm4 == s1[k-1] * c1 + s2[k-1] * c2 - paddw %mm4, %mm0 # mm0 == y[k] - .endm - - - - - # in order to use the 4 line parallel cascade routine on horizontal - # lines, we need to reorder (rotate or transpose) the matrix, since - # images are scanline encoded, and we want to work in parallell - # on 4 lines. - # - # since the 4 lines are independent, it doesnt matter in which order - # the the vector elements are present. - # - # this allows us to use the same routine for left->right and right->left - # processing. - # - # some comments on the non-abelean group of square isometries consisting of - # (I) identity - # (H) horizontal axis mirror - # (V) vertical axis mirror - # (T) transpose (diagonal axis mirror) - # (A) antitranspose (antidiagonal axis mirror) - # (R1) 90deg anticlockwize rotation - # (R2) 180deg rotation - # (R3) 90deg clockwize rotation - # - # - # we basicly have two options: (R1,R3) or (T,A) - # we opt for T and A because they are self inverting, which improves locality - # - # use antitranspose for right to left an transpose - # for left to right (little endian) - - - # antitranspose 4x4 - - # input - # %mm3 == {d0 d1 d2 d3} - # %mm2 == {c0 c1 c2 c3} - # %mm1 == {b0 b1 b2 b3} - # %mm0 == {a0 a1 a2 a3} - - # output - # %mm3 == {a3 b3 c3 d3} - # %mm2 == {a2 b2 c2 d2} - # %mm1 == {a1 b1 c1 d1} - # %mm0 == {a0 b0 c0 d0} - - - .macro antitranspose_4x4: - movq %mm3, %mm4 - punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3} - movq %mm3, %mm5 - punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1} - - movq %mm2, %mm6 - punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3} - movq %mm2, %mm7 - punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1} - - movq %mm4, %mm3 - punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3} - movq %mm4, %mm2 - punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2} - - movq %mm5, %mm1 - punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1} - movq %mm5, %mm0 - punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0} - - .endm - - - # transpose 4x4 - - # input - # %mm3 == {d3 d2 d1 d0} - # %mm2 == {c3 c2 c1 c0} - # %mm1 == {b3 b2 b1 b0} - # %mm0 == {a3 a2 a1 a0} - - # output - # %mm3 == {d3 c3 b3 a3} - # %mm2 == {d2 c2 b2 a2} - # %mm1 == {d1 c1 b1 a1} - # %mm0 == {d0 c0 b0 a0} - - - .macro transpose_4x4: - movq %mm0, %mm4 - punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0} - movq %mm0, %mm5 - punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2} - - movq %mm1, %mm6 - punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0} - movq %mm1, %mm7 - punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2} - - movq %mm4, %mm0 - punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0} - movq %mm4, %mm1 - punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1} - - movq %mm5, %mm2 - punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2} - movq %mm5, %mm3 - punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3} - - .endm - -.globl pixel_cascade_vertb_s16 -.type pixel_cascade_vertb_s16,@function - - -# pixel_cascade_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) - - -pixel_cascade_vertb_s16: - - - pushl %ebp - movl %esp, %ebp - push %ebx - push %esi - push %edi - - movl 8(%ebp), %ebx # pixel array offset - movl 12(%ebp), %ecx # nb of 4x4 pixblocks - movl 16(%ebp), %edx # line with - - movl 20(%ebp), %esi # coefs - movl 24(%ebp), %edi # state - - shll $1, %edx # short int addressing - subl %edx, %ebx - - movq 0(%edi), %mm1 # s1[k-1] - movq 8(%edi), %mm2 # s2[k-1] - .align 16 - .cascade_vertb_line_loop: - - movq (%ebx,%edx,1), %mm3 - movq %mm3, %mm0 - addl %edx, %ebx - coupled - movq %mm0, (%ebx) - - movq (%ebx,%edx,1), %mm3 - movq %mm3, %mm0 - addl %edx, %ebx - coupled - movq %mm0, (%ebx) - - movq (%ebx,%edx,1), %mm3 - movq %mm3, %mm0 - addl %edx, %ebx - coupled - movq %mm0, (%ebx) - - movq (%ebx,%edx,1), %mm3 - movq %mm3, %mm0 - addl %edx, %ebx - coupled - movq %mm0, (%ebx) - - decl %ecx - jnz .cascade_vertb_line_loop - - movq %mm1, 0(%edi) # s1[k-1] - movq %mm2, 8(%edi) # s2[k-1] - - emms - - pop %edi - pop %esi - pop %ebx - leave - ret - -.globl pixel_cascade_horlr_s16 -.type pixel_cascade_horlr_s16,@function - - -# pixel_cascade_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8]) - - -pixel_cascade_horlr_s16: - - - pushl %ebp - movl %esp, %ebp - push %ebx - push %esi - push %edi - - movl 8(%ebp), %ebx # pixel array offset - movl 12(%ebp), %ecx # nb of 4x4 pixblocks - movl 16(%ebp), %edx # line with - - movl 20(%ebp), %esi # coefs - movl 24(%ebp), %edi # state - - shll $1, %edx # short int addressing - movl %edx, %eax - shll $1, %eax - addl %edx, %eax # eax = 3 * edx - - - .align 16 - .cascade_horlr_line_loop: - movq (%edi), %mm1 - movq 8(%edi), %mm2 - - movq (%ebx), %mm0 - movq (%ebx,%edx,1), %mm1 - movq (%ebx,%edx,2), %mm2 - movq (%ebx,%eax,1), %mm3 - - transpose_4x4 - - movq %mm1, (%ebx,%edx,1) - movq %mm2, (%ebx,%edx,2) - movq %mm3, (%ebx,%eax,1) - - coupled - - movq %mm0, (%ebx) - movq (%ebx,%edx,1), %mm3 - movq %mm3, %mm0 - - coupled - - movq %mm0, (%ebx, %edx,1) - movq (%ebx,%edx,2), %mm3 - movq %mm3, %mm0 - - coupled - - movq %mm0, (%ebx, %edx,2) - movq (%ebx,%eax,1), %mm3 - movq %mm3, %mm0 - - coupled - - movq %mm1, 0(%edi) # s1[k-1] - movq %mm2, 8(%edi) # s2[k-1] - - movq %mm0, %mm3 - movq (%ebx), %mm0 - movq (%ebx,%edx,1), %mm1 - movq (%ebx,%edx,2), %mm2 - - transpose_4x4 - - movq %mm0, (%ebx) - movq %mm1, (%ebx,%edx,1) - movq %mm2, (%ebx,%edx,2) - movq %mm3, (%ebx,%eax,1) - - addl $8, %ebx - decl %ecx - jnz .cascade_horlr_line_loop - - emms - - pop %edi - pop %esi - pop %ebx - leave - ret - - - diff --git a/system/mmx/pixel_cheby_s16.s b/system/mmx/pixel_cheby_s16.s deleted file mode 100644 index 2afe9e2..0000000 --- a/system/mmx/pixel_cheby_s16.s +++ /dev/null @@ -1,90 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_cheby_s16_3plus -.type pixel_cheby_s16_3plus,@function - -# void pixel_cheby_s16(int *buf, int nb_8pixel_vectors, int order+1, short int *coefs) - - -# coefs are s2.13 fixed point (-4->4) -pixel_cheby_s16_3plus: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - push %edx - - movl 8(%ebp), %esi # input array - movl 12(%ebp), %ecx # vector count - movl 16(%ebp), %eax # get order+1 - - shll $3, %eax - movl 20(%ebp), %edx - addl %eax, %edx # edx = coef endx address - -# jmp skip - - .align 16 - .loop_cheby: - - movl 20(%ebp), %edi # get coefs - movq (%esi), %mm0 # load 4 pixels from memory (mm0 = x) - pcmpeqw %mm2, %mm2 - movq %mm0, %mm1 # mm1 (T_n-1) <- x - psrlw $1, %mm2 # mm2 (T_n-2) <- 1 - - - movq (%edi), %mm4 # mm4 (acc) == a0 - psraw $1, %mm4 # mm4 == a0/2 - movq %mm0, %mm5 # mm5 (intermediate) - pmulhw 8(%edi), %mm5 # mm5 == (x * a1)/2 - paddsw %mm5, %mm4 # acc = c0 + c1 x - addl $16, %edi - - .loop_cheby_inner: - movq %mm1, %mm3 # mm3 == T_n-1 - psraw $2, %mm2 # mm2 == T_n-2 / 4 - pmulhw %mm0, %mm3 # mm3 == (2 x T_n-1) / 4 - psubsw %mm2, %mm3 # mm3 == (2 x T_n-1 - T_n-2) / 4 - paddsw %mm3, %mm3 - paddsw %mm3, %mm3 # mm3 == T_n - movq %mm1, %mm2 # mm2 == new T_n-1 - movq %mm3, %mm1 # mm3 == new T_n-2 - pmulhw (%edi), %mm3 # mm3 = a_n * T_n / 2 - paddsw %mm3, %mm4 # accumulate - addl $8, %edi - cmpl %edx, %edi - jne .loop_cheby_inner - - paddsw %mm4, %mm4 # compensate for 0.125 factor - paddsw %mm4, %mm4 - paddsw %mm4, %mm4 - movq %mm4, (%esi) # store result in memory - addl $8, %esi # increment source/dest pointer - decl %ecx - jnz .loop_cheby # loop - -skip: - emms - - pop %edx - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_conv_hor_s16.s b/system/mmx/pixel_conv_hor_s16.s deleted file mode 100644 index e90a692..0000000 --- a/system/mmx/pixel_conv_hor_s16.s +++ /dev/null @@ -1,134 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# - # intermediate function - - # input in register: - # %mm0: left 4 pixels - # %mm1: middle 4 pixels - # %mm2: right 4 pixels - - # %mm5: left 4 pixel masks - # %mm6: middle 4 pixel masks - # %mm7: right 4 pixel masks - - # return in register: - # %mm0: middle 4 pixels result - - - .conv_hor_4_pixels: - .align 16 - - # compute quadruplet - - # get left pixels - psrlq $48, %mm0 # shift word 3 to byte 0 - movq %mm1, %mm4 - psllq $16, %mm4 # shift word 0,1,2 to 1,2,3 - por %mm4, %mm0 # combine - pmulhw %mm5, %mm0 - psllw $1, %mm0 - - - # get middle pixels - movq %mm1, %mm4 - pmulhw %mm6, %mm4 - psllw $1, %mm4 - paddsw %mm4, %mm0 - - - # get right pixels - movq %mm2, %mm3 - psllq $48, %mm3 # shift word 0 to word 3 - movq %mm1, %mm4 - psrlq $16, %mm4 # shift word 1,2,3 to 0,1,2 - por %mm4, %mm3 # combine - pmulhw %mm7, %mm3 - psllw $1, %mm3 - paddsw %mm3, %mm0 # accumulate - - ret - -.globl pixel_conv_hor_s16 -.type pixel_conv_hor_s16,@function - - -# pixel_conv_hor_s16(short int *pixel_array, int nb_4_pixel_vectors, short int border[4], short int mask[12]) -# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested -# NOT TESTED - - -pixel_conv_hor_s16: - - - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 8(%ebp), %esi # pixel array offset - movl 12(%ebp), %ecx # nb of 8 pixel vectors in a row (at least 2) - - movl 20(%ebp), %edi # mask vector - movq (%edi), %mm5 - movq 8(%edi), %mm6 - movq 16(%edi), %mm7 - - movl 16(%ebp), %edi # boundary pixel vector - - - - movq (%edi), %mm0 # init regs (left edge, so mm0 is zero) - movq (%esi), %mm1 - movq 8(%esi), %mm2 - - decl %ecx # loop has 2 terminator stubs - decl %ecx # todo: handle if ecx < 3 - - jmp .conv_line_loop - - - .align 16 - .conv_line_loop: - call .conv_hor_4_pixels # compute conv - movq %mm0, (%esi) # store result - movq %mm1, %mm0 # mm0 <- prev (%esi) - movq %mm2, %mm1 # mm1 <- 8(%esi) - movq 16(%esi), %mm2 # mm2 <- 16(%esi) - - addl $8, %esi # increase pointer - decl %ecx - jnz .conv_line_loop - - call .conv_hor_4_pixels # compute conv - movq %mm0, (%esi) # store result - movq %mm1, %mm0 # mm0 <- prev (%esi) - movq %mm2, %mm1 # mm1 <- 8(%esi) - movq (%edi), %mm2 # mm2 <- border - - call .conv_hor_4_pixels # compute last vector - movq %mm0, 8(%esi) # store it - - emms - - pop %edi - pop %esi - leave - ret - - - diff --git a/system/mmx/pixel_conv_ver_s16.s b/system/mmx/pixel_conv_ver_s16.s deleted file mode 100644 index ae2456f..0000000 --- a/system/mmx/pixel_conv_ver_s16.s +++ /dev/null @@ -1,128 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -#TODO: fix out of bound acces in conv_ver and conv_hor - - # intermediate function - - # input in register: - # %mm0: top 4 pixels - # %mm1: middle 4 pixels - # %mm2: bottom 4 pixels - - # %mm5: top 4 pixel mask - # %mm6: middle 4 pixel mask - # %mm7: bottom 4 pixel mask - - # return in register: - # %mm0: middle 4 pixels result - - - .conv_ver_4_pixels: - .align 16 - - # compute quadruplet - - # get top pixel - pmulhw %mm5, %mm0 - psllw $1, %mm0 - - # get middle pixel - movq %mm1, %mm4 - pmulhw %mm6, %mm4 - psllw $1, %mm4 - paddsw %mm4, %mm0 - - # get bottom pixel - movq %mm2, %mm3 - pmulhw %mm7, %mm3 - psllw $1, %mm3 # mm3 <- mm3/4 - paddsw %mm3, %mm0 - - ret - -.globl pixel_conv_ver_s16 -.type pixel_conv_ver_s16,@function - - -# pixel_conv_ver_s16(short int *pixel_array, int nb_4_pixel_vectors, int row_byte_size, short int border[4]) -# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested -# NOT TESTED - - -pixel_conv_ver_s16: - - - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 8(%ebp), %esi # pixel array offset - movl 12(%ebp), %ecx # nb of 4 pixel vectors in a row (at least 2) - movl 16(%ebp), %edx # rowsize in bytes - - movl 24(%ebp), %edi # mask vector - movq (%edi), %mm5 - movq 8(%edi), %mm6 - movq 16(%edi), %mm7 - - movl 20(%ebp), %edi # edge vector - - - shll $1, %edx - decl %ecx # loop has a terminator stub - decl %ecx # loop has another terminator stub - - - movq (%edi), %mm0 # init regs (left edge, so mm0 is zero) - movq (%esi), %mm1 - movq (%esi,%edx,1), %mm2 - jmp .conv_line_loop - - - .align 16 - .conv_line_loop: - call .conv_ver_4_pixels # compute conv - movq %mm0, (%esi) # store result - movq %mm1, %mm0 # mm0 <- prev (%esi) - movq %mm2, %mm1 # mm1 <- (%esi,%edx,1) - movq (%esi,%edx,2), %mm2 # mm2 <- (%esi,%edx,2) - - addl %edx, %esi # increase pointer - decl %ecx - jnz .conv_line_loop - - call .conv_ver_4_pixels # compute conv - movq %mm0, (%esi) # store result - movq %mm1, %mm0 # mm0 <- prev (%esi) - movq %mm2, %mm1 # mm1 <- (%esi,%edx,1) - movq (%edi), %mm2 # clear invalid edge vector - - addl %edx, %esi # increase pointer - call .conv_ver_4_pixels # compute last vector - movq %mm0, (%esi) # store it - - emms - - pop %edi - pop %esi - leave - ret - - - diff --git a/system/mmx/pixel_crot_s16.s b/system/mmx/pixel_crot_s16.s deleted file mode 100644 index 2427869..0000000 --- a/system/mmx/pixel_crot_s16.s +++ /dev/null @@ -1,153 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_crot3d_s16 -.type pixel_crot3d_s16,@function - - -# 3 dimensional colour space rotation -# 3x3 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector - -# void pixel_crot3d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix) - -pixel_crot3d_s16: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - - movl 8(%ebp), %esi # input array - movl 12(%ebp), %ecx # pixel count - movl 16(%ebp), %edi # rotation matrix - movl %ecx, %edx - shll $3, %edx # %edx = plane spacing - - - .align 16 - .loop_crot3d: - - movq (%esi), %mm0 # get 1st component - movq (%esi,%edx,1), %mm6 # get 2nd component - movq (%esi,%edx,2), %mm7 # get 3rd component - - movq %mm0, %mm1 # copy 1st component - movq %mm0, %mm2 - - pmulhw (%edi), %mm0 # mul first column - pmulhw 8(%edi), %mm1 - pmulhw 16(%edi), %mm2 - - movq %mm6, %mm5 # copy 2nd component - movq %mm6, %mm3 - - pmulhw 24(%edi), %mm6 # mul second column - pmulhw 32(%edi), %mm5 - pmulhw 40(%edi), %mm3 - - paddsw %mm6, %mm0 # accumulate - paddsw %mm5, %mm1 - paddsw %mm3, %mm2 - - movq %mm7, %mm4 # copy 3rd component - movq %mm7, %mm6 - - pmulhw 48(%edi), %mm4 # mul third column - pmulhw 56(%edi), %mm6 - pmulhw 64(%edi), %mm7 - - paddsw %mm4, %mm0 # accumulate - paddsw %mm6, %mm1 - paddsw %mm7, %mm2 - - paddsw %mm0, %mm0 # double (fixed point normalization) - paddsw %mm1, %mm1 - paddsw %mm2, %mm2 - - movq %mm0, (%esi) # store - movq %mm1, (%esi, %edx, 1) - movq %mm2, (%esi, %edx, 2) - - addl $8, %esi # increment source pointer - decl %ecx - jnz .loop_crot3d # loop - - emms - - pop %edi - pop %esi - leave - ret - - -.globl pixel_crot2d_s16 -.type pixel_crot2d_s16,@function - -# 2 dimensional colour space rotation -# 2x2 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector - -# void pixel_crot2d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix) - -pixel_crot2d_s16: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - - movl 8(%ebp), %esi # input array - movl 12(%ebp), %ecx # pixel count - movl 16(%ebp), %edi # rotation matrix - movl %ecx, %edx - shll $3, %edx # %edx = plane spacing - - - .align 16 - .loop_crot2d: - - movq (%esi), %mm0 # get 1st component - movq (%esi,%edx,1), %mm2 # get 2nd component - - movq %mm0, %mm1 # copy 1st component - movq %mm2, %mm3 # copy 2nd component - - pmulhw (%edi), %mm0 # mul first column - pmulhw 8(%edi), %mm1 - - pmulhw 16(%edi), %mm2 # mul second column - pmulhw 24(%edi), %mm3 - - paddsw %mm2, %mm0 # accumulate - paddsw %mm3, %mm1 - - paddsw %mm0, %mm0 # fixed point gain correction - paddsw %mm1, %mm1 - - movq %mm0, (%esi) # store - movq %mm1, (%esi, %edx, 1) - - addl $8, %esi # increment source pointer - decl %ecx - jnz .loop_crot2d # loop - - emms - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_gain.s b/system/mmx/pixel_gain.s deleted file mode 100644 index 5cd5057..0000000 --- a/system/mmx/pixel_gain.s +++ /dev/null @@ -1,83 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_gain -.type pixel_gain,@function - -# mmx rgba pixel gain -# void asmtest(char *pixelarray, int32 nbpixels, int *rgba_gain) -# gains are 7.9 fixed point for rgba - -pixel_gain: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 8(%ebp), %esi # pixel array offset - movl 12(%ebp), %ecx # nb of elements - movl 16(%ebp), %edi # int16[4] array of gains - - prefetch (%esi) - - emms - sarl $2, %ecx # process 4 pixels per loop iteration - jz .exit - movq (%edi), %mm7 # read gain array from memory - jmp .loop_gain - - .align 16 - .loop_gain: - - prefetch 128(%esi) - movq (%esi), %mm5 # load pixel 1-2 from memory - movq 8(%esi), %mm6 # load pixel 3-4 from memory - pxor %mm0, %mm0 # zero mm0 - mm3 - pxor %mm1, %mm1 - pxor %mm2, %mm2 - pxor %mm3, %mm3 - punpcklbw %mm5, %mm0 # unpack 1st pixel into 8.8 bit ints - punpckhbw %mm5, %mm1 # unpack 2nd - punpcklbw %mm6, %mm2 # unpack 3rd - punpckhbw %mm6, %mm3 # unpack 4th - psrlw $0x1, %mm0 # shift right to clear sign bit 9.7 - psrlw $0x1, %mm1 - psrlw $0x1, %mm2 - psrlw $0x1, %mm3 - - pmulhw %mm7, %mm0 # multiply 1st pixel 9.7 * 7.9 -> 16.0 - pmulhw %mm7, %mm1 # multiply 2nd - pmulhw %mm7, %mm2 # multiply 3rd - pmulhw %mm7, %mm3 # multiply 4th - - packuswb %mm1, %mm0 # pack & saturate to 8bit vector - movq %mm0, (%esi) # store result in memory - packuswb %mm3, %mm2 # pack & saturate to 8bit vector - movq %mm2, 8(%esi) # store result in memory - - addl $16, %esi # increment source pointer - decl %ecx - jnz .loop_gain # loop - - .exit: - emms - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_gain_s16.s b/system/mmx/pixel_gain_s16.s deleted file mode 100644 index adcfdf5..0000000 --- a/system/mmx/pixel_gain_s16.s +++ /dev/null @@ -1,71 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_gain_s16 -.type pixel_gain_s16,@function - -# gain is integer, shift count is down -# void pixel_gain_s16(int *buf, int nb_8pixel_vectors, short int gain[4], unsigned long long *shift) - -pixel_gain_s16: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 20(%ebp), %edi - movq (%edi), %mm6 # get shift vector - - movl 16(%ebp), %edi - movq (%edi), %mm7 # get gain vector - - movl 8(%ebp), %esi # input array - movl 12(%ebp), %ecx # pixel count - - - .align 16 - .loop_gain: - - movq (%esi), %mm0 # load 4 pixels from memory - movq %mm0, %mm1 - pmulhw %mm7, %mm1 # apply gain (s15.0) fixed point, high word - pmullw %mm7, %mm0 # low word - - movq %mm0, %mm2 # copy - movq %mm1, %mm3 - - punpcklwd %mm1, %mm0 # unpack lsw components - punpckhwd %mm3, %mm2 # unpack msw components - - psrad %mm6, %mm0 # apply signed shift - psrad %mm6, %mm2 - - packssdw %mm2, %mm0 # pack result & saturate - movq %mm0, (%esi) # store result - - - addl $8, %esi # increment source pointer - decl %ecx - jnz .loop_gain # loop - - emms - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_mix_s16.s b/system/mmx/pixel_mix_s16.s deleted file mode 100644 index 9bf41eb..0000000 --- a/system/mmx/pixel_mix_s16.s +++ /dev/null @@ -1,68 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_mix_s16 -.type pixel_mix_s16,@function - -# mmx rgba pixel gain -# void pixel_mix_s16(int *left, int *right, int nb_4pixel_vectors, -# short int gain_left[4], short int gain_right[4]) - -pixel_mix_s16: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 20(%ebp), %edi # int16[4] array of gains - movq (%edi), %mm6 # get left gain array - - movl 24(%ebp), %edi # int16[4] array of gains - movq (%edi), %mm7 # get right gain array - - movl 8(%ebp), %edi # left array - movl 12(%ebp), %esi # right array - movl 16(%ebp), %ecx # pixel count - - - .align 16 - .loop_mix: - -# prefetch 128(%esi) - movq (%esi), %mm1 # load right 4 pixels from memory - pmulhw %mm7, %mm1 # apply right gain - movq (%edi), %mm0 # load 4 left pixels from memory - pmulhw %mm6, %mm0 # apply left gain -# pslaw $1, %mm1 # shift left ((s).15 x (s).15 -> (s0).14)) -# pslaw $1, %mm0 - paddsw %mm0, %mm0 # no shift left arithmic, so use add instead - paddsw %mm1, %mm1 - paddsw %mm1, %mm0 # mix - movq %mm0, (%edi) - addl $8, %esi - addl $8, %edi - decl %ecx - jnz .loop_mix # loop - - emms - - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_mul_s16.s b/system/mmx/pixel_mul_s16.s deleted file mode 100644 index 240a024..0000000 --- a/system/mmx/pixel_mul_s16.s +++ /dev/null @@ -1,56 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_mul_s16 -.type pixel_mul_s16,@function - -# simple add -# void pixel_mul_s16(int *left, int *right, int nb_4pixel_vectors) - -pixel_mul_s16: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 8(%ebp), %edi # left array - movl 12(%ebp), %esi # right array - movl 16(%ebp), %ecx # pixel count - - - .align 16 - .loop_mix: - -# prefetch 128(%esi) - movq (%esi), %mm1 # load right 4 pixels from memory - movq (%edi), %mm0 # load 4 left pixels from memory - pmulhw %mm1, %mm0 # mul - psllw $1, %mm0 # fixed point shift correction - movq %mm0, (%edi) - addl $8, %esi - addl $8, %edi - decl %ecx - jnz .loop_mix # loop - - emms - - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_pack_s16u8.s b/system/mmx/pixel_pack_s16u8.s deleted file mode 100644 index 57df702..0000000 --- a/system/mmx/pixel_pack_s16u8.s +++ /dev/null @@ -1,126 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_pack_s16u8_y -.type pixel_pack_s16u8_y,@function - -# mmx rgba pixel gain -# void pixel_pack_s16u8_y(int *input, int *output, int nb_8pixel_vectors) - -pixel_pack_s16u8_y: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - -# movl 20(%ebp), %edi # int16[4] array of gains -# movq (%edi), %mm7 # get gain array -# psllw $1, %mm7 # adjust for shifted sign bit - - movl 8(%ebp), %esi # input array - movl 12(%ebp), %edi # output array - movl 16(%ebp), %ecx # pixel count - - pxor %mm6, %mm6 - - .align 16 - .loop_pack_y: - -# prefetch 128(%esi) - movq (%esi), %mm0 # load 4 pixels from memory -# pmulhw %mm7, %mm0 # apply gain - movq 8(%esi), %mm1 # load 4 pixels from memory -# pmulhw %mm7, %mm1 # apply gain - -# movq %mm0, %mm2 -# pcmpgtw %mm6, %mm2 # mm2 > 0 ? 0xffff : 0 -# pand %mm2, %mm0 - -# movq %mm1, %mm3 -# pcmpgtw %mm6, %mm3 # mm3 > 0 ? 0xffff : 0 -# pand %mm3, %mm1 - -# psllw $1, %mm0 # shift out sign bit -# psllw $1, %mm1 # shift out sign bit - - psraw $7, %mm0 # shift to lsb - psraw $7, %mm1 # shift to lsb - - packuswb %mm1, %mm0 # pack & saturate to 8bit vector - movq %mm0, (%edi) # store result in memory - - addl $16, %esi # increment source pointer - addl $8, %edi # increment dest pointer - decl %ecx - jnz .loop_pack_y # loop - - emms - - pop %edi - pop %esi - leave - ret - -.globl pixel_pack_s16u8_uv -.type pixel_pack_s16u8_uv,@function - -pixel_pack_s16u8_uv: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - -# movl 20(%ebp), %edi # int16[4] array of gains -# movq (%edi), %mm7 # get gain array - movl 8(%ebp), %esi # pixel array offset - movl 12(%ebp), %edi # nb of elements - movl 16(%ebp), %ecx # pixel count - - pcmpeqw %mm6, %mm6 - psllw $15, %mm6 - movq %mm6, %mm5 - psrlw $8, %mm5 - por %mm5, %mm6 # mm6 <- 8 times 0x80 - - .align 16 - .loop_pack_uv: - -# prefetch 128(%esi) - movq (%esi), %mm0 # load 4 pixels from memory -# pmulhw %mm7, %mm0 # apply gain - movq 8(%esi), %mm1 # load 4 pixels from memory -# pmulhw %mm7, %mm1 # apply gain - - psraw $8, %mm0 # shift to msb - psraw $8, %mm1 - - packsswb %mm1, %mm0 # pack & saturate to 8bit vector - pxor %mm6, %mm0 # flip sign bits - movq %mm0, (%edi) # store result in memory - - addl $16, %esi # increment source pointer - addl $8, %edi # increment dest pointer - decl %ecx - jnz .loop_pack_uv # loop - - emms - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_rand_s16.s b/system/mmx/pixel_rand_s16.s deleted file mode 100644 index 649400b..0000000 --- a/system/mmx/pixel_rand_s16.s +++ /dev/null @@ -1,76 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_rand_s16 -.type pixel_rand_s16,@function - -# mmx rgba pixel gain -# void pixel_rand_s16(int *dst, nb_4pixel_vectors, short int random_seed[4]) - -pixel_rand_s16: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 16(%ebp), %esi # int16[4] array of random seeds - movl 8(%ebp), %edi # dst array - movl 12(%ebp), %ecx # pixel count - - movq (%esi), %mm6 - - - pcmpeqw %mm3, %mm3 - psrlw $15, %mm3 # get bit mask 4 times 0x0001 - - .align 16 - .loop_rand: - -# prefetch 128(%esi) - - - movq %mm6, %mm4 # get random vector - psrlw $15, %mm4 # get first component - movq %mm6, %mm5 - psrlw $14, %mm5 # get second component - pxor %mm5, %mm4 - movq %mm6, %mm5 - psrlw $12, %mm5 # get third component - pxor %mm5, %mm4 - movq %mm6, %mm5 - psrlw $3, %mm5 # get forth component - pxor %mm5, %mm4 - - psllw $1, %mm6 # shift left original random vector - pand %mm3, %mm4 # isolate new bit - por %mm4, %mm6 # combine into new random vector - - movq %mm6, (%edi) - addl $8, %edi - decl %ecx - jnz .loop_rand # loop - - - movq %mm6, (%esi) # store random seeds - - emms - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_randmix_s16.s b/system/mmx/pixel_randmix_s16.s deleted file mode 100644 index 44e1702..0000000 --- a/system/mmx/pixel_randmix_s16.s +++ /dev/null @@ -1,91 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_randmix_s16 -.type pixel_randmix_s16,@function - -# mmx rgba pixel gain -# void pixel_randmix_s16(int *left, int *right, int nb_4pixel_vectors, short int random_seed[4], short int threshold[4]) - -pixel_randmix_s16: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 20(%ebp), %edi # int16[4] array of random seeds - movq (%edi), %mm6 - - movl 24(%ebp), %edi # int16[4] array of thresholds - movq (%edi), %mm7 - - movl 8(%ebp), %edi # left array - movl 12(%ebp), %esi # right array - movl 16(%ebp), %ecx # pixel count - - pcmpeqw %mm3, %mm3 - psrlw $15, %mm3 # get bit mask 4 times 0x0001 - - .align 16 - .loop_randmix: - -# prefetch 128(%esi) - movq (%esi), %mm1 # load right 4 pixels from memory - movq (%edi), %mm0 # load 4 left pixels from memory - - movq %mm6, %mm2 # get random vector - pcmpgtw %mm7, %mm2 # compare random vector with threshold - movq %mm2, %mm5 - - pand %mm0, %mm2 # get left array's components - pandn %mm1, %mm5 # get right array's components - por %mm2, %mm5 - - movq %mm5, (%edi) # store pixels - - movq %mm6, %mm4 # get random vector - psrlw $15, %mm4 # get first component - movq %mm6, %mm5 - psrlw $14, %mm5 # get second component - pxor %mm5, %mm4 - movq %mm6, %mm5 - psrlw $12, %mm5 # get third component - pxor %mm5, %mm4 - movq %mm6, %mm5 - psrlw $3, %mm5 # get forth component - pxor %mm5, %mm4 - - psllw $1, %mm6 # shift left original random vector - pand %mm3, %mm4 # isolate new bit - por %mm4, %mm6 # combine into new random vector - - addl $8, %esi - addl $8, %edi - decl %ecx - jnz .loop_randmix # loop - - - movl 20(%ebp), %edi # int16[4] array of random seeds - movq %mm6, (%edi) # store random seeds - - emms - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_resample_s16.s b/system/mmx/pixel_resample_s16.s deleted file mode 100644 index 3959f9c..0000000 --- a/system/mmx/pixel_resample_s16.s +++ /dev/null @@ -1,314 +0,0 @@ - - -#interpolation data: -#* 4 vectors: neighbourhood for samples (TL, TR, BL, BR) -#* 2 vectors: fractional part (unsigned) -#* 2 vectors: addresses of pixel blocks - -#coord conversion data: -#1 vector: 32bit splatted address -#1 vector: 16bit splatted w-1 -#1 vector: 16bit splatted h-1 -#1 vector: 16bit splatted w (reuse w-1 with add?) -#1 dword: 32 bit line offset - -#coord generation data: several vectors for parameter update stuff.. - -#coordinate systems: 16 bit virtual coordinates (signed, center relative) -#* 2 vectors: virtual coordinates -#(evt tussenstap + conversie naar 16 bit virtual) - - -#step 1: generate virtual coords - - -#step 2: virtual coords -> block adresses + fractional adresses -#* mulhigh: real coords (x,y) (center relative) -#* add center -> unsigned (top left relative) -#* mullow: fractional part (x_frac, y_frac) -#* mulhigh, mullow, pack 32bit: y_offset -#* pack 32bit: x_offset -#* add, shift, add start address: real addresses - - -#step3: data fetch using generated addresses: -# this step would be much simpler in 4x16bit rgba. life's a bitch.. - -#step4: billinear interpolation - -#stat5: store - - - - # this can be simplified by doing 32 bit unaligned moves - # and vector unpacking on the data - - - - # cooked image data structure - # pixel environment temp storage - TL1 = 0x00 - TL2 = 0x02 - TL3 = 0x04 - TL4 = 0x06 - TR1 = 0x08 - TR2 = 0x0A - TR3 = 0x0C - TR4 = 0x0E - BL1 = 0x10 - BL2 = 0x12 - BL3 = 0x14 - BL4 = 0x16 - BR1 = 0x18 - BR2 = 0x1A - BR3 = 0x1C - BR4 = 0x1E - # addresses of pixel blocks - ADDRESS1 = 0x20 - ADDRESS2 = 0x24 - ADDRESS3 = 0x28 - ADDRESS4 = 0x2C - - # second env + address buffer (testing: not used) - SECONDBUFFER = 0x30 - - # 32bit splatted bitmap address - V2PLANEADDRESS = 0x60 - # 16bit splatted image constants - V4TWOWIDTHM1 = 0x68 - V4TWOHEIGHTM1 = 0x70 - V4LINEOFFSET = 0x78 - # data struct size - RESAMPLEDATASIZE = 0x80 - - - - # interpolation routine - # input: %mm0, %mm1 4 x 16bit unsigned top left relative virtual x and y coordinates - # %esi: temp & algo data structure - -getpixelsbilin: psrlw $1, %mm0 # convert to range 0->0x7fff [0,0.5[ - psrlw $1, %mm1 - movq %mm0, %mm2 - movq %mm1, %mm3 - movq V4TWOWIDTHM1(%esi), %mm4 # 2 * (width - 1) - movq V4TWOHEIGHTM1(%esi), %mm5 # 2 * (height - 1) - pmulhw %mm5, %mm3 # mm3 == y coord (topleft relative) - pmulhw %mm4, %mm2 # mm2 == x coord (topleft relative) - pmullw %mm5, %mm1 # mm1 == y frac (unsigned) - pmullw %mm4, %mm0 # mm0 == x frac (unsigned) - - movq %mm3, %mm5 # copy y coord - pmullw V4LINEOFFSET(%esi), %mm3 # low part of line offset - pmulhw V4LINEOFFSET(%esi), %mm5 # high part of line offset - - movq %mm2, %mm7 # copy x coord vector - pxor %mm4, %mm4 - punpcklwd %mm4, %mm2 # low part in %mm2 - punpckhwd %mm4, %mm7 # hight part in %mm7 - - movq %mm3, %mm6 # copy - punpcklwd %mm5, %mm3 # unpack low part in %mm3 - punpckhwd %mm5, %mm6 # high part int %mm6 - - paddd %mm2, %mm3 - paddd %mm7, %mm6 - pslld $1, %mm3 # convert to word adresses - pslld $1, %mm6 - - paddd V2PLANEADDRESS(%esi), %mm3 # add pixel plane address - paddd V2PLANEADDRESS(%esi), %mm6 - - movq %mm3, ADDRESS1(%esi) # store adresses - movq %mm6, ADDRESS3(%esi) - - pcmpeqw %mm2, %mm2 # all ones - movq %mm0, %mm4 # copy x frac - movq %mm1, %mm5 # copy y frac - pxor %mm2, %mm4 # compute compliment (approx negative) - pxor %mm2, %mm5 - - psrlw $1, %mm0 # shift right (0.5 * (frac x) - psrlw $1, %mm1 # shift right (0.5 * (frac y) - psrlw $1, %mm4 # shift right (0.5 * (1 - frac x) - psrlw $1, %mm5 # shift right (0.5 * (1 - frac y) - - movq %mm0, %mm2 # copy of frac x - movq %mm4, %mm3 # copy of (1-frac x) - # fetch data - - #jmp skipfetch # seems the fetch is the real killer. try to optimize this - # using 32 bit accesses & shifts - - # the src image data struct is padded to the cooked data struct - movl RESAMPLEDATASIZE(%esi), %edi - shll $1, %edi - - movl ADDRESS1(%esi), %ecx - movl ADDRESS2(%esi), %edx - - movw (%ecx), %ax - movw (%edx), %bx - movw %ax, TL1(%esi) - movw %bx, TL2(%esi) - movw 2(%ecx), %ax - movw 2(%edx), %bx - movw %ax, TR1(%esi) - movw %bx, TR2(%esi) - - addl %edi, %ecx - addl %edi, %edx - - movw (%ecx), %ax - movw (%edx), %bx - movw %ax, BL1(%esi) - movw %bx, BL2(%esi) - movw 2(%ecx), %ax - movw 2(%edx), %bx - movw %ax, BR1(%esi) - movw %bx, BR2(%esi) - - - movl ADDRESS3(%esi), %ecx - movl ADDRESS4(%esi), %edx - - - movw (%ecx), %ax - movw (%edx), %bx - movw %ax, TL3(%esi) - movw %bx, TL4(%esi) - movw 2(%ecx), %ax - movw 2(%edx), %bx - movw %ax, TR3(%esi) - movw %bx, TR4(%esi) - - addl %edi, %ecx - addl %edi, %edx - - movw (%ecx), %ax - movw (%edx), %bx - movw %ax, BL3(%esi) - movw %bx, BL4(%esi) - movw 2(%ecx), %ax - movw 2(%edx), %bx - movw %ax, BR3(%esi) - movw %bx, BR4(%esi) - - -skipfetch: - pmulhw TL1(%esi), %mm4 # bilin interpolation - pmulhw TR1(%esi), %mm0 - pmulhw BL1(%esi), %mm3 - pmulhw BR1(%esi), %mm2 - - - paddw %mm4, %mm0 - paddw %mm3, %mm2 - - pmulhw %mm5, %mm0 - pmulhw %mm1, %mm2 - - paddw %mm2, %mm0 - psllw $2, %mm0 # compensate for gain reduction - - ret - - - // linear mapping data struct - ROWSTATEX = 0x0 - ROWSTATEY = 0x8 - COLSTATEX = 0x10 - COLSTATEY = 0x18 - ROWINCX = 0x20 - ROWINCY = 0x28 - COLINCX = 0x30 - COLINCY = 0x38 - - // image data struct - LINEOFFSET = 0x0 - IMAGEADDRESS = 0x4 - WIDTH = 0x8 - HEIGHT = 0xC - IMAGEDATASIZE = 0x10 - - - -# pixel_resample_linmap_s16(void *x) -.globl pixel_resample_linmap_s16 -.type pixel_resample_linmap_s16,@function - - SOURCEIMAGE = RESAMPLEDATASIZE - DESTIMAGE = SOURCEIMAGE + IMAGEDATASIZE - LINMAPDATA = DESTIMAGE + IMAGEDATASIZE - -pixel_resample_linmap_s16: - pushl %ebp - movl %esp, %ebp - pushl %esi - pushl %edi - pushl %ebx - - - movl 8(%ebp), %esi # get data struct - movl DESTIMAGE+HEIGHT(%esi), %edx # image height - movl DESTIMAGE+IMAGEADDRESS(%esi), %edi # dest image address - movl DESTIMAGE+WIDTH(%esi), %ecx # image width - shrl $2, %ecx # vector count - .align 16 - -linmap_looprow: - movq LINMAPDATA+ROWSTATEX(%esi), %mm0 # get current coordinates - movq LINMAPDATA+ROWSTATEY(%esi), %mm1 - -linmap_loopcol: - movq %mm0, %mm4 # copy - movq %mm1, %mm5 - paddd LINMAPDATA+ROWINCX(%esi), %mm4 # increment - paddd LINMAPDATA+ROWINCY(%esi), %mm5 - movq %mm4, %mm6 # copy - movq %mm5, %mm7 - paddd LINMAPDATA+ROWINCX(%esi), %mm6 # increment - paddd LINMAPDATA+ROWINCY(%esi), %mm7 - movq %mm6, LINMAPDATA+ROWSTATEX(%esi) # store next state - movq %mm7, LINMAPDATA+ROWSTATEY(%esi) - - psrad $16, %mm0 # round to 16 bit - psrad $16, %mm1 - psrad $16, %mm4 - psrad $16, %mm5 - packssdw %mm4, %mm0 # pack new coordinates - packssdw %mm5, %mm1 - - push %ecx - push %edx - push %edi - - call getpixelsbilin # do interpolation - - pop %edi - pop %edx - pop %ecx - movq %mm0, (%edi) # store 4 pixels - addl $0x8, %edi # point to next 4 pixels - decl %ecx # dec row counter - jnz linmap_looprow - - movq LINMAPDATA+COLSTATEX(%esi), %mm0 # get column state vector - movq LINMAPDATA+COLSTATEY(%esi), %mm1 - movl DESTIMAGE+WIDTH(%esi), %ecx # image width - shrl $2, %ecx # vector count - paddd LINMAPDATA+COLINCX(%esi), %mm0 # increment - paddd LINMAPDATA+COLINCY(%esi), %mm1 - movq %mm0, LINMAPDATA+COLSTATEX(%esi) # store - movq %mm1, LINMAPDATA+COLSTATEY(%esi) - decl %edx # dec column counter - jnz linmap_loopcol - - emms - popl %ebx - popl %edi - popl %esi - leave - ret - - diff --git a/system/mmx/pixel_s1.s b/system/mmx/pixel_s1.s deleted file mode 100644 index d6bc5ca..0000000 --- a/system/mmx/pixel_s1.s +++ /dev/null @@ -1,201 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# - - # this file contains ops for binary image processing - # 8x8 bit tile encoded - # low byte = bottom row - # low bit = right column - # %mm7 = scratch reg for all macros - - - # ************ load mask ******************* - # compute bit masks for rows and columns - # %mm7: scratch reg - - # load mask top - .macro ldmt count reg - pcmpeqb \reg, \reg - psllq $(64-(\count<<3)), \reg - .endm - - # load mask bottom - .macro ldmb count reg - pcmpeqb \reg, \reg - psrlq $(64-(\count<<3)), \reg - .endm - - # load mask top and bottom - .macro ldmtb count regt regb - ldmb \count, \regb - ldmt \count, \regt - .endm - - # load mask right - .macro ldmr count reg - pcmpeqb %mm7, %mm7 - psrlw $(16-\count), %mm7 - movq %mm7, \reg - psllq $8, %mm7 - por %mm7, \reg - .endm - - # load mask left - .macro ldml count reg - pcmpeqb %mm7, %mm7 - psllw $(16-\count), %mm7 - movq %mm7, \reg - psrlq $8, %mm7 - por %mm7, \reg - .endm - - # load mask left and right - .macro ldmlr count regl regr - pcmpeqb %mm7, %mm7 - psllw $(16-\count), %mm7 - movq %mm7, \regl - psrlq $8, %mm7 - por %mm7, \regl - movq \regl, \regr - psrlq $(8-\count), \regr - .endm - - # ************* shift square ********** - # shifts a square in reg, fills with zeros - - # shift square top - .macro sst count reg - psllq $(\count<<3), \reg - .endm - - # shift square bottom - .macro ssb count reg - psrlq $(\count<<3), \reg - .endm - - # not tested - # shift square left - .macro ssl count reg - movq \reg, %mm7 - pcmpeqb \reg, \reg - psllw $(16-\count), \reg - psrlw $8, \reg - pandn %mm7, \reg - psllw $(\count), \reg - .endm - - # shift square right - .macro ssr count reg - movq \reg, %mm7 - pcmpeqb \reg, \reg - psrlw $(16-\count), \reg - psllw $8, \reg - pandn %mm7, \reg - psrlw $(\count), \reg - .endm - - - # ********** combine square ************* - # combines 2 squares - - # combine right - .macro csr count regr reg - ssl \count, \reg - ssr (8-\count), \regr - por \regr, \reg - .endm - - # combine left - .macro csl count regl reg - ssr \count, \reg - ssl (8-\count), \regl - por \regl, \reg - .endm - - # combine top - .macro cst count regt reg - ssb \count, \reg - sst (8-\count), \regt - por \regt, \reg - .endm - - - # combine bottom - .macro csb count regb reg - sst \count, \reg - ssb (8-\count), \regb - por \regb, \reg - .endm - - - # ********** load combine square ************* - # loads combined square using mask - - # load combined square left - # mask should be count bits set right (i.e. 0x01) - .macro lcsml count mask source sourcel dstreg - movq \mask, \dstreg - movq \mask, %mm7 - pandn \source, \dstreg - pand \sourcel, %mm7 - psrlq $(\count), \dstreg - psllq $(8-\count), %mm7 - por %mm7, \dstreg - .endm - - - -.globl pixel_test_s1 -.type pixel_test_s1,@function - -# simple add -# void pixel_add_s16(void *dest, void *source, int nb_squares, int spacing) - - - - # - - -pixel_test_s1: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - - movl 8(%ebp), %edi # dest - movl 12(%ebp), %esi # source - movl 16(%ebp), %ecx # count - movl 20(%ebp), %edx # row distance - - ldmr 1, %mm6 - lcsml 1, %mm6, (%esi), 8(%esi), %mm0 - movq %mm0, (%edi) - - -# movq (%esi), %mm0 -# movq 8(%esi), %mm1 -# csl 4, %mm1, %mm0 -# movq %mm0, (%edi) - - emms - - - pop %edi - pop %esi - leave - ret - diff --git a/system/mmx/pixel_unpack_u8s16.s b/system/mmx/pixel_unpack_u8s16.s deleted file mode 100644 index 0fc14c2..0000000 --- a/system/mmx/pixel_unpack_u8s16.s +++ /dev/null @@ -1,113 +0,0 @@ -# Pure Data Packet mmx routine. -# Copyright (c) by Tom Schouten -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -.globl pixel_unpack_u8s16_y -.type pixel_unpack_u8s16_y,@function - -# mmx rgba pixel gain -# void pixel_unpack_u8s16_y(char *input, char *output, int32 nb_pixels_div8) - -pixel_unpack_u8s16_y: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - -# movl 20(%ebp), %edi # int16[4] array of gains -# movq (%edi), %mm7 # get gain array - - movl 8(%ebp), %esi # input uint8 pixel array - movl 12(%ebp), %edi # output sint16 pixel array - movl 16(%ebp), %ecx # nb of elements div 8 - - - .align 16 - .loop_unpack_y: - - movq (%esi), %mm5 # load 8 pixels from memory - pxor %mm0, %mm0 # zero mm0 - mm3 - pxor %mm1, %mm1 - punpcklbw %mm5, %mm0 # unpack 1st 4 pixels - punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles - psrlw $0x1, %mm0 # shift right to clear sign bit 9.7 - psrlw $0x1, %mm1 -# pmulhw %mm7, %mm0 # apply gain -# pmulhw %mm7, %mm1 -# paddsw %mm0, %mm0 # correct factor 2 -# paddsw %mm1, %mm1 - movq %mm0, (%edi) # store - movq %mm1, 8(%edi) - - addl $8, %esi # increment source pointer - addl $16, %edi # increment dest pointer - decl %ecx - jnz .loop_unpack_y # loop - - emms - - pop %edi - pop %esi - leave - ret - -.globl pixel_unpack_u8s16_uv -.type pixel_unpack_u8s16_uv,@function -pixel_unpack_u8s16_uv: - pushl %ebp - movl %esp, %ebp - push %esi - push %edi - -# movl 20(%ebp), %edi # int16[4] array of gains -# movq (%edi), %mm7 # get gain array - - movl 8(%ebp), %esi # input uint8 pixel array - movl 12(%ebp), %edi # output sint16 pixel array - movl 16(%ebp), %ecx # nb of elements div 8 - - pcmpeqw %mm6, %mm6 - psllw $15, %mm6 - - .align 16 - .loop_unpack_uv: - - movq (%esi), %mm5 # load 8 pixels from memory - pxor %mm0, %mm0 # zero mm0 - mm3 - pxor %mm1, %mm1 - punpcklbw %mm5, %mm0 # unpack 1st 4 pixels - punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles - pxor %mm6, %mm0 # flip sign bit (Cr and Cb are ofset by 128) - pxor %mm6, %mm1 -# pmulhw %mm7, %mm0 # apply gain -# pmulhw %mm7, %mm1 -# paddsw %mm0, %mm0 # correct factor 2 -# paddsw %mm1, %mm1 - movq %mm0, (%edi) # store - movq %mm1, 8(%edi) - - addl $8, %esi # increment source pointer - addl $16, %edi # increment dest pointer - decl %ecx - jnz .loop_unpack_uv # loop - - emms - - pop %edi - pop %esi - leave - ret - -- cgit v1.2.1