aboutsummaryrefslogtreecommitdiff
path: root/system/mmx/pixel_resample_s16.s
diff options
context:
space:
mode:
Diffstat (limited to 'system/mmx/pixel_resample_s16.s')
-rw-r--r--system/mmx/pixel_resample_s16.s314
1 files changed, 0 insertions, 314 deletions
diff --git a/system/mmx/pixel_resample_s16.s b/system/mmx/pixel_resample_s16.s
deleted file mode 100644
index 3959f9c..0000000
--- a/system/mmx/pixel_resample_s16.s
+++ /dev/null
@@ -1,314 +0,0 @@
-
-
-#interpolation data:
-#* 4 vectors: neighbourhood for samples (TL, TR, BL, BR)
-#* 2 vectors: fractional part (unsigned)
-#* 2 vectors: addresses of pixel blocks
-
-#coord conversion data:
-#1 vector: 32bit splatted address
-#1 vector: 16bit splatted w-1
-#1 vector: 16bit splatted h-1
-#1 vector: 16bit splatted w (reuse w-1 with add?)
-#1 dword: 32 bit line offset
-
-#coord generation data: several vectors for parameter update stuff..
-
-#coordinate systems: 16 bit virtual coordinates (signed, center relative)
-#* 2 vectors: virtual coordinates
-#(evt tussenstap + conversie naar 16 bit virtual)
-
-
-#step 1: generate virtual coords
-
-
-#step 2: virtual coords -> block adresses + fractional adresses
-#* mulhigh: real coords (x,y) (center relative)
-#* add center -> unsigned (top left relative)
-#* mullow: fractional part (x_frac, y_frac)
-#* mulhigh, mullow, pack 32bit: y_offset
-#* pack 32bit: x_offset
-#* add, shift, add start address: real addresses
-
-
-#step3: data fetch using generated addresses:
-# this step would be much simpler in 4x16bit rgba. life's a bitch..
-
-#step4: billinear interpolation
-
-#stat5: store
-
-
-
- # this can be simplified by doing 32 bit unaligned moves
- # and vector unpacking on the data
-
-
-
- # cooked image data structure
- # pixel environment temp storage
- TL1 = 0x00
- TL2 = 0x02
- TL3 = 0x04
- TL4 = 0x06
- TR1 = 0x08
- TR2 = 0x0A
- TR3 = 0x0C
- TR4 = 0x0E
- BL1 = 0x10
- BL2 = 0x12
- BL3 = 0x14
- BL4 = 0x16
- BR1 = 0x18
- BR2 = 0x1A
- BR3 = 0x1C
- BR4 = 0x1E
- # addresses of pixel blocks
- ADDRESS1 = 0x20
- ADDRESS2 = 0x24
- ADDRESS3 = 0x28
- ADDRESS4 = 0x2C
-
- # second env + address buffer (testing: not used)
- SECONDBUFFER = 0x30
-
- # 32bit splatted bitmap address
- V2PLANEADDRESS = 0x60
- # 16bit splatted image constants
- V4TWOWIDTHM1 = 0x68
- V4TWOHEIGHTM1 = 0x70
- V4LINEOFFSET = 0x78
- # data struct size
- RESAMPLEDATASIZE = 0x80
-
-
-
- # interpolation routine
- # input: %mm0, %mm1 4 x 16bit unsigned top left relative virtual x and y coordinates
- # %esi: temp & algo data structure
-
-getpixelsbilin: psrlw $1, %mm0 # convert to range 0->0x7fff [0,0.5[
- psrlw $1, %mm1
- movq %mm0, %mm2
- movq %mm1, %mm3
- movq V4TWOWIDTHM1(%esi), %mm4 # 2 * (width - 1)
- movq V4TWOHEIGHTM1(%esi), %mm5 # 2 * (height - 1)
- pmulhw %mm5, %mm3 # mm3 == y coord (topleft relative)
- pmulhw %mm4, %mm2 # mm2 == x coord (topleft relative)
- pmullw %mm5, %mm1 # mm1 == y frac (unsigned)
- pmullw %mm4, %mm0 # mm0 == x frac (unsigned)
-
- movq %mm3, %mm5 # copy y coord
- pmullw V4LINEOFFSET(%esi), %mm3 # low part of line offset
- pmulhw V4LINEOFFSET(%esi), %mm5 # high part of line offset
-
- movq %mm2, %mm7 # copy x coord vector
- pxor %mm4, %mm4
- punpcklwd %mm4, %mm2 # low part in %mm2
- punpckhwd %mm4, %mm7 # hight part in %mm7
-
- movq %mm3, %mm6 # copy
- punpcklwd %mm5, %mm3 # unpack low part in %mm3
- punpckhwd %mm5, %mm6 # high part int %mm6
-
- paddd %mm2, %mm3
- paddd %mm7, %mm6
- pslld $1, %mm3 # convert to word adresses
- pslld $1, %mm6
-
- paddd V2PLANEADDRESS(%esi), %mm3 # add pixel plane address
- paddd V2PLANEADDRESS(%esi), %mm6
-
- movq %mm3, ADDRESS1(%esi) # store adresses
- movq %mm6, ADDRESS3(%esi)
-
- pcmpeqw %mm2, %mm2 # all ones
- movq %mm0, %mm4 # copy x frac
- movq %mm1, %mm5 # copy y frac
- pxor %mm2, %mm4 # compute compliment (approx negative)
- pxor %mm2, %mm5
-
- psrlw $1, %mm0 # shift right (0.5 * (frac x)
- psrlw $1, %mm1 # shift right (0.5 * (frac y)
- psrlw $1, %mm4 # shift right (0.5 * (1 - frac x)
- psrlw $1, %mm5 # shift right (0.5 * (1 - frac y)
-
- movq %mm0, %mm2 # copy of frac x
- movq %mm4, %mm3 # copy of (1-frac x)
- # fetch data
-
- #jmp skipfetch # seems the fetch is the real killer. try to optimize this
- # using 32 bit accesses & shifts
-
- # the src image data struct is padded to the cooked data struct
- movl RESAMPLEDATASIZE(%esi), %edi
- shll $1, %edi
-
- movl ADDRESS1(%esi), %ecx
- movl ADDRESS2(%esi), %edx
-
- movw (%ecx), %ax
- movw (%edx), %bx
- movw %ax, TL1(%esi)
- movw %bx, TL2(%esi)
- movw 2(%ecx), %ax
- movw 2(%edx), %bx
- movw %ax, TR1(%esi)
- movw %bx, TR2(%esi)
-
- addl %edi, %ecx
- addl %edi, %edx
-
- movw (%ecx), %ax
- movw (%edx), %bx
- movw %ax, BL1(%esi)
- movw %bx, BL2(%esi)
- movw 2(%ecx), %ax
- movw 2(%edx), %bx
- movw %ax, BR1(%esi)
- movw %bx, BR2(%esi)
-
-
- movl ADDRESS3(%esi), %ecx
- movl ADDRESS4(%esi), %edx
-
-
- movw (%ecx), %ax
- movw (%edx), %bx
- movw %ax, TL3(%esi)
- movw %bx, TL4(%esi)
- movw 2(%ecx), %ax
- movw 2(%edx), %bx
- movw %ax, TR3(%esi)
- movw %bx, TR4(%esi)
-
- addl %edi, %ecx
- addl %edi, %edx
-
- movw (%ecx), %ax
- movw (%edx), %bx
- movw %ax, BL3(%esi)
- movw %bx, BL4(%esi)
- movw 2(%ecx), %ax
- movw 2(%edx), %bx
- movw %ax, BR3(%esi)
- movw %bx, BR4(%esi)
-
-
-skipfetch:
- pmulhw TL1(%esi), %mm4 # bilin interpolation
- pmulhw TR1(%esi), %mm0
- pmulhw BL1(%esi), %mm3
- pmulhw BR1(%esi), %mm2
-
-
- paddw %mm4, %mm0
- paddw %mm3, %mm2
-
- pmulhw %mm5, %mm0
- pmulhw %mm1, %mm2
-
- paddw %mm2, %mm0
- psllw $2, %mm0 # compensate for gain reduction
-
- ret
-
-
- // linear mapping data struct
- ROWSTATEX = 0x0
- ROWSTATEY = 0x8
- COLSTATEX = 0x10
- COLSTATEY = 0x18
- ROWINCX = 0x20
- ROWINCY = 0x28
- COLINCX = 0x30
- COLINCY = 0x38
-
- // image data struct
- LINEOFFSET = 0x0
- IMAGEADDRESS = 0x4
- WIDTH = 0x8
- HEIGHT = 0xC
- IMAGEDATASIZE = 0x10
-
-
-
-# pixel_resample_linmap_s16(void *x)
-.globl pixel_resample_linmap_s16
-.type pixel_resample_linmap_s16,@function
-
- SOURCEIMAGE = RESAMPLEDATASIZE
- DESTIMAGE = SOURCEIMAGE + IMAGEDATASIZE
- LINMAPDATA = DESTIMAGE + IMAGEDATASIZE
-
-pixel_resample_linmap_s16:
- pushl %ebp
- movl %esp, %ebp
- pushl %esi
- pushl %edi
- pushl %ebx
-
-
- movl 8(%ebp), %esi # get data struct
- movl DESTIMAGE+HEIGHT(%esi), %edx # image height
- movl DESTIMAGE+IMAGEADDRESS(%esi), %edi # dest image address
- movl DESTIMAGE+WIDTH(%esi), %ecx # image width
- shrl $2, %ecx # vector count
- .align 16
-
-linmap_looprow:
- movq LINMAPDATA+ROWSTATEX(%esi), %mm0 # get current coordinates
- movq LINMAPDATA+ROWSTATEY(%esi), %mm1
-
-linmap_loopcol:
- movq %mm0, %mm4 # copy
- movq %mm1, %mm5
- paddd LINMAPDATA+ROWINCX(%esi), %mm4 # increment
- paddd LINMAPDATA+ROWINCY(%esi), %mm5
- movq %mm4, %mm6 # copy
- movq %mm5, %mm7
- paddd LINMAPDATA+ROWINCX(%esi), %mm6 # increment
- paddd LINMAPDATA+ROWINCY(%esi), %mm7
- movq %mm6, LINMAPDATA+ROWSTATEX(%esi) # store next state
- movq %mm7, LINMAPDATA+ROWSTATEY(%esi)
-
- psrad $16, %mm0 # round to 16 bit
- psrad $16, %mm1
- psrad $16, %mm4
- psrad $16, %mm5
- packssdw %mm4, %mm0 # pack new coordinates
- packssdw %mm5, %mm1
-
- push %ecx
- push %edx
- push %edi
-
- call getpixelsbilin # do interpolation
-
- pop %edi
- pop %edx
- pop %ecx
- movq %mm0, (%edi) # store 4 pixels
- addl $0x8, %edi # point to next 4 pixels
- decl %ecx # dec row counter
- jnz linmap_looprow
-
- movq LINMAPDATA+COLSTATEX(%esi), %mm0 # get column state vector
- movq LINMAPDATA+COLSTATEY(%esi), %mm1
- movl DESTIMAGE+WIDTH(%esi), %ecx # image width
- shrl $2, %ecx # vector count
- paddd LINMAPDATA+COLINCX(%esi), %mm0 # increment
- paddd LINMAPDATA+COLINCY(%esi), %mm1
- movq %mm0, LINMAPDATA+COLSTATEX(%esi) # store
- movq %mm1, LINMAPDATA+COLSTATEY(%esi)
- decl %edx # dec column counter
- jnz linmap_loopcol
-
- emms
- popl %ebx
- popl %edi
- popl %esi
- leave
- ret
-
-