aboutsummaryrefslogtreecommitdiff
path: root/system/mmx/pixel_resample_s16.s
diff options
context:
space:
mode:
Diffstat (limited to 'system/mmx/pixel_resample_s16.s')
-rw-r--r--system/mmx/pixel_resample_s16.s314
1 files changed, 314 insertions, 0 deletions
diff --git a/system/mmx/pixel_resample_s16.s b/system/mmx/pixel_resample_s16.s
new file mode 100644
index 0000000..3959f9c
--- /dev/null
+++ b/system/mmx/pixel_resample_s16.s
@@ -0,0 +1,314 @@
+
+
+#interpolation data:
+#* 4 vectors: neighbourhood for samples (TL, TR, BL, BR)
+#* 2 vectors: fractional part (unsigned)
+#* 2 vectors: addresses of pixel blocks
+
+#coord conversion data:
+#1 vector: 32bit splatted address
+#1 vector: 16bit splatted w-1
+#1 vector: 16bit splatted h-1
+#1 vector: 16bit splatted w (reuse w-1 with add?)
+#1 dword: 32 bit line offset
+
+#coord generation data: several vectors for parameter update stuff..
+
+#coordinate systems: 16 bit virtual coordinates (signed, center relative)
+#* 2 vectors: virtual coordinates
+#(evt tussenstap + conversie naar 16 bit virtual)
+
+
+#step 1: generate virtual coords
+
+
+#step 2: virtual coords -> block adresses + fractional adresses
+#* mulhigh: real coords (x,y) (center relative)
+#* add center -> unsigned (top left relative)
+#* mullow: fractional part (x_frac, y_frac)
+#* mulhigh, mullow, pack 32bit: y_offset
+#* pack 32bit: x_offset
+#* add, shift, add start address: real addresses
+
+
+#step3: data fetch using generated addresses:
+# this step would be much simpler in 4x16bit rgba. life's a bitch..
+
+#step4: billinear interpolation
+
+#stat5: store
+
+
+
+ # this can be simplified by doing 32 bit unaligned moves
+ # and vector unpacking on the data
+
+
+
+ # cooked image data structure
+ # pixel environment temp storage
+ TL1 = 0x00
+ TL2 = 0x02
+ TL3 = 0x04
+ TL4 = 0x06
+ TR1 = 0x08
+ TR2 = 0x0A
+ TR3 = 0x0C
+ TR4 = 0x0E
+ BL1 = 0x10
+ BL2 = 0x12
+ BL3 = 0x14
+ BL4 = 0x16
+ BR1 = 0x18
+ BR2 = 0x1A
+ BR3 = 0x1C
+ BR4 = 0x1E
+ # addresses of pixel blocks
+ ADDRESS1 = 0x20
+ ADDRESS2 = 0x24
+ ADDRESS3 = 0x28
+ ADDRESS4 = 0x2C
+
+ # second env + address buffer (testing: not used)
+ SECONDBUFFER = 0x30
+
+ # 32bit splatted bitmap address
+ V2PLANEADDRESS = 0x60
+ # 16bit splatted image constants
+ V4TWOWIDTHM1 = 0x68
+ V4TWOHEIGHTM1 = 0x70
+ V4LINEOFFSET = 0x78
+ # data struct size
+ RESAMPLEDATASIZE = 0x80
+
+
+
+ # interpolation routine
+ # input: %mm0, %mm1 4 x 16bit unsigned top left relative virtual x and y coordinates
+ # %esi: temp & algo data structure
+
+getpixelsbilin: psrlw $1, %mm0 # convert to range 0->0x7fff [0,0.5[
+ psrlw $1, %mm1
+ movq %mm0, %mm2
+ movq %mm1, %mm3
+ movq V4TWOWIDTHM1(%esi), %mm4 # 2 * (width - 1)
+ movq V4TWOHEIGHTM1(%esi), %mm5 # 2 * (height - 1)
+ pmulhw %mm5, %mm3 # mm3 == y coord (topleft relative)
+ pmulhw %mm4, %mm2 # mm2 == x coord (topleft relative)
+ pmullw %mm5, %mm1 # mm1 == y frac (unsigned)
+ pmullw %mm4, %mm0 # mm0 == x frac (unsigned)
+
+ movq %mm3, %mm5 # copy y coord
+ pmullw V4LINEOFFSET(%esi), %mm3 # low part of line offset
+ pmulhw V4LINEOFFSET(%esi), %mm5 # high part of line offset
+
+ movq %mm2, %mm7 # copy x coord vector
+ pxor %mm4, %mm4
+ punpcklwd %mm4, %mm2 # low part in %mm2
+ punpckhwd %mm4, %mm7 # hight part in %mm7
+
+ movq %mm3, %mm6 # copy
+ punpcklwd %mm5, %mm3 # unpack low part in %mm3
+ punpckhwd %mm5, %mm6 # high part int %mm6
+
+ paddd %mm2, %mm3
+ paddd %mm7, %mm6
+ pslld $1, %mm3 # convert to word adresses
+ pslld $1, %mm6
+
+ paddd V2PLANEADDRESS(%esi), %mm3 # add pixel plane address
+ paddd V2PLANEADDRESS(%esi), %mm6
+
+ movq %mm3, ADDRESS1(%esi) # store adresses
+ movq %mm6, ADDRESS3(%esi)
+
+ pcmpeqw %mm2, %mm2 # all ones
+ movq %mm0, %mm4 # copy x frac
+ movq %mm1, %mm5 # copy y frac
+ pxor %mm2, %mm4 # compute compliment (approx negative)
+ pxor %mm2, %mm5
+
+ psrlw $1, %mm0 # shift right (0.5 * (frac x)
+ psrlw $1, %mm1 # shift right (0.5 * (frac y)
+ psrlw $1, %mm4 # shift right (0.5 * (1 - frac x)
+ psrlw $1, %mm5 # shift right (0.5 * (1 - frac y)
+
+ movq %mm0, %mm2 # copy of frac x
+ movq %mm4, %mm3 # copy of (1-frac x)
+ # fetch data
+
+ #jmp skipfetch # seems the fetch is the real killer. try to optimize this
+ # using 32 bit accesses & shifts
+
+ # the src image data struct is padded to the cooked data struct
+ movl RESAMPLEDATASIZE(%esi), %edi
+ shll $1, %edi
+
+ movl ADDRESS1(%esi), %ecx
+ movl ADDRESS2(%esi), %edx
+
+ movw (%ecx), %ax
+ movw (%edx), %bx
+ movw %ax, TL1(%esi)
+ movw %bx, TL2(%esi)
+ movw 2(%ecx), %ax
+ movw 2(%edx), %bx
+ movw %ax, TR1(%esi)
+ movw %bx, TR2(%esi)
+
+ addl %edi, %ecx
+ addl %edi, %edx
+
+ movw (%ecx), %ax
+ movw (%edx), %bx
+ movw %ax, BL1(%esi)
+ movw %bx, BL2(%esi)
+ movw 2(%ecx), %ax
+ movw 2(%edx), %bx
+ movw %ax, BR1(%esi)
+ movw %bx, BR2(%esi)
+
+
+ movl ADDRESS3(%esi), %ecx
+ movl ADDRESS4(%esi), %edx
+
+
+ movw (%ecx), %ax
+ movw (%edx), %bx
+ movw %ax, TL3(%esi)
+ movw %bx, TL4(%esi)
+ movw 2(%ecx), %ax
+ movw 2(%edx), %bx
+ movw %ax, TR3(%esi)
+ movw %bx, TR4(%esi)
+
+ addl %edi, %ecx
+ addl %edi, %edx
+
+ movw (%ecx), %ax
+ movw (%edx), %bx
+ movw %ax, BL3(%esi)
+ movw %bx, BL4(%esi)
+ movw 2(%ecx), %ax
+ movw 2(%edx), %bx
+ movw %ax, BR3(%esi)
+ movw %bx, BR4(%esi)
+
+
+skipfetch:
+ pmulhw TL1(%esi), %mm4 # bilin interpolation
+ pmulhw TR1(%esi), %mm0
+ pmulhw BL1(%esi), %mm3
+ pmulhw BR1(%esi), %mm2
+
+
+ paddw %mm4, %mm0
+ paddw %mm3, %mm2
+
+ pmulhw %mm5, %mm0
+ pmulhw %mm1, %mm2
+
+ paddw %mm2, %mm0
+ psllw $2, %mm0 # compensate for gain reduction
+
+ ret
+
+
+ // linear mapping data struct
+ ROWSTATEX = 0x0
+ ROWSTATEY = 0x8
+ COLSTATEX = 0x10
+ COLSTATEY = 0x18
+ ROWINCX = 0x20
+ ROWINCY = 0x28
+ COLINCX = 0x30
+ COLINCY = 0x38
+
+ // image data struct
+ LINEOFFSET = 0x0
+ IMAGEADDRESS = 0x4
+ WIDTH = 0x8
+ HEIGHT = 0xC
+ IMAGEDATASIZE = 0x10
+
+
+
+# pixel_resample_linmap_s16(void *x)
+.globl pixel_resample_linmap_s16
+.type pixel_resample_linmap_s16,@function
+
+ SOURCEIMAGE = RESAMPLEDATASIZE
+ DESTIMAGE = SOURCEIMAGE + IMAGEDATASIZE
+ LINMAPDATA = DESTIMAGE + IMAGEDATASIZE
+
+pixel_resample_linmap_s16:
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+
+ movl 8(%ebp), %esi # get data struct
+ movl DESTIMAGE+HEIGHT(%esi), %edx # image height
+ movl DESTIMAGE+IMAGEADDRESS(%esi), %edi # dest image address
+ movl DESTIMAGE+WIDTH(%esi), %ecx # image width
+ shrl $2, %ecx # vector count
+ .align 16
+
+linmap_looprow:
+ movq LINMAPDATA+ROWSTATEX(%esi), %mm0 # get current coordinates
+ movq LINMAPDATA+ROWSTATEY(%esi), %mm1
+
+linmap_loopcol:
+ movq %mm0, %mm4 # copy
+ movq %mm1, %mm5
+ paddd LINMAPDATA+ROWINCX(%esi), %mm4 # increment
+ paddd LINMAPDATA+ROWINCY(%esi), %mm5
+ movq %mm4, %mm6 # copy
+ movq %mm5, %mm7
+ paddd LINMAPDATA+ROWINCX(%esi), %mm6 # increment
+ paddd LINMAPDATA+ROWINCY(%esi), %mm7
+ movq %mm6, LINMAPDATA+ROWSTATEX(%esi) # store next state
+ movq %mm7, LINMAPDATA+ROWSTATEY(%esi)
+
+ psrad $16, %mm0 # round to 16 bit
+ psrad $16, %mm1
+ psrad $16, %mm4
+ psrad $16, %mm5
+ packssdw %mm4, %mm0 # pack new coordinates
+ packssdw %mm5, %mm1
+
+ push %ecx
+ push %edx
+ push %edi
+
+ call getpixelsbilin # do interpolation
+
+ pop %edi
+ pop %edx
+ pop %ecx
+ movq %mm0, (%edi) # store 4 pixels
+ addl $0x8, %edi # point to next 4 pixels
+ decl %ecx # dec row counter
+ jnz linmap_looprow
+
+ movq LINMAPDATA+COLSTATEX(%esi), %mm0 # get column state vector
+ movq LINMAPDATA+COLSTATEY(%esi), %mm1
+ movl DESTIMAGE+WIDTH(%esi), %ecx # image width
+ shrl $2, %ecx # vector count
+ paddd LINMAPDATA+COLINCX(%esi), %mm0 # increment
+ paddd LINMAPDATA+COLINCY(%esi), %mm1
+ movq %mm0, LINMAPDATA+COLSTATEX(%esi) # store
+ movq %mm1, LINMAPDATA+COLSTATEY(%esi)
+ decl %edx # dec column counter
+ jnz linmap_loopcol
+
+ emms
+ popl %ebx
+ popl %edi
+ popl %esi
+ leave
+ ret
+
+