1 files changed, 314 insertions, 0 deletions
diff --git a/system/mmx/pixel_resample_s16.s b/system/mmx/pixel_resample_s16.s
new file mode 100644
index 0000000..3959f9c
--- /dev/null
+++ b/system/mmx/pixel_resample_s16.s
@@ -0,0 +1,314 @@
+	
+
+#interpolation data:
+#* 4 vectors: neighbourhood for samples (TL, TR, BL, BR)
+#* 2 vectors: fractional part (unsigned)
+#* 2 vectors: addresses of pixel blocks
+
+#coord conversion data:
+#1 vector: 32bit splatted address	
+#1 vector: 16bit splatted w-1
+#1 vector: 16bit splatted h-1
+#1 vector: 16bit splatted w (reuse w-1 with add?)
+#1 dword:  32 bit line offset
+
+#coord generation data:	several vectors for parameter update stuff..
+
+#coordinate systems: 16 bit virtual coordinates (signed, center relative)
+#* 2 vectors: virtual coordinates
+#(evt tussenstap + conversie naar 16 bit virtual)
+
+
+#step 1:	generate virtual coords
+
+		
+#step 2:	virtual coords -> block adresses + fractional adresses
+#* mulhigh: real coords (x,y) (center relative)
+#* add center -> unsigned (top left relative)
+#* mullow: fractional part (x_frac, y_frac)
+#* mulhigh, mullow, pack 32bit: y_offset
+#* pack 32bit: x_offset
+#* add, shift, add start address: real addresses
+	
+
+#step3:		data fetch using generated addresses: 
+#		this step would be much simpler in 4x16bit rgba. life's a bitch..
+
+#step4:		billinear interpolation
+
+#stat5:		store
+
+
+
+		# this can be simplified by doing 32 bit unaligned moves
+		# and vector unpacking on the data
+
+	
+
+		# cooked image data structure
+		# pixel environment temp storage
+		TL1 = 0x00
+		TL2 = 0x02
+		TL3 = 0x04
+		TL4 = 0x06
+		TR1 = 0x08
+		TR2 = 0x0A
+		TR3 = 0x0C
+		TR4 = 0x0E
+		BL1 = 0x10
+		BL2 = 0x12
+		BL3 = 0x14
+		BL4 = 0x16
+		BR1 = 0x18
+		BR2 = 0x1A
+		BR3 = 0x1C
+		BR4 = 0x1E
+		# addresses of pixel blocks
+		ADDRESS1  = 0x20
+		ADDRESS2  = 0x24
+		ADDRESS3  = 0x28
+		ADDRESS4  = 0x2C
+
+		# second env + address buffer (testing:	 not used)
+		SECONDBUFFER = 0x30
+	
+		# 32bit splatted bitmap address
+		V2PLANEADDRESS = 0x60
+		# 16bit splatted image constants
+		V4TWOWIDTHM1 = 0x68
+		V4TWOHEIGHTM1 = 0x70
+		V4LINEOFFSET = 0x78
+		# data struct size
+		RESAMPLEDATASIZE = 0x80
+	
+	
+
+		# interpolation routine
+		# input:	%mm0, %mm1 4 x 16bit unsigned top left relative virtual x and y coordinates
+		#		%esi: temp & algo data structure
+
+getpixelsbilin:	psrlw $1, %mm0			# convert to range 0->0x7fff [0,0.5[
+		psrlw $1, %mm1
+		movq %mm0, %mm2
+		movq %mm1, %mm3
+		movq V4TWOWIDTHM1(%esi), %mm4	# 2 * (width - 1)
+		movq V4TWOHEIGHTM1(%esi), %mm5	# 2 * (height - 1)
+		pmulhw %mm5, %mm3		# mm3 == y coord (topleft relative)
+		pmulhw %mm4, %mm2		# mm2 == x coord (topleft relative)
+		pmullw %mm5, %mm1		# mm1 == y frac (unsigned)
+		pmullw %mm4, %mm0		# mm0 == x frac (unsigned)
+
+		movq %mm3, %mm5			# copy y coord 
+		pmullw V4LINEOFFSET(%esi), %mm3	# low part of line offset
+		pmulhw V4LINEOFFSET(%esi), %mm5	# high part of line offset
+
+		movq %mm2, %mm7			# copy x coord vector
+		pxor %mm4, %mm4
+		punpcklwd %mm4, %mm2		# low part in %mm2
+		punpckhwd %mm4, %mm7		# hight part in %mm7
+	
+		movq %mm3, %mm6			# copy
+		punpcklwd %mm5, %mm3		# unpack low part in %mm3
+		punpckhwd %mm5, %mm6		# high part int %mm6
+
+		paddd %mm2, %mm3
+		paddd %mm7, %mm6
+		pslld $1, %mm3			# convert to word adresses
+		pslld $1, %mm6
+
+		paddd V2PLANEADDRESS(%esi), %mm3	# add pixel plane address
+		paddd V2PLANEADDRESS(%esi), %mm6
+
+		movq %mm3, ADDRESS1(%esi)	# store adresses
+		movq %mm6, ADDRESS3(%esi)
+
+		pcmpeqw %mm2, %mm2		# all ones
+		movq %mm0, %mm4			# copy x frac
+		movq %mm1, %mm5			# copy y frac
+		pxor %mm2, %mm4			# compute compliment (approx negative)
+		pxor %mm2, %mm5
+
+		psrlw $1, %mm0			# shift right (0.5 * (frac x)
+		psrlw $1, %mm1			# shift right (0.5 * (frac y)
+		psrlw $1, %mm4			# shift right (0.5 * (1 - frac x)
+		psrlw $1, %mm5			# shift right (0.5 * (1 - frac y)
+
+		movq %mm0, %mm2			# copy of frac x
+		movq %mm4, %mm3			# copy of (1-frac x)
+						# fetch data
+
+		#jmp skipfetch			# seems the fetch is the real killer. try to optimize this
+						# using 32 bit accesses & shifts
+
+						# the src image data struct is padded to the cooked data struct
+		movl RESAMPLEDATASIZE(%esi), %edi
+		shll $1, %edi
+
+		movl ADDRESS1(%esi), %ecx 
+		movl ADDRESS2(%esi), %edx
+	
+		movw (%ecx), %ax
+		movw (%edx), %bx
+		movw %ax, TL1(%esi)
+		movw %bx, TL2(%esi)
+		movw 2(%ecx), %ax
+		movw 2(%edx), %bx
+		movw %ax, TR1(%esi)
+		movw %bx, TR2(%esi)
+
+		addl %edi, %ecx
+		addl %edi, %edx
+
+		movw (%ecx), %ax
+		movw (%edx), %bx
+		movw %ax, BL1(%esi)
+		movw %bx, BL2(%esi)
+		movw 2(%ecx), %ax
+		movw 2(%edx), %bx
+		movw %ax, BR1(%esi)
+		movw %bx, BR2(%esi)
+
+		
+		movl ADDRESS3(%esi), %ecx 
+		movl ADDRESS4(%esi), %edx
+
+
+		movw (%ecx), %ax
+		movw (%edx), %bx
+		movw %ax, TL3(%esi)
+		movw %bx, TL4(%esi)
+		movw 2(%ecx), %ax
+		movw 2(%edx), %bx
+		movw %ax, TR3(%esi)
+		movw %bx, TR4(%esi)
+	
+		addl %edi, %ecx
+		addl %edi, %edx
+
+		movw (%ecx), %ax
+		movw (%edx), %bx
+		movw %ax, BL3(%esi)
+		movw %bx, BL4(%esi)
+		movw 2(%ecx), %ax
+		movw 2(%edx), %bx
+		movw %ax, BR3(%esi)
+		movw %bx, BR4(%esi)
+
+	
+skipfetch:	
+		pmulhw TL1(%esi), %mm4		# bilin interpolation
+		pmulhw TR1(%esi), %mm0
+		pmulhw BL1(%esi), %mm3
+		pmulhw BR1(%esi), %mm2
+
+
+		paddw %mm4, %mm0
+		paddw %mm3, %mm2
+
+		pmulhw %mm5, %mm0
+		pmulhw %mm1, %mm2
+
+		paddw %mm2, %mm0
+		psllw $2, %mm0			# compensate for gain reduction
+
+		ret
+
+
+		// linear mapping data struct
+		ROWSTATEX = 0x0
+		ROWSTATEY = 0x8
+		COLSTATEX = 0x10
+		COLSTATEY = 0x18
+		ROWINCX = 0x20		
+		ROWINCY = 0x28
+		COLINCX = 0x30		
+		COLINCY = 0x38
+
+		// image data struct
+		LINEOFFSET = 0x0
+		IMAGEADDRESS = 0x4
+		WIDTH = 0x8
+		HEIGHT = 0xC
+		IMAGEDATASIZE = 0x10
+		
+
+
+# pixel_resample_linmap_s16(void *x)		
+.globl pixel_resample_linmap_s16
+.type  pixel_resample_linmap_s16,@function
+
+		SOURCEIMAGE = RESAMPLEDATASIZE
+		DESTIMAGE = SOURCEIMAGE + IMAGEDATASIZE
+		LINMAPDATA = DESTIMAGE + IMAGEDATASIZE
+	
+pixel_resample_linmap_s16:	
+		pushl %ebp
+		movl %esp, %ebp
+		pushl %esi
+		pushl %edi
+		pushl %ebx
+
+
+		movl 8(%ebp),  %esi			# get data struct
+		movl DESTIMAGE+HEIGHT(%esi), %edx	# image height
+		movl DESTIMAGE+IMAGEADDRESS(%esi), %edi # dest image address
+		movl DESTIMAGE+WIDTH(%esi), %ecx	# image width
+		shrl $2, %ecx				# vector count
+		.align 16
+	
+linmap_looprow:
+		movq LINMAPDATA+ROWSTATEX(%esi), %mm0	# get current coordinates
+		movq LINMAPDATA+ROWSTATEY(%esi), %mm1
+
+linmap_loopcol:		
+		movq %mm0, %mm4				# copy
+		movq %mm1, %mm5
+		paddd LINMAPDATA+ROWINCX(%esi), %mm4	# increment
+		paddd LINMAPDATA+ROWINCY(%esi), %mm5
+		movq %mm4, %mm6				# copy
+		movq %mm5, %mm7	
+		paddd LINMAPDATA+ROWINCX(%esi), %mm6	# increment
+		paddd LINMAPDATA+ROWINCY(%esi), %mm7
+		movq %mm6, LINMAPDATA+ROWSTATEX(%esi)	# store next state
+		movq %mm7, LINMAPDATA+ROWSTATEY(%esi) 
+
+		psrad $16, %mm0				# round to 16 bit
+		psrad $16, %mm1
+		psrad $16, %mm4
+		psrad $16, %mm5
+		packssdw %mm4, %mm0			# pack new coordinates
+		packssdw %mm5, %mm1
+	
+		push %ecx
+		push %edx
+		push %edi
+	
+		call getpixelsbilin			# do interpolation
+
+		pop %edi
+		pop %edx
+		pop %ecx
+		movq %mm0, (%edi)			# store 4 pixels
+		addl $0x8, %edi				# point to next 4 pixels
+		decl %ecx				# dec row counter
+		jnz linmap_looprow
+
+		movq LINMAPDATA+COLSTATEX(%esi), %mm0	# get column state vector
+		movq LINMAPDATA+COLSTATEY(%esi), %mm1
+		movl DESTIMAGE+WIDTH(%esi), %ecx	# image width
+		shrl $2, %ecx				# vector count
+		paddd LINMAPDATA+COLINCX(%esi), %mm0	# increment
+		paddd LINMAPDATA+COLINCY(%esi), %mm1
+		movq %mm0, LINMAPDATA+COLSTATEX(%esi)	# store
+		movq %mm1, LINMAPDATA+COLSTATEY(%esi)
+		decl %edx				# dec column counter
+		jnz linmap_loopcol
+		
+		emms
+		popl %ebx
+		popl %edi
+		popl %esi
+		leave
+		ret
+
+