system/mmx/pixel_resample_s16.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314

	
#interpolation data:
#* 4 vectors: neighbourhood for samples (TL, TR, BL, BR)
#* 2 vectors: fractional part (unsigned)
#* 2 vectors: addresses of pixel blocks

#coord conversion data:
#1 vector: 32bit splatted address	
#1 vector: 16bit splatted w-1
#1 vector: 16bit splatted h-1
#1 vector: 16bit splatted w (reuse w-1 with add?)
#1 dword:  32 bit line offset

#coord generation data:	several vectors for parameter update stuff..

#coordinate systems: 16 bit virtual coordinates (signed, center relative)
#* 2 vectors: virtual coordinates
#(evt tussenstap + conversie naar 16 bit virtual)


#step 1:	generate virtual coords

		
#step 2:	virtual coords -> block adresses + fractional adresses
#* mulhigh: real coords (x,y) (center relative)
#* add center -> unsigned (top left relative)
#* mullow: fractional part (x_frac, y_frac)
#* mulhigh, mullow, pack 32bit: y_offset
#* pack 32bit: x_offset
#* add, shift, add start address: real addresses
	

#step3:		data fetch using generated addresses: 
#		this step would be much simpler in 4x16bit rgba. life's a bitch..

#step4:		billinear interpolation

#stat5:		store


		# this can be simplified by doing 32 bit unaligned moves
		# and vector unpacking on the data

	
		# cooked image data structure
		# pixel environment temp storage
		TL1 = 0x00
		TL2 = 0x02
		TL3 = 0x04
		TL4 = 0x06
		TR1 = 0x08
		TR2 = 0x0A
		TR3 = 0x0C
		TR4 = 0x0E
		BL1 = 0x10
		BL2 = 0x12
		BL3 = 0x14
		BL4 = 0x16
		BR1 = 0x18
		BR2 = 0x1A
		BR3 = 0x1C
		BR4 = 0x1E
		# addresses of pixel blocks
		ADDRESS1  = 0x20
		ADDRESS2  = 0x24
		ADDRESS3  = 0x28
		ADDRESS4  = 0x2C

		# second env + address buffer (testing:	 not used)
		SECONDBUFFER = 0x30
	
		# 32bit splatted bitmap address
		V2PLANEADDRESS = 0x60
		# 16bit splatted image constants
		V4TWOWIDTHM1 = 0x68
		V4TWOHEIGHTM1 = 0x70
		V4LINEOFFSET = 0x78
		# data struct size
		RESAMPLEDATASIZE = 0x80
	
	
		# interpolation routine
		# input:	%mm0, %mm1 4 x 16bit unsigned top left relative virtual x and y coordinates
		#		%esi: temp & algo data structure

getpixelsbilin:	psrlw $1, %mm0			# convert to range 0->0x7fff [0,0.5[
		psrlw $1, %mm1
		movq %mm0, %mm2
		movq %mm1, %mm3
		movq V4TWOWIDTHM1(%esi), %mm4	# 2 * (width - 1)
		movq V4TWOHEIGHTM1(%esi), %mm5	# 2 * (height - 1)
		pmulhw %mm5, %mm3		# mm3 == y coord (topleft relative)
		pmulhw %mm4, %mm2		# mm2 == x coord (topleft relative)
		pmullw %mm5, %mm1		# mm1 == y frac (unsigned)
		pmullw %mm4, %mm0		# mm0 == x frac (unsigned)

		movq %mm3, %mm5			# copy y coord 
		pmullw V4LINEOFFSET(%esi), %mm3	# low part of line offset
		pmulhw V4LINEOFFSET(%esi), %mm5	# high part of line offset

		movq %mm2, %mm7			# copy x coord vector
		pxor %mm4, %mm4
		punpcklwd %mm4, %mm2		# low part in %mm2
		punpckhwd %mm4, %mm7		# hight part in %mm7
	
		movq %mm3, %mm6			# copy
		punpcklwd %mm5, %mm3		# unpack low part in %mm3
		punpckhwd %mm5, %mm6		# high part int %mm6

		paddd %mm2, %mm3
		paddd %mm7, %mm6
		pslld $1, %mm3			# convert to word adresses
		pslld $1, %mm6

		paddd V2PLANEADDRESS(%esi), %mm3	# add pixel plane address
		paddd V2PLANEADDRESS(%esi), %mm6

		movq %mm3, ADDRESS1(%esi)	# store adresses
		movq %mm6, ADDRESS3(%esi)

		pcmpeqw %mm2, %mm2		# all ones
		movq %mm0, %mm4			# copy x frac
		movq %mm1, %mm5			# copy y frac
		pxor %mm2, %mm4			# compute compliment (approx negative)
		pxor %mm2, %mm5

		psrlw $1, %mm0			# shift right (0.5 * (frac x)
		psrlw $1, %mm1			# shift right (0.5 * (frac y)
		psrlw $1, %mm4			# shift right (0.5 * (1 - frac x)
		psrlw $1, %mm5			# shift right (0.5 * (1 - frac y)

		movq %mm0, %mm2			# copy of frac x
		movq %mm4, %mm3			# copy of (1-frac x)
						# fetch data

		#jmp skipfetch			# seems the fetch is the real killer. try to optimize this
						# using 32 bit accesses & shifts

						# the src image data struct is padded to the cooked data struct
		movl RESAMPLEDATASIZE(%esi), %edi
		shll $1, %edi

		movl ADDRESS1(%esi), %ecx 
		movl ADDRESS2(%esi), %edx
	
		movw (%ecx), %ax
		movw (%edx), %bx
		movw %ax, TL1(%esi)
		movw %bx, TL2(%esi)
		movw 2(%ecx), %ax
		movw 2(%edx), %bx
		movw %ax, TR1(%esi)
		movw %bx, TR2(%esi)

		addl %edi, %ecx
		addl %edi, %edx

		movw (%ecx), %ax
		movw (%edx), %bx
		movw %ax, BL1(%esi)
		movw %bx, BL2(%esi)
		movw 2(%ecx), %ax
		movw 2(%edx), %bx
		movw %ax, BR1(%esi)
		movw %bx, BR2(%esi)

		
		movl ADDRESS3(%esi), %ecx 
		movl ADDRESS4(%esi), %edx


		movw (%ecx), %ax
		movw (%edx), %bx
		movw %ax, TL3(%esi)
		movw %bx, TL4(%esi)
		movw 2(%ecx), %ax
		movw 2(%edx), %bx
		movw %ax, TR3(%esi)
		movw %bx, TR4(%esi)
	
		addl %edi, %ecx
		addl %edi, %edx

		movw (%ecx), %ax
		movw (%edx), %bx
		movw %ax, BL3(%esi)
		movw %bx, BL4(%esi)
		movw 2(%ecx), %ax
		movw 2(%edx), %bx
		movw %ax, BR3(%esi)
		movw %bx, BR4(%esi)

	
skipfetch:	
		pmulhw TL1(%esi), %mm4		# bilin interpolation
		pmulhw TR1(%esi), %mm0
		pmulhw BL1(%esi), %mm3
		pmulhw BR1(%esi), %mm2


		paddw %mm4, %mm0
		paddw %mm3, %mm2

		pmulhw %mm5, %mm0
		pmulhw %mm1, %mm2

		paddw %mm2, %mm0
		psllw $2, %mm0			# compensate for gain reduction

		ret


		// linear mapping data struct
		ROWSTATEX = 0x0
		ROWSTATEY = 0x8
		COLSTATEX = 0x10
		COLSTATEY = 0x18
		ROWINCX = 0x20		
		ROWINCY = 0x28
		COLINCX = 0x30		
		COLINCY = 0x38

		// image data struct
		LINEOFFSET = 0x0
		IMAGEADDRESS = 0x4
		WIDTH = 0x8
		HEIGHT = 0xC
		IMAGEDATASIZE = 0x10
		

# pixel_resample_linmap_s16(void *x)		
.globl pixel_resample_linmap_s16
.type  pixel_resample_linmap_s16,@function

		SOURCEIMAGE = RESAMPLEDATASIZE
		DESTIMAGE = SOURCEIMAGE + IMAGEDATASIZE
		LINMAPDATA = DESTIMAGE + IMAGEDATASIZE
	
pixel_resample_linmap_s16:	
		pushl %ebp
		movl %esp, %ebp
		pushl %esi
		pushl %edi
		pushl %ebx


		movl 8(%ebp),  %esi			# get data struct
		movl DESTIMAGE+HEIGHT(%esi), %edx	# image height
		movl DESTIMAGE+IMAGEADDRESS(%esi), %edi # dest image address
		movl DESTIMAGE+WIDTH(%esi), %ecx	# image width
		shrl $2, %ecx				# vector count
		.align 16
	
linmap_looprow:
		movq LINMAPDATA+ROWSTATEX(%esi), %mm0	# get current coordinates
		movq LINMAPDATA+ROWSTATEY(%esi), %mm1

linmap_loopcol:		
		movq %mm0, %mm4				# copy
		movq %mm1, %mm5
		paddd LINMAPDATA+ROWINCX(%esi), %mm4	# increment
		paddd LINMAPDATA+ROWINCY(%esi), %mm5
		movq %mm4, %mm6				# copy
		movq %mm5, %mm7	
		paddd LINMAPDATA+ROWINCX(%esi), %mm6	# increment
		paddd LINMAPDATA+ROWINCY(%esi), %mm7
		movq %mm6, LINMAPDATA+ROWSTATEX(%esi)	# store next state
		movq %mm7, LINMAPDATA+ROWSTATEY(%esi) 

		psrad $16, %mm0				# round to 16 bit
		psrad $16, %mm1
		psrad $16, %mm4
		psrad $16, %mm5
		packssdw %mm4, %mm0			# pack new coordinates
		packssdw %mm5, %mm1
	
		push %ecx
		push %edx
		push %edi
	
		call getpixelsbilin			# do interpolation

		pop %edi
		pop %edx
		pop %ecx
		movq %mm0, (%edi)			# store 4 pixels
		addl $0x8, %edi				# point to next 4 pixels
		decl %ecx				# dec row counter
		jnz linmap_looprow

		movq LINMAPDATA+COLSTATEX(%esi), %mm0	# get column state vector
		movq LINMAPDATA+COLSTATEY(%esi), %mm1
		movl DESTIMAGE+WIDTH(%esi), %ecx	# image width
		shrl $2, %ecx				# vector count
		paddd LINMAPDATA+COLINCX(%esi), %mm0	# increment
		paddd LINMAPDATA+COLINCY(%esi), %mm1
		movq %mm0, LINMAPDATA+COLSTATEX(%esi)	# store
		movq %mm1, LINMAPDATA+COLSTATEY(%esi)
		decl %edx				# dec column counter
		jnz linmap_loopcol
		
		emms
		popl %ebx
		popl %edi
		popl %esi
		leave
		ret