system/mmx/pixel_crot_s16.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

#    Pure Data Packet mmx routine.
#    Copyright (c) by Tom Schouten <tom@zwizwa.be>
# 
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
# 
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
# 
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_crot3d_s16
.type  pixel_crot3d_s16,@function


# 3 dimensional colour space rotation
# 3x3 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
	
# void pixel_crot3d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)

pixel_crot3d_s16:
	pushl %ebp
	movl %esp, %ebp
	push %esi
	push %edi

	
	movl 8(%ebp),  %esi	# input array
	movl 12(%ebp), %ecx	# pixel count
	movl 16(%ebp), %edi	# rotation matrix
	movl %ecx, %edx
	shll $3, %edx		# %edx = plane spacing

	
	.align 16
	.loop_crot3d:	

	movq (%esi), %mm0		# get 1st component
	movq (%esi,%edx,1), %mm6	# get 2nd component
	movq (%esi,%edx,2), %mm7	# get 3rd component

	movq %mm0, %mm1			# copy 1st component
	movq %mm0, %mm2

	pmulhw (%edi), %mm0		# mul first column
	pmulhw 8(%edi), %mm1
	pmulhw 16(%edi), %mm2

	movq %mm6, %mm5			# copy 2nd component
	movq %mm6, %mm3

	pmulhw 24(%edi), %mm6		# mul second column
	pmulhw 32(%edi), %mm5
	pmulhw 40(%edi), %mm3

	paddsw %mm6, %mm0		# accumulate
	paddsw %mm5, %mm1
	paddsw %mm3, %mm2

	movq %mm7, %mm4			# copy 3rd component
	movq %mm7, %mm6

	pmulhw 48(%edi), %mm4		# mul third column
	pmulhw 56(%edi), %mm6
	pmulhw 64(%edi), %mm7

	paddsw %mm4, %mm0		# accumulate
	paddsw %mm6, %mm1
	paddsw %mm7, %mm2

	paddsw %mm0, %mm0		# double (fixed point normalization)
	paddsw %mm1, %mm1
	paddsw %mm2, %mm2

	movq %mm0, (%esi)		# store
	movq %mm1, (%esi, %edx, 1)
	movq %mm2, (%esi, %edx, 2)

	addl $8, %esi			# increment source pointer
	decl %ecx
	jnz .loop_crot3d		# loop

	emms
	
	pop %edi
	pop %esi
	leave
	ret
	

.globl pixel_crot2d_s16
.type  pixel_crot2d_s16,@function
	
# 2 dimensional colour space rotation
# 2x2 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
	
# void pixel_crot2d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)

pixel_crot2d_s16:
	pushl %ebp
	movl %esp, %ebp
	push %esi
	push %edi

	
	movl 8(%ebp),  %esi	# input array
	movl 12(%ebp), %ecx	# pixel count
	movl 16(%ebp), %edi	# rotation matrix
	movl %ecx, %edx
	shll $3, %edx		# %edx = plane spacing

	
	.align 16
	.loop_crot2d:	

	movq (%esi), %mm0		# get 1st component
	movq (%esi,%edx,1), %mm2	# get 2nd component

	movq %mm0, %mm1			# copy 1st component
	movq %mm2, %mm3			# copy 2nd component

	pmulhw (%edi), %mm0		# mul first column
	pmulhw 8(%edi), %mm1

	pmulhw 16(%edi), %mm2		# mul second column
	pmulhw 24(%edi), %mm3

	paddsw %mm2, %mm0		# accumulate
	paddsw %mm3, %mm1

	paddsw %mm0, %mm0		# fixed point gain correction
	paddsw %mm1, %mm1

	movq %mm0, (%esi)		# store
	movq %mm1, (%esi, %edx, 1)

	addl $8, %esi			# increment source pointer
	decl %ecx
	jnz .loop_crot2d		# loop

	emms
	
	pop %edi
	pop %esi
	leave
	ret