@@ -31,6 +31,268 @@ namespace lsp
3131 namespace sse
3232 {
3333
34+ void pmix_v1 (float *dst, const float *src, const float *k, size_t count)
35+ {
36+ IF_ARCH_X86 (size_t off);
37+ ARCH_X86_ASM
38+ (
39+ __ASM_EMIT (" xor %[off], %[off]" )
40+ // 8x blocks
41+ __ASM_EMIT (" sub $8, %[count]" )
42+ __ASM_EMIT (" jb 2f" )
43+ __ASM_EMIT (" 1:" )
44+ __ASM_EMIT (" movups 0x00(%[src],%[off]), %%xmm0" ) /* xmm0 = s */
45+ __ASM_EMIT (" movups 0x10(%[src],%[off]), %%xmm1" )
46+ __ASM_EMIT (" movups 0x00(%[dst],%[off]), %%xmm2" ) /* xmm2 = d */
47+ __ASM_EMIT (" movups 0x10(%[dst],%[off]), %%xmm3" )
48+ __ASM_EMIT (" movups 0x00(%[k],%[off]), %%xmm4" ) /* xmm4 = k */
49+ __ASM_EMIT (" movups 0x10(%[k],%[off]), %%xmm5" )
50+ __ASM_EMIT (" subps %%xmm2, %%xmm0" ) /* xmm0 = s - d */
51+ __ASM_EMIT (" subps %%xmm3, %%xmm1" )
52+ __ASM_EMIT (" mulps %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
53+ __ASM_EMIT (" mulps %%xmm5, %%xmm1" )
54+ __ASM_EMIT (" addps %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
55+ __ASM_EMIT (" addps %%xmm3, %%xmm1" )
56+ __ASM_EMIT (" movups %%xmm0, 0x00(%[dst],%[off])" )
57+ __ASM_EMIT (" movups %%xmm1, 0x10(%[dst],%[off])" )
58+ __ASM_EMIT (" add $0x20, %[off]" )
59+ __ASM_EMIT (" sub $8, %[count]" )
60+ __ASM_EMIT (" jae 1b" )
61+ // 4x block
62+ __ASM_EMIT (" 2:" )
63+ __ASM_EMIT (" add $4, %[count]" )
64+ __ASM_EMIT (" jl 4f" )
65+ __ASM_EMIT (" movups 0x00(%[src],%[off]), %%xmm0" ) /* xmm0 = s */
66+ __ASM_EMIT (" movups 0x00(%[dst],%[off]), %%xmm2" ) /* xmm2 = d */
67+ __ASM_EMIT (" movups 0x00(%[k],%[off]), %%xmm4" ) /* xmm4 = k */
68+ __ASM_EMIT (" subps %%xmm2, %%xmm0" ) /* xmm0 = s - d */
69+ __ASM_EMIT (" mulps %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
70+ __ASM_EMIT (" addps %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
71+ __ASM_EMIT (" movups %%xmm0, 0x00(%[dst],%[off])" )
72+ __ASM_EMIT (" sub $4, %[count]" )
73+ __ASM_EMIT (" add $0x10, %[off]" )
74+ // 1x blocks
75+ __ASM_EMIT (" 4:" )
76+ __ASM_EMIT (" add $3, %[count]" )
77+ __ASM_EMIT (" jl 6f" )
78+ __ASM_EMIT (" 5:" )
79+ __ASM_EMIT (" movss 0x00(%[src],%[off]), %%xmm0" ) /* xmm0 = s */
80+ __ASM_EMIT (" movss 0x00(%[dst],%[off]), %%xmm2" ) /* xmm2 = d */
81+ __ASM_EMIT (" movss 0x00(%[k],%[off]), %%xmm4" ) /* xmm4 = k */
82+ __ASM_EMIT (" subss %%xmm2, %%xmm0" ) /* xmm0 = s - d */
83+ __ASM_EMIT (" mulss %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
84+ __ASM_EMIT (" addss %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
85+ __ASM_EMIT (" movss %%xmm0, 0x00(%[dst],%[off])" )
86+ __ASM_EMIT (" add $0x04, %[off]" )
87+ __ASM_EMIT (" dec %[count]" )
88+ __ASM_EMIT (" jge 5b" )
89+ // End
90+ __ASM_EMIT (" 6:" )
91+ : [off] " =&r" (off), [count] " +r" (count)
92+ : [dst] " r" (dst), [src] " r" (src), [k] " r" (k)
93+ : " cc" , " memory" ,
94+ " %xmm0" , " %xmm1" , " %xmm2" , " %xmm3" ,
95+ " %xmm4" , " %xmm5"
96+ );
97+ }
98+
99+ void pmix_v2 (float *dst, const float *src1, const float *src2, const float *k, size_t count)
100+ {
101+ IF_ARCH_X86 (size_t off);
102+ ARCH_X86_ASM
103+ (
104+ __ASM_EMIT (" xor %[off], %[off]" )
105+ // 8x blocks
106+ __ASM_EMIT32 (" subl $8, %[count]" )
107+ __ASM_EMIT64 (" sub $8, %[count]" )
108+ __ASM_EMIT (" jb 2f" )
109+ __ASM_EMIT (" 1:" )
110+ __ASM_EMIT (" movups 0x00(%[src2],%[off]), %%xmm0" ) /* xmm0 = s */
111+ __ASM_EMIT (" movups 0x10(%[src2],%[off]), %%xmm1" )
112+ __ASM_EMIT (" movups 0x00(%[src1],%[off]), %%xmm2" ) /* xmm2 = d */
113+ __ASM_EMIT (" movups 0x10(%[src1],%[off]), %%xmm3" )
114+ __ASM_EMIT (" movups 0x00(%[k],%[off]), %%xmm4" ) /* xmm4 = k */
115+ __ASM_EMIT (" movups 0x10(%[k],%[off]), %%xmm5" )
116+ __ASM_EMIT (" subps %%xmm2, %%xmm0" ) /* xmm0 = s - d */
117+ __ASM_EMIT (" subps %%xmm3, %%xmm1" )
118+ __ASM_EMIT (" mulps %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
119+ __ASM_EMIT (" mulps %%xmm5, %%xmm1" )
120+ __ASM_EMIT (" addps %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
121+ __ASM_EMIT (" addps %%xmm3, %%xmm1" )
122+ __ASM_EMIT (" movups %%xmm0, 0x00(%[dst],%[off])" )
123+ __ASM_EMIT (" movups %%xmm1, 0x10(%[dst],%[off])" )
124+ __ASM_EMIT32 (" addl $0x20, %[off]" )
125+ __ASM_EMIT64 (" add $0x20, %[off]" )
126+ __ASM_EMIT (" sub $8, %[count]" )
127+ __ASM_EMIT (" jae 1b" )
128+ // 4x block
129+ __ASM_EMIT (" 2:" )
130+ __ASM_EMIT32 (" addl $4, %[count]" )
131+ __ASM_EMIT64 (" add $4, %[count]" )
132+ __ASM_EMIT (" jl 4f" )
133+ __ASM_EMIT (" movups 0x00(%[src2],%[off]), %%xmm0" ) /* xmm0 = s */
134+ __ASM_EMIT (" movups 0x00(%[src1],%[off]), %%xmm2" ) /* xmm2 = d */
135+ __ASM_EMIT (" movups 0x00(%[k],%[off]), %%xmm4" ) /* xmm4 = k */
136+ __ASM_EMIT (" subps %%xmm2, %%xmm0" ) /* xmm0 = s - d */
137+ __ASM_EMIT (" mulps %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
138+ __ASM_EMIT (" addps %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
139+ __ASM_EMIT (" movups %%xmm0, 0x00(%[dst],%[off])" )
140+ __ASM_EMIT32 (" subl $4, %[count]" )
141+ __ASM_EMIT64 (" sub $4, %[count]" )
142+ __ASM_EMIT (" add $0x10, %[off]" )
143+ // 1x blocks
144+ __ASM_EMIT (" 4:" )
145+ __ASM_EMIT32 (" addl $3, %[count]" )
146+ __ASM_EMIT64 (" add $3, %[count]" )
147+ __ASM_EMIT (" jl 6f" )
148+ __ASM_EMIT (" 5:" )
149+ __ASM_EMIT (" movss 0x00(%[src2],%[off]), %%xmm0" ) /* xmm0 = s */
150+ __ASM_EMIT (" movss 0x00(%[src1],%[off]), %%xmm2" ) /* xmm2 = d */
151+ __ASM_EMIT (" movss 0x00(%[k],%[off]), %%xmm4" ) /* xmm4 = k */
152+ __ASM_EMIT (" subss %%xmm2, %%xmm0" ) /* xmm0 = s - d */
153+ __ASM_EMIT (" mulss %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
154+ __ASM_EMIT (" addss %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
155+ __ASM_EMIT (" movss %%xmm0, 0x00(%[dst],%[off])" )
156+ __ASM_EMIT (" add $0x04, %[off]" )
157+ __ASM_EMIT32 (" decl %[count]" )
158+ __ASM_EMIT64 (" dec %[count]" )
159+ __ASM_EMIT (" jge 5b" )
160+ // End
161+ __ASM_EMIT (" 6:" )
162+ : [off] " =&r" (off), [count] __ASM_ARG_RW (count)
163+ : [dst] " r" (dst), [src1] " r" (src1), [src2] " r" (src2), [k] " r" (k)
164+ : " cc" , " memory" ,
165+ " %xmm0" , " %xmm1" , " %xmm2" , " %xmm3" ,
166+ " %xmm4" , " %xmm5"
167+ );
168+ }
169+
170+ void pmix_k1 (float *dst, const float *src, float k, size_t count)
171+ {
172+ IF_ARCH_X86 (size_t off);
173+ ARCH_X86_ASM
174+ (
175+ __ASM_EMIT (" shufps $0x00, %[k], %[k]" )
176+ __ASM_EMIT (" xor %[off], %[off]" )
177+ __ASM_EMIT (" movaps %[k], %%xmm4" )
178+ // 8x blocks
179+ __ASM_EMIT (" sub $8, %[count]" )
180+ __ASM_EMIT (" jb 2f" )
181+ __ASM_EMIT (" 1:" )
182+ __ASM_EMIT (" movups 0x00(%[src],%[off]), %%xmm0" ) /* xmm0 = s */
183+ __ASM_EMIT (" movups 0x10(%[src],%[off]), %%xmm1" )
184+ __ASM_EMIT (" movups 0x00(%[dst],%[off]), %%xmm2" ) /* xmm2 = d */
185+ __ASM_EMIT (" movups 0x10(%[dst],%[off]), %%xmm3" )
186+ __ASM_EMIT (" subps %%xmm2, %%xmm0" ) /* xmm0 = s - d */
187+ __ASM_EMIT (" subps %%xmm3, %%xmm1" )
188+ __ASM_EMIT (" mulps %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
189+ __ASM_EMIT (" mulps %%xmm4, %%xmm1" )
190+ __ASM_EMIT (" addps %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
191+ __ASM_EMIT (" addps %%xmm3, %%xmm1" )
192+ __ASM_EMIT (" movups %%xmm0, 0x00(%[dst],%[off])" )
193+ __ASM_EMIT (" movups %%xmm1, 0x10(%[dst],%[off])" )
194+ __ASM_EMIT (" add $0x20, %[off]" )
195+ __ASM_EMIT (" sub $8, %[count]" )
196+ __ASM_EMIT (" jae 1b" )
197+ // 4x block
198+ __ASM_EMIT (" 2:" )
199+ __ASM_EMIT (" add $4, %[count]" )
200+ __ASM_EMIT (" jl 4f" )
201+ __ASM_EMIT (" movups 0x00(%[src],%[off]), %%xmm0" ) /* xmm0 = s */
202+ __ASM_EMIT (" movups 0x00(%[dst],%[off]), %%xmm2" ) /* xmm2 = d */
203+ __ASM_EMIT (" subps %%xmm2, %%xmm0" ) /* xmm0 = s - d */
204+ __ASM_EMIT (" mulps %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
205+ __ASM_EMIT (" addps %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
206+ __ASM_EMIT (" movups %%xmm0, 0x00(%[dst],%[off])" )
207+ __ASM_EMIT (" sub $4, %[count]" )
208+ __ASM_EMIT (" add $0x10, %[off]" )
209+ // 1x blocks
210+ __ASM_EMIT (" 4:" )
211+ __ASM_EMIT (" add $3, %[count]" )
212+ __ASM_EMIT (" jl 6f" )
213+ __ASM_EMIT (" 5:" )
214+ __ASM_EMIT (" movss 0x00(%[src],%[off]), %%xmm0" ) /* xmm0 = s */
215+ __ASM_EMIT (" movss 0x00(%[dst],%[off]), %%xmm2" ) /* xmm2 = d */
216+ __ASM_EMIT (" subss %%xmm2, %%xmm0" ) /* xmm0 = s - d */
217+ __ASM_EMIT (" mulss %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
218+ __ASM_EMIT (" addss %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
219+ __ASM_EMIT (" movss %%xmm0, 0x00(%[dst],%[off])" )
220+ __ASM_EMIT (" add $0x04, %[off]" )
221+ __ASM_EMIT (" dec %[count]" )
222+ __ASM_EMIT (" jge 5b" )
223+ // End
224+ __ASM_EMIT (" 6:" )
225+ : [off] " =&r" (off), [count] " +r" (count), [k] " +Yz" (k)
226+ : [dst] " r" (dst), [src] " r" (src)
227+ : " cc" , " memory" ,
228+ " %xmm1" , " %xmm2" , " %xmm3" ,
229+ " %xmm4"
230+ );
231+ }
232+
233+ void pmix_k2 (float *dst, const float *src1, const float *src2, float k, size_t count)
234+ {
235+ IF_ARCH_X86 (size_t off);
236+ ARCH_X86_ASM
237+ (
238+ __ASM_EMIT (" shufps $0x00, %[k], %[k]" )
239+ __ASM_EMIT (" xor %[off], %[off]" )
240+ __ASM_EMIT (" movaps %[k], %%xmm4" )
241+ // 8x blocks
242+ __ASM_EMIT (" sub $8, %[count]" )
243+ __ASM_EMIT (" jb 2f" )
244+ __ASM_EMIT (" 1:" )
245+ __ASM_EMIT (" movups 0x00(%[src2],%[off]), %%xmm0" ) /* xmm0 = s */
246+ __ASM_EMIT (" movups 0x10(%[src2],%[off]), %%xmm1" )
247+ __ASM_EMIT (" movups 0x00(%[src1],%[off]), %%xmm2" ) /* xmm2 = d */
248+ __ASM_EMIT (" movups 0x10(%[src1],%[off]), %%xmm3" )
249+ __ASM_EMIT (" subps %%xmm2, %%xmm0" ) /* xmm0 = s - d */
250+ __ASM_EMIT (" subps %%xmm3, %%xmm1" )
251+ __ASM_EMIT (" mulps %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
252+ __ASM_EMIT (" mulps %%xmm4, %%xmm1" )
253+ __ASM_EMIT (" addps %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
254+ __ASM_EMIT (" addps %%xmm3, %%xmm1" )
255+ __ASM_EMIT (" movups %%xmm0, 0x00(%[dst],%[off])" )
256+ __ASM_EMIT (" movups %%xmm1, 0x10(%[dst],%[off])" )
257+ __ASM_EMIT (" add $0x20, %[off]" )
258+ __ASM_EMIT (" sub $8, %[count]" )
259+ __ASM_EMIT (" jae 1b" )
260+ // 4x block
261+ __ASM_EMIT (" 2:" )
262+ __ASM_EMIT (" add $4, %[count]" )
263+ __ASM_EMIT (" jl 4f" )
264+ __ASM_EMIT (" movups 0x00(%[src2],%[off]), %%xmm0" ) /* xmm0 = s */
265+ __ASM_EMIT (" movups 0x00(%[src1],%[off]), %%xmm2" ) /* xmm2 = d */
266+ __ASM_EMIT (" subps %%xmm2, %%xmm0" ) /* xmm0 = s - d */
267+ __ASM_EMIT (" mulps %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
268+ __ASM_EMIT (" addps %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
269+ __ASM_EMIT (" movups %%xmm0, 0x00(%[dst],%[off])" )
270+ __ASM_EMIT (" sub $4, %[count]" )
271+ __ASM_EMIT (" add $0x10, %[off]" )
272+ // 1x blocks
273+ __ASM_EMIT (" 4:" )
274+ __ASM_EMIT (" add $3, %[count]" )
275+ __ASM_EMIT (" jl 6f" )
276+ __ASM_EMIT (" 5:" )
277+ __ASM_EMIT (" movss 0x00(%[src2],%[off]), %%xmm0" ) /* xmm0 = s */
278+ __ASM_EMIT (" movss 0x00(%[src1],%[off]), %%xmm2" ) /* xmm2 = d */
279+ __ASM_EMIT (" subss %%xmm2, %%xmm0" ) /* xmm0 = s - d */
280+ __ASM_EMIT (" mulss %%xmm4, %%xmm0" ) /* xmm0 = (s-d) * k */
281+ __ASM_EMIT (" addss %%xmm2, %%xmm0" ) /* xmm0 = mix = d + (s-d) * k */
282+ __ASM_EMIT (" movss %%xmm0, 0x00(%[dst],%[off])" )
283+ __ASM_EMIT (" add $0x04, %[off]" )
284+ __ASM_EMIT (" dec %[count]" )
285+ __ASM_EMIT (" jge 5b" )
286+ // End
287+ __ASM_EMIT (" 6:" )
288+ : [off] " =&r" (off), [count] " +r" (count), [k] " +Yz" (k)
289+ : [dst] " r" (dst), [src1] " r" (src1), [src2] " r" (src2)
290+ : " cc" , " memory" ,
291+ " %xmm1" , " %xmm2" , " %xmm3" ,
292+ " %xmm4"
293+ );
294+ }
295+
34296 } /* namespace sse */
35297} /* namespace lsp */
36298
0 commit comments