Skip to content

Commit ffee91a

Browse files
committed
Submitted SSE-optimized pmix functions
1 parent a9fe98c commit ffee91a

File tree

11 files changed

+1313
-2
lines changed

11 files changed

+1313
-2
lines changed

include/private/dsp/arch/generic/pmath/pmix.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ namespace lsp
3939
void pmix_v2(float *dst, const float *src1, const float *src2, const float *k, size_t count)
4040
{
4141
for (size_t i=0; i<count; ++i)
42-
dst[i] = (src2[i] - src1[i])*k[i];
42+
dst[i] = src1[i] + (src2[i] - src1[i])*k[i];
4343
}
4444

4545
void pmix_k1(float *dst, const float *src, float k, size_t count)
@@ -51,7 +51,7 @@ namespace lsp
5151
void pmix_k2(float *dst, const float *src1, const float *src2, float k, size_t count)
5252
{
5353
for (size_t i=0; i<count; ++i)
54-
dst[i] = (src2[i] - src1[i])*k;
54+
dst[i] = src1[i] + (src2[i] - src1[i])*k;
5555
}
5656

5757
} /* namespace generic */

include/private/dsp/arch/x86/sse/pmath/pmix.h

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,268 @@ namespace lsp
3131
namespace sse
3232
{
3333

34+
void pmix_v1(float *dst, const float *src, const float *k, size_t count)
35+
{
36+
IF_ARCH_X86(size_t off);
37+
ARCH_X86_ASM
38+
(
39+
__ASM_EMIT("xor %[off], %[off]")
40+
// 8x blocks
41+
__ASM_EMIT("sub $8, %[count]")
42+
__ASM_EMIT("jb 2f")
43+
__ASM_EMIT("1:")
44+
__ASM_EMIT("movups 0x00(%[src],%[off]), %%xmm0") /* xmm0 = s */
45+
__ASM_EMIT("movups 0x10(%[src],%[off]), %%xmm1")
46+
__ASM_EMIT("movups 0x00(%[dst],%[off]), %%xmm2") /* xmm2 = d */
47+
__ASM_EMIT("movups 0x10(%[dst],%[off]), %%xmm3")
48+
__ASM_EMIT("movups 0x00(%[k],%[off]), %%xmm4") /* xmm4 = k */
49+
__ASM_EMIT("movups 0x10(%[k],%[off]), %%xmm5")
50+
__ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = s - d */
51+
__ASM_EMIT("subps %%xmm3, %%xmm1")
52+
__ASM_EMIT("mulps %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
53+
__ASM_EMIT("mulps %%xmm5, %%xmm1")
54+
__ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
55+
__ASM_EMIT("addps %%xmm3, %%xmm1")
56+
__ASM_EMIT("movups %%xmm0, 0x00(%[dst],%[off])")
57+
__ASM_EMIT("movups %%xmm1, 0x10(%[dst],%[off])")
58+
__ASM_EMIT("add $0x20, %[off]")
59+
__ASM_EMIT("sub $8, %[count]")
60+
__ASM_EMIT("jae 1b")
61+
// 4x block
62+
__ASM_EMIT("2:")
63+
__ASM_EMIT("add $4, %[count]")
64+
__ASM_EMIT("jl 4f")
65+
__ASM_EMIT("movups 0x00(%[src],%[off]), %%xmm0") /* xmm0 = s */
66+
__ASM_EMIT("movups 0x00(%[dst],%[off]), %%xmm2") /* xmm2 = d */
67+
__ASM_EMIT("movups 0x00(%[k],%[off]), %%xmm4") /* xmm4 = k */
68+
__ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = s - d */
69+
__ASM_EMIT("mulps %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
70+
__ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
71+
__ASM_EMIT("movups %%xmm0, 0x00(%[dst],%[off])")
72+
__ASM_EMIT("sub $4, %[count]")
73+
__ASM_EMIT("add $0x10, %[off]")
74+
// 1x blocks
75+
__ASM_EMIT("4:")
76+
__ASM_EMIT("add $3, %[count]")
77+
__ASM_EMIT("jl 6f")
78+
__ASM_EMIT("5:")
79+
__ASM_EMIT("movss 0x00(%[src],%[off]), %%xmm0") /* xmm0 = s */
80+
__ASM_EMIT("movss 0x00(%[dst],%[off]), %%xmm2") /* xmm2 = d */
81+
__ASM_EMIT("movss 0x00(%[k],%[off]), %%xmm4") /* xmm4 = k */
82+
__ASM_EMIT("subss %%xmm2, %%xmm0") /* xmm0 = s - d */
83+
__ASM_EMIT("mulss %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
84+
__ASM_EMIT("addss %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
85+
__ASM_EMIT("movss %%xmm0, 0x00(%[dst],%[off])")
86+
__ASM_EMIT("add $0x04, %[off]")
87+
__ASM_EMIT("dec %[count]")
88+
__ASM_EMIT("jge 5b")
89+
// End
90+
__ASM_EMIT("6:")
91+
: [off] "=&r" (off), [count] "+r" (count)
92+
: [dst] "r" (dst), [src] "r" (src), [k] "r" (k)
93+
: "cc", "memory",
94+
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
95+
"%xmm4", "%xmm5"
96+
);
97+
}
98+
99+
void pmix_v2(float *dst, const float *src1, const float *src2, const float *k, size_t count)
100+
{
101+
IF_ARCH_X86(size_t off);
102+
ARCH_X86_ASM
103+
(
104+
__ASM_EMIT("xor %[off], %[off]")
105+
// 8x blocks
106+
__ASM_EMIT32("subl $8, %[count]")
107+
__ASM_EMIT64("sub $8, %[count]")
108+
__ASM_EMIT("jb 2f")
109+
__ASM_EMIT("1:")
110+
__ASM_EMIT("movups 0x00(%[src2],%[off]), %%xmm0") /* xmm0 = s */
111+
__ASM_EMIT("movups 0x10(%[src2],%[off]), %%xmm1")
112+
__ASM_EMIT("movups 0x00(%[src1],%[off]), %%xmm2") /* xmm2 = d */
113+
__ASM_EMIT("movups 0x10(%[src1],%[off]), %%xmm3")
114+
__ASM_EMIT("movups 0x00(%[k],%[off]), %%xmm4") /* xmm4 = k */
115+
__ASM_EMIT("movups 0x10(%[k],%[off]), %%xmm5")
116+
__ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = s - d */
117+
__ASM_EMIT("subps %%xmm3, %%xmm1")
118+
__ASM_EMIT("mulps %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
119+
__ASM_EMIT("mulps %%xmm5, %%xmm1")
120+
__ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
121+
__ASM_EMIT("addps %%xmm3, %%xmm1")
122+
__ASM_EMIT("movups %%xmm0, 0x00(%[dst],%[off])")
123+
__ASM_EMIT("movups %%xmm1, 0x10(%[dst],%[off])")
124+
__ASM_EMIT32("addl $0x20, %[off]")
125+
__ASM_EMIT64("add $0x20, %[off]")
126+
__ASM_EMIT("sub $8, %[count]")
127+
__ASM_EMIT("jae 1b")
128+
// 4x block
129+
__ASM_EMIT("2:")
130+
__ASM_EMIT32("addl $4, %[count]")
131+
__ASM_EMIT64("add $4, %[count]")
132+
__ASM_EMIT("jl 4f")
133+
__ASM_EMIT("movups 0x00(%[src2],%[off]), %%xmm0") /* xmm0 = s */
134+
__ASM_EMIT("movups 0x00(%[src1],%[off]), %%xmm2") /* xmm2 = d */
135+
__ASM_EMIT("movups 0x00(%[k],%[off]), %%xmm4") /* xmm4 = k */
136+
__ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = s - d */
137+
__ASM_EMIT("mulps %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
138+
__ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
139+
__ASM_EMIT("movups %%xmm0, 0x00(%[dst],%[off])")
140+
__ASM_EMIT32("subl $4, %[count]")
141+
__ASM_EMIT64("sub $4, %[count]")
142+
__ASM_EMIT("add $0x10, %[off]")
143+
// 1x blocks
144+
__ASM_EMIT("4:")
145+
__ASM_EMIT32("addl $3, %[count]")
146+
__ASM_EMIT64("add $3, %[count]")
147+
__ASM_EMIT("jl 6f")
148+
__ASM_EMIT("5:")
149+
__ASM_EMIT("movss 0x00(%[src2],%[off]), %%xmm0") /* xmm0 = s */
150+
__ASM_EMIT("movss 0x00(%[src1],%[off]), %%xmm2") /* xmm2 = d */
151+
__ASM_EMIT("movss 0x00(%[k],%[off]), %%xmm4") /* xmm4 = k */
152+
__ASM_EMIT("subss %%xmm2, %%xmm0") /* xmm0 = s - d */
153+
__ASM_EMIT("mulss %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
154+
__ASM_EMIT("addss %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
155+
__ASM_EMIT("movss %%xmm0, 0x00(%[dst],%[off])")
156+
__ASM_EMIT("add $0x04, %[off]")
157+
__ASM_EMIT32("decl %[count]")
158+
__ASM_EMIT64("dec %[count]")
159+
__ASM_EMIT("jge 5b")
160+
// End
161+
__ASM_EMIT("6:")
162+
: [off] "=&r" (off), [count] __ASM_ARG_RW(count)
163+
: [dst] "r" (dst), [src1] "r" (src1), [src2] "r" (src2), [k] "r" (k)
164+
: "cc", "memory",
165+
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
166+
"%xmm4", "%xmm5"
167+
);
168+
}
169+
170+
void pmix_k1(float *dst, const float *src, float k, size_t count)
171+
{
172+
IF_ARCH_X86(size_t off);
173+
ARCH_X86_ASM
174+
(
175+
__ASM_EMIT("shufps $0x00, %[k], %[k]")
176+
__ASM_EMIT("xor %[off], %[off]")
177+
__ASM_EMIT("movaps %[k], %%xmm4")
178+
// 8x blocks
179+
__ASM_EMIT("sub $8, %[count]")
180+
__ASM_EMIT("jb 2f")
181+
__ASM_EMIT("1:")
182+
__ASM_EMIT("movups 0x00(%[src],%[off]), %%xmm0") /* xmm0 = s */
183+
__ASM_EMIT("movups 0x10(%[src],%[off]), %%xmm1")
184+
__ASM_EMIT("movups 0x00(%[dst],%[off]), %%xmm2") /* xmm2 = d */
185+
__ASM_EMIT("movups 0x10(%[dst],%[off]), %%xmm3")
186+
__ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = s - d */
187+
__ASM_EMIT("subps %%xmm3, %%xmm1")
188+
__ASM_EMIT("mulps %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
189+
__ASM_EMIT("mulps %%xmm4, %%xmm1")
190+
__ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
191+
__ASM_EMIT("addps %%xmm3, %%xmm1")
192+
__ASM_EMIT("movups %%xmm0, 0x00(%[dst],%[off])")
193+
__ASM_EMIT("movups %%xmm1, 0x10(%[dst],%[off])")
194+
__ASM_EMIT("add $0x20, %[off]")
195+
__ASM_EMIT("sub $8, %[count]")
196+
__ASM_EMIT("jae 1b")
197+
// 4x block
198+
__ASM_EMIT("2:")
199+
__ASM_EMIT("add $4, %[count]")
200+
__ASM_EMIT("jl 4f")
201+
__ASM_EMIT("movups 0x00(%[src],%[off]), %%xmm0") /* xmm0 = s */
202+
__ASM_EMIT("movups 0x00(%[dst],%[off]), %%xmm2") /* xmm2 = d */
203+
__ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = s - d */
204+
__ASM_EMIT("mulps %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
205+
__ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
206+
__ASM_EMIT("movups %%xmm0, 0x00(%[dst],%[off])")
207+
__ASM_EMIT("sub $4, %[count]")
208+
__ASM_EMIT("add $0x10, %[off]")
209+
// 1x blocks
210+
__ASM_EMIT("4:")
211+
__ASM_EMIT("add $3, %[count]")
212+
__ASM_EMIT("jl 6f")
213+
__ASM_EMIT("5:")
214+
__ASM_EMIT("movss 0x00(%[src],%[off]), %%xmm0") /* xmm0 = s */
215+
__ASM_EMIT("movss 0x00(%[dst],%[off]), %%xmm2") /* xmm2 = d */
216+
__ASM_EMIT("subss %%xmm2, %%xmm0") /* xmm0 = s - d */
217+
__ASM_EMIT("mulss %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
218+
__ASM_EMIT("addss %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
219+
__ASM_EMIT("movss %%xmm0, 0x00(%[dst],%[off])")
220+
__ASM_EMIT("add $0x04, %[off]")
221+
__ASM_EMIT("dec %[count]")
222+
__ASM_EMIT("jge 5b")
223+
// End
224+
__ASM_EMIT("6:")
225+
: [off] "=&r" (off), [count] "+r" (count), [k] "+Yz" (k)
226+
: [dst] "r" (dst), [src] "r" (src)
227+
: "cc", "memory",
228+
"%xmm1", "%xmm2", "%xmm3",
229+
"%xmm4"
230+
);
231+
}
232+
233+
void pmix_k2(float *dst, const float *src1, const float *src2, float k, size_t count)
234+
{
235+
IF_ARCH_X86(size_t off);
236+
ARCH_X86_ASM
237+
(
238+
__ASM_EMIT("shufps $0x00, %[k], %[k]")
239+
__ASM_EMIT("xor %[off], %[off]")
240+
__ASM_EMIT("movaps %[k], %%xmm4")
241+
// 8x blocks
242+
__ASM_EMIT("sub $8, %[count]")
243+
__ASM_EMIT("jb 2f")
244+
__ASM_EMIT("1:")
245+
__ASM_EMIT("movups 0x00(%[src2],%[off]), %%xmm0") /* xmm0 = s */
246+
__ASM_EMIT("movups 0x10(%[src2],%[off]), %%xmm1")
247+
__ASM_EMIT("movups 0x00(%[src1],%[off]), %%xmm2") /* xmm2 = d */
248+
__ASM_EMIT("movups 0x10(%[src1],%[off]), %%xmm3")
249+
__ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = s - d */
250+
__ASM_EMIT("subps %%xmm3, %%xmm1")
251+
__ASM_EMIT("mulps %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
252+
__ASM_EMIT("mulps %%xmm4, %%xmm1")
253+
__ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
254+
__ASM_EMIT("addps %%xmm3, %%xmm1")
255+
__ASM_EMIT("movups %%xmm0, 0x00(%[dst],%[off])")
256+
__ASM_EMIT("movups %%xmm1, 0x10(%[dst],%[off])")
257+
__ASM_EMIT("add $0x20, %[off]")
258+
__ASM_EMIT("sub $8, %[count]")
259+
__ASM_EMIT("jae 1b")
260+
// 4x block
261+
__ASM_EMIT("2:")
262+
__ASM_EMIT("add $4, %[count]")
263+
__ASM_EMIT("jl 4f")
264+
__ASM_EMIT("movups 0x00(%[src2],%[off]), %%xmm0") /* xmm0 = s */
265+
__ASM_EMIT("movups 0x00(%[src1],%[off]), %%xmm2") /* xmm2 = d */
266+
__ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = s - d */
267+
__ASM_EMIT("mulps %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
268+
__ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
269+
__ASM_EMIT("movups %%xmm0, 0x00(%[dst],%[off])")
270+
__ASM_EMIT("sub $4, %[count]")
271+
__ASM_EMIT("add $0x10, %[off]")
272+
// 1x blocks
273+
__ASM_EMIT("4:")
274+
__ASM_EMIT("add $3, %[count]")
275+
__ASM_EMIT("jl 6f")
276+
__ASM_EMIT("5:")
277+
__ASM_EMIT("movss 0x00(%[src2],%[off]), %%xmm0") /* xmm0 = s */
278+
__ASM_EMIT("movss 0x00(%[src1],%[off]), %%xmm2") /* xmm2 = d */
279+
__ASM_EMIT("subss %%xmm2, %%xmm0") /* xmm0 = s - d */
280+
__ASM_EMIT("mulss %%xmm4, %%xmm0") /* xmm0 = (s-d) * k */
281+
__ASM_EMIT("addss %%xmm2, %%xmm0") /* xmm0 = mix = d + (s-d) * k */
282+
__ASM_EMIT("movss %%xmm0, 0x00(%[dst],%[off])")
283+
__ASM_EMIT("add $0x04, %[off]")
284+
__ASM_EMIT("dec %[count]")
285+
__ASM_EMIT("jge 5b")
286+
// End
287+
__ASM_EMIT("6:")
288+
: [off] "=&r" (off), [count] "+r" (count), [k] "+Yz" (k)
289+
: [dst] "r" (dst), [src1] "r" (src1), [src2] "r" (src2)
290+
: "cc", "memory",
291+
"%xmm1", "%xmm2", "%xmm3",
292+
"%xmm4"
293+
);
294+
}
295+
34296
} /* namespace sse */
35297
} /* namespace lsp */
36298

src/main/x86/sse.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,11 @@
297297
EXPORT1(clamp_kk1);
298298
EXPORT1(clamp_kk2);
299299

300+
EXPORT1(pmix_v1);
301+
EXPORT1(pmix_v2);
302+
EXPORT1(pmix_k1);
303+
EXPORT1(pmix_k2);
304+
300305
EXPORT1(direct_fft);
301306
EXPORT1(reverse_fft);
302307
EXPORT1(normalize_fft2);

0 commit comments

Comments
 (0)