@@ -30,6 +30,126 @@ namespace lsp
3030{
3131 namespace asimd
3232 {
33+ IF_ARCH_AARCH64 (
34+ static const uint32_t rgba32_to_bgra32_const[] __lsp_aligned16 =
35+ {
36+ LSP_DSP_VEC4 (0x00ff00ff ),
37+ LSP_DSP_VEC4 (0x00ff00ff ),
38+ };
39+ );
40+
41+ void rgba32_to_bgra32 (void *dst, const void *src, size_t count)
42+ {
43+ ARCH_AARCH64_ASM (
44+ __ASM_EMIT (" ldp q16, q17, [%[XC]]" )
45+
46+ // 32x blocks
47+ __ASM_EMIT (" subs %[count], %[count], #32" )
48+ __ASM_EMIT (" b.lo 2f" )
49+ __ASM_EMIT (" 1:" )
50+ __ASM_EMIT (" ldp q0, q1, [%[src], 0x00]" ) // v0 = R G B A
51+ __ASM_EMIT (" ldp q2, q3, [%[src], 0x20]" )
52+ __ASM_EMIT (" ldp q4, q5, [%[src], 0x40]" )
53+ __ASM_EMIT (" ldp q6, q7, [%[src], 0x60]" )
54+ __ASM_EMIT (" rev32 v8.8h, v0.8h" ) // v8 = B A R G
55+ __ASM_EMIT (" rev32 v9.8h, v1.8h" )
56+ __ASM_EMIT (" rev32 v10.8h, v2.8h" )
57+ __ASM_EMIT (" rev32 v11.8h, v3.8h" )
58+ __ASM_EMIT (" rev32 v12.8h, v4.8h" )
59+ __ASM_EMIT (" rev32 v13.8h, v5.8h" )
60+ __ASM_EMIT (" rev32 v14.8h, v6.8h" )
61+ __ASM_EMIT (" rev32 v15.8h, v7.8h" )
62+ __ASM_EMIT (" bit v0.16b, v8.16b, v16.16b" ) // v0 = B G R A
63+ __ASM_EMIT (" bit v1.16b, v9.16b, v17.16b" )
64+ __ASM_EMIT (" bit v2.16b, v10.16b, v16.16b" )
65+ __ASM_EMIT (" bit v3.16b, v11.16b, v17.16b" )
66+ __ASM_EMIT (" bit v4.16b, v12.16b, v16.16b" )
67+ __ASM_EMIT (" bit v5.16b, v13.16b, v17.16b" )
68+ __ASM_EMIT (" bit v6.16b, v14.16b, v16.16b" )
69+ __ASM_EMIT (" bit v7.16b, v15.16b, v17.16b" )
70+ __ASM_EMIT (" stp q0, q1, [%[dst], 0x00]" )
71+ __ASM_EMIT (" stp q2, q3, [%[dst], 0x20]" )
72+ __ASM_EMIT (" stp q4, q5, [%[dst], 0x40]" )
73+ __ASM_EMIT (" stp q6, q7, [%[dst], 0x60]" )
74+ __ASM_EMIT (" subs %[count], %[count], #32" )
75+ __ASM_EMIT (" add %[src], %[src], 0x80" )
76+ __ASM_EMIT (" add %[dst], %[dst], 0x80" )
77+ __ASM_EMIT (" b.hs 1b" )
78+
79+ // 16x blocks
80+ __ASM_EMIT (" 2:" )
81+ __ASM_EMIT (" adds %[count], %[count], #16" )
82+ __ASM_EMIT (" b.lt 4f" )
83+ __ASM_EMIT (" ldp q0, q1, [%[src], 0x00]" ) // v0 = R G B A
84+ __ASM_EMIT (" ldp q2, q3, [%[src], 0x20]" )
85+ __ASM_EMIT (" rev32 v8.8h, v0.8h" ) // v8 = B A R G
86+ __ASM_EMIT (" rev32 v9.8h, v1.8h" )
87+ __ASM_EMIT (" rev32 v10.8h, v2.8h" )
88+ __ASM_EMIT (" rev32 v11.8h, v3.8h" )
89+ __ASM_EMIT (" bit v0.16b, v8.16b, v16.16b" ) // v0 = B G R A
90+ __ASM_EMIT (" bit v1.16b, v9.16b, v17.16b" )
91+ __ASM_EMIT (" bit v2.16b, v10.16b, v16.16b" )
92+ __ASM_EMIT (" bit v3.16b, v11.16b, v17.16b" )
93+ __ASM_EMIT (" stp q0, q1, [%[dst], 0x00]" )
94+ __ASM_EMIT (" stp q2, q3, [%[dst], 0x20]" )
95+ __ASM_EMIT (" sub %[count], %[count], #16" )
96+ __ASM_EMIT (" add %[src], %[src], 0x40" )
97+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
98+
99+ // 8x blocks
100+ __ASM_EMIT (" 4:" )
101+ __ASM_EMIT (" adds %[count], %[count], #8" )
102+ __ASM_EMIT (" b.lt 6f" )
103+ __ASM_EMIT (" ldp q0, q1, [%[src], 0x00]" ) // v0 = R G B A
104+ __ASM_EMIT (" rev32 v8.8h, v0.8h" ) // v8 = B A R G
105+ __ASM_EMIT (" rev32 v9.8h, v1.8h" )
106+ __ASM_EMIT (" bit v0.16b, v8.16b, v16.16b" ) // v0 = B G R A
107+ __ASM_EMIT (" bit v1.16b, v9.16b, v17.16b" )
108+ __ASM_EMIT (" stp q0, q1, [%[dst], 0x00]" )
109+ __ASM_EMIT (" sub %[count], %[count], #8" )
110+ __ASM_EMIT (" add %[src], %[src], 0x20" )
111+ __ASM_EMIT (" add %[dst], %[dst], 0x20" )
112+
113+ // 4x blocks
114+ __ASM_EMIT (" 6:" )
115+ __ASM_EMIT (" adds %[count], %[count], #4" )
116+ __ASM_EMIT (" b.lt 8f" )
117+ __ASM_EMIT (" ldr q0, [%[src], 0x00]" ) // v0 = R G B A
118+ __ASM_EMIT (" rev32 v8.8h, v0.8h" ) // v8 = B A R G
119+ __ASM_EMIT (" bit v0.16b, v8.16b, v16.16b" ) // v0 = B G R A
120+ __ASM_EMIT (" str q0, [%[dst], 0x00]" )
121+ __ASM_EMIT (" sub %[count], %[count], #4" )
122+ __ASM_EMIT (" add %[src], %[src], 0x10" )
123+ __ASM_EMIT (" add %[dst], %[dst], 0x10" )
124+
125+ // 1x blocks
126+ __ASM_EMIT (" 8:" )
127+ __ASM_EMIT (" adds %[count], %[count], #3" )
128+ __ASM_EMIT (" b.lt 10f" )
129+ __ASM_EMIT (" 9:" )
130+ __ASM_EMIT (" ld1r {v0.4s}, [%[src]]" ) // v0 = R G B A
131+ __ASM_EMIT (" rev32 v8.8h, v0.8h" ) // v8 = B A R G
132+ __ASM_EMIT (" bit v0.16b, v8.16b, v16.16b" ) // v0 = B G R A
133+ __ASM_EMIT (" st1 {v0.s}[0], [%[dst]]" )
134+ __ASM_EMIT (" add %[src], %[src], 0x04" )
135+ __ASM_EMIT (" add %[dst], %[dst], 0x04" )
136+ __ASM_EMIT (" subs %[count], %[count], #1" )
137+ __ASM_EMIT (" b.ge 9b" )
138+
139+ // End
140+ __ASM_EMIT (" 10:" )
141+ : [src] " +r" (src), [dst] " +r" (dst),
142+ [count] " +r" (count)
143+ : [XC] " r" (&rgba32_to_bgra32_const[0 ])
144+ : " cc" , " memory" ,
145+ " v0" , " v1" , " v2" , " v3" ,
146+ " v4" , " v5" , " v6" , " v7" ,
147+ " v8" , " v9" , " v10" , " v11" ,
148+ " v12" , " v13" , " v14" , " v15" ,
149+ " v16" , " v17"
150+ );
151+ }
152+
33153 IF_ARCH_AARCH64 (
34154 static const uint32_t abgr32_to_bgrff32_const[] __lsp_aligned32 =
35155 {
0 commit comments