@@ -292,7 +292,7 @@ namespace lsp
292292
293293 : [dst] " +r" (dst), [src] " +r" (v), [count] " +r" (count),
294294 [eff] " +r" (eff)
295- : [XC] " r " (&EFF_HSLA_HUE_XC[ 0 ])
295+ :
296296 : " cc" , " memory" ,
297297 " v0" , " v1" , " v2" , " v3" ,
298298 " v4" , " v5" , " v6" , " v7" ,
@@ -301,6 +301,130 @@ namespace lsp
301301 );
302302 }
303303
304+ #define EFF_HSLA_LIGHT_CORE \
305+ /* v0 = v[0] */ \
306+ /* v1 = v[1] */ \
307+ /* v8 = h */ \
308+ /* v9 = s */ \
309+ /* v10 = l */ \
310+ /* v11 = a */ \
311+ /* v14 = T */ \
312+ /* v15 = KT */ \
313+ __ASM_EMIT (" fabs v5.4s, v1.4s" ) /* v5 = V1 = abs(v1) */ \
314+ __ASM_EMIT (" fabs v1.4s, v0.4s" ) /* v1 = V = abs(v) */ \
315+ __ASM_EMIT (" fsub v3.4s, v14.4s, v1.4s" ) /* v3 = T - V */ \
316+ __ASM_EMIT (" fsub v7.4s, v14.4s, v5.4s" ) \
317+ __ASM_EMIT (" fcmgt v2.4s, v3.4s, #0.0" ) /* v2 = [(T-V) > 0] */ \
318+ __ASM_EMIT (" fcmgt v6.4s, v7.4s, #0.0" ) \
319+ __ASM_EMIT (" fmul v3.4s, v3.4s, v15.4s" ) /* v3 = (T-V)*KT */ \
320+ __ASM_EMIT (" fmul v7.4s, v7.4s, v15.4s" ) \
321+ __ASM_EMIT (" bit v1.16b, v14.16b, v2.16b" ) /* v1 = EL = V&[(T-V) <= 0] | T&[(T-V) > 0] */ \
322+ __ASM_EMIT (" bit v5.16b, v14.16b, v6.16b" ) \
323+ __ASM_EMIT (" and v3.16b, v3.16b, v2.16b" ) /* v3 = A = ((T-V)*KT) & [(T-V) > 0] */ \
324+ __ASM_EMIT (" and v7.16b, v7.16b, v6.16b" ) \
325+ __ASM_EMIT (" fmul v2.4s, v1.4s, v10.4s" ) /* v2 = EL*l = L */ \
326+ __ASM_EMIT (" fmul v6.4s, v5.4s, v10.4s" ) \
327+ __ASM_EMIT (" mov v0.16b, v8.16b" ) \
328+ __ASM_EMIT (" mov v1.16b, v9.16b" ) \
329+ __ASM_EMIT (" mov v4.16b, v8.16b" ) \
330+ __ASM_EMIT (" mov v5.16b, v9.16b" )
331+
332+ /*
333+ kt = 1.0f / eff->thresh;
334+ value = (value >= 0.0f) ? value : -value;
335+
336+ if ((eff->thresh - value) <= 0)
337+ {
338+ dst[2] = eff->l * value;
339+ dst[3] = 0.0f;
340+ }
341+ else
342+ {
343+ dst[2] = eff->l * eff->thresh;
344+ dst[3] = (eff->thresh - value) * kt;
345+ }
346+
347+ dst[0] = eff->h;
348+ dst[1] = eff->s;
349+ */
350+
351+ void eff_hsla_light (float *dst, const float *v, const dsp::hsla_light_eff_t *eff, size_t count)
352+ {
353+ ARCH_AARCH64_ASM
354+ (
355+ __ASM_EMIT (" ld4r {v8.4s, v9.4s, v10.4s, v11.4s}, [%[eff]]" ) /* v8 = h, v9 = s, v10 = l, v11 = a */
356+ __ASM_EMIT (" add %[eff], %[eff], #0x10" )
357+ __ASM_EMIT (" ld1r {v14.4s}, [%[eff]]" ) /* v14 = T */
358+ __ASM_EMIT (" frecpe v0.4s, v14.4s" ) /* v0 = TD */
359+ __ASM_EMIT (" frecps v1.4s, v0.4s, v14.4s" ) /* v1 = (2 - TD*T) */
360+ __ASM_EMIT (" fmul v0.4s, v1.4s, v0.4s" ) /* v0 = t' = TD * (2 - TD*T) */
361+ __ASM_EMIT (" frecps v1.4s, v0.4s, v14.4s" ) /* v1 = (2 - TD*t') */
362+ __ASM_EMIT (" fmul v15.4s, v1.4s, v0.4s" ) /* v15 = KT = 1/t = t' * (2 - TD*t') */
363+
364+ // -----------------------------------------------------------------
365+ // 8x blocks
366+ __ASM_EMIT (" subs %[count], %[count], #8" )
367+ __ASM_EMIT (" b.lo 2f" )
368+ __ASM_EMIT (" 1:" )
369+ __ASM_EMIT (" ldp q0, q1, [%[src]]" ) /* v8 = v[0], v9 = v[1] */
370+ EFF_HSLA_LIGHT_CORE
371+ __ASM_EMIT (" subs %[count], %[count], #8" )
372+ __ASM_EMIT (" st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[dst]]" )
373+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
374+ __ASM_EMIT (" add %[src], %[src], 0x20" )
375+ __ASM_EMIT (" st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[dst]]" )
376+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
377+ __ASM_EMIT (" b.hs 1b" )
378+
379+ // -----------------------------------------------------------------
380+ // 1x-8x block
381+ __ASM_EMIT (" 2:" )
382+ __ASM_EMIT (" adds %[count], %[count], #8" )
383+ __ASM_EMIT (" b.ls 14f" )
384+ __ASM_EMIT (" tst %[count], #4" )
385+ __ASM_EMIT (" b.eq 4f" )
386+ __ASM_EMIT (" ldr q0, [%[src]]" )
387+ __ASM_EMIT (" add %[src], %[src], 0x10" )
388+ __ASM_EMIT (" 4:" )
389+ __ASM_EMIT (" tst %[count], #2" )
390+ __ASM_EMIT (" b.eq 6f" )
391+ __ASM_EMIT (" ld1 {v1.2s}, [%[src]]" )
392+ __ASM_EMIT (" add %[src], %[src], 0x08" )
393+ __ASM_EMIT (" 6:" )
394+ __ASM_EMIT (" tst %[count], #1" )
395+ __ASM_EMIT (" b.eq 8f" )
396+ __ASM_EMIT (" ld1 {v1.s}[2], [%[src]]" )
397+ __ASM_EMIT (" 8:" )
398+ EFF_HSLA_LIGHT_CORE
399+ __ASM_EMIT (" tst %[count], #4" )
400+ __ASM_EMIT (" b.eq 10f" )
401+ __ASM_EMIT (" st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[dst]]" )
402+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
403+ __ASM_EMIT (" 10:" )
404+ __ASM_EMIT (" tst %[count], #2" )
405+ __ASM_EMIT (" b.eq 12f" )
406+ __ASM_EMIT (" st4 {v4.2s, v5.2s, v6.2s, v7.2s}, [%[dst]]" )
407+ __ASM_EMIT (" add %[dst], %[dst], 0x20" )
408+ __ASM_EMIT (" 12:" )
409+ __ASM_EMIT (" tst %[count], #1" )
410+ __ASM_EMIT (" b.eq 14f" )
411+ __ASM_EMIT (" st4 {v4.s, v5.s, v6.s, v7.s}[2], [%[dst]]" )
412+ // End
413+ __ASM_EMIT (" 14:" )
414+
415+ : [dst] " +r" (dst), [src] " +r" (v), [count] " +r" (count),
416+ [eff] " +r" (eff)
417+ :
418+ : " cc" , " memory" ,
419+ " v0" , " v1" , " v2" , " v3" ,
420+ " v4" , " v5" , " v6" , " v7" ,
421+ " v8" , " v9" , " v10" , " v11" ,
422+ " v12" , " v13" , " v14" , " v15"
423+ );
424+ }
425+
426+ #undef EFF_HSLA_LIGHT_CORE
427+
304428 } /* namespace asimd */
305429} /* namespace lsp */
306430
0 commit comments