diff --git a/src/audio/eq_fir/eq_fir_hifi2ep.c b/src/audio/eq_fir/eq_fir_hifi2ep.c index 2a687623ea51..71b40edcee32 100644 --- a/src/audio/eq_fir/eq_fir_hifi2ep.c +++ b/src/audio/eq_fir/eq_fir_hifi2ep.c @@ -61,8 +61,7 @@ void eq_fir_2x_s32(struct fir_state_32x16 fir[], struct input_stream_buffer *bso for (i = 0; i < (frames >> 1); i++) { x1 = x0 + nch; y1 = y0 + nch; - fir_32x16_2x_hifiep(f, *x0, *x1, y0, y1, - lshift, rshift); + fir_32x16_2x(f, *x0, *x1, y0, y1, lshift, rshift); x0 += inc; y0 += inc; } @@ -107,8 +106,7 @@ void eq_fir_2x_s24(struct fir_state_32x16 fir[], struct input_stream_buffer *bso for (i = 0; i < (frames >> 1); i++) { x1 = x0 + nch; y1 = y0 + nch; - fir_32x16_2x_hifiep(f, *x0 << 8, *x1 << 8, &z0, &z1, - lshift, rshift); + fir_32x16_2x(f, *x0 << 8, *x1 << 8, &z0, &z1, lshift, rshift); *y0 = sat_int24(Q_SHIFT_RND(z0, 31, 23)); *y1 = sat_int24(Q_SHIFT_RND(z1, 31, 23)); x0 += inc; @@ -155,8 +153,7 @@ void eq_fir_2x_s16(struct fir_state_32x16 fir[], struct input_stream_buffer *bso for (i = 0; i < (frames >> 1); i++) { x1 = x0 + nch; y1 = y0 + nch; - fir_32x16_2x_hifiep(f, *x0 << 16, *x1 << 16, &z0, &z1, - lshift, rshift); + fir_32x16_2x(f, *x0 << 16, *x1 << 16, &z0, &z1, lshift, rshift); *y0 = sat_int16(Q_SHIFT_RND(z0, 31, 15)); *y1 = sat_int16(Q_SHIFT_RND(z1, 31, 15)); x0 += inc; diff --git a/src/audio/eq_fir/eq_fir_hifi3.c b/src/audio/eq_fir/eq_fir_hifi3.c index 638b296a74c4..0d25851ea447 100644 --- a/src/audio/eq_fir/eq_fir_hifi3.c +++ b/src/audio/eq_fir/eq_fir_hifi3.c @@ -70,7 +70,7 @@ void eq_fir_2x_s32(struct fir_state_32x16 fir[], struct input_stream_buffer *bso /* Load two input samples via input pointer x */ AE_L32_XP(d0, x, inc_nch_s); AE_L32_XP(d1, x, inc_nch_s); - fir_32x16_2x_hifi3(f, d0, d1, y0, y1, shift); + fir_32x16_2x(f, d0, d1, y0, y1, shift); AE_L32_XC(d0, y0, inc_2nch_s); AE_L32_XC(d1, y1, inc_2nch_s); } @@ -131,7 +131,7 @@ void eq_fir_2x_s24(struct fir_state_32x16 fir[], struct input_stream_buffer *bso d0 = AE_SLAA32(d0, 8); d1 = AE_SLAA32(d1, 8); - fir_32x16_2x_hifi3(f, d0, d1, &z0, &z1, shift); + fir_32x16_2x(f, d0, d1, &z0, &z1, shift); /* Shift and round to Q1.23 format */ d0 = AE_SRAI32R(z0, 8); @@ -205,7 +205,7 @@ void eq_fir_2x_s16(struct fir_state_32x16 fir[], struct input_stream_buffer *bso x0 = AE_CVT32X2F16_32(d0); x1 = AE_CVT32X2F16_32(d1); - fir_32x16_2x_hifi3(f, x0, x1, &z0, &z1, shift); + fir_32x16_2x(f, x0, x1, &z0, &z1, shift); /* Round to Q1.15 format */ d0 = AE_ROUND16X4F32SSYM(z0, z0); diff --git a/src/audio/tdfb/tdfb_hifi3.c b/src/audio/tdfb/tdfb_hifi3.c index 77225ceefbeb..f5484d337395 100644 --- a/src/audio/tdfb/tdfb_hifi3.c +++ b/src/audio/tdfb/tdfb_hifi3.c @@ -76,8 +76,7 @@ void tdfb_fir_s16(struct tdfb_comp_data *cd, struct input_stream_buffer *bsource /* Compute FIR and mix as Q5.27*/ fir_core_setup_circular(f); - fir_32x16_2x_hifi3(f, cd->in[is], cd->in[is2], &y0, &y1, - shift); + fir_32x16_2x(f, cd->in[is], cd->in[is2], &y0, &y1, shift); for (k = 0; k < out_nch; k++) { if (om & 1) { cd->out[k] += (int32_t)y0 >> 4; @@ -167,8 +166,7 @@ void tdfb_fir_s24(struct tdfb_comp_data *cd, struct input_stream_buffer *bsource /* Compute FIR and mix as Q5.27*/ fir_core_setup_circular(f); - fir_32x16_2x_hifi3(f, cd->in[is], cd->in[is2], &y0, &y1, - shift); + fir_32x16_2x(f, cd->in[is], cd->in[is2], &y0, &y1, shift); for (k = 0; k < out_nch; k++) { if (om & 1) { cd->out[k] += (int32_t)y0 >> 4; @@ -257,8 +255,7 @@ void tdfb_fir_s32(struct tdfb_comp_data *cd, struct input_stream_buffer *bsource /* Compute FIR and mix as Q5.27*/ fir_core_setup_circular(f); - fir_32x16_2x_hifi3(f, cd->in[is], cd->in[is2], &y0, &y1, - shift); + fir_32x16_2x(f, cd->in[is], cd->in[is2], &y0, &y1, shift); for (k = 0; k < out_nch; k++) { if (om & 1) { cd->out[k] += (int32_t)y0 >> 4; diff --git a/src/audio/tdfb/tdfb_hifiep.c b/src/audio/tdfb/tdfb_hifiep.c index 1a8cdc4e397a..d6fea124b542 100644 --- a/src/audio/tdfb/tdfb_hifiep.c +++ b/src/audio/tdfb/tdfb_hifiep.c @@ -42,7 +42,7 @@ static inline void tdfb_core(struct tdfb_comp_data *cd, int in_nch, int out_nch) fir_hifiep_setup_circular(f); fir_get_lrshifts(f, &lshift, &rshift); /* Process two samples */ - fir_32x16_2x_hifiep(f, cd->in[is], cd->in[is2], &y0, &y1, lshift, rshift); + fir_32x16_2x(f, cd->in[is], cd->in[is2], &y0, &y1, lshift, rshift); /* Mix as Q5.27 */ for (k = 0; k < out_nch; k++) { if (om & 1) { diff --git a/src/include/sof/math/fir_hifi2ep.h b/src/include/sof/math/fir_hifi2ep.h index 85561c770c23..f3625104b3dc 100644 --- a/src/include/sof/math/fir_hifi2ep.h +++ b/src/include/sof/math/fir_hifi2ep.h @@ -30,7 +30,6 @@ struct fir_state_32x16 { ae_p16x2s *coef; /* Pointer to FIR coefficients */ int taps; /* Number of FIR taps */ int length; /* Number of FIR taps plus input length (even) */ - int in_shift; /* Amount of right shifts at input */ int out_shift; /* Amount of right shifts at output */ }; @@ -53,10 +52,10 @@ static inline void fir_hifiep_setup_circular(struct fir_state_32x16 *fir) void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift, int *rshift); -void fir_32x16_hifiep(struct fir_state_32x16 *fir, int32_t x, int32_t *y, int lshift, int rshift); +void fir_32x16(struct fir_state_32x16 *fir, int32_t x, int32_t *y, int lshift, int rshift); -void fir_32x16_2x_hifiep(struct fir_state_32x16 *fir, int32_t x0, int32_t x1, - int32_t *y0, int32_t *y1, int lshift, int rshift); +void fir_32x16_2x(struct fir_state_32x16 *fir, int32_t x0, int32_t x1, + int32_t *y0, int32_t *y1, int lshift, int rshift); #endif #endif /* __SOF_MATH_FIR_HIFI2EP_H__ */ diff --git a/src/include/sof/math/fir_hifi3.h b/src/include/sof/math/fir_hifi3.h index 1a9e626247d7..abcf46df5f6f 100644 --- a/src/include/sof/math/fir_hifi3.h +++ b/src/include/sof/math/fir_hifi3.h @@ -28,7 +28,6 @@ struct fir_state_32x16 { ae_f16x4 *coef; /* Pointer to FIR coefficients */ int taps; /* Number of FIR taps */ int length; /* Number of FIR taps plus input length (even) */ - int in_shift; /* Amount of right shifts at input */ int out_shift; /* Amount of right shifts at output */ }; @@ -55,14 +54,12 @@ static inline void fir_comp_setup_circular(const struct audio_stream *buffer) AE_SETCEND0(audio_stream_get_end_addr(buffer)); } -void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift, - int *rshift); +void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift, int *rshift); -void fir_32x16_hifi3(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, - int shift); +void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift); -void fir_32x16_2x_hifi3(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1, - ae_int32 *y0, ae_int32 *y1, int shift); +void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1, + ae_int32 *y0, ae_int32 *y1, int shift); #endif #endif /* __SOF_MATH_FIR_HIFI3_H__ */ diff --git a/src/math/CMakeLists.txt b/src/math/CMakeLists.txt index 9b506eb0b85e..ef0c409b69a6 100644 --- a/src/math/CMakeLists.txt +++ b/src/math/CMakeLists.txt @@ -29,7 +29,7 @@ add_local_sources_ifdef(CONFIG_POWER_FIXED sof power.c) add_local_sources_ifdef(CONFIG_BINARY_LOGARITHM_FIXED sof base2log.c) -add_local_sources_ifdef(CONFIG_MATH_FIR sof fir_generic.c fir_hifi2ep.c fir_hifi3.c) +add_local_sources_ifdef(CONFIG_MATH_FIR sof fir_generic.c fir_hifi2ep.c fir_hifi3.c fir_hifi5.c) if(CONFIG_MATH_FFT) add_subdirectory(fft) diff --git a/src/math/fir_hifi2ep.c b/src/math/fir_hifi2ep.c index 2172e3e98d53..1dd03d7c1485 100644 --- a/src/math/fir_hifi2ep.c +++ b/src/math/fir_hifi2ep.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: BSD-3-Clause // -// Copyright(c) 2017 Intel Corporation. All rights reserved. +// Copyright(c) 2017-2025 Intel Corporation. // // Author: Seppo Ingalsuo @@ -89,7 +89,7 @@ EXPORT_SYMBOL(fir_get_lrshifts); * 8x 48 bit registers in register file P */ -void fir_32x16_hifiep(struct fir_state_32x16 *fir, int32_t x, int32_t *y, int lshift, int rshift) +void fir_32x16(struct fir_state_32x16 *fir, int32_t x, int32_t *y, int lshift, int rshift) { /* This function uses * 1x 56 bit registers Q, @@ -163,8 +163,8 @@ EXPORT_SYMBOL(fir_32x16_hifiep); * 8x 48 bit registers in register file P */ -void fir_32x16_2x_hifiep(struct fir_state_32x16 *fir, int32_t x0, int32_t x1, - int32_t *y0, int32_t *y1, int lshift, int rshift) +void fir_32x16_2x(struct fir_state_32x16 *fir, int32_t x0, int32_t x1, + int32_t *y0, int32_t *y1, int lshift, int rshift) { /* This function uses * 2x 56 bit registers Q, diff --git a/src/math/fir_hifi3.c b/src/math/fir_hifi3.c index a485bf235f79..dd3e0edf8bde 100644 --- a/src/math/fir_hifi3.c +++ b/src/math/fir_hifi3.c @@ -1,13 +1,13 @@ // SPDX-License-Identifier: BSD-3-Clause // -// Copyright(c) 2017 Intel Corporation. All rights reserved. +// Copyright(c) 2017-2025 Intel Corporation. // // Author: Seppo Ingalsuo #include #include -#if SOF_USE_MIN_HIFI(3, FILTER) +#if SOF_USE_HIFI(3, FILTER) || SOF_USE_HIFI(4, FILTER) #include #include @@ -90,8 +90,7 @@ EXPORT_SYMBOL(fir_get_lrshifts); * 8x 48 bit registers in register file P */ -void fir_32x16_hifi3(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, - int shift) +void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift) { /* This function uses * 1x 56 bit registers Q, @@ -162,15 +161,15 @@ void fir_32x16_hifi3(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, a = AE_SLAA64S(a, shift); AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y, 0); } -EXPORT_SYMBOL(fir_32x16_hifi3); +EXPORT_SYMBOL(fir_32x16); /* HiFi EP has the follow number of reqisters that should not be exceeded * 4x 56 bit registers in register file Q * 8x 48 bit registers in register file P */ -void fir_32x16_2x_hifi3(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1, - ae_int32 *y0, ae_int32 *y1, int shift) +void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1, + ae_int32 *y0, ae_int32 *y1, int shift) { /* This function uses * 2x 56 bit registers Q, @@ -252,6 +251,6 @@ void fir_32x16_2x_hifi3(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1, AE_S32_L_I(AE_ROUND32F48SSYM(b), (ae_int32 *)y1, 0); AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y0, 0); } -EXPORT_SYMBOL(fir_32x16_2x_hifi3); +EXPORT_SYMBOL(fir_32x16_2x); #endif diff --git a/src/math/fir_hifi5.c b/src/math/fir_hifi5.c new file mode 100644 index 000000000000..d4b699ceabb2 --- /dev/null +++ b/src/math/fir_hifi5.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2017-2025 Intel Corporation. +// +// Author: Seppo Ingalsuo + +#include +#include + +#if SOF_USE_MIN_HIFI(5, FILTER) + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * EQ FIR algorithm code + */ + +void fir_reset(struct fir_state_32x16 *fir) +{ + fir->taps = 0; + fir->length = 0; + fir->out_shift = 0; + fir->coef = NULL; + /* There may need to know the beginning of dynamic allocation after + * reset so omitting setting also fir->delay to NULL. + */ +} +EXPORT_SYMBOL(fir_reset); + +int fir_delay_size(struct sof_fir_coef_data *config) +{ + /* Check FIR tap count for implementation specific constraints */ + if (config->length > SOF_FIR_MAX_LENGTH || config->length < 4) + return -EINVAL; + + /* The optimization requires the tap count to be multiple of four */ + if (config->length & 0x3) + return -EINVAL; + + /* The dual sample version needs one more delay entry. To preserve + * align for 64 bits need to add two. + */ + return (config->length + 2) * sizeof(int32_t); +} +EXPORT_SYMBOL(fir_delay_size); + +int fir_init_coef(struct fir_state_32x16 *fir, + struct sof_fir_coef_data *config) +{ + /* The length is taps plus two since the filter computes two + * samples per call. Length plus one would be minimum but the add + * must be even. The even length is needed for 64 bit loads from delay + * lines with 32 bit samples. + */ + fir->taps = (int)config->length; + fir->length = fir->taps + 2; + fir->out_shift = (int)config->out_shift; + fir->coef = (ae_f16x4 *)&config->coef[0]; + return 0; +} +EXPORT_SYMBOL(fir_init_coef); + +void fir_init_delay(struct fir_state_32x16 *fir, int32_t **data) +{ + fir->delay = (ae_int32 *)*data; + fir->delay_end = fir->delay + fir->length; + fir->rwp = (ae_int32 *)(fir->delay + fir->length - 1); + *data += fir->length; /* Point to next delay line start */ +} +EXPORT_SYMBOL(fir_init_delay); + +void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift, + int *rshift) +{ + *lshift = (fir->out_shift < 0) ? -fir->out_shift : 0; + *rshift = (fir->out_shift > 0) ? fir->out_shift : 0; +} +EXPORT_SYMBOL(fir_get_lrshifts); + +void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift) +{ + /* This function uses + * 1x 56 bit registers Q, + * 4x 48 bit registers P + * 3x integers + * 2x address pointers, + */ + ae_f64 a; + ae_valign u; + ae_f32x2 data2; + ae_f16x4 coefs; + ae_f32x2 d0; + ae_f32x2 d1; + int i; + ae_int32 *dp = fir->rwp; + ae_int16x4 *coefp = (ae_int16x4 *)fir->coef; + const int taps_div_4 = fir->taps >> 2; + const int inc = sizeof(int32_t); + + /* Bypass samples if taps count is zero. */ + if (!taps_div_4) { + *y = x; + return; + } + + /* Write sample to delay */ + AE_S32_L_XC(x, fir->rwp, -sizeof(int32_t)); + + /* Prime the coefficients stream */ + u = AE_LA64_PP(coefp); + + /* Note: If the next function is converted to handle two samples + * per call the data load can be done with single instruction + * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f)); + */ + a = AE_ZEROQ56(); + for (i = 0; i < taps_div_4; i++) { + /* Load four coefficients. Coef_3 contains tap h[n], + * coef_2 contains h[n+1], coef_1 contains h[n+2], and + * coef_0 contains h[n+3]; + */ + AE_LA16X4_IP(coefs, u, coefp); + + /* Load two data samples and pack to d0 to data2_h and + * d1 to data2_l. + */ + AE_L32_XC(d0, dp, inc); + AE_L32_XC(d1, dp, inc); + data2 = AE_SEL32_LL(d0, d1); + + /* Accumulate + * a += data2_h * coefs_3 + data2_l * coefs_2. The Q1.31 + * data and Q1.15 coefficients are used as 24 bits as + * Q1.23 values. + */ + AE_MULAAFD32X16_H3_L2(a, data2, coefs); + + /* Repeat the same for next two taps and increase coefp. + * a += data2_h * coefs_1 + data2_l * coefs_0. + */ + AE_L32_XC(d0, dp, inc); + AE_L32_XC(d1, dp, inc); + data2 = AE_SEL32_LL(d0, d1); + AE_MULAAFD32X16_H1_L0(a, data2, coefs); + } + + /* Do scaling shifts and store sample. */ + a = AE_SLAA64S(a, shift); + AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y, 0); +} +EXPORT_SYMBOL(fir_32x16); + +void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1, + ae_int32 *y0, ae_int32 *y1, int shift) +{ + /* This function uses + * 7x 64 bit AE registers + * 3x integers + * 2x address pointers, + */ + ae_valign u; + ae_f64 a = AE_ZERO64(); + ae_f64 b = AE_ZERO64(); + ae_f32x2 d0; + ae_f32x2 d1; + ae_f32x2 d2; + ae_f16x4 coefs; + ae_f32x2 *dp; + ae_f16x4 *coefp = fir->coef; + const int taps_div_4 = fir->taps >> 2; + const int inc = 2 * sizeof(int32_t); + int i; + + /* Bypass samples if taps count is zero. */ + if (!taps_div_4) { + *y0 = x0; + *y1 = x1; + return; + } + + /* Write samples to delay */ + AE_S32_L_XC(x0, fir->rwp, -sizeof(int32_t)); + dp = (ae_f32x2 *)fir->rwp; + AE_S32_L_XC(x1, fir->rwp, -sizeof(int32_t)); + + /* Prime the coefficients stream */ + u = AE_LA64_PP(coefp); + + /* Load two samples, two newest samples and proceed + * to elder input samples in delay line. + */ + AE_L32X2_XC(d0, dp, inc); + for (i = 0; i < taps_div_4; i++) { + /* Load four coefficients. Coef_3 contains tap h[n], + * coef_2 contains h[n+1], coef_1 contains h[n+2], and + * coef_0 contains h[n+3]; + */ + AE_LA16X4_IP(coefs, u, coefp); + + /* Load two data samples more. + * d0.H is x[n] the newest sample + * d0.L is x[n-1] + * d1.H is x[n-2] + * d1.L is x[n-3] + * d2.H is x[n-4] + */ + AE_L32X2_XC(d1, dp, inc); + AE_L32X2_XC(d2, dp, inc); + + /* Calculate four FIR taps for current (x1 -> a) and previous input (x0 -> b) + * b = b + d0.H * c.3 + d0.L * c.2 + d1.H * c.1 + d1.L * c.0 + * a = a + d0.L * c.3 + d1.H * c.2 + d1.L * c.1 + d2.H * c.0 + */ + AE_MULA2Q32X16_FIR_H(b, a, d0, d1, d2, coefs); + + /* Prepare for next four taps, d2 overlaps to next loop iteration as d0 */ + d0 = d2; + } + + /* Shift left by one Q1.31 x Q1.15 -> Q2.46 format for Q2.47 round and + * store output samples. + */ + b = AE_SLAA64S(b, shift + 1); + a = AE_SLAA64S(a, shift + 1); + d0 = AE_ROUND32X2F48SASYM(b, a); + AE_S32_H_I(d0, (ae_int32 *)y1, 0); + AE_S32_L_I(d0, (ae_int32 *)y0, 0); +} +EXPORT_SYMBOL(fir_32x16_2x); + +#endif diff --git a/src/math/fir_llext/CMakeLists.txt b/src/math/fir_llext/CMakeLists.txt index 9b8d2531bea3..39bb2d8ce89b 100644 --- a/src/math/fir_llext/CMakeLists.txt +++ b/src/math/fir_llext/CMakeLists.txt @@ -6,5 +6,6 @@ sof_llext_build("fir" ../fir_generic.c ../fir_hifi2ep.c ../fir_hifi3.c + ../fir_hifi5.c LIB openmodules ) diff --git a/zephyr/CMakeLists.txt b/zephyr/CMakeLists.txt index 54face8b974d..b387fc011de1 100644 --- a/zephyr/CMakeLists.txt +++ b/zephyr/CMakeLists.txt @@ -691,6 +691,7 @@ elseif(CONFIG_MATH_FIR) ${SOF_MATH_PATH}/fir_generic.c ${SOF_MATH_PATH}/fir_hifi2ep.c ${SOF_MATH_PATH}/fir_hifi3.c + ${SOF_MATH_PATH}/fir_hifi5.c ) endif()