From 2d04d474636346aee7ff15f3a44f9e4acb462424 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Tue, 28 May 2024 13:03:10 -0400 Subject: [PATCH] Support 32-bit window and comb filter --- celt/arch.h | 19 +++++++++ celt/celt.c | 79 ++++++++++++++++++------------------ celt/celt.h | 3 +- celt/celt_decoder.c | 8 ++-- celt/celt_lpc.c | 7 ++-- celt/celt_lpc.h | 2 +- celt/dump_modes/dump_modes.c | 12 +++++- celt/fixed_debug.h | 1 + celt/fixed_generic.h | 7 ++++ celt/mdct.c | 24 +++++------ celt/mdct.h | 8 ++-- celt/modes.c | 9 +++- celt/modes.h | 2 +- celt/static_modes_fixed.h | 29 ++++++++++++- celt/tests/test_unit_mdct.c | 8 +++- src/opus_decoder.c | 10 ++--- src/opus_encoder.c | 13 +++--- 17 files changed, 160 insertions(+), 81 deletions(-) diff --git a/celt/arch.h b/celt/arch.h index d23889c05..f7618541f 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -175,8 +175,20 @@ typedef opus_val16 opus_res; #ifdef ENABLE_QEXT typedef opus_val32 celt_coef; +#define COEF_ONE Q31ONE +#define MULT_COEF_32(a, b) MULT32_32_Q31(a,b) +#define MAC_COEF_32_ARM(c, a, b) ADD32((c), MULT32_32_Q32(a,b)) +#define MULT_COEF(a, b) MULT32_32_Q31(a,b) +#define MULT_COEF_TAPS(a, b) SHL32(MULT16_16(a,b), 1) +#define COEF2VAL16(x) EXTRACT16(SHR32(x, 16)) #else typedef opus_val16 celt_coef; +#define COEF_ONE Q15ONE +#define MULT_COEF_32(a, b) MULT16_32_Q15(a,b) +#define MAC_COEF_32_ARM(a, b, c) MAC16_32_Q16(a,b,c) +#define MULT_COEF(a, b) MULT16_16_Q15(a,b) +#define MULT_COEF_TAPS(a, b) MULT16_16_P15(a,b) +#define COEF2VAL16(x) (x) #endif #define celt_isnan(x) 0 @@ -265,6 +277,8 @@ static OPUS_INLINE int celt_isnan(float x) #define Q15ONE 1.0f #define Q31ONE 1.0f +#define COEF_ONE 1.0f +#define COEF2VAL16(x) (x) #define NORM_SCALING 1.f @@ -321,6 +335,7 @@ static OPUS_INLINE int celt_isnan(float x) #define MAC16_32_Q15(c,a,b) ((c)+(a)*(b)) #define MAC16_32_Q16(c,a,b) ((c)+(a)*(b)) +#define MAC_COEF_32_ARM(c,a,b) ((c)+(a)*(b)) #define MULT16_16_Q11_32(a,b) ((a)*(b)) #define MULT16_16_Q11(a,b) ((a)*(b)) @@ -332,6 +347,10 @@ static OPUS_INLINE int celt_isnan(float x) #define MULT16_16_P14(a,b) ((a)*(b)) #define MULT16_32_P16(a,b) ((a)*(b)) +#define MULT_COEF_32(a, b) ((a)*(b)) +#define MULT_COEF(a, b) ((a)*(b)) +#define MULT_COEF_TAPS(a, b) ((a)*(b)) + #define DIV32_16(a,b) (((opus_val32)(a))/(opus_val16)(b)) #define DIV32(a,b) (((opus_val32)(a))/(opus_val32)(b)) diff --git a/celt/celt.c b/celt/celt.c index 9ce234695..8809b4755 100644 --- a/celt/celt.c +++ b/celt/celt.c @@ -89,6 +89,7 @@ int resampling_factor(opus_int32 rate) return ret; } + #if !defined(OVERRIDE_COMB_FILTER_CONST) || defined(NON_STATIC_COMB_FILTER_CONST_C) /* This version should be faster on ARM */ #ifdef OPUS_ARM_ASM @@ -96,7 +97,7 @@ int resampling_factor(opus_int32 rate) static #endif void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, - opus_val16 g10, opus_val16 g11, opus_val16 g12) + celt_coef g10, celt_coef g11, celt_coef g12) { opus_val32 x0, x1, x2, x3, x4; int i; @@ -108,33 +109,33 @@ void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, { opus_val32 t; x0=SHL32(x[i-T+2],1); - t = MAC16_32_Q16(x[i], g10, x2); - t = MAC16_32_Q16(t, g11, ADD32(x1,x3)); - t = MAC16_32_Q16(t, g12, ADD32(x0,x4)); + t = MAC_COEF_32_ARM(x[i], g10, x2); + t = MAC_COEF_32_ARM(t, g11, ADD32(x1,x3)); + t = MAC_COEF_32_ARM(t, g12, ADD32(x0,x4)); t = SATURATE(t, SIG_SAT); y[i] = t; x4=SHL32(x[i-T+3],1); - t = MAC16_32_Q16(x[i+1], g10, x1); - t = MAC16_32_Q16(t, g11, ADD32(x0,x2)); - t = MAC16_32_Q16(t, g12, ADD32(x4,x3)); + t = MAC_COEF_32_ARM(x[i+1], g10, x1); + t = MAC_COEF_32_ARM(t, g11, ADD32(x0,x2)); + t = MAC_COEF_32_ARM(t, g12, ADD32(x4,x3)); t = SATURATE(t, SIG_SAT); y[i+1] = t; x3=SHL32(x[i-T+4],1); - t = MAC16_32_Q16(x[i+2], g10, x0); - t = MAC16_32_Q16(t, g11, ADD32(x4,x1)); - t = MAC16_32_Q16(t, g12, ADD32(x3,x2)); + t = MAC_COEF_32_ARM(x[i+2], g10, x0); + t = MAC_COEF_32_ARM(t, g11, ADD32(x4,x1)); + t = MAC_COEF_32_ARM(t, g12, ADD32(x3,x2)); t = SATURATE(t, SIG_SAT); y[i+2] = t; x2=SHL32(x[i-T+5],1); - t = MAC16_32_Q16(x[i+3], g10, x4); - t = MAC16_32_Q16(t, g11, ADD32(x3,x0)); - t = MAC16_32_Q16(t, g12, ADD32(x2,x1)); + t = MAC_COEF_32_ARM(x[i+3], g10, x4); + t = MAC_COEF_32_ARM(t, g11, ADD32(x3,x0)); + t = MAC_COEF_32_ARM(t, g12, ADD32(x2,x1)); t = SATURATE(t, SIG_SAT); y[i+3] = t; x1=SHL32(x[i-T+6],1); - t = MAC16_32_Q16(x[i+4], g10, x3); - t = MAC16_32_Q16(t, g11, ADD32(x2,x4)); - t = MAC16_32_Q16(t, g12, ADD32(x1,x0)); + t = MAC_COEF_32_ARM(x[i+4], g10, x3); + t = MAC_COEF_32_ARM(t, g11, ADD32(x2,x4)); + t = MAC_COEF_32_ARM(t, g12, ADD32(x1,x0)); t = SATURATE(t, SIG_SAT); y[i+4] = t; } @@ -143,9 +144,9 @@ void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, { opus_val32 t; x0=SHL32(x[i-T+2],1); - t = MAC16_32_Q16(x[i], g10, x2); - t = MAC16_32_Q16(t, g11, ADD32(x1,x3)); - t = MAC16_32_Q16(t, g12, ADD32(x0,x4)); + t = MAC_COEF_32_ARM(x[i], g10, x2); + t = MAC_COEF_32_ARM(t, g11, ADD32(x1,x3)); + t = MAC_COEF_32_ARM(t, g12, ADD32(x0,x4)); t = SATURATE(t, SIG_SAT); y[i] = t; x4=x3; @@ -160,7 +161,7 @@ void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, static #endif void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, - opus_val16 g10, opus_val16 g11, opus_val16 g12) + celt_coef g10, celt_coef g11, celt_coef g12) { opus_val32 x0, x1, x2, x3, x4; int i; @@ -172,9 +173,9 @@ void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, { x0=x[i-T+2]; y[i] = x[i] - + MULT16_32_Q15(g10,x2) - + MULT16_32_Q15(g11,ADD32(x1,x3)) - + MULT16_32_Q15(g12,ADD32(x0,x4)); + + MULT_COEF_32(g10,x2) + + MULT_COEF_32(g11,ADD32(x1,x3)) + + MULT_COEF_32(g12,ADD32(x0,x4)); y[i] = SATURATE(y[i], SIG_SAT); x4=x3; x3=x2; @@ -189,11 +190,11 @@ void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, #ifndef OVERRIDE_comb_filter void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, - const opus_val16 *window, int overlap, int arch) + const celt_coef *window, int overlap, int arch) { int i; /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */ - opus_val16 g00, g01, g02, g10, g11, g12; + celt_coef g00, g01, g02, g10, g11, g12; opus_val32 x0, x1, x2, x3, x4; static const opus_val16 gains[3][3] = { {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)}, @@ -211,12 +212,12 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, to have then be at least 2 to avoid processing garbage data. */ T0 = IMAX(T0, COMBFILTER_MINPERIOD); T1 = IMAX(T1, COMBFILTER_MINPERIOD); - g00 = MULT16_16_P15(g0, gains[tapset0][0]); - g01 = MULT16_16_P15(g0, gains[tapset0][1]); - g02 = MULT16_16_P15(g0, gains[tapset0][2]); - g10 = MULT16_16_P15(g1, gains[tapset1][0]); - g11 = MULT16_16_P15(g1, gains[tapset1][1]); - g12 = MULT16_16_P15(g1, gains[tapset1][2]); + g00 = MULT_COEF_TAPS(g0, gains[tapset0][0]); + g01 = MULT_COEF_TAPS(g0, gains[tapset0][1]); + g02 = MULT_COEF_TAPS(g0, gains[tapset0][2]); + g10 = MULT_COEF_TAPS(g1, gains[tapset1][0]); + g11 = MULT_COEF_TAPS(g1, gains[tapset1][1]); + g12 = MULT_COEF_TAPS(g1, gains[tapset1][2]); x1 = x[-T1+1]; x2 = x[-T1 ]; x3 = x[-T1-1]; @@ -226,16 +227,16 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, overlap=0; for (i=0;iwindow[i], etmp[overlap-1-i]) - + MULT16_32_Q15(mode->window[overlap-i-1], etmp[i]); + MULT16_32_Q15(COEF2VAL16(mode->window[i]), etmp[overlap-1-i]) + + MULT16_32_Q15 (COEF2VAL16(mode->window[overlap-i-1]), etmp[i]); } } while (++coverlap); fprintf(file, "#define DEF_WINDOW%d\n", mode->overlap); - fprintf (file, "static const opus_val16 window%d[%d] = {\n", mode->overlap, mode->overlap); + fprintf (file, "static const celt_coef window%d[%d] = {\n", mode->overlap, mode->overlap); +#if defined(FIXED_POINT) && defined(ENABLE_QEXT) + fprintf(file, "#ifdef ENABLE_QEXT\n"); + for (j=0;joverlap;j++) + fprintf (file, WORD32 ",%c", mode->window[j],(j+6)%5==0?'\n':' '); + fprintf(file, "#else\n"); + for (j=0;joverlap;j++) + fprintf (file, WORD16 ",%c", COEF16(mode->window[j], 16),(j+6)%5==0?'\n':' '); + fprintf(file, "#endif\n"); +#else for (j=0;joverlap;j++) fprintf (file, WORD16 ",%c", mode->window[j],(j+6)%5==0?'\n':' '); +#endif fprintf (file, "};\n"); fprintf(file, "#endif\n"); fprintf(file, "\n"); diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h index 89a011d0f..9731168de 100644 --- a/celt/fixed_debug.h +++ b/celt/fixed_debug.h @@ -43,6 +43,7 @@ extern opus_int64 celt_mips; #define MULT16_16SU(a,b) ((opus_val32)(opus_val16)(a)*(opus_val32)(opus_uint16)(b)) #define MULT32_32_Q31(a,b) ADD32(ADD32(SHL32(MULT16_16(SHR32((a),16),SHR((b),16)),1), SHR32(MULT16_16SU(SHR32((a),16),((b)&0x0000ffff)),15)), SHR32(MULT16_16SU(SHR32((b),16),((a)&0x0000ffff)),15)) +#define MULT32_32_Q32(a,b) ADD32(ADD32(MULT16_16(SHR((a),16),SHR((b),16)), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),16)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),16)) /** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */ #define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR32((b),16)), SHR32(MULT16_16SU((a),((b)&0x0000ffff)),16)) diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h index a8988889d..dc6c93e7a 100644 --- a/celt/fixed_generic.h +++ b/celt/fixed_generic.h @@ -71,6 +71,13 @@ #define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15)) #endif +/** 32x32 multiplication, followed by a 32-bit shift right. Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT32_32_Q32(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),32)) +#else +#define MULT32_32_Q32(a,b) ADD32(ADD32(MULT16_16(SHR((a),16),SHR((b),16)), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),16)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),16)) +#endif + /** Compile-time conversion of float constant to 16-bit value */ #define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits)))) diff --git a/celt/mdct.c b/celt/mdct.c index 94129158b..4e8fc242f 100644 --- a/celt/mdct.c +++ b/celt/mdct.c @@ -120,7 +120,7 @@ void clt_mdct_clear(mdct_lookup *l, int arch) /* Forward MDCT trashes the input array */ #ifndef OVERRIDE_clt_mdct_forward void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, - const opus_val16 *window, int overlap, int shift, int stride, int arch) + const celt_coef *window, int overlap, int shift, int stride, int arch) { int i; int N, N2, N4; @@ -159,13 +159,13 @@ void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scal const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); kiss_fft_scalar * OPUS_RESTRICT yp = f; - const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); - const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; + const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); + const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; for(i=0;i<((overlap+3)>>2);i++) { /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ - *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2); - *yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]); + *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); + *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); xp1+=2; xp2-=2; wp1+=2; @@ -184,8 +184,8 @@ void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scal for(;iallocVectors==NULL) goto failure; - window = (opus_val16*)opus_alloc(mode->overlap*sizeof(opus_val16)); + window = (celt_coef*)opus_alloc(mode->overlap*sizeof(*window)); if (window==NULL) goto failure; @@ -378,8 +378,13 @@ CELTMode *opus_custom_mode_create(opus_int32 Fs, int frame_size, int *error) for (i=0;ioverlap;i++) window[i] = Q15ONE*sin(.5*M_PI* sin(.5*M_PI*(i+.5)/mode->overlap) * sin(.5*M_PI*(i+.5)/mode->overlap)); #else +# ifdef ENABLE_QEXT + for (i=0;ioverlap;i++) + window[i] = 2147483647*sin(.5*M_PI* sin(.5*M_PI*(i+.5)/mode->overlap) * sin(.5*M_PI*(i+.5)/mode->overlap)); +# else for (i=0;ioverlap;i++) window[i] = MIN32(32767,floor(.5+32768.*sin(.5*M_PI* sin(.5*M_PI*(i+.5)/mode->overlap) * sin(.5*M_PI*(i+.5)/mode->overlap)))); +# endif #endif mode->window = window; diff --git a/celt/modes.h b/celt/modes.h index be813ccc8..997257d60 100644 --- a/celt/modes.h +++ b/celt/modes.h @@ -66,7 +66,7 @@ struct OpusCustomMode { const unsigned char *allocVectors; /**< Number of bits in each band for several rates */ const opus_int16 *logN; - const opus_val16 *window; + const celt_coef *window; mdct_lookup mdct; PulseCache cache; }; diff --git a/celt/static_modes_fixed.h b/celt/static_modes_fixed.h index 0d885a510..49f94b596 100644 --- a/celt/static_modes_fixed.h +++ b/celt/static_modes_fixed.h @@ -11,7 +11,33 @@ #ifndef DEF_WINDOW120 #define DEF_WINDOW120 -static const opus_val16 window120[120] = { +static const celt_coef window120[120] = { +#ifdef ENABLE_QEXT +144497, 1300330, 3611201, 7075520, 11690888, +17454086, 24361057, 32406886, 41585775, 51891010, +63314937, 75848919, 89483305, 104207389, 120009370, +136876310, 154794092, 173747378, 193719571, 214692768, +236647730, 259563841, 283419076, 308189974, 333851610, +360377579, 387739975, 415909390, 444854905, 474544098, +504943052, 536016380, 567727246, 600037405, 632907246, +666295841, 700161014, 734459402, 769146541, 804176949, +839504226, 875081151, 910859801, 946791664, 982827766, +1018918806, 1055015289, 1091067669, 1127026498, 1162842572, +1198467087, 1233851789, 1268949131, 1303712427, 1338096005, +1372055357, 1405547287, 1438530057, 1470963523, 1502809271, +1534030739, 1564593342, 1594464576, 1623614127, 1652013955, +1679638381, 1706464157, 1732470523, 1757639262, 1781954728, +1805403878, 1827976281, 1849664119, 1870462176, 1890367815, +1909380945, 1927503971, 1944741740, 1961101474, 1976592691, +1991227121, 2005018606, 2017983003, 2030138066, 2041503334, +2052100005, 2061950805, 2071079860, 2079512552, 2087275383, +2094395834, 2100902217, 2106823531, 2112189320, 2117029526, +2121374346, 2125254091, 2128699048, 2131739342, 2134404803, +2136724837, 2138728300, 2140443379, 2141897477, 2143117096, +2144127739, 2144953806, 2145618501, 2146143740, 2146550076, +2146856617, 2147080957, 2147239112, 2147345466, 2147412715, +2147451824, 2147471990, 2147480610, 2147483253, 2147483642, +#else 2, 20, 55, 108, 178, 266, 372, 494, 635, 792, 966, 1157, 1365, 1590, 1831, @@ -36,6 +62,7 @@ static const opus_val16 window120[120] = { 32717, 32729, 32740, 32748, 32754, 32758, 32762, 32764, 32766, 32767, 32767, 32767, 32767, 32767, 32767, +#endif }; #endif diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c index 70dc042ef..00b3840de 100644 --- a/celt/tests/test_unit_mdct.c +++ b/celt/tests/test_unit_mdct.c @@ -109,7 +109,7 @@ void test1d(int nfft,int isinverse,int arch) kiss_fft_scalar *in; kiss_fft_scalar *in_copy; kiss_fft_scalar *out; - opus_val16 *window; + celt_coef *window; int k; #ifdef CUSTOM_MODES @@ -133,14 +133,18 @@ void test1d(int nfft,int isinverse,int arch) in = (kiss_fft_scalar*)malloc(buflen); in_copy = (kiss_fft_scalar*)malloc(buflen); out = (kiss_fft_scalar*)malloc(buflen); - window = (opus_val16*)malloc(sizeof(opus_val16)*nfft/2); + window = (celt_coef*)malloc(sizeof(*window)*nfft/2); for (k=0;k