Support 32-bit window and comb filter

xiph · Dec 19, 2024 · 2d04d47 · 2d04d47
1 parent 5b0d1f8
commit 2d04d47
Show file tree

Hide file tree

Showing 17 changed files with 160 additions and 81 deletions.
diff --git a/celt/arch.h b/celt/arch.h
@@ -175,8 +175,20 @@ typedef opus_val16 opus_res;
 
 #ifdef ENABLE_QEXT
 typedef opus_val32 celt_coef;
+#define COEF_ONE Q31ONE
+#define MULT_COEF_32(a, b) MULT32_32_Q31(a,b)
+#define MAC_COEF_32_ARM(c, a, b) ADD32((c), MULT32_32_Q32(a,b))
+#define MULT_COEF(a, b) MULT32_32_Q31(a,b)
+#define MULT_COEF_TAPS(a, b) SHL32(MULT16_16(a,b), 1)
+#define COEF2VAL16(x) EXTRACT16(SHR32(x, 16))
 #else
 typedef opus_val16 celt_coef;
+#define COEF_ONE Q15ONE
+#define MULT_COEF_32(a, b) MULT16_32_Q15(a,b)
+#define MAC_COEF_32_ARM(a, b, c) MAC16_32_Q16(a,b,c)
+#define MULT_COEF(a, b) MULT16_16_Q15(a,b)
+#define MULT_COEF_TAPS(a, b) MULT16_16_P15(a,b)
+#define COEF2VAL16(x) (x)
 #endif
 
 #define celt_isnan(x) 0
@@ -265,6 +277,8 @@ static OPUS_INLINE int celt_isnan(float x)
 
 #define Q15ONE 1.0f
 #define Q31ONE 1.0f
+#define COEF_ONE 1.0f
+#define COEF2VAL16(x) (x)
 
 #define NORM_SCALING 1.f
 
@@ -321,6 +335,7 @@ static OPUS_INLINE int celt_isnan(float x)
 
 #define MAC16_32_Q15(c,a,b)     ((c)+(a)*(b))
 #define MAC16_32_Q16(c,a,b)     ((c)+(a)*(b))
+#define MAC_COEF_32_ARM(c,a,b)     ((c)+(a)*(b))
 
 #define MULT16_16_Q11_32(a,b)     ((a)*(b))
 #define MULT16_16_Q11(a,b)     ((a)*(b))
@@ -332,6 +347,10 @@ static OPUS_INLINE int celt_isnan(float x)
 #define MULT16_16_P14(a,b)     ((a)*(b))
 #define MULT16_32_P16(a,b)     ((a)*(b))
 
+#define MULT_COEF_32(a, b)      ((a)*(b))
+#define MULT_COEF(a, b)   ((a)*(b))
+#define MULT_COEF_TAPS(a, b)   ((a)*(b))
+
 #define DIV32_16(a,b)     (((opus_val32)(a))/(opus_val16)(b))
 #define DIV32(a,b)     (((opus_val32)(a))/(opus_val32)(b))
 

diff --git a/celt/celt.c b/celt/celt.c
@@ -89,14 +89,15 @@ int resampling_factor(opus_int32 rate)
    return ret;
 }
 
+
 #if !defined(OVERRIDE_COMB_FILTER_CONST) || defined(NON_STATIC_COMB_FILTER_CONST_C)
 /* This version should be faster on ARM */
 #ifdef OPUS_ARM_ASM
 #ifndef NON_STATIC_COMB_FILTER_CONST_C
 static
 #endif
 void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
-      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+      celt_coef g10, celt_coef g11, celt_coef g12)
 {
    opus_val32 x0, x1, x2, x3, x4;
    int i;
@@ -108,33 +109,33 @@ void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
    {
       opus_val32 t;
       x0=SHL32(x[i-T+2],1);
-      t = MAC16_32_Q16(x[i], g10, x2);
-      t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
-      t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
+      t = MAC_COEF_32_ARM(x[i], g10, x2);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x1,x3));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x0,x4));
       t = SATURATE(t, SIG_SAT);
       y[i] = t;
       x4=SHL32(x[i-T+3],1);
-      t = MAC16_32_Q16(x[i+1], g10, x1);
-      t = MAC16_32_Q16(t, g11, ADD32(x0,x2));
-      t = MAC16_32_Q16(t, g12, ADD32(x4,x3));
+      t = MAC_COEF_32_ARM(x[i+1], g10, x1);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x0,x2));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x4,x3));
       t = SATURATE(t, SIG_SAT);
       y[i+1] = t;
       x3=SHL32(x[i-T+4],1);
-      t = MAC16_32_Q16(x[i+2], g10, x0);
-      t = MAC16_32_Q16(t, g11, ADD32(x4,x1));
-      t = MAC16_32_Q16(t, g12, ADD32(x3,x2));
+      t = MAC_COEF_32_ARM(x[i+2], g10, x0);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x4,x1));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x3,x2));
       t = SATURATE(t, SIG_SAT);
       y[i+2] = t;
       x2=SHL32(x[i-T+5],1);
-      t = MAC16_32_Q16(x[i+3], g10, x4);
-      t = MAC16_32_Q16(t, g11, ADD32(x3,x0));
-      t = MAC16_32_Q16(t, g12, ADD32(x2,x1));
+      t = MAC_COEF_32_ARM(x[i+3], g10, x4);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x3,x0));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x2,x1));
       t = SATURATE(t, SIG_SAT);
       y[i+3] = t;
       x1=SHL32(x[i-T+6],1);
-      t = MAC16_32_Q16(x[i+4], g10, x3);
-      t = MAC16_32_Q16(t, g11, ADD32(x2,x4));
-      t = MAC16_32_Q16(t, g12, ADD32(x1,x0));
+      t = MAC_COEF_32_ARM(x[i+4], g10, x3);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x2,x4));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x1,x0));
       t = SATURATE(t, SIG_SAT);
       y[i+4] = t;
    }
@@ -143,9 +144,9 @@ void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
    {
       opus_val32 t;
       x0=SHL32(x[i-T+2],1);
-      t = MAC16_32_Q16(x[i], g10, x2);
-      t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
-      t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
+      t = MAC_COEF_32_ARM(x[i], g10, x2);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x1,x3));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x0,x4));
       t = SATURATE(t, SIG_SAT);
       y[i] = t;
       x4=x3;
@@ -160,7 +161,7 @@ void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
 static
 #endif
 void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
-      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+      celt_coef g10, celt_coef g11, celt_coef g12)
 {
    opus_val32 x0, x1, x2, x3, x4;
    int i;
@@ -172,9 +173,9 @@ void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
    {
       x0=x[i-T+2];
       y[i] = x[i]
-               + MULT16_32_Q15(g10,x2)
-               + MULT16_32_Q15(g11,ADD32(x1,x3))
-               + MULT16_32_Q15(g12,ADD32(x0,x4));
+               + MULT_COEF_32(g10,x2)
+               + MULT_COEF_32(g11,ADD32(x1,x3))
+               + MULT_COEF_32(g12,ADD32(x0,x4));
       y[i] = SATURATE(y[i], SIG_SAT);
       x4=x3;
       x3=x2;
@@ -189,11 +190,11 @@ void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
 #ifndef OVERRIDE_comb_filter
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
-      const opus_val16 *window, int overlap, int arch)
+      const celt_coef *window, int overlap, int arch)
 {
    int i;
    /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
-   opus_val16 g00, g01, g02, g10, g11, g12;
+   celt_coef g00, g01, g02, g10, g11, g12;
    opus_val32 x0, x1, x2, x3, x4;
    static const opus_val16 gains[3][3] = {
          {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
@@ -211,12 +212,12 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       to have then be at least 2 to avoid processing garbage data. */
    T0 = IMAX(T0, COMBFILTER_MINPERIOD);
    T1 = IMAX(T1, COMBFILTER_MINPERIOD);
-   g00 = MULT16_16_P15(g0, gains[tapset0][0]);
-   g01 = MULT16_16_P15(g0, gains[tapset0][1]);
-   g02 = MULT16_16_P15(g0, gains[tapset0][2]);
-   g10 = MULT16_16_P15(g1, gains[tapset1][0]);
-   g11 = MULT16_16_P15(g1, gains[tapset1][1]);
-   g12 = MULT16_16_P15(g1, gains[tapset1][2]);
+   g00 = MULT_COEF_TAPS(g0, gains[tapset0][0]);
+   g01 = MULT_COEF_TAPS(g0, gains[tapset0][1]);
+   g02 = MULT_COEF_TAPS(g0, gains[tapset0][2]);
+   g10 = MULT_COEF_TAPS(g1, gains[tapset1][0]);
+   g11 = MULT_COEF_TAPS(g1, gains[tapset1][1]);
+   g12 = MULT_COEF_TAPS(g1, gains[tapset1][2]);
    x1 = x[-T1+1];
    x2 = x[-T1  ];
    x3 = x[-T1-1];
@@ -226,16 +227,16 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       overlap=0;
    for (i=0;i<overlap;i++)
    {
-      opus_val16 f;
+      celt_coef f;
       x0=x[i-T1+2];
-      f = MULT16_16_Q15(window[i],window[i]);
+      f = MULT_COEF(window[i],window[i]);
       y[i] = x[i]
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g00),x[i-T0])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),ADD32(x[i-T0+1],x[i-T0-1]))
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),ADD32(x[i-T0+2],x[i-T0-2]))
-               + MULT16_32_Q15(MULT16_16_Q15(f,g10),x2)
-               + MULT16_32_Q15(MULT16_16_Q15(f,g11),ADD32(x1,x3))
-               + MULT16_32_Q15(MULT16_16_Q15(f,g12),ADD32(x0,x4));
+               + MULT_COEF_32(MULT_COEF((COEF_ONE-f),g00),x[i-T0])
+               + MULT_COEF_32(MULT_COEF((COEF_ONE-f),g01),ADD32(x[i-T0+1],x[i-T0-1]))
+               + MULT_COEF_32(MULT_COEF((COEF_ONE-f),g02),ADD32(x[i-T0+2],x[i-T0-2]))
+               + MULT_COEF_32(MULT_COEF(f,g10),x2)
+               + MULT_COEF_32(MULT_COEF(f,g11),ADD32(x1,x3))
+               + MULT_COEF_32(MULT_COEF(f,g12),ADD32(x0,x4));
       y[i] = SATURATE(y[i], SIG_SAT);
       x4=x3;
       x3=x2;

diff --git a/celt/celt.h b/celt/celt.h
@@ -41,6 +41,7 @@
 #include "entenc.h"
 #include "entdec.h"
 #include "arch.h"
+#include "kiss_fft.h"
 
 #ifdef ENABLE_DEEP_PLC
 #include "lpcnet.h"
@@ -236,7 +237,7 @@ void celt_preemphasis(const opus_res * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTR
 
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
-      const opus_val16 *window, int overlap, int arch);
+      const celt_coef *window, int overlap, int arch);
 
 void init_caps(const CELTMode *m,int *cap,int LM,int C);
 

diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
@@ -535,8 +535,8 @@ static void prefilter_and_fold(CELTDecoder * OPUS_RESTRICT st, int N)
       for (i=0;i<overlap/2;i++)
       {
          decode_mem[c][DECODE_BUFFER_SIZE-N+i] =
-            MULT16_32_Q15(mode->window[i], etmp[overlap-1-i])
-            + MULT16_32_Q15(mode->window[overlap-i-1], etmp[i]);
+            MULT16_32_Q15(COEF2VAL16(mode->window[i]), etmp[overlap-1-i])
+            + MULT16_32_Q15 (COEF2VAL16(mode->window[overlap-i-1]), etmp[i]);
       }
    } while (++c<CC);
 }
@@ -692,7 +692,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM
    } else {
       int exc_length;
       /* Pitch-based PLC */
-      const opus_val16 *window;
+      const celt_coef *window;
       opus_val16 *exc;
       opus_val16 fade = Q15ONE;
       int pitch_index;
@@ -880,7 +880,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM
                for (i=0;i<overlap;i++)
                {
                   opus_val16 tmp_g = Q15ONE
-                        - MULT16_16_Q15(window[i], Q15ONE-ratio);
+                        - MULT16_16_Q15(COEF2VAL16(window[i]), Q15ONE-ratio);
                   buf[DECODE_BUFFER_SIZE-N+i] =
                         MULT16_32_Q15(tmp_g, buf[DECODE_BUFFER_SIZE-N+i]);
                }

diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
@@ -277,7 +277,7 @@ void celt_iir(const opus_val32 *_x,
 int _celt_autocorr(
                    const opus_val16 *x,   /*  in: [0...n-1] samples x   */
                    opus_val32       *ac,  /* out: [0...lag-1] ac values */
-                   const opus_val16       *window,
+                   const celt_coef  *window,
                    int          overlap,
                    int          lag,
                    int          n,
@@ -302,8 +302,9 @@ int _celt_autocorr(
          xx[i] = x[i];
       for (i=0;i<overlap;i++)
       {
-         xx[i] = MULT16_16_Q15(x[i],window[i]);
-         xx[n-i-1] = MULT16_16_Q15(x[n-i-1],window[i]);
+         opus_val16 w = COEF2VAL16(window[i]);
+         xx[i] = MULT16_16_Q15(x[i],w);
+         xx[n-i-1] = MULT16_16_Q15(x[n-i-1],w);
       }
       xptr = xx;
    }

diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
@@ -61,6 +61,6 @@ void celt_iir(const opus_val32 *x,
          int arch);
 
 int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
-         const opus_val16 *window, int overlap, int lag, int n, int arch);
+         const celt_coef *window, int overlap, int lag, int n, int arch);
 
 #endif /* PLC_H */
diff --git a/celt/dump_modes/dump_modes.c b/celt/dump_modes/dump_modes.c
@@ -99,9 +99,19 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
 
       fprintf(file, "#ifndef DEF_WINDOW%d\n", mode->overlap);
       fprintf(file, "#define DEF_WINDOW%d\n", mode->overlap);
-      fprintf (file, "static const opus_val16 window%d[%d] = {\n", mode->overlap, mode->overlap);
+      fprintf (file, "static const celt_coef window%d[%d] = {\n", mode->overlap, mode->overlap);
+#if defined(FIXED_POINT) && defined(ENABLE_QEXT)
+      fprintf(file, "#ifdef ENABLE_QEXT\n");
+      for (j=0;j<mode->overlap;j++)
+         fprintf (file, WORD32 ",%c", mode->window[j],(j+6)%5==0?'\n':' ');
+      fprintf(file, "#else\n");
+      for (j=0;j<mode->overlap;j++)
+         fprintf (file, WORD16 ",%c", COEF16(mode->window[j], 16),(j+6)%5==0?'\n':' ');
+      fprintf(file, "#endif\n");
+#else
       for (j=0;j<mode->overlap;j++)
          fprintf (file, WORD16 ",%c", mode->window[j],(j+6)%5==0?'\n':' ');
+#endif
       fprintf (file, "};\n");
       fprintf(file, "#endif\n");
       fprintf(file, "\n");

diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
@@ -43,6 +43,7 @@ extern opus_int64 celt_mips;
 
 #define MULT16_16SU(a,b) ((opus_val32)(opus_val16)(a)*(opus_val32)(opus_uint16)(b))
 #define MULT32_32_Q31(a,b) ADD32(ADD32(SHL32(MULT16_16(SHR32((a),16),SHR((b),16)),1), SHR32(MULT16_16SU(SHR32((a),16),((b)&0x0000ffff)),15)), SHR32(MULT16_16SU(SHR32((b),16),((a)&0x0000ffff)),15))
+#define MULT32_32_Q32(a,b) ADD32(ADD32(MULT16_16(SHR((a),16),SHR((b),16)), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),16)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),16))
 
 /** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
 #define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR32((b),16)), SHR32(MULT16_16SU((a),((b)&0x0000ffff)),16))

diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
@@ -71,6 +71,13 @@
 #define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))
 #endif
 
+/** 32x32 multiplication, followed by a 32-bit shift right. Results fits in 32 bits */
+#if OPUS_FAST_INT64
+#define MULT32_32_Q32(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),32))
+#else
+#define MULT32_32_Q32(a,b) ADD32(ADD32(MULT16_16(SHR((a),16),SHR((b),16)), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),16)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),16))
+#endif
+
 /** Compile-time conversion of float constant to 16-bit value */
 #define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits))))
 

diff --git a/celt/mdct.c b/celt/mdct.c
@@ -120,7 +120,7 @@ void clt_mdct_clear(mdct_lookup *l, int arch)
 /* Forward MDCT trashes the input array */
 #ifndef OVERRIDE_clt_mdct_forward
 void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
-      const opus_val16 *window, int overlap, int shift, int stride, int arch)
+      const celt_coef *window, int overlap, int shift, int stride, int arch)
 {
    int i;
    int N, N2, N4;
@@ -159,13 +159,13 @@ void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scal
       const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
       const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
       kiss_fft_scalar * OPUS_RESTRICT yp = f;
-      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
-      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
+      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
+      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
       for(i=0;i<((overlap+3)>>2);i++)
       {
          /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
-         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
-         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
+         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
+         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
          xp1+=2;
          xp2-=2;
          wp1+=2;
@@ -184,8 +184,8 @@ void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scal
       for(;i<N4;i++)
       {
          /* Real part arranged as a-bR, Imag part arranged as -c-dR */
-         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
-         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
+         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
+         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
          xp1+=2;
          xp2-=2;
          wp1+=2;
@@ -258,7 +258,7 @@ void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scal
 
 #ifndef OVERRIDE_clt_mdct_backward
 void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
-      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
+      const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
 {
    int i;
    int N, N2, N4;
@@ -346,16 +346,16 @@ void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_sca
    {
       kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
       kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
-      const opus_val16 * OPUS_RESTRICT wp1 = window;
-      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
+      const celt_coef * OPUS_RESTRICT wp1 = window;
+      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
 
       for(i = 0; i < overlap/2; i++)
       {
          kiss_fft_scalar x1, x2;
          x1 = *xp1;
          x2 = *yp1;
-         *yp1++ = SUB32_ovflw(MULT16_32_Q15(*wp2, x2), MULT16_32_Q15(*wp1, x1));
-         *xp1-- = ADD32_ovflw(MULT16_32_Q15(*wp1, x2), MULT16_32_Q15(*wp2, x1));
+         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
+         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
          wp1++;
          wp2--;
       }