lavu/tx: optimize and simplify inverse MDCTs

Convert the input from a scatter to a gather instead, which is faster and better for SIMD. Also, add a pre-shuffled exptab version to avoid gathering there at all. This doubles the exptab size, but the speedup makes it worth it. In SIMD, the exptab will likely be purged to a higher cache anyway because of the FFT in the middle, and the amount of loads stays identical. For a 960-point inverse MDCT, the speedup is 10%. This makes it possible to write sane and fast SIMD versions of inverse MDCTs.
2024-09-19 12:56:40 +00:00 · 2022-08-16 01:11:40 +02:00 · 2022-08-16 01:11:40 +02:00 · ae66a9db7b
commit ae66a9db7b
parent 412922cc6f
3 changed files with 45 additions and 20 deletions
--- a/libavutil/tx.c
+++ b/libavutil/tx.c
@ -44,7 +44,6 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
    int *in_map, *out_map;
    const int inv = s->inv;
    const int len = n*m;    /* Will not be equal to s->len for MDCTs */
-    const int mdct = TYPE_IS(MDCT, s->type);
    int m_inv, n_inv;

    /* Make sure the numbers are coprime */
@ -63,8 +62,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
    /* Ruritanian map for input, CRT map for output, can be swapped */
    for (int j = 0; j < m; j++) {
        for (int i = 0; i < n; i++) {
-            /* Shifted by 1 to simplify MDCTs */
-            in_map[j*n + i] = ((i*m + j*n) % len) << mdct;
+            in_map[j*n + i] = (i*m + j*n) % len;
            out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
        }
    }
--- a/libavutil/tx_priv.h
+++ b/libavutil/tx_priv.h
@ -297,10 +297,13 @@ void ff_tx_init_tabs_float (int len);
 void ff_tx_init_tabs_double(int len);
 void ff_tx_init_tabs_int32 (int len);

-/* Typed init function to initialize an MDCT exptab in a context. */
-int  ff_tx_mdct_gen_exp_float (AVTXContext *s);
-int  ff_tx_mdct_gen_exp_double(AVTXContext *s);
-int  ff_tx_mdct_gen_exp_int32 (AVTXContext *s);
+/* Typed init function to initialize an MDCT exptab in a context.
+ * If pre_tab is set, duplicates the entire table, with the first
+ * copy being shuffled according to pre_tab, and the second copy
+ * being the original. */
+int ff_tx_mdct_gen_exp_float (AVTXContext *s, int *pre_tab);
+int ff_tx_mdct_gen_exp_double(AVTXContext *s, int *pre_tab);
+int ff_tx_mdct_gen_exp_int32 (AVTXContext *s, int *pre_tab);

 /* Lists of codelets */
 extern const FFTXCodelet * const ff_tx_codelet_list_float_c       [];
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@ -948,7 +948,7 @@ static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s,
                                               const void *scale)
 {
    int ret;
-    FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
+    FFTXCodeletOptions sub_opts = { .invert_lookup = inv };

    s->scale_d = *((SCALE_TYPE *)scale);
    s->scale_f = s->scale_d;
@ -961,9 +961,14 @@ static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s,
                                inv, scale)))
        return ret;

-    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s)))
+    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->sub->map : NULL)))
        return ret;

+    /* Saves a multiply in a hot path. */
+    if (inv)
+        for (int i = 0; i < (s->len >> 1); i++)
+            s->sub->map[i] <<= 1;
+
    return 0;
 }

@ -1020,12 +1025,14 @@ static void TX_NAME(ff_tx_mdct_sr_inv)(AVTXContext *s, void *_dst, void *_src,
    in2 = src + ((len2*2) - 1) * stride;

    for (int i = 0; i < len2; i++) {
-        TXComplex tmp = { in2[-2*i*stride], in1[2*i*stride] };
-        CMUL3(z[sub_map[i]], tmp, exp[i]);
+        int k = sub_map[i];
+        TXComplex tmp = { in2[-k*stride], in1[k*stride] };
+        CMUL3(z[i], tmp, exp[i]);
    }

    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));

+    exp += len2;
    for (int i = 0; i < len4; i++) {
        const int i0 = len4 + i, i1 = len4 - i - 1;
        TXComplex src1 = { z[i1].im, z[i1].re };
@ -1141,9 +1148,13 @@ static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s,
    if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
        return ret;

-    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s)))
+    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
        return ret;

+    /* Saves multiplies in loops. */
+    for (int i = 0; i < len; i++)
+        s->map[i] <<= 1;
+
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
        return AVERROR(ENOMEM);

@ -1160,6 +1171,7 @@ static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst,    \
    TXComplex *z = _dst, *exp = s->exp;                                        \
    const TXSample *src = _src, *in1, *in2;                                    \
    const int len4 = s->len >> 2;                                              \
+    const int len2 = s->len >> 1;                                              \
    const int m = s->sub->len;                                                 \
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
    const int *sub_map = s->sub->map;                                          \
@ -1168,13 +1180,15 @@ static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst,    \
    in1 = src;                                                                 \
    in2 = src + ((N*m*2) - 1) * stride;                                        \
                                                                               \
-    for (int i = 0; i < m; i++) {                                              \
+    for (int i = 0; i < len2; i += N) {                                        \
        for (int j = 0; j < N; j++) {                                          \
-            const int k = in_map[i*N + j];                                     \
+            const int k = in_map[j];                                           \
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
-            CMUL3(fft##N##in[j], tmp, exp[k >> 1]);                            \
+            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
        }                                                                      \
-        fft##N(s->tmp + sub_map[i], fft##N##in, m);                            \
+        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
+        exp += N;                                                              \
+        in_map += N;                                                           \
    }                                                                          \
                                                                               \
    for (int i = 0; i < N; i++)                                                \
@ -1405,22 +1419,32 @@ static const FFTXCodelet TX_NAME(ff_tx_rdft_c2r_def) = {
    .prio       = FF_TX_PRIO_BASE,
 };

-int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s)
+int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
 {
+    int off = 0;
    int len4 = s->len >> 1;
    double scale = s->scale_d;
    const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
+    size_t alloc = pre_tab ? 2*len4 : len4;

-    if (!(s->exp = av_malloc_array(len4, sizeof(*s->exp))))
+    if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
        return AVERROR(ENOMEM);

    scale = sqrt(fabs(scale));
+
+    if (pre_tab)
+        off = len4;
+
    for (int i = 0; i < len4; i++) {
        const double alpha = M_PI_2 * (i + theta) / len4;
-        s->exp[i].re = RESCALE(cos(alpha) * scale);
-        s->exp[i].im = RESCALE(sin(alpha) * scale);
+        s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
+                                       RESCALE(sin(alpha) * scale) };
    }

+    if (pre_tab)
+        for (int i = 0; i < len4; i++)
+            s->exp[i] = s->exp[len4 + pre_tab[i]];
+
    return 0;
 }