mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2024-09-19 12:56:40 +00:00
lavu/tx: optimize and simplify inverse MDCTs
Convert the input from a scatter to a gather instead, which is faster and better for SIMD. Also, add a pre-shuffled exptab version to avoid gathering there at all. This doubles the exptab size, but the speedup makes it worth it. In SIMD, the exptab will likely be purged to a higher cache anyway because of the FFT in the middle, and the amount of loads stays identical. For a 960-point inverse MDCT, the speedup is 10%. This makes it possible to write sane and fast SIMD versions of inverse MDCTs.
This commit is contained in:
parent
412922cc6f
commit
ae66a9db7b
@ -44,7 +44,6 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
|
||||
int *in_map, *out_map;
|
||||
const int inv = s->inv;
|
||||
const int len = n*m; /* Will not be equal to s->len for MDCTs */
|
||||
const int mdct = TYPE_IS(MDCT, s->type);
|
||||
int m_inv, n_inv;
|
||||
|
||||
/* Make sure the numbers are coprime */
|
||||
@ -63,8 +62,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
|
||||
/* Ruritanian map for input, CRT map for output, can be swapped */
|
||||
for (int j = 0; j < m; j++) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
/* Shifted by 1 to simplify MDCTs */
|
||||
in_map[j*n + i] = ((i*m + j*n) % len) << mdct;
|
||||
in_map[j*n + i] = (i*m + j*n) % len;
|
||||
out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
|
||||
}
|
||||
}
|
||||
|
@ -297,10 +297,13 @@ void ff_tx_init_tabs_float (int len);
|
||||
void ff_tx_init_tabs_double(int len);
|
||||
void ff_tx_init_tabs_int32 (int len);
|
||||
|
||||
/* Typed init function to initialize an MDCT exptab in a context. */
|
||||
int ff_tx_mdct_gen_exp_float (AVTXContext *s);
|
||||
int ff_tx_mdct_gen_exp_double(AVTXContext *s);
|
||||
int ff_tx_mdct_gen_exp_int32 (AVTXContext *s);
|
||||
/* Typed init function to initialize an MDCT exptab in a context.
|
||||
* If pre_tab is set, duplicates the entire table, with the first
|
||||
* copy being shuffled according to pre_tab, and the second copy
|
||||
* being the original. */
|
||||
int ff_tx_mdct_gen_exp_float (AVTXContext *s, int *pre_tab);
|
||||
int ff_tx_mdct_gen_exp_double(AVTXContext *s, int *pre_tab);
|
||||
int ff_tx_mdct_gen_exp_int32 (AVTXContext *s, int *pre_tab);
|
||||
|
||||
/* Lists of codelets */
|
||||
extern const FFTXCodelet * const ff_tx_codelet_list_float_c [];
|
||||
|
@ -948,7 +948,7 @@ static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s,
|
||||
const void *scale)
|
||||
{
|
||||
int ret;
|
||||
FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
|
||||
FFTXCodeletOptions sub_opts = { .invert_lookup = inv };
|
||||
|
||||
s->scale_d = *((SCALE_TYPE *)scale);
|
||||
s->scale_f = s->scale_d;
|
||||
@ -961,9 +961,14 @@ static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s,
|
||||
inv, scale)))
|
||||
return ret;
|
||||
|
||||
if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s)))
|
||||
if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->sub->map : NULL)))
|
||||
return ret;
|
||||
|
||||
/* Saves a multiply in a hot path. */
|
||||
if (inv)
|
||||
for (int i = 0; i < (s->len >> 1); i++)
|
||||
s->sub->map[i] <<= 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1020,12 +1025,14 @@ static void TX_NAME(ff_tx_mdct_sr_inv)(AVTXContext *s, void *_dst, void *_src,
|
||||
in2 = src + ((len2*2) - 1) * stride;
|
||||
|
||||
for (int i = 0; i < len2; i++) {
|
||||
TXComplex tmp = { in2[-2*i*stride], in1[2*i*stride] };
|
||||
CMUL3(z[sub_map[i]], tmp, exp[i]);
|
||||
int k = sub_map[i];
|
||||
TXComplex tmp = { in2[-k*stride], in1[k*stride] };
|
||||
CMUL3(z[i], tmp, exp[i]);
|
||||
}
|
||||
|
||||
s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
|
||||
|
||||
exp += len2;
|
||||
for (int i = 0; i < len4; i++) {
|
||||
const int i0 = len4 + i, i1 = len4 - i - 1;
|
||||
TXComplex src1 = { z[i1].im, z[i1].re };
|
||||
@ -1141,9 +1148,13 @@ static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s,
|
||||
if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
|
||||
return ret;
|
||||
|
||||
if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s)))
|
||||
if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
|
||||
return ret;
|
||||
|
||||
/* Saves multiplies in loops. */
|
||||
for (int i = 0; i < len; i++)
|
||||
s->map[i] <<= 1;
|
||||
|
||||
if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
@ -1160,6 +1171,7 @@ static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
|
||||
TXComplex *z = _dst, *exp = s->exp; \
|
||||
const TXSample *src = _src, *in1, *in2; \
|
||||
const int len4 = s->len >> 2; \
|
||||
const int len2 = s->len >> 1; \
|
||||
const int m = s->sub->len; \
|
||||
const int *in_map = s->map, *out_map = in_map + N*m; \
|
||||
const int *sub_map = s->sub->map; \
|
||||
@ -1168,13 +1180,15 @@ static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
|
||||
in1 = src; \
|
||||
in2 = src + ((N*m*2) - 1) * stride; \
|
||||
\
|
||||
for (int i = 0; i < m; i++) { \
|
||||
for (int i = 0; i < len2; i += N) { \
|
||||
for (int j = 0; j < N; j++) { \
|
||||
const int k = in_map[i*N + j]; \
|
||||
const int k = in_map[j]; \
|
||||
TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \
|
||||
CMUL3(fft##N##in[j], tmp, exp[k >> 1]); \
|
||||
CMUL3(fft##N##in[j], tmp, exp[j]); \
|
||||
} \
|
||||
fft##N(s->tmp + sub_map[i], fft##N##in, m); \
|
||||
fft##N(s->tmp + *(sub_map++), fft##N##in, m); \
|
||||
exp += N; \
|
||||
in_map += N; \
|
||||
} \
|
||||
\
|
||||
for (int i = 0; i < N; i++) \
|
||||
@ -1405,22 +1419,32 @@ static const FFTXCodelet TX_NAME(ff_tx_rdft_c2r_def) = {
|
||||
.prio = FF_TX_PRIO_BASE,
|
||||
};
|
||||
|
||||
int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s)
|
||||
int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
|
||||
{
|
||||
int off = 0;
|
||||
int len4 = s->len >> 1;
|
||||
double scale = s->scale_d;
|
||||
const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
|
||||
size_t alloc = pre_tab ? 2*len4 : len4;
|
||||
|
||||
if (!(s->exp = av_malloc_array(len4, sizeof(*s->exp))))
|
||||
if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
scale = sqrt(fabs(scale));
|
||||
|
||||
if (pre_tab)
|
||||
off = len4;
|
||||
|
||||
for (int i = 0; i < len4; i++) {
|
||||
const double alpha = M_PI_2 * (i + theta) / len4;
|
||||
s->exp[i].re = RESCALE(cos(alpha) * scale);
|
||||
s->exp[i].im = RESCALE(sin(alpha) * scale);
|
||||
s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
|
||||
RESCALE(sin(alpha) * scale) };
|
||||
}
|
||||
|
||||
if (pre_tab)
|
||||
for (int i = 0; i < len4; i++)
|
||||
s->exp[i] = s->exp[len4 + pre_tab[i]];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user