From 8bb59e6742eeb00a418e49ef8ea601d1d637e51a Mon Sep 17 00:00:00 2001 From: James Almer Date: Mon, 12 Jun 2017 19:27:29 -0300 Subject: [PATCH] x86/aacpsdsp: add ff_ps_hybrid_analysis_ileave_sse About 2x faster than the c version. --- libavcodec/x86/aacpsdsp.asm | 106 +++++++++++++++++++++++++++++++++ libavcodec/x86/aacpsdsp_init.c | 3 + 2 files changed, 109 insertions(+) diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm index 66bbbf4e44..a7327d39ce 100644 --- a/libavcodec/x86/aacpsdsp.asm +++ b/libavcodec/x86/aacpsdsp.asm @@ -166,6 +166,112 @@ align 16 jl .loop REP_RET +;********************************************************** +;void ps_hybrid_analysis_ileave_sse(float out[2][38][64], +; float (*in)[32][2], +; int i, int len) +;********************************************************** +INIT_XMM sse +cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp + movsxdifnidn iq, id + mov lend, 32 << 3 + lea inq, [inq+iq*4] + mov tmpd, id + shl tmpd, 8 + add outq, tmpq + mov tmpd, 64 + sub tmpd, id + mov id, tmpd + + test id, 1 + jne .loop4 + test id, 2 + jne .loop8 + +align 16 +.loop16: + mov in0q, inq + mov in1q, 38*64*4 + add in1q, in0q + mov tmpd, lend + +.inner_loop16: + movaps m0, [in0q] + movaps m1, [in1q] + movaps m2, [in0q+lenq] + movaps m3, [in1q+lenq] + TRANSPOSE4x4PS 0, 1, 2, 3, 4 + movaps [outq], m0 + movaps [outq+lenq], m1 + movaps [outq+lenq*2], m2 + movaps [outq+3*32*2*4], m3 + lea in0q, [in0q+lenq*2] + lea in1q, [in1q+lenq*2] + add outq, mmsize + sub tmpd, mmsize + jg .inner_loop16 + add inq, 16 + add outq, 3*32*2*4 + sub id, 4 + jg .loop16 + RET + +align 16 +.loop8: + mov in0q, inq + mov in1q, 38*64*4 + add in1q, in0q + mov tmpd, lend + +.inner_loop8: + movlps m0, [in0q] + movlps m1, [in1q] + movhps m0, [in0q+lenq] + movhps m1, [in1q+lenq] + SBUTTERFLYPS 0, 1, 2 + SBUTTERFLYPD 0, 1, 2 + movaps [outq], m0 + movaps [outq+lenq], m1 + lea in0q, [in0q+lenq*2] + lea in1q, [in1q+lenq*2] + add outq, mmsize + sub tmpd, mmsize + jg .inner_loop8 + add inq, 8 + add outq, lenq + sub id, 2 + jg .loop16 + RET + +align 16 +.loop4: + mov in0q, inq + mov in1q, 38*64*4 + add in1q, in0q + mov tmpd, lend + +.inner_loop4: + movss m0, [in0q] + movss m1, [in1q] + movss m2, [in0q+lenq] + movss m3, [in1q+lenq] + movlhps m0, m1 + movlhps m2, m3 + shufps m0, m2, q2020 + movaps [outq], m0 + lea in0q, [in0q+lenq*2] + lea in1q, [in1q+lenq*2] + add outq, mmsize + sub tmpd, mmsize + jg .inner_loop4 + add inq, 4 + sub id, 1 + test id, 2 + jne .loop8 + cmp id, 4 + jge .loop16 + RET + ;*********************************************************** ;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64], ; float (*in)[32][2], diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c index 25e089c395..056e23e59e 100644 --- a/libavcodec/x86/aacpsdsp_init.c +++ b/libavcodec/x86/aacpsdsp_init.c @@ -44,6 +44,8 @@ void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2], int i, int len); void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2], int i, int len); +void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64], + int i, int len); av_cold void ff_psdsp_init_x86(PSDSPContext *s) { @@ -52,6 +54,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s) if (EXTERNAL_SSE(cpu_flags)) { s->add_squares = ff_ps_add_squares_sse; s->mul_pair_single = ff_ps_mul_pair_single_sse; + s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse; s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse; s->hybrid_analysis = ff_ps_hybrid_analysis_sse; }