diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm index cdbfd66e82..53cfd64b3a 100644 --- a/libavcodec/x86/fft.asm +++ b/libavcodec/x86/fft.asm @@ -191,6 +191,23 @@ SECTION .text addps %2, %2, %5 ; {i0,i1,i2,i3} %endmacro +%macro INTERL 5 +%if cpuflag(avx) + vunpckhps %3, %2, %1 + vunpcklps %2, %2, %1 + vextractf128 %4(%5), %2, 0 + vextractf128 %4 %+ H(%5), %3, 0 + vextractf128 %4(%5 + 1), %2, 1 + vextractf128 %4 %+ H(%5 + 1), %3, 1 +%elif cpuflag(sse) + mova %3, %2 + unpcklps %2, %1 + unpckhps %3, %1 + mova %4(%5), %2 + mova %4(%5+1), %3 +%endif +%endmacro + ; scheduled for cpu-bound sizes %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim IF%1 mova m4, Z(4) @@ -541,17 +558,6 @@ DEFINE_ARGS zc, w, n, o1, o3 INIT_YMM avx %if HAVE_AVX_EXTERNAL -%macro INTERL_AVX 5 - vunpckhps %3, %2, %1 - vunpcklps %2, %2, %1 - vextractf128 %4(%5), %2, 0 - vextractf128 %4 %+ H(%5), %3, 0 - vextractf128 %4(%5 + 1), %2, 1 - vextractf128 %4 %+ H(%5 + 1), %3, 1 -%endmacro - -%define INTERL INTERL_AVX - DECL_PASS pass_avx, PASS_BIG 1 DECL_PASS pass_interleave_avx, PASS_BIG 0 @@ -566,16 +572,6 @@ cglobal fft_calc, 2,5,8 INIT_XMM sse -%macro INTERL_SSE 5 - mova %3, %2 - unpcklps %2, %1 - unpckhps %3, %1 - mova %4(%5), %2 - mova %4(%5+1), %3 -%endmacro - -%define INTERL INTERL_SSE - DECL_PASS pass_sse, PASS_BIG 1 DECL_PASS pass_interleave_sse, PASS_BIG 0 @@ -861,16 +857,30 @@ INIT_XMM sse %endmacro %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 +%if cpuflag(sse) mulps m6, %3, [%5+%1] mulps m7, %2, [%5+%1] mulps %2, %2, [%6+%1] mulps %3, %3, [%6+%1] subps %2, %2, m6 addps %3, %3, m7 +%elif cpuflag(3dnow) + mova m6, [%1+%2*2] + mova %3, [%1+%2*2+8] + mova %4, m6 + mova m7, %3 + pfmul m6, [%5+%2] + pfmul %3, [%6+%2] + pfmul %4, [%6+%2] + pfmul m7, [%5+%2] + pfsub %3, m6 + pfadd %4, m7 +%endif %endmacro -%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 +%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 .post: +%if cpuflag(avx) vmovaps ymm1, [%3+%1*2] vmovaps ymm0, [%3+%1*2+0x20] vmovaps ymm3, [%3+%2*2] @@ -899,10 +909,7 @@ INIT_XMM sse sub %2, 0x20 add %1, 0x20 jl .post -%endmacro - -%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 -.post: +%elif cpuflag(sse) movaps xmm1, [%3+%1*2] movaps xmm0, [%3+%1*2+0x10] CMUL %1, xmm0, xmm1, %3, %4, %5 @@ -924,25 +931,9 @@ INIT_XMM sse sub %2, 0x10 add %1, 0x10 jl .post -%endmacro - -%macro CMUL_3DNOW 6 - mova m6, [%1+%2*2] - mova %3, [%1+%2*2+8] - mova %4, m6 - mova m7, %3 - pfmul m6, [%5+%2] - pfmul %3, [%6+%2] - pfmul %4, [%6+%2] - pfmul m7, [%5+%2] - pfsub %3, m6 - pfadd %4, m7 -%endmacro - -%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 -.post: - CMUL_3DNOW %3, %1, m0, m1, %4, %5 - CMUL_3DNOW %3, %2, m2, m3, %4, %5 +%elif cpuflag(3dnow) + CMUL %3, %1, m0, m1, %4, %5 + CMUL %3, %2, m2, m3, %4, %5 movd [%3+%1*2+ 0], m0 movd [%3+%2*2+12], m1 movd [%3+%2*2+ 0], m2 @@ -958,9 +949,10 @@ INIT_XMM sse sub %2, 8 add %1, 8 jl .post +%endif %endmacro -%macro DECL_IMDCT 1 +%macro DECL_IMDCT 0 cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input %if ARCH_X86_64 %define rrevtab r7 @@ -1066,7 +1058,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i neg r0 mov r1, -mmsize sub r1, r0 - %1 r0, r1, r6, rtcos, rtsin + POSROTATESHUF r0, r1, r6, rtcos, rtsin %if ARCH_X86_64 == 0 add esp, 12 %endif @@ -1076,18 +1068,18 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i RET %endmacro -DECL_IMDCT POSROTATESHUF +DECL_IMDCT %if ARCH_X86_32 INIT_MMX 3dnow -DECL_IMDCT POSROTATESHUF_3DNOW +DECL_IMDCT INIT_MMX 3dnowext -DECL_IMDCT POSROTATESHUF_3DNOW +DECL_IMDCT %endif INIT_YMM avx %if HAVE_AVX_EXTERNAL -DECL_IMDCT POSROTATESHUF_AVX +DECL_IMDCT %endif