From d2a7314f1e0d8404eb82316ad7f12665c24d48e7 Mon Sep 17 00:00:00 2001 From: James Almer Date: Fri, 17 Jan 2014 03:55:44 -0300 Subject: [PATCH] vp9/x86: add ff_vp9_loop_filter_[vh]_16_16_sse2(). Similar gains in performance as the SSSE3 version Signed-off-by: James Almer --- libavcodec/x86/vp9dsp_init.c | 19 +++++++++++++++---- libavcodec/x86/vp9lpf.asm | 14 ++++++++------ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 900efb3a0b..ab3396e098 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -177,10 +177,17 @@ itxfm_func(idct, idct, 32, avx); #undef itxfm_func #undef itxfm_funcs -void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H); -void ff_vp9_loop_filter_v_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H); -void ff_vp9_loop_filter_h_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H); -void ff_vp9_loop_filter_h_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H); +#define lpf_funcs(size1, size2, opt) \ +void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H); \ +void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) + +lpf_funcs(16, 16, sse2); +lpf_funcs(16, 16, ssse3); +lpf_funcs(16, 16, avx); + +#undef lpf_funcs #endif /* HAVE_YASM */ @@ -230,6 +237,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_fpel(2, 1, 16, avg, sse2); init_fpel(1, 1, 32, avg, sse2); init_fpel(0, 1, 64, avg, sse2); + if (ARCH_X86_64) { + dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2; + dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2; + } } if (EXTERNAL_SSSE3(cpu_flags)) { diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm index c5e5df9097..75ce849e59 100644 --- a/libavcodec/x86/vp9lpf.asm +++ b/libavcodec/x86/vp9lpf.asm @@ -284,11 +284,11 @@ SECTION .text %endif ; calc fm mask +%if cpuflag(ssse3) pxor m0, m0 - movd m2, Id - movd m3, Ed - pshufb m2, m0 ; I I I I ... - pshufb m3, m0 ; E E E E ... +%endif + SPLATB_REG m2, I, m0 ; I I I I ... + SPLATB_REG m3, E, m0 ; E E E E ... mova m0, [pb_80] pxor m2, m0 pxor m3, m0 @@ -340,9 +340,10 @@ SECTION .text ABSSUB_CMP m1, m9, m11, m6, m4, m5, m8 ; abs(p2 - p0) <= 1 pand m2, m1 ABSSUB m4, m10, m11, m5 ; abs(p1 - p0) +%if cpuflag(ssse3) pxor m0, m0 - movd m7, Hd - pshufb m7, m0 ; H H H H ... +%endif + SPLATB_REG m7, H, m0 ; H H H H ... pxor m7, m8 pxor m4, m8 pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) @@ -665,6 +666,7 @@ cglobal vp9_loop_filter_h_16_16, 5,8,16, 256, dst, stride, E, I, H, mstride, dst RET %endmacro +LPF_16_16_VH sse2 LPF_16_16_VH ssse3 LPF_16_16_VH avx