From 630967ef63d0f2a5cc12b06815af0ec6cb5c9d2a Mon Sep 17 00:00:00 2001 From: Martin Vignali Date: Sat, 2 Dec 2017 19:46:42 +0100 Subject: [PATCH] avcodec/utvideodec : add SIMD (SSSE3 and AVX2) for gradient_pred --- libavcodec/lossless_videodsp.c | 11 ++++ libavcodec/lossless_videodsp.h | 1 + libavcodec/utvideodec.c | 5 +- libavcodec/x86/lossless_videodsp.asm | 80 +++++++++++++++++++++++++ libavcodec/x86/lossless_videodsp_init.c | 5 ++ 5 files changed, 101 insertions(+), 1 deletion(-) diff --git a/libavcodec/lossless_videodsp.c b/libavcodec/lossless_videodsp.c index b5b96e6129..cff94c234d 100644 --- a/libavcodec/lossless_videodsp.c +++ b/libavcodec/lossless_videodsp.c @@ -98,6 +98,16 @@ static int add_left_pred_int16_c(uint16_t *dst, const uint16_t *src, unsigned ma return acc; } +static void add_gradient_pred_c(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width){ + int A, B, C, i; + + for (i = 0; i < width; i++) { + A = src[i - stride]; + B = src[i - (stride + 1)]; + C = src[i - 1]; + src[i] = (A - B + C + src[i]) & 0xFF; + } +} void ff_llviddsp_init(LLVidDSPContext *c) { @@ -106,6 +116,7 @@ void ff_llviddsp_init(LLVidDSPContext *c) c->add_left_pred = add_left_pred_c; c->add_left_pred_int16 = add_left_pred_int16_c; + c->add_gradient_pred = add_gradient_pred_c; if (ARCH_PPC) ff_llviddsp_init_ppc(c); diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_videodsp.h index ccab39bac6..8077898d1a 100644 --- a/libavcodec/lossless_videodsp.h +++ b/libavcodec/lossless_videodsp.h @@ -39,6 +39,7 @@ typedef struct LLVidDSPContext { int (*add_left_pred_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned left); + void (*add_gradient_pred)(uint8_t *src /* align 32 */, const ptrdiff_t stride, const ptrdiff_t width); } LLVidDSPContext; void ff_llviddsp_init(LLVidDSPContext *llviddsp); diff --git a/libavcodec/utvideodec.c b/libavcodec/utvideodec.c index d2da825fbf..b85cb5daa6 100644 --- a/libavcodec/utvideodec.c +++ b/libavcodec/utvideodec.c @@ -460,6 +460,7 @@ static void restore_gradient_planar(UtvideoContext *c, uint8_t *src, ptrdiff_t s uint8_t *bsrc; int slice_start, slice_height; const int cmask = ~rmode; + int min_width = FFMIN(width, 32); for (slice = 0; slice < slices; slice++) { slice_start = ((slice * height) / slices) & cmask; @@ -479,12 +480,14 @@ static void restore_gradient_planar(UtvideoContext *c, uint8_t *src, ptrdiff_t s for (j = 1; j < slice_height; j++) { // second line - first element has top prediction, the rest uses gradient bsrc[0] = (bsrc[0] + bsrc[-stride]) & 0xFF; - for (i = 1; i < width; i++) { + for (i = 1; i < min_width; i++) { /* dsp need align 32 */ A = bsrc[i - stride]; B = bsrc[i - (stride + 1)]; C = bsrc[i - 1]; bsrc[i] = (A - B + C + bsrc[i]) & 0xFF; } + if (width > 32) + c->llviddsp.add_gradient_pred(bsrc + 32, stride, width - 32); bsrc += stride; } } diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm index cfa0620fd1..9a169fe314 100644 --- a/libavcodec/x86/lossless_videodsp.asm +++ b/libavcodec/x86/lossless_videodsp.asm @@ -2,6 +2,7 @@ ;* SIMD lossless video DSP utils ;* Copyright (c) 2008 Loren Merritt ;* Copyright (c) 2014 Michael Niedermayer +;* Copyright (c) 2017 Jokyo Images ;* ;* This file is part of FFmpeg. ;* @@ -325,3 +326,82 @@ cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left ADD_HFYU_LEFT_LOOP_INT16 u, a .src_unaligned: ADD_HFYU_LEFT_LOOP_INT16 u, u + + +;--------------------------------------------------------------------------------------------- +; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width) +;--------------------------------------------------------------------------------------------- +%macro ADD_GRADIENT_PRED 0 +cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp + mova xm0, [pb_15] + +;load src - 1 in xm1 + movd xm1, [srcq-1] +%if cpuflag(avx2) + vpbroadcastb xm1, xm1 +%else + pxor xm2, xm2 + pshufb xm1, xm2 +%endif + + add srcq, widthq + neg widthq + neg strideq + +.loop: + lea tmpq, [srcq + strideq] + mova m2, [tmpq + widthq] ; A = src[x-stride] + movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)] + mova m4, [srcq + widthq] ; current val (src[x]) + + psubb m2, m3; A - B + +; prefix sum A-B + pslldq m3, m2, 1 + paddb m2, m3 + pslldq m3, m2, 2 + paddb m2, m3 + pslldq m3, m2, 4 + paddb m2, m3 + pslldq m3, m2, 8 + paddb m2, m3 + +; prefix sum current val + pslldq m3, m4, 1 + paddb m4, m3 + pslldq m3, m4, 2 + paddb m4, m3 + pslldq m3, m4, 4 + paddb m4, m3 + pslldq m3, m4, 8 + paddb m4, m3 + +; last sum + paddb m2, m4 ; current + (A - B) + + paddb xm1, xm2 ; += C + mova [srcq + widthq], xm1 ; store + + pshufb xm1, xm0 ; put last val in all val of xm1 + +%if mmsize == 32 + vextracti128 xm2, m2, 1 ; get second lane of the ymm + paddb xm1, xm2; += C + + mova [srcq + widthq + 16], xm1 ; store + pshufb xm1, xm0 ; put last val in all val of m1 +%endif + + add widthq, mmsize + jl .loop + RET + +%endmacro + +INIT_XMM ssse3 +ADD_GRADIENT_PRED + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +ADD_GRADIENT_PRED +%endif diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c index beae317cc2..e3063de462 100644 --- a/libavcodec/x86/lossless_videodsp_init.c +++ b/libavcodec/x86/lossless_videodsp_init.c @@ -44,6 +44,9 @@ int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src, int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); +void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width); +void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width); + #if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32 static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, ptrdiff_t w, @@ -109,6 +112,7 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c) if (EXTERNAL_SSSE3(cpu_flags)) { c->add_left_pred = ff_add_left_pred_ssse3; c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3; + c->add_gradient_pred = ff_add_gradient_pred_ssse3; } if (EXTERNAL_SSSE3_FAST(cpu_flags)) { @@ -121,5 +125,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c) if (EXTERNAL_AVX2_FAST(cpu_flags)) { c->add_bytes = ff_add_bytes_avx2; c->add_left_pred = ff_add_left_pred_unaligned_avx2; + c->add_gradient_pred = ff_add_gradient_pred_avx2; } }