FFmpeg/libavcodec/x86/lossless_videodsp.asm
Lynne bbe95f7353
x86: replace explicit REP_RETs with RETs
From x86inc:
> On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
> a branch or a branch target. So switch to a 2-byte form of ret in that case.
> We can automatically detect "follows a branch", but not a branch target.
> (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)

x86inc can automatically determine whether to use REP_RET rather than
REP in most of these cases, so impact is minimal. Additionally, a few
REP_RETs were used unnecessary, despite the return being nowhere near a
branch.

The only CPUs affected were AMD K10s, made between 2007 and 2011, 16
years ago and 12 years ago, respectively.

In the future, everyone involved with x86inc should consider dropping
REP_RETs altogether.
2023-02-01 04:23:55 +01:00

395 lines
9.6 KiB
NASM

;******************************************************************************
;* SIMD lossless video DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2014 Michael Niedermayer
;* Copyright (c) 2017 Jokyo Images
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_15
pb_zzzzzzzz77777777: times 8 db -1
pb_7: times 8 db 7
pb_ef: times 8 db 14,15
pb_67: times 8 db 6, 7
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
SECTION .text
;------------------------------------------------------------------------------
; void ff_add_median_pred(uint8_t *dst, const uint8_t *top,
; const uint8_t *diff, int w,
; int *left, int *left_top)
;------------------------------------------------------------------------------
INIT_XMM sse2
cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
movu m0, [topq]
mova m2, m0
movd m4, [left_topq]
LSHIFT m2, 1
mova m1, m0
por m4, m2
movd m3, [leftq]
psubb m0, m4 ; t-tl
add dstq, wq
add topq, wq
add diffq, wq
neg wq
jmp .skip
.loop:
movu m4, [topq+wq]
mova m0, m4
LSHIFT m4, 1
por m4, m1
mova m1, m0 ; t
psubb m0, m4 ; t-tl
.skip:
movu m2, [diffq+wq]
%assign i 0
%rep mmsize
mova m4, m0
paddb m4, m3 ; t-tl+l
mova m5, m3
pmaxub m3, m1
pminub m5, m1
pminub m3, m4
pmaxub m3, m5 ; median
paddb m3, m2 ; +residual
%if i==0
mova m7, m3
LSHIFT m7, mmsize-1
%else
mova m6, m3
RSHIFT m7, 1
LSHIFT m6, mmsize-1
por m7, m6
%endif
%if i<mmsize-1
RSHIFT m0, 1
RSHIFT m1, 1
RSHIFT m2, 1
%endif
%assign i i+1
%endrep
movu [dstq+wq], m7
add wq, mmsize
jl .loop
movzx r2d, byte [dstq-1]
mov [leftq], r2d
movzx r2d, byte [topq-1]
mov [left_topq], r2d
RET
%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
add srcq, wq
add dstq, wq
neg wq
%%.loop:
pshufb xm0, xm5
%if %2
mova m1, [srcq+wq]
%else
movu m1, [srcq+wq]
%endif
psllw m2, m1, 8
paddb m1, m2
pshufb m2, m1, m3
paddb m1, m2
pshufb m2, m1, m4
paddb m1, m2
%if mmsize >= 16
pshufb m2, m1, m6
paddb m1, m2
%endif
paddb xm0, xm1
%if %1
mova [dstq+wq], xm0
%else
movq [dstq+wq], xm0
movhps [dstq+wq+8], xm0
%endif
%if mmsize == 32
vextracti128 xm2, m1, 1 ; get second lane of the ymm
pshufb xm0, xm5 ; set alls val to last val of the first lane
paddb xm0, xm2
;store val
%if %1
mova [dstq+wq+16], xm0
%else;
movq [dstq+wq+16], xm0
movhps [dstq+wq+16+8], xm0
%endif
%endif
add wq, mmsize
jl %%.loop
%if mmsize == 32
movzx eax, byte [dstq - 1]
%else;
mov eax, mmsize-1
sub eax, wd
movd m1, eax
pshufb m0, m1
movd eax, m0
%endif
RET
%endmacro
;------------------------------------------------------------------------------
; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
;------------------------------------------------------------------------------
INIT_MMX ssse3
cglobal add_left_pred, 3,3,7, dst, src, w, left
.skip_prologue:
mova m5, [pb_7]
mova m4, [pb_zzzz3333zzzzbbbb]
mova m3, [pb_zz11zz55zz99zzdd]
movd m0, leftm
psllq m0, 56
ADD_LEFT_LOOP 1, 1
%macro ADD_LEFT_PRED_UNALIGNED 0
cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
mova xm5, [pb_15]
VBROADCASTI128 m6, [pb_zzzzzzzz77777777]
VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb]
VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
movd xm0, leftm
pslldq xm0, 15
test srcq, mmsize - 1
jnz .src_unaligned
test dstq, mmsize - 1
jnz .dst_unaligned
ADD_LEFT_LOOP 1, 1
.dst_unaligned:
ADD_LEFT_LOOP 0, 1
.src_unaligned:
ADD_LEFT_LOOP 0, 0
%endmacro
INIT_XMM ssse3
ADD_LEFT_PRED_UNALIGNED
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_LEFT_PRED_UNALIGNED
%endif
;------------------------------------------------------------------------------
; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
;------------------------------------------------------------------------------
%macro ADD_BYTES 0
cglobal add_bytes, 3,4,2, dst, src, w, size
mov sizeq, wq
and sizeq, -2*mmsize
jz .2
add dstq, sizeq
add srcq, sizeq
neg sizeq
.1:
mova m0, [srcq + sizeq]
mova m1, [srcq + sizeq + mmsize]
paddb m0, [dstq + sizeq]
paddb m1, [dstq + sizeq + mmsize]
mova [dstq + sizeq], m0
mova [dstq + sizeq + mmsize], m1
add sizeq, 2*mmsize
jl .1
.2:
and wq, 2*mmsize-1
jz .end
add dstq, wq
add srcq, wq
neg wq
.3:
mov sizeb, [srcq + wq]
add [dstq + wq], sizeb
inc wq
jl .3
.end:
RET
%endmacro
INIT_XMM sse2
ADD_BYTES
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_BYTES
%endif
%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
add wd, wd
add srcq, wq
add dstq, wq
neg wq
%%.loop:
mov%2 m1, [srcq+wq]
mova m2, m1
pslld m1, 16
paddw m1, m2
mova m2, m1
pshufb m1, m3
paddw m1, m2
pshufb m0, m5
%if mmsize == 16
mova m2, m1
pshufb m1, m4
paddw m1, m2
%endif
paddw m0, m1
pand m0, m7
%ifidn %1, a
mova [dstq+wq], m0
%else
movq [dstq+wq], m0
movhps [dstq+wq+8], m0
%endif
add wq, mmsize
jl %%.loop
mov eax, mmsize-1
sub eax, wd
mov wd, eax
shl wd, 8
lea eax, [wd+eax-1]
movd m1, eax
pshufb m0, m1
movd eax, m0
RET
%endmacro
;---------------------------------------------------------------------------------------------
; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
;---------------------------------------------------------------------------------------------
INIT_MMX ssse3
cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
.skip_prologue:
mova m5, [pb_67]
mova m3, [pb_zzzz2323zzzzabab]
movd m0, leftm
psllq m0, 48
movd m7, maskm
SPLATW m7 ,m7
ADD_HFYU_LEFT_LOOP_INT16 a, a
INIT_XMM ssse3
cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
mova m5, [pb_ef]
mova m4, [pb_zzzzzzzz67676767]
mova m3, [pb_zzzz2323zzzzabab]
movd m0, leftm
pslldq m0, 14
movd m7, maskm
SPLATW m7 ,m7
test srcq, 15
jnz .src_unaligned
test dstq, 15
jnz .dst_unaligned
ADD_HFYU_LEFT_LOOP_INT16 a, a
.dst_unaligned:
ADD_HFYU_LEFT_LOOP_INT16 u, a
.src_unaligned:
ADD_HFYU_LEFT_LOOP_INT16 u, u
;---------------------------------------------------------------------------------------------
; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
;---------------------------------------------------------------------------------------------
%macro ADD_GRADIENT_PRED 0
cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
mova xm0, [pb_15]
;load src - 1 in xm1
movd xm1, [srcq-1]
%if cpuflag(avx2)
vpbroadcastb xm1, xm1
%else
pxor xm2, xm2
pshufb xm1, xm2
%endif
add srcq, widthq
neg widthq
neg strideq
.loop:
lea tmpq, [srcq + strideq]
mova m2, [tmpq + widthq] ; A = src[x-stride]
movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
mova m4, [srcq + widthq] ; current val (src[x])
psubb m2, m3; A - B
; prefix sum A-B
pslldq m3, m2, 1
paddb m2, m3
pslldq m3, m2, 2
paddb m2, m3
pslldq m3, m2, 4
paddb m2, m3
pslldq m3, m2, 8
paddb m2, m3
; prefix sum current val
pslldq m3, m4, 1
paddb m4, m3
pslldq m3, m4, 2
paddb m4, m3
pslldq m3, m4, 4
paddb m4, m3
pslldq m3, m4, 8
paddb m4, m3
; last sum
paddb m2, m4 ; current + (A - B)
paddb xm1, xm2 ; += C
mova [srcq + widthq], xm1 ; store
pshufb xm1, xm0 ; put last val in all val of xm1
%if mmsize == 32
vextracti128 xm2, m2, 1 ; get second lane of the ymm
paddb xm1, xm2; += C
mova [srcq + widthq + 16], xm1 ; store
pshufb xm1, xm0 ; put last val in all val of m1
%endif
add widthq, mmsize
jl .loop
RET
%endmacro
INIT_XMM ssse3
ADD_GRADIENT_PRED
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_GRADIENT_PRED
%endif