SSSE3 versions of width4 VP8 6-tap MC functions

Also make some small changes to saturation order of 4-tap SSSE3 MC to fix a
non-bitexactness bug.

Patch mostly by Eli Friedman <eli.friedman AT gmail DOT com>.

Originally committed as revision 23965 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Jason Garrett-Glaser 2010-07-02 05:27:41 +00:00
parent 301ab19dd9
commit dcc602d802
2 changed files with 192 additions and 161 deletions

View File

@ -54,6 +54,18 @@ extern void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
@ -173,6 +185,11 @@ HVTAPSSE2(6, 4, 8)
HVTAPSSE2(6, 6, 8)
HVTAPSSE2(6, 6, 16)
HVTAP(ssse3, 16, 4, 4, 4, 8)
HVTAP(ssse3, 16, 4, 6, 4, 8)
HVTAP(ssse3, 16, 6, 4, 4, 8)
HVTAP(ssse3, 16, 6, 6, 4, 8)
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
uint8_t *dst, int dststride, uint8_t *src, \
@ -264,6 +281,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
if (mm_flags & FF_MM_SSSE3) {
VP8_LUMA_MC_FUNC(0, 16, ssse3);
VP8_MC_FUNC(1, 8, ssse3);
VP8_MC_FUNC(2, 4, ssse3);
VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
}

View File

@ -44,14 +44,14 @@ sixtap_filter_hw_m: times 4 dw 2, -11
times 4 dw 36, 108
times 4 dw -11, 2
fourtap_filter_hb_m: times 8 db -6, -1
times 8 db 123, 12
times 8 db -9, -6
times 8 db 93, 50
times 8 db -6, -9
times 8 db 50, 93
times 8 db -1, -6
times 8 db 12, 123
fourtap_filter_hb_m: times 8 db -6, 123
times 8 db 12, -1
times 8 db -9, 93
times 8 db 50, -6
times 8 db -6, 50
times 8 db 93, -9
times 8 db -1, 12
times 8 db 123, -6
sixtap_filter_hb_m: times 8 db 2, 1
times 8 db -11, 108
@ -136,7 +136,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%endif
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
@ -159,6 +159,171 @@ SECTION .text
; int height, int mx, int my);
;-----------------------------------------------------------------------------
%macro FILTER_SSSE3 3
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
lea r5d, [r5*3]
mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3]
%ifdef PIC
lea r11, [sixtap_filter_hb_m]
%endif
mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
mova m6, [sixtap_filter_hb+r5*8-32]
mova m7, [sixtap_filter_hb+r5*8-16]
.nextrow
movu m0, [r2-2]
mova m1, m0
mova m2, m0
%ifidn %1, 4
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
punpcklbw m0, [r2+3]
%else
pshufb m0, [filter_h6_shuf1]
%endif
pshufb m1, m3
pshufb m2, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
pmaddubsw m2, m7
paddsw m0, m1
paddsw m0, m2
paddsw m0, [pw_64]
psraw m0, 7
packuswb m0, m0
movh [r0], m0 ; store
; go to next line
add r0, r1
add r2, r3
dec r4 ; next row
jg .nextrow
REP_RET
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
shl r5d, 4
mova m2, [pw_64]
mova m3, [filter_h2_shuf]
mova m4, [filter_h4_shuf]
%ifdef PIC
lea r11, [fourtap_filter_hb_m]
%endif
mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
mova m6, [fourtap_filter_hb+r5]
.nextrow
movu m0, [r2-1]
mova m1, m0
pshufb m0, m3
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
paddsw m0, m2
paddsw m0, m1
psraw m0, 7
packuswb m0, m0
movh [r0], m0 ; store
; go to next line
add r0, r1
add r2, r3
dec r4 ; next row
jg .nextrow
REP_RET
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
shl r6d, 4
%ifdef PIC
lea r11, [fourtap_filter_hb_m]
%endif
mova m5, [fourtap_filter_hb+r6-16]
mova m6, [fourtap_filter_hb+r6]
mova m7, [pw_64]
; read 3 lines
sub r2, r3
movh m0, [r2]
movh m1, [r2+ r3]
movh m2, [r2+2*r3]
add r2, r3
.nextrow
movh m3, [r2+2*r3] ; read new row
mova m4, m0
mova m0, m1
punpcklbw m4, m1
mova m1, m2
punpcklbw m2, m3
pmaddubsw m4, m5
pmaddubsw m2, m6
paddsw m4, m2
mova m2, m3
paddsw m4, m7
psraw m4, 7
packuswb m4, m4
movh [r0], m4
; go to next line
add r0, r1
add r2, r3
dec r4 ; next row
jg .nextrow
REP_RET
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
lea r6d, [r6*3]
%ifdef PIC
lea r11, [sixtap_filter_hb_m]
%endif
lea r6, [sixtap_filter_hb+r6*8]
; read 5 lines
sub r2, r3
sub r2, r3
movh m0, [r2]
movh m1, [r2+r3]
movh m2, [r2+r3*2]
lea r2, [r2+r3*2]
add r2, r3
movh m3, [r2]
movh m4, [r2+r3]
.nextrow
movh m5, [r2+2*r3] ; read new row
mova m6, m0
punpcklbw m6, m5
mova m0, m1
punpcklbw m1, m2
mova m7, m3
punpcklbw m7, m4
pmaddubsw m6, [r6-48]
pmaddubsw m1, [r6-32]
pmaddubsw m7, [r6-16]
paddsw m6, m1
paddsw m6, m7
mova m1, m2
paddsw m6, [pw_64]
mova m2, m3
psraw m6, 7
mova m3, m4
packuswb m6, m6
mova m4, m5
movh [r0], m6
; go to next line
add r0, r1
add r2, r3
dec r4 ; next row
jg .nextrow
REP_RET
%endmacro
INIT_MMX
FILTER_SSSE3 4, 0, 0
INIT_XMM
FILTER_SSSE3 8, 8, 7
; 4x4 block, H-only 4-tap filter
cglobal put_vp8_epel4_h4_mmxext, 6, 6
shl r5d, 4
@ -383,72 +548,6 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
jg .nextrow
REP_RET
cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
shl r5d, 4
mova m2, [pw_64]
mova m3, [filter_h4_shuf]
mova m4, [filter_h6_shuf2]
%ifdef PIC
lea r11, [fourtap_filter_hb_m]
%endif
mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
mova m6, [fourtap_filter_hb+r5]
.nextrow
movu m0, [r2-1]
mova m1, m0
pshufb m0, m3
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
paddsw m0, m2
paddsw m0, m1
psraw m0, 7
packuswb m0, m0
movh [r0], m0 ; store
; go to next line
add r0, r1
add r2, r3
dec r4 ; next row
jg .nextrow
REP_RET
cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
lea r5d, [r5*3]
mova m3, [filter_h6_shuf1]
mova m4, [filter_h6_shuf2]
%ifdef PIC
lea r11, [sixtap_filter_hb_m]
%endif
mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
mova m6, [sixtap_filter_hb+r5*8-32]
mova m7, [sixtap_filter_hb+r5*8-16]
.nextrow
movu m0, [r2-2]
mova m1, m0
mova m2, m0
pshufb m0, m3
pshufb m1, m4
pshufb m2, [filter_h6_shuf3]
pmaddubsw m0, m5
pmaddubsw m1, m6
pmaddubsw m2, m7
paddsw m0, m1
paddsw m0, m2
paddsw m0, [pw_64]
psraw m0, 7
packuswb m0, m0
movh [r0], m0 ; store
; go to next line
add r0, r1
add r2, r3
dec r4 ; next row
jg .nextrow
REP_RET
%macro FILTER_V 3
; 4x4 block, V-only 4-tap filter
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
@ -573,92 +672,6 @@ FILTER_V mmxext, 4, 0
INIT_XMM
FILTER_V sse2, 8, 8
cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
shl r6d, 4
%ifdef PIC
lea r11, [fourtap_filter_hb_m]
%endif
mova m5, [fourtap_filter_hb+r6-16]
mova m6, [fourtap_filter_hb+r6]
mova m7, [pw_64]
; read 3 lines
sub r2, r3
movh m0, [r2]
movh m1, [r2+ r3]
movh m2, [r2+2*r3]
add r2, r3
.nextrow
movh m3, [r2+2*r3] ; read new row
mova m4, m0
mova m0, m1
punpcklbw m4, m3
punpcklbw m1, m2
pmaddubsw m4, m5
pmaddubsw m1, m6
paddsw m4, m1
mova m1, m2
paddsw m4, m7
mova m2, m3
psraw m4, 7
packuswb m4, m4
movh [r0], m4
; go to next line
add r0, r1
add r2, r3
dec r4 ; next row
jg .nextrow
REP_RET
cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
lea r6d, [r6*3]
%ifdef PIC
lea r11, [sixtap_filter_hb_m]
%endif
lea r6, [sixtap_filter_hb+r6*8]
; read 5 lines
sub r2, r3
sub r2, r3
movh m0, [r2]
movh m1, [r2+r3]
movh m2, [r2+r3*2]
lea r2, [r2+r3*2]
add r2, r3
movh m3, [r2]
movh m4, [r2+r3]
.nextrow
movh m5, [r2+2*r3] ; read new row
mova m6, m0
punpcklbw m6, m5
mova m0, m1
punpcklbw m1, m2
mova m7, m3
punpcklbw m7, m4
pmaddubsw m6, [r6-48]
pmaddubsw m1, [r6-32]
pmaddubsw m7, [r6-16]
paddsw m6, m1
paddsw m6, m7
mova m1, m2
paddsw m6, [pw_64]
mova m2, m3
psraw m6, 7
mova m3, m4
packuswb m6, m6
mova m4, m5
movh [r0], m6
; go to next line
add r0, r1
add r2, r3
dec r4 ; next row
jg .nextrow
REP_RET
%macro FILTER_BILINEAR 3
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
mov r5d, 8*16