x86: hevc_mc: load less data in epel filters

Before:
5679 decicycles in epel_bi, 2059976 runs, 37176 skips
3468 decicycles in epel_uni, 1040886 runs, 7690 skips

After:
5323 decicycles in epel_bi, 2059493 runs, 37659 skips
3262 decicycles in epel_uni, 1040871 runs, 7705 skips

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Christophe Gisquet 2014-07-25 15:08:49 +02:00 committed by Michael Niedermayer
parent 36284ae981
commit 81943a10b5

View File

@ -176,15 +176,23 @@ QPEL_TABLE 12, 4, w, sse4
%else
%define rfilterq %2
%endif
movdqu m0, [rfilterq ] ;load 128bit of x
%ifnum %3
movdqu m1, [rfilterq+ %3] ;load 128bit of x+stride
movdqu m2, [rfilterq+2*%3] ;load 128bit of x+2*stride
movdqu m3, [rfilterq+3*%3] ;load 128bit of x+3*stride
%if (%1 == 8 && %4 <= 4)
%define %%load movd
%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
%define %%load movq
%else
movdqu m1, [rfilterq+ %3q] ;load 128bit of x+stride
movdqu m2, [rfilterq+2*%3q] ;load 128bit of x+2*stride
movdqu m3, [rfilterq+r3srcq] ;load 128bit of x+2*stride
%define %%load movdqu
%endif
%%load m0, [rfilterq ]
%ifnum %3
%%load m1, [rfilterq+ %3]
%%load m2, [rfilterq+2*%3]
%%load m3, [rfilterq+3*%3]
%else
%%load m1, [rfilterq+ %3q]
%%load m2, [rfilterq+2*%3q]
%%load m3, [rfilterq+r3srcq]
%endif
%if %1 == 8