lavc/aarch64: Add neon implementation for vsse_intra16

Provide optimized implementation for vsse_intra16 for arm64.

Performance tests are shown below.
- vsse_4_c: 155.2
- vsse_4_neon: 36.2

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Hubert Mazur 2022-09-08 11:25:06 +02:00 committed by Martin Storsjö
parent ce03ea3e79
commit 908abe8032
2 changed files with 66 additions and 0 deletions

View File

@ -47,6 +47,8 @@ int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
ptrdiff_t stride, int h) ;
int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@ -69,5 +71,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsad[4] = vsad_intra16_neon;
c->vsse[0] = vsse16_neon;
c->vsse[4] = vsse_intra16_neon;
}
}

View File

@ -784,3 +784,66 @@ function vsad_intra16_neon, export=1
ret
endfunc
function vsse_intra16_neon, export=1
// x0 unused
// x1 uint8_t *pix1
// x2 uint8_t *dummy
// x3 ptrdiff_t stride
// w4 int h
ld1 {v0.16b}, [x1], x3
movi v16.4s, #0
movi v17.4s, #0
sub w4, w4, #1 // we need to make h-1 iterations
cmp w4, #3
b.lt 2f
1:
// v = abs( pix1[0] - pix1[0 + stride] )
// score = sum( v * v )
ld1 {v1.16b}, [x1], x3
ld1 {v2.16b}, [x1], x3
uabd v30.16b, v0.16b, v1.16b
ld1 {v3.16b}, [x1], x3
umull v29.8h, v30.8b, v30.8b
umull2 v28.8h, v30.16b, v30.16b
uabd v27.16b, v1.16b, v2.16b
uadalp v16.4s, v29.8h
umull v26.8h, v27.8b, v27.8b
umull2 v27.8h, v27.16b, v27.16b
uadalp v17.4s, v28.8h
uabd v25.16b, v2.16b, v3.16b
uadalp v16.4s, v26.8h
umull v24.8h, v25.8b, v25.8b
umull2 v25.8h, v25.16b, v25.16b
uadalp v17.4s, v27.8h
sub w4, w4, #3
uadalp v16.4s, v24.8h
cmp w4, #3
uadalp v17.4s, v25.8h
mov v0.16b, v3.16b
b.ge 1b
cbz w4, 3f
// iterate by one
2:
ld1 {v1.16b}, [x1], x3
subs w4, w4, #1
uabd v30.16b, v0.16b, v1.16b
mov v0.16b, v1.16b
umull v29.8h, v30.8b, v30.8b
umull2 v30.8h, v30.16b, v30.16b
uadalp v16.4s, v29.8h
uadalp v17.4s, v30.8h
cbnz w4, 2b
3:
add v16.4s, v16.4s, v17.4S
uaddlv d17, v16.4s
fmov w0, s17
ret
endfunc