lavc/aarch64: Add neon implementation of vsse16

Provide optimized implementation of vsse16 for arm64.

Performance comparison tests are shown below.
- vsse_0_c: 257.7
- vsse_0_neon: 59.2

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Hubert Mazur 2022-09-08 11:25:04 +02:00 committed by Martin Storsjö
parent 200f5e578f
commit c495a4b32d
2 changed files with 91 additions and 0 deletions

View File

@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@ -62,5 +64,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sse[2] = sse4_neon;
c->vsad[0] = vsad16_neon;
c->vsse[0] = vsse16_neon;
}
}

View File

@ -649,3 +649,90 @@ function vsad16_neon, export=1
ret
endfunc
function vsse16_neon, export=1
// x0 unused
// x1 uint8_t *pix1
// x2 uint8_t *pix2
// x3 ptrdiff_t stride
// w4 int h
ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration
ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration
sub w4, w4, #1 // we need to make h-1 iterations
movi v16.4s, #0
movi v17.4s, #0
cmp w4, #3 // check if we can make 3 iterations at once
usubl v31.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
usubl2 v30.8h, v0.16b, v1.16b // Signed difference of pix1[0] - pix2[0], first iteration
b.le 2f
1:
// x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
// res = (x) * (x)
ld1 {v0.16b}, [x1], x3 // Load pix1[0 + stride], first iteration
ld1 {v1.16b}, [x2], x3 // Load pix2[0 + stride], first iteration
ld1 {v2.16b}, [x1], x3 // Load pix1[0 + stride], second iteration
ld1 {v3.16b}, [x2], x3 // Load pix2[0 + stride], second iteration
usubl v29.8h, v0.8b, v1.8b
usubl2 v28.8h, v0.16b, v1.16b
ld1 {v4.16b}, [x1], x3 // Load pix1[0 + stride], third iteration
ld1 {v5.16b}, [x2], x3 // Load pix1[0 + stride], third iteration
sabd v31.8h, v31.8h, v29.8h
sabd v30.8h, v30.8h, v28.8h
usubl v27.8h, v2.8b, v3.8b
usubl2 v26.8h, v2.16b, v3.16b
usubl v25.8h, v4.8b, v5.8b
usubl2 v24.8h, v4.16b, v5.16b
sabd v29.8h, v29.8h, v27.8h
sabd v27.8h, v27.8h, v25.8h
umlal v16.4s, v31.4h, v31.4h
umlal2 v17.4s, v31.8h, v31.8h
sabd v28.8h, v28.8h, v26.8h
sabd v26.8h, v26.8h, v24.8h
umlal v16.4s, v30.4h, v30.4h
umlal2 v17.4s, v30.8h, v30.8h
mov v31.16b, v25.16b
umlal v16.4s, v29.4h, v29.4h
umlal2 v17.4s, v29.8h, v29.8h
mov v30.16b, v24.16b
umlal v16.4s, v28.4h, v28.4h
umlal2 v17.4s, v28.8h, v28.8h
sub w4, w4, #3
umlal v16.4s, v27.4h, v27.4h
umlal2 v17.4s, v27.8h, v27.8h
cmp w4, #3
umlal v16.4s, v26.4h, v26.4h
umlal2 v17.4s, v26.8h, v26.8h
b.ge 1b
cbz w4, 3f
// iterate by once
2:
ld1 {v0.16b}, [x1], x3
ld1 {v1.16b}, [x2], x3
subs w4, w4, #1
usubl v29.8h, v0.8b, v1.8b
usubl2 v28.8h, v0.16b, v1.16b
sabd v31.8h, v31.8h, v29.8h
sabd v30.8h, v30.8h, v28.8h
umlal v16.4s, v31.4h, v31.4h
umlal2 v17.4s, v31.8h, v31.8h
mov v31.16b, v29.16b
umlal v16.4s, v30.4h, v30.4h
umlal2 v17.4s, v30.8h, v30.8h
mov v30.16b, v28.16b
b.ne 2b
3:
add v16.4s, v16.4s, v17.4s
uaddlv d17, v16.4s
fmov w0, s17
ret
endfunc