avcodec/aarch64/mpegvideoencdsp: add dotprod implementation for pix_norm1

A55             A76
pix_norm1_c:        484.3           235.2
pix_norm1_neon:     193.8 ( 2.50x)   44.7 ( 5.26x)
pix_norm1_dotprod:   91.8 ( 5.28x)   21.2 (11.09x)
This commit is contained in:
Ramiro Polla 2024-08-21 16:55:52 +02:00
parent 9f68a3712e
commit 8c203ea7c7
2 changed files with 38 additions and 0 deletions

View File

@ -27,6 +27,10 @@
int ff_pix_sum16_neon(const uint8_t *pix, int line_size);
int ff_pix_norm1_neon(const uint8_t *pix, int line_size);
#if HAVE_DOTPROD
int ff_pix_norm1_neon_dotprod(const uint8_t *pix, int line_size);
#endif
av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
@ -36,4 +40,10 @@ av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
c->pix_sum = ff_pix_sum16_neon;
c->pix_norm1 = ff_pix_norm1_neon;
}
#if HAVE_DOTPROD
if (have_dotprod(cpu_flags)) {
c->pix_norm1 = ff_pix_norm1_neon_dotprod;
}
#endif
}

View File

@ -66,3 +66,31 @@ function ff_pix_norm1_neon, export=1
ret
endfunc
#if HAVE_DOTPROD
ENABLE_DOTPROD
function ff_pix_norm1_neon_dotprod, export=1
// x0 const uint8_t *pix
// x1 int line_size
sxtw x1, w1
movi v0.16b, #0
mov w2, #16
1:
ld1 {v1.16b}, [x0], x1
ld1 {v2.16b}, [x0], x1
udot v0.4s, v1.16b, v1.16b
subs w2, w2, #2
udot v0.4s, v2.16b, v2.16b
b.ne 1b
uaddlv d0, v0.4s
fmov w0, s0
ret
endfunc
DISABLE_DOTPROD
#endif