aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping

For widths of 32 pixels and more, loop first horizontally,
then vertically.

Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.

When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.

By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.

Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.

For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.

Before:                  Cortex A53      A72      A73  Graviton 3
put_hevc_qpel_h16_8_neon:     710.5    667.7    692.5   211.0
put_hevc_qpel_h32_8_neon:    2791.5   2643.5   2732.0   883.5
put_hevc_qpel_h64_8_neon:   10954.0  10657.0  10874.2  3241.5
After:
put_hevc_qpel_h16_8_neon:     697.5    663.5    705.7   212.5
put_hevc_qpel_h32_8_neon:    2767.2   2684.5   2791.2   920.5
put_hevc_qpel_h64_8_neon:   10559.2  10471.5  10932.2  3051.7

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2024-03-24 12:54:13 +02:00
parent e3a54cabde
commit 717cc82d28
2 changed files with 94 additions and 29 deletions

View File

@ -109,6 +109,8 @@ void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff
intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height,
intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_qpel_h32_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height,
intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
int width);
@ -124,6 +126,9 @@ void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, c
void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
my, int width);
void ff_hevc_put_hevc_qpel_uni_h32_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
my, int width);
void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
mx, intptr_t my, int width);
@ -139,6 +144,9 @@ void ff_hevc_put_hevc_qpel_bi_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
mx, intptr_t my, int width);
void ff_hevc_put_hevc_qpel_bi_h32_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
mx, intptr_t my, int width);
#define NEON8_FNPROTO(fn, args, ext) \
void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
@ -335,28 +343,28 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_neon;
c->put_hevc_qpel[4][0][1] =
c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h12_8_neon;
c->put_hevc_qpel[5][0][1] =
c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_neon;
c->put_hevc_qpel[7][0][1] =
c->put_hevc_qpel[8][0][1] =
c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h16_8_neon;
c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h32_8_neon;
c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
c->put_hevc_qpel_uni[2][0][1] = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
c->put_hevc_qpel_uni[4][0][1] =
c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
c->put_hevc_qpel_uni[5][0][1] =
c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
c->put_hevc_qpel_uni[7][0][1] =
c->put_hevc_qpel_uni[8][0][1] =
c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_qpel_uni_h32_8_neon;
c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
c->put_hevc_qpel_bi[2][0][1] = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
c->put_hevc_qpel_bi[4][0][1] =
c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
c->put_hevc_qpel_bi[5][0][1] =
c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
c->put_hevc_qpel_bi[7][0][1] =
c->put_hevc_qpel_bi[8][0][1] =
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h32_8_neon;
NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);

View File

@ -383,11 +383,9 @@ endfunc
.ifc \type, qpel
function ff_hevc_put_hevc_h16_8_neon, export=0
uxtl v16.8h, v16.8b
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
@ -408,7 +406,6 @@ function ff_hevc_put_hevc_h16_8_neon, export=0
mla v28.8h, v24.8h, v0.h[\i]
mla v29.8h, v25.8h, v0.h[\i]
.endr
subs x9, x9, #2
ret
endfunc
.endif
@ -439,7 +436,10 @@ function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
1: ld1 {v16.8b-v18.8b}, [src], x13
ld1 {v19.8b-v21.8b}, [x12], x13
uxtl v16.8h, v16.8b
uxtl v19.8h, v19.8b
bl ff_hevc_put_hevc_h16_8_neon
subs x9, x9, #2
.ifc \type, qpel
st1 {v26.8h}, [dst], #16
@ -504,7 +504,6 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
.ifc \type, qpel_bi
ldrh w8, [sp] // width
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
lsl x17, x5, #7 // src2b reset
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
@ -519,11 +518,14 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: mov x9, height
1: ld1 {v16.8b-v18.8b}, [src], x13
ld1 {v19.8b-v21.8b}, [x12], x13
uxtl v16.8h, v16.8b
uxtl v19.8h, v19.8b
bl ff_hevc_put_hevc_h16_8_neon
subs height, height, #2
.ifc \type, qpel
st1 {v26.8h, v27.8h}, [dst], x14
@ -550,28 +552,83 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
st1 {v28.8b, v29.8b}, [x10], x14
.endif
b.gt 1b // double line
subs width, width, #16
// reset src
msub src, srcstride, height, src
msub x12, srcstride, height, x12
// reset dst
msub dst, dststride, height, dst
msub x10, dststride, height, x10
ret mx
endfunc
function ff_hevc_put_hevc_\type\()_h32_8_neon, export=1
load_filter mx
sxtw height, heightw
mov mx, x30
.ifc \type, qpel_bi
// reset xsrc
sub x4, x4, x17
sub x15, x15, x17
add x4, x4, #32
add x15, x15, #32
ldrh w8, [sp] // width
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
lsl x17, x5, #7 // src2b reset
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
sub x16, x16, width, uxtw #1
.endif
add src, src, #16
add x12, x12, #16
sub src, src, #3
mov mx, x30
.ifc \type, qpel
add dst, dst, #32
add x10, x10, #32
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #(MAX_PB_SIZE << 2)
sub x14, x14, width, uxtw #1
.else
add dst, dst, #16
add x10, x10, #16
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
sub x14, x14, width, uxtw
.endif
sub x13, x13, width, uxtw
sub x13, x13, #8
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: mov w9, width
ld1 {v16.8b}, [src], #8
ld1 {v19.8b}, [x12], #8
uxtl v16.8h, v16.8b
uxtl v19.8h, v19.8b
1:
ld1 {v17.8b-v18.8b}, [src], #16
ld1 {v20.8b-v21.8b}, [x12], #16
bl ff_hevc_put_hevc_h16_8_neon
subs w9, w9, #16
mov v16.16b, v18.16b
mov v19.16b, v21.16b
.ifc \type, qpel
st1 {v26.8h, v27.8h}, [dst], #32
st1 {v28.8h, v29.8h}, [x10], #32
.else
.ifc \type, qpel_bi
ld1 {v20.8h, v21.8h}, [ x4], #32
ld1 {v22.8h, v23.8h}, [x15], #32
sqadd v26.8h, v26.8h, v20.8h
sqadd v27.8h, v27.8h, v21.8h
sqadd v28.8h, v28.8h, v22.8h
sqadd v29.8h, v29.8h, v23.8h
sqrshrun v26.8b, v26.8h, #7
sqrshrun v27.8b, v27.8h, #7
sqrshrun v28.8b, v28.8h, #7
sqrshrun v29.8b, v29.8h, #7
.else
sqrshrun v26.8b, v26.8h, #6
sqrshrun v27.8b, v27.8h, #6
sqrshrun v28.8b, v28.8h, #6
sqrshrun v29.8b, v29.8h, #6
.endif
st1 {v26.8b, v27.8b}, [dst], #16
st1 {v28.8b, v29.8b}, [x10], #16
.endif
b.gt 1b // double line
subs height, height, #2
add src, src, x13
add x12, x12, x13
add dst, dst, x14
add x10, x10, x14
.ifc \type, qpel_bi
add x4, x4, x16
add x15, x15, x16
.endif
b.gt 0b
ret mx