avcodec/vc1: Arm 64-bit NEON unescape fast path

checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows.

vc1dsp.vc1_unescape_buffer_c: 655617.7
vc1dsp.vc1_unescape_buffer_neon: 118237.0

Signed-off-by: Ben Avison <bavison@riscosopen.org>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Ben Avison 2022-03-31 18:23:50 +01:00 committed by Martin Storsjö
parent 5379412ed0
commit 6eee650289
2 changed files with 237 additions and 0 deletions

View File

@ -21,6 +21,7 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavcodec/vc1dsp.h"
#include "config.h"
@ -51,6 +52,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
{
/* Dealing with starting and stopping, and removing escape bytes, are
* comparatively less time-sensitive, so are more clearly expressed using
* a C wrapper around the assembly inner loop. Note that we assume a
* little-endian machine that supports unaligned loads. */
int dsize = 0;
while (size >= 4)
{
int found = 0;
while (!found && (((uintptr_t) dst) & 7) && size >= 4)
{
found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
if (!found)
{
*dst++ = *src++;
--size;
++dsize;
}
}
if (!found)
{
int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
dst += skip;
src += skip;
size -= skip;
dsize += skip;
while (!found && size >= 4)
{
found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
if (!found)
{
*dst++ = *src++;
--size;
++dsize;
}
}
}
if (found)
{
*dst++ = *src++;
*dst++ = *src++;
++src;
size -= 3;
dsize += 2;
}
}
while (size > 0)
{
*dst++ = *src++;
--size;
++dsize;
}
return dsize;
}
av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
@ -76,5 +135,7 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
}
}

View File

@ -1368,3 +1368,179 @@ function ff_vc1_h_loop_filter16_neon, export=1
st2 {v2.b, v3.b}[7], [x6]
4: ret
endfunc
// Copy at most the specified number of bytes from source to destination buffer,
// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
// On entry:
// x0 -> source buffer
// w1 = max number of bytes to copy
// x2 -> destination buffer, optimally 8-byte aligned
// On exit:
// w0 = number of bytes not copied
function ff_vc1_unescape_buffer_helper_neon, export=1
// Offset by 80 to screen out cases that are too short for us to handle,
// and also make it easy to test for loop termination, or to determine
// whether we need an odd number of half-iterations of the loop.
subs w1, w1, #80
b.mi 90f
// Set up useful constants
movi v20.4s, #3, lsl #24
movi v21.4s, #3, lsl #16
tst w1, #32
b.ne 1f
ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48
ext v25.16b, v0.16b, v1.16b, #1
ext v26.16b, v0.16b, v1.16b, #2
ext v27.16b, v0.16b, v1.16b, #3
ext v29.16b, v1.16b, v2.16b, #1
ext v30.16b, v1.16b, v2.16b, #2
ext v31.16b, v1.16b, v2.16b, #3
bic v24.16b, v0.16b, v20.16b
bic v25.16b, v25.16b, v20.16b
bic v26.16b, v26.16b, v20.16b
bic v27.16b, v27.16b, v20.16b
bic v28.16b, v1.16b, v20.16b
bic v29.16b, v29.16b, v20.16b
bic v30.16b, v30.16b, v20.16b
bic v31.16b, v31.16b, v20.16b
eor v24.16b, v24.16b, v21.16b
eor v25.16b, v25.16b, v21.16b
eor v26.16b, v26.16b, v21.16b
eor v27.16b, v27.16b, v21.16b
eor v28.16b, v28.16b, v21.16b
eor v29.16b, v29.16b, v21.16b
eor v30.16b, v30.16b, v21.16b
eor v31.16b, v31.16b, v21.16b
cmeq v24.4s, v24.4s, #0
cmeq v25.4s, v25.4s, #0
cmeq v26.4s, v26.4s, #0
cmeq v27.4s, v27.4s, #0
add w1, w1, #32
b 3f
1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48
ext v25.16b, v3.16b, v4.16b, #1
ext v26.16b, v3.16b, v4.16b, #2
ext v27.16b, v3.16b, v4.16b, #3
ext v29.16b, v4.16b, v5.16b, #1
ext v30.16b, v4.16b, v5.16b, #2
ext v31.16b, v4.16b, v5.16b, #3
bic v24.16b, v3.16b, v20.16b
bic v25.16b, v25.16b, v20.16b
bic v26.16b, v26.16b, v20.16b
bic v27.16b, v27.16b, v20.16b
bic v28.16b, v4.16b, v20.16b
bic v29.16b, v29.16b, v20.16b
bic v30.16b, v30.16b, v20.16b
bic v31.16b, v31.16b, v20.16b
eor v24.16b, v24.16b, v21.16b
eor v25.16b, v25.16b, v21.16b
eor v26.16b, v26.16b, v21.16b
eor v27.16b, v27.16b, v21.16b
eor v28.16b, v28.16b, v21.16b
eor v29.16b, v29.16b, v21.16b
eor v30.16b, v30.16b, v21.16b
eor v31.16b, v31.16b, v21.16b
cmeq v24.4s, v24.4s, #0
cmeq v25.4s, v25.4s, #0
cmeq v26.4s, v26.4s, #0
cmeq v27.4s, v27.4s, #0
// Drop through...
2: mov v0.16b, v5.16b
ld1 {v1.16b, v2.16b}, [x0], #32
cmeq v28.4s, v28.4s, #0
cmeq v29.4s, v29.4s, #0
cmeq v30.4s, v30.4s, #0
cmeq v31.4s, v31.4s, #0
orr v24.16b, v24.16b, v25.16b
orr v26.16b, v26.16b, v27.16b
orr v28.16b, v28.16b, v29.16b
orr v30.16b, v30.16b, v31.16b
ext v25.16b, v0.16b, v1.16b, #1
orr v22.16b, v24.16b, v26.16b
ext v26.16b, v0.16b, v1.16b, #2
ext v27.16b, v0.16b, v1.16b, #3
ext v29.16b, v1.16b, v2.16b, #1
orr v23.16b, v28.16b, v30.16b
ext v30.16b, v1.16b, v2.16b, #2
ext v31.16b, v1.16b, v2.16b, #3
bic v24.16b, v0.16b, v20.16b
bic v25.16b, v25.16b, v20.16b
bic v26.16b, v26.16b, v20.16b
orr v22.16b, v22.16b, v23.16b
bic v27.16b, v27.16b, v20.16b
bic v28.16b, v1.16b, v20.16b
bic v29.16b, v29.16b, v20.16b
bic v30.16b, v30.16b, v20.16b
bic v31.16b, v31.16b, v20.16b
addv s22, v22.4s
eor v24.16b, v24.16b, v21.16b
eor v25.16b, v25.16b, v21.16b
eor v26.16b, v26.16b, v21.16b
eor v27.16b, v27.16b, v21.16b
eor v28.16b, v28.16b, v21.16b
mov w3, v22.s[0]
eor v29.16b, v29.16b, v21.16b
eor v30.16b, v30.16b, v21.16b
eor v31.16b, v31.16b, v21.16b
cmeq v24.4s, v24.4s, #0
cmeq v25.4s, v25.4s, #0
cmeq v26.4s, v26.4s, #0
cmeq v27.4s, v27.4s, #0
cbnz w3, 90f
st1 {v3.16b, v4.16b}, [x2], #32
3: mov v3.16b, v2.16b
ld1 {v4.16b, v5.16b}, [x0], #32
cmeq v28.4s, v28.4s, #0
cmeq v29.4s, v29.4s, #0
cmeq v30.4s, v30.4s, #0
cmeq v31.4s, v31.4s, #0
orr v24.16b, v24.16b, v25.16b
orr v26.16b, v26.16b, v27.16b
orr v28.16b, v28.16b, v29.16b
orr v30.16b, v30.16b, v31.16b
ext v25.16b, v3.16b, v4.16b, #1
orr v22.16b, v24.16b, v26.16b
ext v26.16b, v3.16b, v4.16b, #2
ext v27.16b, v3.16b, v4.16b, #3
ext v29.16b, v4.16b, v5.16b, #1
orr v23.16b, v28.16b, v30.16b
ext v30.16b, v4.16b, v5.16b, #2
ext v31.16b, v4.16b, v5.16b, #3
bic v24.16b, v3.16b, v20.16b
bic v25.16b, v25.16b, v20.16b
bic v26.16b, v26.16b, v20.16b
orr v22.16b, v22.16b, v23.16b
bic v27.16b, v27.16b, v20.16b
bic v28.16b, v4.16b, v20.16b
bic v29.16b, v29.16b, v20.16b
bic v30.16b, v30.16b, v20.16b
bic v31.16b, v31.16b, v20.16b
addv s22, v22.4s
eor v24.16b, v24.16b, v21.16b
eor v25.16b, v25.16b, v21.16b
eor v26.16b, v26.16b, v21.16b
eor v27.16b, v27.16b, v21.16b
eor v28.16b, v28.16b, v21.16b
mov w3, v22.s[0]
eor v29.16b, v29.16b, v21.16b
eor v30.16b, v30.16b, v21.16b
eor v31.16b, v31.16b, v21.16b
cmeq v24.4s, v24.4s, #0
cmeq v25.4s, v25.4s, #0
cmeq v26.4s, v26.4s, #0
cmeq v27.4s, v27.4s, #0
cbnz w3, 91f
st1 {v0.16b, v1.16b}, [x2], #32
subs w1, w1, #64
b.pl 2b
90: add w0, w1, #80
ret
91: sub w1, w1, #32
b 90b
endfunc