aarch64/vvc: Add put_epel_hv

On Apple M1:

put_chroma_hv_8_4x4_c:                                   1.7 ( 1.00x)
put_chroma_hv_8_4x4_neon:                                0.2 ( 7.67x)
put_chroma_hv_8_8x8_c:                                   5.5 ( 1.00x)
put_chroma_hv_8_8x8_neon:                                0.5 (11.53x)
put_chroma_hv_8_16x16_c:                                18.5 ( 1.00x)
put_chroma_hv_8_16x16_neon:                              1.5 (12.53x)
put_chroma_hv_8_32x32_c:                                72.5 ( 1.00x)
put_chroma_hv_8_32x32_neon:                              4.7 (15.34x)
put_chroma_hv_8_64x64_c:                               274.0 ( 1.00x)
put_chroma_hv_8_64x64_neon:                             18.5 (14.83x)
put_chroma_hv_8_128x128_c:                            1058.7 ( 1.00x)
put_chroma_hv_8_128x128_neon:                           75.2 (14.07x)

On Android Pixel 8 Pro:

put_chroma_hv_8_4x4_c:                                   1.2 ( 1.00x)
put_chroma_hv_8_4x4_neon:                                0.0 ( 0.00x)
put_chroma_hv_8_4x4_i8mm:                                0.2 ( 5.00x)
put_chroma_hv_8_8x8_c:                                   4.0 ( 1.00x)
put_chroma_hv_8_8x8_neon:                                0.5 ( 8.00x)
put_chroma_hv_8_8x8_i8mm:                                0.5 ( 8.00x)
put_chroma_hv_8_16x16_c:                                15.2 ( 1.00x)
put_chroma_hv_8_16x16_neon:                              2.5 ( 6.10x)
put_chroma_hv_8_16x16_i8mm:                              2.2 ( 6.78x)
put_chroma_hv_8_32x32_c:                                61.0 ( 1.00x)
put_chroma_hv_8_32x32_neon:                              9.8 ( 6.26x)
put_chroma_hv_8_32x32_i8mm:                              8.5 ( 7.18x)
put_chroma_hv_8_64x64_c:                               229.5 ( 1.00x)
put_chroma_hv_8_64x64_neon:                             38.5 ( 5.96x)
put_chroma_hv_8_64x64_i8mm:                             34.0 ( 6.75x)
put_chroma_hv_8_128x128_c:                             919.8 ( 1.00x)
put_chroma_hv_8_128x128_neon:                          154.5 ( 5.95x)
put_chroma_hv_8_128x128_i8mm:                          140.0 ( 6.57x)
This commit is contained in:
Zhao Zhili 2024-09-11 17:13:38 +08:00 committed by Nuo Mi
parent 0dcf204e5d
commit 1be5a2374f
3 changed files with 147 additions and 0 deletions

View File

@ -297,4 +297,12 @@ NEON8_FNPROTO_PARTIAL_6(qpel_hv, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride, int height, const uint8_t *src, ptrdiff_t srcstride, int height,
const int8_t *hf, const int8_t *vf, int width), _i8mm); const int8_t *hf, const int8_t *vf, int width), _i8mm);
NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride, int height,
const int8_t *hf, const int8_t *vf, int width),);
NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride, int height,
const int8_t *hf, const int8_t *vf, int width), _i8mm);
#endif #endif

View File

@ -72,6 +72,11 @@ endconst
sxtl v0.8h, v0.8b sxtl v0.8h, v0.8b
.endm .endm
.macro vvc_load_epel_filterh freg
ld1 {v0.8b}, [\freg]
sxtl v0.8h, v0.8b
.endm
.macro calc_epelh dst, src0, src1, src2, src3 .macro calc_epelh dst, src0, src1, src2, src3
smull \dst\().4s, \src0\().4h, v0.h[0] smull \dst\().4s, \src0\().4h, v0.h[0]
smlal \dst\().4s, \src1\().4h, v0.h[1] smlal \dst\().4s, \src1\().4h, v0.h[1]
@ -2299,10 +2304,16 @@ endfunc
DISABLE_I8MM DISABLE_I8MM
#endif #endif
function vvc_put_epel_hv4_8_end_neon
vvc_load_epel_filterh x5
mov x10, #(VVC_MAX_PB_SIZE * 2)
b 0f
endfunc
function hevc_put_hevc_epel_hv4_8_end_neon function hevc_put_hevc_epel_hv4_8_end_neon
load_epel_filterh x5, x4 load_epel_filterh x5, x4
mov x10, #(HEVC_MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
0:
ldr d16, [sp] ldr d16, [sp]
ldr d17, [sp, x10] ldr d17, [sp, x10]
add sp, sp, x10, lsl #1 add sp, sp, x10, lsl #1
@ -2339,9 +2350,16 @@ function hevc_put_hevc_epel_hv6_8_end_neon
2: ret 2: ret
endfunc endfunc
function vvc_put_epel_hv8_8_end_neon
vvc_load_epel_filterh x5
mov x10, #(VVC_MAX_PB_SIZE * 2)
b 0f
endfunc
function hevc_put_hevc_epel_hv8_8_end_neon function hevc_put_hevc_epel_hv8_8_end_neon
load_epel_filterh x5, x4 load_epel_filterh x5, x4
mov x10, #(HEVC_MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
0:
ldr q16, [sp] ldr q16, [sp]
ldr q17, [sp, x10] ldr q17, [sp, x10]
add sp, sp, x10, lsl #1 add sp, sp, x10, lsl #1
@ -2379,9 +2397,16 @@ function hevc_put_hevc_epel_hv12_8_end_neon
2: ret 2: ret
endfunc endfunc
function vvc_put_epel_hv16_8_end_neon
vvc_load_epel_filterh x5
mov x10, #(VVC_MAX_PB_SIZE * 2)
b 0f
endfunc
function hevc_put_hevc_epel_hv16_8_end_neon function hevc_put_hevc_epel_hv16_8_end_neon
load_epel_filterh x5, x4 load_epel_filterh x5, x4
mov x10, #(HEVC_MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
0:
ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10
@ -2437,6 +2462,21 @@ function ff_hevc_put_hevc_epel_hv4_8_\suffix, export=1
b hevc_put_hevc_epel_hv4_8_end_neon b hevc_put_hevc_epel_hv4_8_end_neon
endfunc endfunc
function ff_vvc_put_epel_hv4_8_\suffix, export=1
add w10, w3, #3
lsl x10, x10, #8
sub sp, sp, x10 // tmp_array
stp x5, x30, [sp, #-32]!
stp x0, x3, [sp, #16]
add x0, sp, #32
sub x1, x1, x2
add w3, w3, #3
bl X(ff_vvc_put_epel_h4_8_\suffix)
ldp x0, x3, [sp, #16]
ldp x5, x30, [sp], #32
b vvc_put_epel_hv4_8_end_neon
endfunc
function ff_hevc_put_hevc_epel_hv6_8_\suffix, export=1 function ff_hevc_put_hevc_epel_hv6_8_\suffix, export=1
add w10, w3, #3 add w10, w3, #3
lsl x10, x10, #7 lsl x10, x10, #7
@ -2467,6 +2507,21 @@ function ff_hevc_put_hevc_epel_hv8_8_\suffix, export=1
b hevc_put_hevc_epel_hv8_8_end_neon b hevc_put_hevc_epel_hv8_8_end_neon
endfunc endfunc
function ff_vvc_put_epel_hv8_8_\suffix, export=1
add w10, w3, #3
lsl x10, x10, #8
sub sp, sp, x10 // tmp_array
stp x5, x30, [sp, #-32]!
stp x0, x3, [sp, #16]
add x0, sp, #32
sub x1, x1, x2
add w3, w3, #3
bl X(ff_vvc_put_epel_h8_8_\suffix)
ldp x0, x3, [sp, #16]
ldp x5, x30, [sp], #32
b vvc_put_epel_hv8_8_end_neon
endfunc
function ff_hevc_put_hevc_epel_hv12_8_\suffix, export=1 function ff_hevc_put_hevc_epel_hv12_8_\suffix, export=1
add w10, w3, #3 add w10, w3, #3
lsl x10, x10, #7 lsl x10, x10, #7
@ -2497,6 +2552,21 @@ function ff_hevc_put_hevc_epel_hv16_8_\suffix, export=1
b hevc_put_hevc_epel_hv16_8_end_neon b hevc_put_hevc_epel_hv16_8_end_neon
endfunc endfunc
function ff_vvc_put_epel_hv16_8_\suffix, export=1
add w10, w3, #3
lsl x10, x10, #8
sub sp, sp, x10 // tmp_array
stp x5, x30, [sp, #-32]!
stp x0, x3, [sp, #16]
add x0, sp, #32
sub x1, x1, x2
add w3, w3, #3
bl X(ff_vvc_put_epel_h16_8_\suffix)
ldp x0, x3, [sp, #16]
ldp x5, x30, [sp], #32
b vvc_put_epel_hv16_8_end_neon
endfunc
function ff_hevc_put_hevc_epel_hv24_8_\suffix, export=1 function ff_hevc_put_hevc_epel_hv24_8_\suffix, export=1
add w10, w3, #3 add w10, w3, #3
lsl x10, x10, #7 lsl x10, x10, #7
@ -2530,6 +2600,24 @@ function ff_hevc_put_hevc_epel_hv32_8_\suffix, export=1
ret ret
endfunc endfunc
function ff_vvc_put_epel_hv32_8_\suffix, export=1
stp x4, x5, [sp, #-64]!
stp x2, x3, [sp, #16]
stp x0, x1, [sp, #32]
str x30, [sp, #48]
mov x6, #16
bl X(ff_vvc_put_epel_hv16_8_\suffix)
ldp x0, x1, [sp, #32]
ldp x2, x3, [sp, #16]
ldp x4, x5, [sp], #48
add x0, x0, #32
add x1, x1, #16
mov x6, #16
bl X(ff_vvc_put_epel_hv16_8_\suffix)
ldr x30, [sp], #16
ret
endfunc
function ff_hevc_put_hevc_epel_hv48_8_\suffix, export=1 function ff_hevc_put_hevc_epel_hv48_8_\suffix, export=1
stp x4, x5, [sp, #-64]! stp x4, x5, [sp, #-64]!
stp x2, x3, [sp, #16] stp x2, x3, [sp, #16]
@ -2579,6 +2667,43 @@ function ff_hevc_put_hevc_epel_hv64_8_\suffix, export=1
ldr x30, [sp], #16 ldr x30, [sp], #16
ret ret
endfunc endfunc
function ff_vvc_put_epel_hv64_8_\suffix, export=1
stp x4, x5, [sp, #-64]!
stp x2, x3, [sp, #16]
stp x0, x1, [sp, #32]
str x30, [sp, #48]
mov x6, #32
bl X(ff_vvc_put_epel_hv32_8_\suffix)
ldp x0, x1, [sp, #32]
ldp x2, x3, [sp, #16]
ldp x4, x5, [sp], #48
add x0, x0, #64
add x1, x1, #32
mov x6, #32
bl X(ff_vvc_put_epel_hv32_8_\suffix)
ldr x30, [sp], #16
ret
endfunc
function ff_vvc_put_epel_hv128_8_\suffix, export=1
stp x4, x5, [sp, #-64]!
stp x2, x3, [sp, #16]
stp x0, x1, [sp, #32]
str x30, [sp, #48]
mov x6, #64
bl X(ff_vvc_put_epel_hv64_8_\suffix)
ldp x0, x1, [sp, #32]
ldp x2, x3, [sp, #16]
ldp x4, x5, [sp], #48
add x0, x0, #128
add x1, x1, #64
mov x6, #64
bl X(ff_vvc_put_epel_hv64_8_\suffix)
ldr x30, [sp], #16
ret
endfunc
.endm .endm
epel_hv neon epel_hv neon

View File

@ -84,6 +84,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put[1][5][0][1] = c->inter.put[1][5][0][1] =
c->inter.put[1][6][0][1] = ff_vvc_put_epel_h32_8_neon; c->inter.put[1][6][0][1] = ff_vvc_put_epel_h32_8_neon;
c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon;
c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon;
c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon;
c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon;
c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon;
c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon;
c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon; c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon; c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon; c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
@ -134,6 +141,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm; c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm;
c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm; c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm;
c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm; c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm;
c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon_i8mm;
c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon_i8mm;
c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon_i8mm;
c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon_i8mm;
c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
} }
} else if (bd == 10) { } else if (bd == 10) {
c->alf.filter[LUMA] = alf_filter_luma_10_neon; c->alf.filter[LUMA] = alf_filter_luma_10_neon;