diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h index 90a42d7108..0fefb4d70f 100644 --- a/libavcodec/aarch64/h26x/dsp.h +++ b/libavcodec/aarch64/h26x/dsp.h @@ -297,4 +297,12 @@ NEON8_FNPROTO_PARTIAL_6(qpel_hv, (int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int width), _i8mm); +NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst, + const uint8_t *src, ptrdiff_t srcstride, int height, + const int8_t *hf, const int8_t *vf, int width),); + +NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst, + const uint8_t *src, ptrdiff_t srcstride, int height, + const int8_t *hf, const int8_t *vf, int width), _i8mm); + #endif diff --git a/libavcodec/aarch64/h26x/epel_neon.S b/libavcodec/aarch64/h26x/epel_neon.S index cad8f2a5f4..e44a448b1f 100644 --- a/libavcodec/aarch64/h26x/epel_neon.S +++ b/libavcodec/aarch64/h26x/epel_neon.S @@ -72,6 +72,11 @@ endconst sxtl v0.8h, v0.8b .endm +.macro vvc_load_epel_filterh freg + ld1 {v0.8b}, [\freg] + sxtl v0.8h, v0.8b +.endm + .macro calc_epelh dst, src0, src1, src2, src3 smull \dst\().4s, \src0\().4h, v0.h[0] smlal \dst\().4s, \src1\().4h, v0.h[1] @@ -2299,10 +2304,16 @@ endfunc DISABLE_I8MM #endif +function vvc_put_epel_hv4_8_end_neon + vvc_load_epel_filterh x5 + mov x10, #(VVC_MAX_PB_SIZE * 2) + b 0f +endfunc function hevc_put_hevc_epel_hv4_8_end_neon load_epel_filterh x5, x4 mov x10, #(HEVC_MAX_PB_SIZE * 2) +0: ldr d16, [sp] ldr d17, [sp, x10] add sp, sp, x10, lsl #1 @@ -2339,9 +2350,16 @@ function hevc_put_hevc_epel_hv6_8_end_neon 2: ret endfunc +function vvc_put_epel_hv8_8_end_neon + vvc_load_epel_filterh x5 + mov x10, #(VVC_MAX_PB_SIZE * 2) + b 0f +endfunc + function hevc_put_hevc_epel_hv8_8_end_neon load_epel_filterh x5, x4 mov x10, #(HEVC_MAX_PB_SIZE * 2) +0: ldr q16, [sp] ldr q17, [sp, x10] add sp, sp, x10, lsl #1 @@ -2379,9 +2397,16 @@ function hevc_put_hevc_epel_hv12_8_end_neon 2: ret endfunc +function vvc_put_epel_hv16_8_end_neon + vvc_load_epel_filterh x5 + mov x10, #(VVC_MAX_PB_SIZE * 2) + b 0f +endfunc + function hevc_put_hevc_epel_hv16_8_end_neon load_epel_filterh x5, x4 mov x10, #(HEVC_MAX_PB_SIZE * 2) +0: ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10 @@ -2437,6 +2462,21 @@ function ff_hevc_put_hevc_epel_hv4_8_\suffix, export=1 b hevc_put_hevc_epel_hv4_8_end_neon endfunc +function ff_vvc_put_epel_hv4_8_\suffix, export=1 + add w10, w3, #3 + lsl x10, x10, #8 + sub sp, sp, x10 // tmp_array + stp x5, x30, [sp, #-32]! + stp x0, x3, [sp, #16] + add x0, sp, #32 + sub x1, x1, x2 + add w3, w3, #3 + bl X(ff_vvc_put_epel_h4_8_\suffix) + ldp x0, x3, [sp, #16] + ldp x5, x30, [sp], #32 + b vvc_put_epel_hv4_8_end_neon +endfunc + function ff_hevc_put_hevc_epel_hv6_8_\suffix, export=1 add w10, w3, #3 lsl x10, x10, #7 @@ -2467,6 +2507,21 @@ function ff_hevc_put_hevc_epel_hv8_8_\suffix, export=1 b hevc_put_hevc_epel_hv8_8_end_neon endfunc +function ff_vvc_put_epel_hv8_8_\suffix, export=1 + add w10, w3, #3 + lsl x10, x10, #8 + sub sp, sp, x10 // tmp_array + stp x5, x30, [sp, #-32]! + stp x0, x3, [sp, #16] + add x0, sp, #32 + sub x1, x1, x2 + add w3, w3, #3 + bl X(ff_vvc_put_epel_h8_8_\suffix) + ldp x0, x3, [sp, #16] + ldp x5, x30, [sp], #32 + b vvc_put_epel_hv8_8_end_neon +endfunc + function ff_hevc_put_hevc_epel_hv12_8_\suffix, export=1 add w10, w3, #3 lsl x10, x10, #7 @@ -2497,6 +2552,21 @@ function ff_hevc_put_hevc_epel_hv16_8_\suffix, export=1 b hevc_put_hevc_epel_hv16_8_end_neon endfunc +function ff_vvc_put_epel_hv16_8_\suffix, export=1 + add w10, w3, #3 + lsl x10, x10, #8 + sub sp, sp, x10 // tmp_array + stp x5, x30, [sp, #-32]! + stp x0, x3, [sp, #16] + add x0, sp, #32 + sub x1, x1, x2 + add w3, w3, #3 + bl X(ff_vvc_put_epel_h16_8_\suffix) + ldp x0, x3, [sp, #16] + ldp x5, x30, [sp], #32 + b vvc_put_epel_hv16_8_end_neon +endfunc + function ff_hevc_put_hevc_epel_hv24_8_\suffix, export=1 add w10, w3, #3 lsl x10, x10, #7 @@ -2530,6 +2600,24 @@ function ff_hevc_put_hevc_epel_hv32_8_\suffix, export=1 ret endfunc +function ff_vvc_put_epel_hv32_8_\suffix, export=1 + stp x4, x5, [sp, #-64]! + stp x2, x3, [sp, #16] + stp x0, x1, [sp, #32] + str x30, [sp, #48] + mov x6, #16 + bl X(ff_vvc_put_epel_hv16_8_\suffix) + ldp x0, x1, [sp, #32] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp], #48 + add x0, x0, #32 + add x1, x1, #16 + mov x6, #16 + bl X(ff_vvc_put_epel_hv16_8_\suffix) + ldr x30, [sp], #16 + ret +endfunc + function ff_hevc_put_hevc_epel_hv48_8_\suffix, export=1 stp x4, x5, [sp, #-64]! stp x2, x3, [sp, #16] @@ -2579,6 +2667,43 @@ function ff_hevc_put_hevc_epel_hv64_8_\suffix, export=1 ldr x30, [sp], #16 ret endfunc + +function ff_vvc_put_epel_hv64_8_\suffix, export=1 + stp x4, x5, [sp, #-64]! + stp x2, x3, [sp, #16] + stp x0, x1, [sp, #32] + str x30, [sp, #48] + mov x6, #32 + bl X(ff_vvc_put_epel_hv32_8_\suffix) + ldp x0, x1, [sp, #32] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp], #48 + add x0, x0, #64 + add x1, x1, #32 + mov x6, #32 + bl X(ff_vvc_put_epel_hv32_8_\suffix) + ldr x30, [sp], #16 + ret +endfunc + +function ff_vvc_put_epel_hv128_8_\suffix, export=1 + stp x4, x5, [sp, #-64]! + stp x2, x3, [sp, #16] + stp x0, x1, [sp, #32] + str x30, [sp, #48] + mov x6, #64 + bl X(ff_vvc_put_epel_hv64_8_\suffix) + ldp x0, x1, [sp, #32] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp], #48 + add x0, x0, #128 + add x1, x1, #64 + mov x6, #64 + bl X(ff_vvc_put_epel_hv64_8_\suffix) + ldr x30, [sp], #16 + ret +endfunc + .endm epel_hv neon diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index c947885145..4867491620 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -84,6 +84,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put[1][5][0][1] = c->inter.put[1][6][0][1] = ff_vvc_put_epel_h32_8_neon; + c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon; + c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon; + c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon; + c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon; + c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon; + c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon; + c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon; c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon; c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon; @@ -134,6 +141,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm; c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm; c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm; + + c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon_i8mm; + c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon_i8mm; + c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon_i8mm; + c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon_i8mm; + c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm; + c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm; } } else if (bd == 10) { c->alf.filter[LUMA] = alf_filter_luma_10_neon;