aarch64/vvc: Add w_avg

w_avg_8_2x2_c:                                           0.0 ( 0.00x)
w_avg_8_2x2_neon:                                        0.0 ( 0.00x)
w_avg_8_4x4_c:                                           0.2 ( 1.00x)
w_avg_8_4x4_neon:                                        0.0 ( 0.00x)
w_avg_8_8x8_c:                                           1.2 ( 1.00x)
w_avg_8_8x8_neon:                                        0.2 ( 5.00x)
w_avg_8_16x16_c:                                         4.2 ( 1.00x)
w_avg_8_16x16_neon:                                      0.8 ( 5.67x)
w_avg_8_32x32_c:                                        16.2 ( 1.00x)
w_avg_8_32x32_neon:                                      2.5 ( 6.50x)
w_avg_8_64x64_c:                                        64.5 ( 1.00x)
w_avg_8_64x64_neon:                                      9.0 ( 7.17x)
w_avg_8_128x128_c:                                     269.5 ( 1.00x)
w_avg_8_128x128_neon:                                   35.5 ( 7.59x)
w_avg_10_2x2_c:                                          0.2 ( 1.00x)
w_avg_10_2x2_neon:                                       0.2 ( 1.00x)
w_avg_10_4x4_c:                                          0.2 ( 1.00x)
w_avg_10_4x4_neon:                                       0.2 ( 1.00x)
w_avg_10_8x8_c:                                          1.0 ( 1.00x)
w_avg_10_8x8_neon:                                       0.2 ( 4.00x)
w_avg_10_16x16_c:                                        4.2 ( 1.00x)
w_avg_10_16x16_neon:                                     0.8 ( 5.67x)
w_avg_10_32x32_c:                                       16.2 ( 1.00x)
w_avg_10_32x32_neon:                                     2.5 ( 6.50x)
w_avg_10_64x64_c:                                       66.2 ( 1.00x)
w_avg_10_64x64_neon:                                    10.0 ( 6.62x)
w_avg_10_128x128_c:                                    277.8 ( 1.00x)
w_avg_10_128x128_neon:                                  39.8 ( 6.99x)
w_avg_12_2x2_c:                                          0.0 ( 0.00x)
w_avg_12_2x2_neon:                                       0.2 ( 0.00x)
w_avg_12_4x4_c:                                          0.2 ( 1.00x)
w_avg_12_4x4_neon:                                       0.0 ( 0.00x)
w_avg_12_8x8_c:                                          1.2 ( 1.00x)
w_avg_12_8x8_neon:                                       0.5 ( 2.50x)
w_avg_12_16x16_c:                                        4.8 ( 1.00x)
w_avg_12_16x16_neon:                                     0.8 ( 6.33x)
w_avg_12_32x32_c:                                       17.0 ( 1.00x)
w_avg_12_32x32_neon:                                     2.8 ( 6.18x)
w_avg_12_64x64_c:                                       64.0 ( 1.00x)
w_avg_12_64x64_neon:                                    10.0 ( 6.40x)
w_avg_12_128x128_c:                                    269.2 ( 1.00x)
w_avg_12_128x128_neon:                                  42.0 ( 6.41x)

Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
This commit is contained in:
Zhao Zhili 2024-09-29 20:02:27 +08:00 committed by Nuo Mi
parent 76eb3e5ff3
commit 0ba9e8d0d4
2 changed files with 118 additions and 17 deletions

View File

@ -52,6 +52,39 @@ void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *src0, const int16_t *src1, int width,
int height);
void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
const int16_t *src0, const int16_t *src1,
int width, int height,
uintptr_t w0_w1, uintptr_t offset_shift);
void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
const int16_t *src0, const int16_t *src1,
int width, int height,
uintptr_t w0_w1, uintptr_t offset_shift);
void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
const int16_t *src0, const int16_t *src1,
int width, int height,
uintptr_t w0_w1, uintptr_t offset_shift);
/* When passing arguments to functions, Apple platforms diverge from the ARM64
* standard ABI for functions that require passing arguments on the stack. To
* simplify portability in the assembly function interface, use a different
* function signature that doesn't require passing arguments on the stack.
*/
#define W_AVG_FUN(bit_depth) \
static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *src0, const int16_t *src1, int width, int height, \
int denom, int w0, int w1, int o0, int o1) \
{ \
int shift = denom + FFMAX(3, 15 - bit_depth); \
int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
}
W_AVG_FUN(8)
W_AVG_FUN(10)
W_AVG_FUN(12)
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
{
int cpu_flags = av_get_cpu_flags();
@ -123,6 +156,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
c->inter.avg = ff_vvc_avg_8_neon;
c->inter.w_avg = vvc_w_avg_8;
for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
@ -163,11 +197,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
}
} else if (bd == 10) {
c->inter.avg = ff_vvc_avg_10_neon;
c->inter.w_avg = vvc_w_avg_10;
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
} else if (bd == 12) {
c->inter.avg = ff_vvc_avg_12_neon;
c->inter.w_avg = vvc_w_avg_12;
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;

View File

@ -22,9 +22,9 @@
#define VVC_MAX_PB_SIZE 128
.macro vvc_avg, bit_depth
.macro vvc_avg type, bit_depth
.macro vvc_avg_\bit_depth\()_2_4, tap
.macro vvc_\type\()_\bit_depth\()_2_4 tap
.if \tap == 2
ldr s0, [src0]
ldr s2, [src1]
@ -32,9 +32,19 @@
ldr d0, [src0]
ldr d2, [src1]
.endif
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
add v4.4s, v4.4s, v16.4s
sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
.else
mov v4.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
sqshl v4.4s, v4.4s, v22.4s
sqxtn v4.4h, v4.4s
.endif
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
.if \tap == 2
@ -57,7 +67,7 @@
add dst, dst, dst_stride
.endm
function ff_vvc_avg_\bit_depth\()_neon, export=1
function ff_vvc_\type\()_\bit_depth\()_neon, export=1
dst .req x0
dst_stride .req x1
src0 .req x2
@ -67,42 +77,64 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
mov x10, #(VVC_MAX_PB_SIZE * 2)
cmp width, #8
.if \bit_depth == 8
movi v16.4s, #64
.ifc \type, avg
movi v16.4s, #(1 << (14 - \bit_depth))
.else
.if \bit_depth == 10
mov w6, #1023
movi v16.4s, #16
.else
mov w6, #4095
movi v16.4s, #4
.endif
lsr x11, x6, #32 // weight0
mov w12, w6 // weight1
lsr x13, x7, #32 // offset
mov w14, w7 // shift
dup v19.8h, w11
neg w14, w14 // so we can use sqshl
dup v20.8h, w12
dup v16.4s, w13
dup v22.4s, w14
.endif // avg
.if \bit_depth >= 10
// clip pixel
mov w6, #((1 << \bit_depth) - 1)
movi v18.8h, #0
dup v17.8h, w6
.endif
b.eq 8f
b.hi 16f
cmp width, #4
b.eq 4f
2: // width == 2
subs height, height, #1
vvc_avg_\bit_depth\()_2_4 2
vvc_\type\()_\bit_depth\()_2_4 2
b.ne 2b
b 32f
4: // width == 4
subs height, height, #1
vvc_avg_\bit_depth\()_2_4 4
vvc_\type\()_\bit_depth\()_2_4 4
b.ne 4b
b 32f
8: // width == 8
ld1 {v0.8h}, [src0], x10
ld1 {v2.8h}, [src1], x10
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
saddl2 v5.4s, v0.8h, v2.8h
add v4.4s, v4.4s, v16.4s
add v5.4s, v5.4s, v16.4s
sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth)
.else
mov v4.16b, v16.16b
mov v5.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
smlal2 v5.4s, v0.8h, v19.8h
smlal2 v5.4s, v2.8h, v20.8h
sqshl v4.4s, v4.4s, v22.4s
sqshl v5.4s, v5.4s, v22.4s
sqxtn v4.4h, v4.4s
sqxtn2 v4.8h, v5.4s
.endif
subs height, height, #1
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
@ -122,6 +154,7 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
17:
ldp q0, q1, [x7], #32
ldp q2, q3, [x8], #32
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
saddl2 v5.4s, v0.8h, v2.8h
saddl v6.4s, v1.4h, v3.4h
@ -134,6 +167,28 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth)
sqshrn v6.4h, v6.4s, #(15 - \bit_depth)
sqshrn2 v6.8h, v7.4s, #(15 - \bit_depth)
.else // avg
mov v4.16b, v16.16b
mov v5.16b, v16.16b
mov v6.16b, v16.16b
mov v7.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
smlal2 v5.4s, v0.8h, v19.8h
smlal2 v5.4s, v2.8h, v20.8h
smlal v6.4s, v1.4h, v19.4h
smlal v6.4s, v3.4h, v20.4h
smlal2 v7.4s, v1.8h, v19.8h
smlal2 v7.4s, v3.8h, v20.8h
sqshl v4.4s, v4.4s, v22.4s
sqshl v5.4s, v5.4s, v22.4s
sqshl v6.4s, v6.4s, v22.4s
sqshl v7.4s, v7.4s, v22.4s
sqxtn v4.4h, v4.4s
sqxtn v6.4h, v6.4s
sqxtn2 v4.8h, v5.4s
sqxtn2 v6.8h, v7.4s
.endif // w_avg
subs w6, w6, #16
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
@ -155,9 +210,19 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
b.ne 16b
32:
ret
.unreq dst
.unreq dst_stride
.unreq src0
.unreq src1
.unreq width
.unreq height
endfunc
.endm
vvc_avg 8
vvc_avg 10
vvc_avg 12
vvc_avg avg, 8
vvc_avg avg, 10
vvc_avg avg, 12
vvc_avg w_avg, 8
vvc_avg w_avg, 10
vvc_avg w_avg, 12