aarch64/vvc: Add w_avg

w_avg_8_2x2_c: 0.0 ( 0.00x) w_avg_8_2x2_neon: 0.0 ( 0.00x) w_avg_8_4x4_c: 0.2 ( 1.00x) w_avg_8_4x4_neon: 0.0 ( 0.00x) w_avg_8_8x8_c: 1.2 ( 1.00x) w_avg_8_8x8_neon: 0.2 ( 5.00x) w_avg_8_16x16_c: 4.2 ( 1.00x) w_avg_8_16x16_neon: 0.8 ( 5.67x) w_avg_8_32x32_c: 16.2 ( 1.00x) w_avg_8_32x32_neon: 2.5 ( 6.50x) w_avg_8_64x64_c: 64.5 ( 1.00x) w_avg_8_64x64_neon: 9.0 ( 7.17x) w_avg_8_128x128_c: 269.5 ( 1.00x) w_avg_8_128x128_neon: 35.5 ( 7.59x) w_avg_10_2x2_c: 0.2 ( 1.00x) w_avg_10_2x2_neon: 0.2 ( 1.00x) w_avg_10_4x4_c: 0.2 ( 1.00x) w_avg_10_4x4_neon: 0.2 ( 1.00x) w_avg_10_8x8_c: 1.0 ( 1.00x) w_avg_10_8x8_neon: 0.2 ( 4.00x) w_avg_10_16x16_c: 4.2 ( 1.00x) w_avg_10_16x16_neon: 0.8 ( 5.67x) w_avg_10_32x32_c: 16.2 ( 1.00x) w_avg_10_32x32_neon: 2.5 ( 6.50x) w_avg_10_64x64_c: 66.2 ( 1.00x) w_avg_10_64x64_neon: 10.0 ( 6.62x) w_avg_10_128x128_c: 277.8 ( 1.00x) w_avg_10_128x128_neon: 39.8 ( 6.99x) w_avg_12_2x2_c: 0.0 ( 0.00x) w_avg_12_2x2_neon: 0.2 ( 0.00x) w_avg_12_4x4_c: 0.2 ( 1.00x) w_avg_12_4x4_neon: 0.0 ( 0.00x) w_avg_12_8x8_c: 1.2 ( 1.00x) w_avg_12_8x8_neon: 0.5 ( 2.50x) w_avg_12_16x16_c: 4.8 ( 1.00x) w_avg_12_16x16_neon: 0.8 ( 6.33x) w_avg_12_32x32_c: 17.0 ( 1.00x) w_avg_12_32x32_neon: 2.8 ( 6.18x) w_avg_12_64x64_c: 64.0 ( 1.00x) w_avg_12_64x64_neon: 10.0 ( 6.40x) w_avg_12_128x128_c: 269.2 ( 1.00x) w_avg_12_128x128_neon: 42.0 ( 6.41x) Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-10-18 20:43:26 +00:00 · 2024-09-29 20:02:27 +08:00 · 2024-09-29 20:02:27 +08:00 · 0ba9e8d0d4
commit 0ba9e8d0d4
parent 76eb3e5ff3
2 changed files with 118 additions and 17 deletions
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@ -52,6 +52,39 @@ void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
                        const int16_t *src0, const int16_t *src1, int width,
                        int height);

+void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
+                         const int16_t *src0, const int16_t *src1,
+                         int width, int height,
+                         uintptr_t w0_w1, uintptr_t offset_shift);
+void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
+                         const int16_t *src0, const int16_t *src1,
+                         int width, int height,
+                         uintptr_t w0_w1, uintptr_t offset_shift);
+void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
+                          const int16_t *src0, const int16_t *src1,
+                          int width, int height,
+                          uintptr_t w0_w1, uintptr_t offset_shift);
+/* When passing arguments to functions, Apple platforms diverge from the ARM64
+ * standard ABI for functions that require passing arguments on the stack. To
+ * simplify portability in the assembly function interface, use a different
+ * function signature that doesn't require passing arguments on the stack.
+ */
+#define W_AVG_FUN(bit_depth) \
+static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride, \
+    const int16_t *src0, const int16_t *src1, int width, int height, \
+    int denom, int w0, int w1, int o0, int o1) \
+{ \
+    int shift = denom + FFMAX(3, 15 - bit_depth); \
+    int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
+    uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
+    uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
+    ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
+}
+
+W_AVG_FUN(8)
+W_AVG_FUN(10)
+W_AVG_FUN(12)
+
 void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
 {
    int cpu_flags = av_get_cpu_flags();
@ -123,6 +156,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
        c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;

        c->inter.avg = ff_vvc_avg_8_neon;
+        c->inter.w_avg = vvc_w_avg_8;

        for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
            c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
@ -163,11 +197,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
        }
    } else if (bd == 10) {
        c->inter.avg = ff_vvc_avg_10_neon;
+        c->inter.w_avg = vvc_w_avg_10;

        c->alf.filter[LUMA] = alf_filter_luma_10_neon;
        c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
    } else if (bd == 12) {
        c->inter.avg = ff_vvc_avg_12_neon;
+        c->inter.w_avg = vvc_w_avg_12;

        c->alf.filter[LUMA] = alf_filter_luma_12_neon;
        c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@ -22,9 +22,9 @@

 #define VVC_MAX_PB_SIZE 128

-.macro vvc_avg, bit_depth
+.macro vvc_avg type, bit_depth

-.macro vvc_avg_\bit_depth\()_2_4, tap
+.macro vvc_\type\()_\bit_depth\()_2_4 tap
 .if \tap == 2
        ldr             s0, [src0]
        ldr             s2, [src1]
@ -32,9 +32,19 @@
        ldr             d0, [src0]
        ldr             d2, [src1]
 .endif
+
+.ifc \type, avg
        saddl           v4.4s, v0.4h, v2.4h
        add             v4.4s, v4.4s, v16.4s
        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
+.else
+        mov             v4.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+.endif
+
 .if \bit_depth == 8
        sqxtun          v4.8b, v4.8h
 .if \tap == 2
@ -57,7 +67,7 @@
        add             dst, dst, dst_stride
 .endm

-function ff_vvc_avg_\bit_depth\()_neon, export=1
+function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        dst             .req x0
        dst_stride      .req x1
        src0            .req x2
@ -67,42 +77,64 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1

        mov             x10, #(VVC_MAX_PB_SIZE * 2)
        cmp             width, #8
-.if \bit_depth == 8
-        movi            v16.4s, #64
+.ifc \type, avg
+        movi            v16.4s, #(1 << (14 - \bit_depth))
 .else
-.if \bit_depth == 10
-        mov             w6, #1023
-        movi            v16.4s, #16
-.else
-        mov             w6, #4095
-        movi            v16.4s, #4
-.endif
+        lsr             x11, x6, #32        // weight0
+        mov             w12, w6             // weight1
+        lsr             x13, x7, #32        // offset
+        mov             w14, w7             // shift
+
+        dup             v19.8h, w11
+        neg             w14, w14            // so we can use sqshl
+        dup             v20.8h, w12
+        dup             v16.4s, w13
+        dup             v22.4s, w14
+.endif // avg
+
+ .if \bit_depth >= 10
+        // clip pixel
+        mov             w6, #((1 << \bit_depth) - 1)
        movi            v18.8h, #0
        dup             v17.8h, w6
 .endif
+
        b.eq            8f
        b.hi            16f
        cmp             width, #4
        b.eq            4f
 2:      // width == 2
        subs            height, height, #1
-        vvc_avg_\bit_depth\()_2_4 2
+        vvc_\type\()_\bit_depth\()_2_4 2
        b.ne            2b
        b               32f
 4:      // width == 4
        subs            height, height, #1
-        vvc_avg_\bit_depth\()_2_4 4
+        vvc_\type\()_\bit_depth\()_2_4 4
        b.ne            4b
        b               32f
 8:      // width == 8
        ld1             {v0.8h}, [src0], x10
        ld1             {v2.8h}, [src1], x10
+.ifc \type, avg
        saddl           v4.4s, v0.4h, v2.4h
        saddl2          v5.4s, v0.8h, v2.8h
        add             v4.4s, v4.4s, v16.4s
        add             v5.4s, v5.4s, v16.4s
        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
        sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
+.else
+        mov             v4.16b, v16.16b
+        mov             v5.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        smlal2          v5.4s, v0.8h, v19.8h
+        smlal2          v5.4s, v2.8h, v20.8h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqshl           v5.4s, v5.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+        sqxtn2          v4.8h, v5.4s
+.endif
        subs            height, height, #1
 .if \bit_depth == 8
        sqxtun          v4.8b, v4.8h
@ -122,6 +154,7 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
 17:
        ldp             q0, q1, [x7], #32
        ldp             q2, q3, [x8], #32
+.ifc \type, avg
        saddl           v4.4s, v0.4h, v2.4h
        saddl2          v5.4s, v0.8h, v2.8h
        saddl           v6.4s, v1.4h, v3.4h
@ -134,6 +167,28 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
        sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
        sqshrn          v6.4h, v6.4s, #(15 - \bit_depth)
        sqshrn2         v6.8h, v7.4s, #(15 - \bit_depth)
+.else   // avg
+        mov             v4.16b, v16.16b
+        mov             v5.16b, v16.16b
+        mov             v6.16b, v16.16b
+        mov             v7.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        smlal2          v5.4s, v0.8h, v19.8h
+        smlal2          v5.4s, v2.8h, v20.8h
+        smlal           v6.4s, v1.4h, v19.4h
+        smlal           v6.4s, v3.4h, v20.4h
+        smlal2          v7.4s, v1.8h, v19.8h
+        smlal2          v7.4s, v3.8h, v20.8h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqshl           v5.4s, v5.4s, v22.4s
+        sqshl           v6.4s, v6.4s, v22.4s
+        sqshl           v7.4s, v7.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+        sqxtn           v6.4h, v6.4s
+        sqxtn2          v4.8h, v5.4s
+        sqxtn2          v6.8h, v7.4s
+.endif  // w_avg
        subs            w6, w6, #16
 .if \bit_depth == 8
        sqxtun          v4.8b, v4.8h
@ -155,9 +210,19 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
        b.ne            16b
 32:
        ret
+
+.unreq dst
+.unreq dst_stride
+.unreq src0
+.unreq src1
+.unreq width
+.unreq height
 endfunc
 .endm

-vvc_avg 8
-vvc_avg 10
-vvc_avg 12
+vvc_avg avg, 8
+vvc_avg avg, 10
+vvc_avg avg, 12
+vvc_avg w_avg, 8
+vvc_avg w_avg, 10
+vvc_avg w_avg, 12