avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for hpel functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for hpel functions in new file hpeldsp_msa.c Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2024-09-19 21:06:42 +00:00 · 2015-06-14 23:26:24 +05:30 · 2015-06-14 23:26:24 +05:30 · ee3ef5fda2
commit ee3ef5fda2
parent d277b05c51
7 changed files with 1825 additions and 0 deletions
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@ -365,4 +365,6 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
        ff_hpeldsp_init_ppc(c, flags);
    if (ARCH_X86)
        ff_hpeldsp_init_x86(c, flags);
+    if (ARCH_MIPS)
+        ff_hpeldsp_init_mips(c, flags);
 }
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@ -99,5 +99,6 @@ void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
+void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags);

 #endif /* AVCODEC_HPELDSP_H */
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@ -26,6 +26,7 @@ OBJS-$(CONFIG_H264CHROMA)                 += mips/h264chroma_init_mips.o
 OBJS-$(CONFIG_H264PRED)                   += mips/h264pred_init_mips.o
 OBJS-$(CONFIG_H263DSP)                    += mips/h263dsp_init_mips.o
 OBJS-$(CONFIG_QPELDSP)                    += mips/qpeldsp_init_mips.o
+OBJS-$(CONFIG_HPELDSP)                    += mips/hpeldsp_init_mips.o
 MSA-OBJS-$(CONFIG_HEVC_DECODER)           += mips/hevcdsp_msa.o            \
                                             mips/hevc_mc_uni_msa.o        \
                                             mips/hevc_mc_uniw_msa.o       \
@ -41,5 +42,6 @@ MSA-OBJS-$(CONFIG_H264CHROMA)             += mips/h264chroma_msa.o
 MSA-OBJS-$(CONFIG_H264PRED)               += mips/h264pred_msa.o
 MSA-OBJS-$(CONFIG_H263DSP)                += mips/h263dsp_msa.o
 MSA-OBJS-$(CONFIG_QPELDSP)                += mips/qpeldsp_msa.o
+MSA-OBJS-$(CONFIG_HPELDSP)                += mips/hpeldsp_msa.o
 LOONGSON3-OBJS-$(CONFIG_H264DSP)          += mips/h264dsp_mmi.o
 LOONGSON3-OBJS-$(CONFIG_H264CHROMA)       += mips/h264chroma_mmi.o
--- a/libavcodec/mips/hpeldsp_init_mips.c
+++ b/libavcodec/mips/hpeldsp_init_mips.c
@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../hpeldsp.h"
+#include "libavcodec/mips/hpeldsp_mips.h"
+
+#if HAVE_MSA
+static void ff_hpeldsp_init_msa(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = ff_put_pixels16_msa;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_msa;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_msa;
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_msa;
+
+    c->put_pixels_tab[1][0] = ff_put_pixels8_msa;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_msa;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_msa;
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_msa;
+
+    c->put_pixels_tab[2][1] = ff_put_pixels4_x2_msa;
+    c->put_pixels_tab[2][2] = ff_put_pixels4_y2_msa;
+    c->put_pixels_tab[2][3] = ff_put_pixels4_xy2_msa;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_msa;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_msa;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_msa;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_msa;
+
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_msa;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_msa;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_msa;
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_msa;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_msa;
+    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_msa;
+    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_msa;
+    c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_msa;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_msa;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_msa;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_msa;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_msa;
+
+    c->avg_pixels_tab[2][0] = ff_avg_pixels4_msa;
+    c->avg_pixels_tab[2][1] = ff_avg_pixels4_x2_msa;
+    c->avg_pixels_tab[2][2] = ff_avg_pixels4_y2_msa;
+    c->avg_pixels_tab[2][3] = ff_avg_pixels4_xy2_msa;
+}
+#endif  // #if HAVE_MSA
+
+void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags)
+{
+#if HAVE_MSA
+    ff_hpeldsp_init_msa(c, flags);
+#endif  // #if HAVE_MSA
+}
--- a/libavcodec/mips/hpeldsp_mips.h
+++ b/libavcodec/mips/hpeldsp_mips.h
@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
+#define AVCODEC_MIPS_HPELDSP_MIPS_H
+
+#include "libavcodec/bit_depth_template.c"
+
+void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+
+#endif  // #ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
--- a/libavcodec/mips/hpeldsp_msa.c
+++ b/libavcodec/mips/hpeldsp_msa.c
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@ -804,6 +804,15 @@
 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)

+#define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2,  slide_val)     \
+{                                                                         \
+    v16i8 zero_m = { 0 };                                                 \
+    SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);                    \
+    out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val);  \
+}
+#define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
+#define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
+
 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
                  out0, out1, out2, out3, slide_val)    \
 {                                                       \
@ -1174,6 +1183,13 @@
 }
 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)

+#define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \
+{                                                             \
+    HADD_UB2(RTYPE, in0, in1, out0, out1);                    \
+    out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \
+}
+#define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
+
 /* Description : Horizontal subtraction of unsigned byte vector elements
   Arguments   : Inputs  - in0, in1
                 Outputs - out0, out1
@ -2408,6 +2424,67 @@
    out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
 }

+/* Description : Average byte elements from pair of vectors and store 8x4 byte
+                 block in destination memory
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
+                 Outputs -
+                 Return Type -
+   Details     : Each byte element from input vector pair 'in0' and 'in1' are
+                 averaged (a + b)/2 and stored in 'tmp0_m'
+                 Each byte element from input vector pair 'in2' and 'in3' are
+                 averaged (a + b)/2 and stored in 'tmp1_m'
+                 Each byte element from input vector pair 'in4' and 'in5' are
+                 averaged (a + b)/2 and stored in 'tmp2_m'
+                 Each byte element from input vector pair 'in6' and 'in7' are
+                 averaged (a + b)/2 and stored in 'tmp3_m'
+                 The half vector results from all 4 vectors are stored in
+                 destination memory as 8x4 byte block
+*/
+#define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
+{                                                                           \
+    uint64_t out0_m, out1_m, out2_m, out3_m;                                \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+                                                                            \
+    tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \
+    tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \
+    tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \
+    tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \
+                                                                            \
+    out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \
+    out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \
+    out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \
+    out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \
+    SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \
+}
+
+/* Description : Average byte elements from pair of vectors and store 16x4 byte
+                 block in destination memory
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
+                 Outputs -
+                 Return Type -
+   Details     : Each byte element from input vector pair 'in0' and 'in1' are
+                 averaged (a + b)/2 and stored in 'tmp0_m'
+                 Each byte element from input vector pair 'in2' and 'in3' are
+                 averaged (a + b)/2 and stored in 'tmp1_m'
+                 Each byte element from input vector pair 'in4' and 'in5' are
+                 averaged (a + b)/2 and stored in 'tmp2_m'
+                 Each byte element from input vector pair 'in6' and 'in7' are
+                 averaged (a + b)/2 and stored in 'tmp3_m'
+                 The results from all 4 vectors are stored in destination
+                 memory as 16x4 byte block
+*/
+#define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
+{                                                                            \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+                                                                             \
+    tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \
+    tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \
+    tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \
+    tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \
+                                                                             \
+    ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \
+}
+
 /* Description : Average rounded byte elements from pair of vectors and store
                 8x4 byte block in destination memory
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
@ -2439,6 +2516,91 @@
    SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \
 }

+/* Description : Average rounded byte elements from pair of vectors and store
+                 16x4 byte block in destination memory
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
+                 Outputs -
+                 Return Type -
+   Details     : Each byte element from input vector pair 'in0' and 'in1' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
+                 Each byte element from input vector pair 'in2' and 'in3' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
+                 Each byte element from input vector pair 'in4' and 'in5' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
+                 Each byte element from input vector pair 'in6' and 'in7' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
+                 The vector results from all 4 vectors are stored in
+                 destination memory as 16x4 byte block
+*/
+#define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
+{                                                                             \
+    v16u8 t0_m, t1_m, t2_m, t3_m;                                             \
+                                                                              \
+    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \
+                t0_m, t1_m, t2_m, t3_m);                                      \
+    ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \
+}
+
+/* Description : Average rounded byte elements from pair of vectors,
+                 average rounded with destination and store 8x4 byte block
+                 in destination memory
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
+                 Outputs -
+                 Return Type -
+   Details     : Each byte element from input vector pair 'in0' and 'in1' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
+                 Each byte element from input vector pair 'in2' and 'in3' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
+                 Each byte element from input vector pair 'in4' and 'in5' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
+                 Each byte element from input vector pair 'in6' and 'in7' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
+                 The half vector results from all 4 vectors are stored in
+                 destination memory as 8x4 byte block
+*/
+#define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
+                          pdst, stride)                            \
+{                                                                  \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \
+    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \
+                                                                   \
+    LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \
+    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
+    AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
+                  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
+}
+
+/* Description : Average rounded byte elements from pair of vectors,
+                 average rounded with destination and store 16x4 byte block
+                 in destination memory
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
+                 Outputs -
+                 Return Type -
+   Details     : Each byte element from input vector pair 'in0' and 'in1' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
+                 Each byte element from input vector pair 'in2' and 'in3' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
+                 Each byte element from input vector pair 'in4' and 'in5' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
+                 Each byte element from input vector pair 'in6' and 'in7' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
+                 The vector results from all 4 vectors are stored in
+                 destination memory as 16x4 byte block
+*/
+#define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
+                           pdst, stride)                            \
+{                                                                   \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
+    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \
+                                                                    \
+    LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \
+    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \
+    AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
+                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
+}
+
 /* Description : Add block 4x4
   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
                 Outputs -