lavc/opusdsp: rewrite R-V V postfilter

This uses a more traditional approach allowing up processing of up to
period minus two elements per iteration. This also allows the algorithm
to work for all and any vector length.

As the T-Head C908 device under test can load 16 elements loop, there is
unsurprisingly a little performance drop when the period is minimal and
the parallelism is capped at 13 elements:

Before:
postfilter_15_c:         21222.2
postfilter_15_rvv_f32:   22007.7
postfilter_512_c:        20189.7
postfilter_512_rvv_f32:  22004.2
postfilter_1022_c:       20189.7
postfilter_1022_rvv_f32: 22004.2

After:
postfilter_15_c:         20189.5
postfilter_15_rvv_f32:    7057.2
postfilter_512_c:        20189.5
postfilter_512_rvv_f32:   5667.2
postfilter_1022_c:       20192.7
postfilter_1022_rvv_f32:  5667.2
This commit is contained in:
Rémi Denis-Courmont 2023-11-02 21:08:56 +02:00
parent 02594c8c01
commit adc87a5f7c
2 changed files with 43 additions and 67 deletions

View File

@ -25,30 +25,15 @@
#include "libavutil/riscv/cpu.h"
#include "libavcodec/opusdsp.h"
void ff_opus_postfilter_rvv_128(float *data, int period, float *g, int len);
void ff_opus_postfilter_rvv_256(float *data, int period, float *g, int len);
void ff_opus_postfilter_rvv_512(float *data, int period, float *g, int len);
void ff_opus_postfilter_rvv_1024(float *data, int period, float *g, int len);
void ff_opus_postfilter_rvv(float *data, int period, float *g, int len);
av_cold void ff_opus_dsp_init_riscv(OpusDSP *d)
{
#if HAVE_RVV
int flags = av_get_cpu_flags();
if (flags & AV_CPU_FLAG_RVV_F32)
switch (ff_get_rv_vlenb()) {
case 16:
d->postfilter = ff_opus_postfilter_rvv_128;
break;
case 32:
d->postfilter = ff_opus_postfilter_rvv_256;
break;
case 64:
d->postfilter = ff_opus_postfilter_rvv_512;
break;
case 128:
d->postfilter = ff_opus_postfilter_rvv_512;
break;
}
if ((flags & AV_CPU_FLAG_RVV_F32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
(flags & AV_CPU_FLAG_RVB_BASIC))
d->postfilter = ff_opus_postfilter_rvv;
#endif
}

View File

@ -20,56 +20,47 @@
#include "libavutil/riscv/asm.S"
func ff_opus_postfilter_rvv_128, zve32f
lvtypei a5, e32, m2, ta, ma
j 1f
endfunc
func ff_opus_postfilter_rvv, zve32f
flw fa0, 0(a2) // g0
slli t1, a1, 2
flw fa1, 4(a2) // g1
sub t0, a0, t1
flw fa2, 8(a2) // g2
addi t0, t0, 2 * 4 // data - (period - 2) = initial &x0
func ff_opus_postfilter_rvv_512, zve32f
lvtypei a5, e32, mf2, ta, ma
j 1f
endfunc
func ff_opus_postfilter_rvv_1024, zve32f
lvtypei a5, e32, mf4, ta, ma
j 1f
endfunc
func ff_opus_postfilter_rvv_256, zve32f
lvtypei a5, e32, m1, ta, ma
flw ft4, -16(t0)
addi t3, a1, -2 // maximum parallelism w/o stepping our tail
flw ft3, -12(t0)
flw ft2, -8(t0)
flw ft1, -4(t0)
1:
li a4, 5
addi a1, a1, 2
slli a1, a1, 2
lw t1, 4(a2)
vsetivli zero, 3, e32, m1, ta, ma
vle32.v v24, (a2)
sub a1, a0, a1 // a1 = &x4 = &data[-(period + 2)]
vsetvl zero, a4, a5
vslide1up.vx v8, v24, t1
lw t2, 8(a2)
vle32.v v16, (a1)
vslide1up.vx v24, v8, t2 // v24 = { g[2], g[1], g[0], g[1], g[2] }
2:
vsetvl t0, a3, a5
vle32.v v0, (a0)
sub a3, a3, t0
3:
vsetvl zero, a4, a5
lw t2, 20(a1)
vfmul.vv v8, v24, v16
addi a0, a0, 4
vslide1down.vx v16, v16, t2
addi a1, a1, 4
vfredusum.vs v0, v8, v0
vsetvl zero, t0, a5
vmv.x.s t1, v0
addi t0, t0, -1
vslide1down.vx v0, v0, zero
sw t1, -4(a0)
bnez t0, 3b
bnez a3, 2b
min t1, a3, t3
vsetvli t1, t1, e32, m4, ta, ma
vle32.v v0, (t0) // x0
sub a3, a3, t1
vle32.v v28, (a0)
sh2add t0, t1, t0
vfslide1up.vf v4, v0, ft1
addi t2, t1, -4
vfslide1up.vf v8, v4, ft2
vfslide1up.vf v12, v8, ft3
vfslide1up.vf v16, v12, ft4
vfadd.vv v20, v4, v12
vfadd.vv v24, v0, v16
vslidedown.vx v12, v0, t2
vfmacc.vf v28, fa0, v8
vslidedown.vi v4, v12, 2
vfmacc.vf v28, fa1, v20
vslide1down.vx v8, v12, zero
vfmacc.vf v28, fa2, v24
vslide1down.vx v0, v4, zero
vse32.v v28, (a0)
vfmv.f.s ft4, v12
sh2add a0, t1, a0
vfmv.f.s ft2, v4
vfmv.f.s ft3, v8
vfmv.f.s ft1, v0
bnez a3, 1b
ret
endfunc