swscale/aarch64/rgb2rgb: add neon implementation for rgb24toyv12

A55               A76
rgb24toyv12_16_200_c:     36890.6           17275.5
rgb24toyv12_16_200_neon:  12460.1 ( 2.96x)   5360.8 ( 3.22x)
rgb24toyv12_128_60_c:     83205.1           39884.8
rgb24toyv12_128_60_neon:  27468.4 ( 3.03x)  13552.5 ( 2.94x)
rgb24toyv12_512_16_c:     88111.6           42346.8
rgb24toyv12_512_16_neon:  29126.6 ( 3.03x)  14411.2 ( 2.94x)
rgb24toyv12_1920_4_c:     82068.1           39620.0
rgb24toyv12_1920_4_neon:  27011.6 ( 3.04x)  13492.2 ( 2.94x)
This commit is contained in:
Ramiro Polla 2024-08-28 00:52:09 +02:00
parent caaec2ea95
commit c0666d8bed
2 changed files with 164 additions and 0 deletions

View File

@ -27,6 +27,9 @@
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
void ff_rgb24toyv12_neon(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height, int lumStride,
int chromStride, int srcStride, int32_t *rgb2yuv);
void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride);
@ -39,6 +42,7 @@ av_cold void rgb2rgb_init_aarch64(void)
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
ff_rgb24toyv12 = ff_rgb24toyv12_neon;
interleaveBytes = ff_interleave_bytes_neon;
deinterleaveBytes = ff_deinterleave_bytes_neon;
}

View File

@ -1,5 +1,6 @@
/*
* Copyright (c) 2020 Martin Storsjo
* Copyright (c) 2024 Ramiro Polla
*
* This file is part of FFmpeg.
*
@ -20,6 +21,165 @@
#include "libavutil/aarch64/asm.S"
#define RGB2YUV_COEFFS 16*4+16*32
#define BY v0.h[0]
#define GY v0.h[1]
#define RY v0.h[2]
#define BU v1.h[0]
#define GU v1.h[1]
#define RU v1.h[2]
#define BV v2.h[0]
#define GV v2.h[1]
#define RV v2.h[2]
#define Y_OFFSET v22
#define UV_OFFSET v23
// convert rgb to 16-bit y, u, or v
// uses v3 and v4
.macro rgbconv16 dst, b, g, r, bc, gc, rc
smull v3.4s, \b\().4h, \bc
smlal v3.4s, \g\().4h, \gc
smlal v3.4s, \r\().4h, \rc
smull2 v4.4s, \b\().8h, \bc
smlal2 v4.4s, \g\().8h, \gc
smlal2 v4.4s, \r\().8h, \rc // v3:v4 = b * bc + g * gc + r * rc (32-bit)
shrn \dst\().4h, v3.4s, #7
shrn2 \dst\().8h, v4.4s, #7 // dst = b * bc + g * gc + r * rc (16-bit)
.endm
// void ff_rgb24toyv12_neon(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
// uint8_t *vdst, int width, int height, int lumStride,
// int chromStride, int srcStride, int32_t *rgb2yuv);
function ff_rgb24toyv12_neon, export=1
// x0 const uint8_t *src
// x1 uint8_t *ydst
// x2 uint8_t *udst
// x3 uint8_t *vdst
// w4 int width
// w5 int height
// w6 int lumStride
// w7 int chromStride
ldrsw x14, [sp]
ldr x15, [sp, #8]
// x14 int srcStride
// x15 int32_t *rgb2yuv
// extend width and stride parameters
uxtw x4, w4
sxtw x6, w6
sxtw x7, w7
// src1 = x0
// src2 = x10
add x10, x0, x14 // x10 = src + srcStride
lsl x14, x14, #1 // srcStride *= 2
add x11, x4, x4, lsl #1 // x11 = 3 * width
sub x14, x14, x11 // srcPadding = (2 * srcStride) - (3 * width)
// ydst1 = x1
// ydst2 = x11
add x11, x1, x6 // x11 = ydst + lumStride
lsl x6, x6, #1 // lumStride *= 2
sub x6, x6, x4 // lumPadding = (2 * lumStride) - width
sub x7, x7, x4, lsr #1 // chromPadding = chromStride - (width / 2)
// load rgb2yuv coefficients into v0, v1, and v2
add x15, x15, #RGB2YUV_COEFFS
ld1 {v0.8h-v2.8h}, [x15] // load 24 values
// load offset constants
movi Y_OFFSET.8h, #0x10, lsl #8
movi UV_OFFSET.8h, #0x80, lsl #8
1:
mov w15, w4 // w15 = width
2:
// load first line
ld3 {v26.16b, v27.16b, v28.16b}, [x0], #48
// widen first line to 16-bit
uxtl v16.8h, v26.8b // v16 = B11
uxtl v17.8h, v27.8b // v17 = G11
uxtl v18.8h, v28.8b // v18 = R11
uxtl2 v19.8h, v26.16b // v19 = B12
uxtl2 v20.8h, v27.16b // v20 = G12
uxtl2 v21.8h, v28.16b // v21 = R12
// calculate Y values for first line
rgbconv16 v24, v16, v17, v18, BY, GY, RY // v24 = Y11
rgbconv16 v25, v19, v20, v21, BY, GY, RY // v25 = Y12
// load second line
ld3 {v26.16b, v27.16b, v28.16b}, [x10], #48
// pairwise add and save rgb values to calculate average
addp v5.8h, v16.8h, v19.8h
addp v6.8h, v17.8h, v20.8h
addp v7.8h, v18.8h, v21.8h
// widen second line to 16-bit
uxtl v16.8h, v26.8b // v16 = B21
uxtl v17.8h, v27.8b // v17 = G21
uxtl v18.8h, v28.8b // v18 = R21
uxtl2 v19.8h, v26.16b // v19 = B22
uxtl2 v20.8h, v27.16b // v20 = G22
uxtl2 v21.8h, v28.16b // v21 = R22
// calculate Y values for second line
rgbconv16 v26, v16, v17, v18, BY, GY, RY // v26 = Y21
rgbconv16 v27, v19, v20, v21, BY, GY, RY // v27 = Y22
// pairwise add rgb values to calculate average
addp v16.8h, v16.8h, v19.8h
addp v17.8h, v17.8h, v20.8h
addp v18.8h, v18.8h, v21.8h
// calculate average
add v16.8h, v16.8h, v5.8h
add v17.8h, v17.8h, v6.8h
add v18.8h, v18.8h, v7.8h
ushr v16.8h, v16.8h, #2
ushr v17.8h, v17.8h, #2
ushr v18.8h, v18.8h, #2
// calculate U and V values
rgbconv16 v28, v16, v17, v18, BU, GU, RU // v28 = U
rgbconv16 v29, v16, v17, v18, BV, GV, RV // v29 = V
// add offsets and narrow all values
addhn v24.8b, v24.8h, Y_OFFSET.8h
addhn v25.8b, v25.8h, Y_OFFSET.8h
addhn v26.8b, v26.8h, Y_OFFSET.8h
addhn v27.8b, v27.8h, Y_OFFSET.8h
addhn v28.8b, v28.8h, UV_OFFSET.8h
addhn v29.8b, v29.8h, UV_OFFSET.8h
subs w15, w15, #16
// store output
st1 {v24.8b, v25.8b}, [x1], #16 // store ydst1
st1 {v26.8b, v27.8b}, [x11], #16 // store ydst2
st1 {v28.8b}, [x2], #8 // store udst
st1 {v29.8b}, [x3], #8 // store vdst
b.gt 2b
subs w5, w5, #2
// row += 2
add x0, x0, x14 // src1 += srcPadding
add x10, x10, x14 // src2 += srcPadding
add x1, x1, x6 // ydst1 += lumPadding
add x11, x11, x6 // ydst2 += lumPadding
add x2, x2, x7 // udst += chromPadding
add x3, x3, x7 // vdst += chromPadding
b.gt 1b
ret
endfunc
// void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
// uint8_t *dest, int width, int height,
// int src1Stride, int src2Stride, int dstStride);