mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2024-09-19 21:06:42 +00:00
x86/vvc_alf: avoid overwriting for non-16 aligned widths
Previously, the code allowed overwriting on 16-aligned blocks, which was suitable when there were no picture's virtual boundaries because both CTU sizes and strides were 16-aligned. However, with picture's virtual boundaries, each CTU is divided into four ALF blocks, leading to potential issues with overwriting later CTUs. In cases involving picture virtual boundaries, each ALF block is 8-pixel aligned. For luma, we consistently ensure an 8-aligned width. For chroma in 4:2:0 format, we need to account for a 4-aligned width.
This commit is contained in:
parent
1fa9f5b17f
commit
6b0e6a98b5
@ -324,18 +324,69 @@ SECTION .text
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; STORE_PIXELS(dst, src)
|
; STORE_PIXELS_W16(dst, src)
|
||||||
%macro STORE_PIXELS 2
|
%macro STORE_PIXELS_W16 2
|
||||||
%if ps == 2
|
%if ps == 2
|
||||||
movu %1, m%2
|
movu [%1], m%2
|
||||||
%else
|
%else
|
||||||
packuswb m%2, m%2
|
movu [%1], xm%2
|
||||||
vpermq m%2, m%2, 0x8
|
|
||||||
movu %1, xm%2
|
|
||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro FILTER_16x4 0
|
%macro STORE_PIXELS_W8 2
|
||||||
|
%if ps == 2
|
||||||
|
movu [%1], xm%2
|
||||||
|
%else
|
||||||
|
movq [%1], xm%2
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
; STORE_PIXELS_W4(dst, src, offset)
|
||||||
|
%macro STORE_PIXELS_W4 3
|
||||||
|
%if ps == 2
|
||||||
|
movq [%1 + %3 * ps], xm%2
|
||||||
|
%else
|
||||||
|
movd [%1 + %3], xm%2
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro STORE_PIXELS_W8LE 3
|
||||||
|
cmp %3, 8
|
||||||
|
jl .w4
|
||||||
|
STORE_PIXELS_W8 %1, %2
|
||||||
|
cmp %3, 12
|
||||||
|
%if ps == 2
|
||||||
|
vpermq m%2, m%2, q0302
|
||||||
|
%else
|
||||||
|
vpermq m%2, m%2, q0101
|
||||||
|
%endif
|
||||||
|
jl .end
|
||||||
|
STORE_PIXELS_W4 %1, %2, 8
|
||||||
|
jmp .end
|
||||||
|
.w4:
|
||||||
|
STORE_PIXELS_W4 %1, %2, 0
|
||||||
|
.end:
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
; STORE_PIXELS(dst, src, width)
|
||||||
|
%macro STORE_PIXELS 3
|
||||||
|
%if ps == 1
|
||||||
|
packuswb m%2, m%2
|
||||||
|
vpermq m%2, m%2, 0x8
|
||||||
|
%endif
|
||||||
|
|
||||||
|
%ifidn %3, 16
|
||||||
|
STORE_PIXELS_W16 %1, %2
|
||||||
|
%else
|
||||||
|
%if LUMA
|
||||||
|
STORE_PIXELS_W8 %1, %2
|
||||||
|
%else
|
||||||
|
STORE_PIXELS_W8LE %1, %2, %3
|
||||||
|
%endif
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro FILTER_16x4 1
|
||||||
%if LUMA
|
%if LUMA
|
||||||
push clipq
|
push clipq
|
||||||
push strideq
|
push strideq
|
||||||
@ -362,7 +413,7 @@ SECTION .text
|
|||||||
; clip to pixel
|
; clip to pixel
|
||||||
CLIPW m0, m14, m15
|
CLIPW m0, m14, m15
|
||||||
|
|
||||||
STORE_PIXELS [dstq], 0
|
STORE_PIXELS dstq, 0, %1
|
||||||
|
|
||||||
lea srcq, [srcq + src_strideq]
|
lea srcq, [srcq + src_strideq]
|
||||||
lea dstq, [dstq + dst_strideq]
|
lea dstq, [dstq + dst_strideq]
|
||||||
@ -399,7 +450,7 @@ SECTION .text
|
|||||||
; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height,
|
; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height,
|
||||||
; const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max);
|
; const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max);
|
||||||
; ******************************
|
; ******************************
|
||||||
cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \
|
cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \
|
||||||
offset, x, s5, s6
|
offset, x, s5, s6
|
||||||
%define ps (%1 / 8) ; pixel size
|
%define ps (%1 / 8) ; pixel size
|
||||||
movd xm15, pixel_maxd
|
movd xm15, pixel_maxd
|
||||||
@ -409,18 +460,32 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_s
|
|||||||
.loop:
|
.loop:
|
||||||
push srcq
|
push srcq
|
||||||
push dstq
|
push dstq
|
||||||
|
push widthq
|
||||||
xor xq, xq
|
xor xq, xq
|
||||||
|
|
||||||
.loop_w:
|
.loop_w:
|
||||||
|
cmp widthq, 16
|
||||||
|
jl .loop_w_end
|
||||||
|
|
||||||
LOAD_PARAMS
|
LOAD_PARAMS
|
||||||
FILTER_16x4
|
FILTER_16x4 16
|
||||||
|
|
||||||
add srcq, 16 * ps
|
add srcq, 16 * ps
|
||||||
add dstq, 16 * ps
|
add dstq, 16 * ps
|
||||||
add xq, 16
|
add xq, 16
|
||||||
cmp xq, widthq
|
sub widthq, 16
|
||||||
jl .loop_w
|
jmp .loop_w
|
||||||
|
|
||||||
|
.loop_w_end:
|
||||||
|
cmp widthq, 0
|
||||||
|
je .w_end
|
||||||
|
|
||||||
|
LOAD_PARAMS
|
||||||
|
FILTER_16x4 widthq
|
||||||
|
|
||||||
|
.w_end:
|
||||||
|
|
||||||
|
pop widthq
|
||||||
pop dstq
|
pop dstq
|
||||||
pop srcq
|
pop srcq
|
||||||
lea srcq, [srcq + 4 * src_strideq]
|
lea srcq, [srcq + 4 * src_strideq]
|
||||||
|
Loading…
Reference in New Issue
Block a user