From 42d622fab31dbcae91fd4b0810e0ac07027e2df1 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 8 Jul 2011 14:52:52 -0700 Subject: [PATCH 01/22] swscale: fix 16-bit scaling when output is 8-bits. We would use the second half of the U plane buffer, rather than the V plane buffer, to output the V plane pixels. --- libswscale/utils.c | 2 +- libswscale/x86/swscale_template.c | 24 ++++++++++++------------ tests/ref/lavfi/pixdesc | 4 ++-- tests/ref/lavfi/pixfmts_copy | 4 ++-- tests/ref/lavfi/pixfmts_crop | 4 ++-- tests/ref/lavfi/pixfmts_hflip | 4 ++-- tests/ref/lavfi/pixfmts_null | 4 ++-- tests/ref/lavfi/pixfmts_scale | 4 ++-- tests/ref/lavfi/pixfmts_vflip | 4 ++-- 9 files changed, 27 insertions(+), 27 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index 9f0bb7a8b9..fd10fa03fb 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -1053,7 +1053,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) for (i=0; ivChrBufSize; i++) { FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i+c->vChrBufSize], dst_stride*2+1, fail); c->chrUPixBuf[i] = c->chrUPixBuf[i+c->vChrBufSize]; - c->chrVPixBuf[i] = c->chrVPixBuf[i+c->vChrBufSize] = c->chrUPixBuf[i] + dst_stride_px; + c->chrVPixBuf[i] = c->chrVPixBuf[i+c->vChrBufSize] = c->chrUPixBuf[i] + (dst_stride >> 1); } if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) for (i=0; ivLumBufSize; i++) { diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index f58ac520e1..26cd2742a3 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -81,7 +81,7 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; if (uDest) { - x86_reg uv_off = c->uv_off; + x86_reg uv_off = c->uv_offx2 >> 1; YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) } @@ -164,7 +164,7 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; if (uDest) { - x86_reg uv_off = c->uv_off; + x86_reg uv_off = c->uv_offx2 >> 1; YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) } @@ -473,7 +473,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off << 1; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX_ACCURATE @@ -506,7 +506,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off << 1; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX @@ -563,7 +563,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off << 1; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -587,7 +587,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off << 1; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -640,7 +640,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off << 1; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -664,7 +664,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off << 1; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -797,7 +797,7 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off << 1; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -821,7 +821,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off << 1; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -862,7 +862,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off << 1; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -883,7 +883,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off << 1; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ diff --git a/tests/ref/lavfi/pixdesc b/tests/ref/lavfi/pixdesc index 8695b1dea2..ee28c4b78f 100644 --- a/tests/ref/lavfi/pixdesc +++ b/tests/ref/lavfi/pixdesc @@ -42,8 +42,8 @@ yuv422p16be 86ad3447f97969ce095afeef81fa9abf yuv422p16le a53a9b451f4a81eeae33362c1bbd07f8 yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e -yuv444p16be 95db370ae765dd3d10b7def14704ae73 -yuv444p16le 36b969179b5ad9d312a0d1e1cd6bc402 +yuv444p16be 12554ba5f143126dc5e886b9a8be37e9 +yuv444p16le 35872903aefd0f545255a4452ccc262e yuva420p a29884f3f3dfe1e00b961bc17bef3d47 yuvj420p 32eec78ba51857b16ce9b813a49b7189 yuvj422p 0dfa0ed434f73be51428758c69e082cb diff --git a/tests/ref/lavfi/pixfmts_copy b/tests/ref/lavfi/pixfmts_copy index 8695b1dea2..ee28c4b78f 100644 --- a/tests/ref/lavfi/pixfmts_copy +++ b/tests/ref/lavfi/pixfmts_copy @@ -42,8 +42,8 @@ yuv422p16be 86ad3447f97969ce095afeef81fa9abf yuv422p16le a53a9b451f4a81eeae33362c1bbd07f8 yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e -yuv444p16be 95db370ae765dd3d10b7def14704ae73 -yuv444p16le 36b969179b5ad9d312a0d1e1cd6bc402 +yuv444p16be 12554ba5f143126dc5e886b9a8be37e9 +yuv444p16le 35872903aefd0f545255a4452ccc262e yuva420p a29884f3f3dfe1e00b961bc17bef3d47 yuvj420p 32eec78ba51857b16ce9b813a49b7189 yuvj422p 0dfa0ed434f73be51428758c69e082cb diff --git a/tests/ref/lavfi/pixfmts_crop b/tests/ref/lavfi/pixfmts_crop index c7b59527fb..fb5c838a4e 100644 --- a/tests/ref/lavfi/pixfmts_crop +++ b/tests/ref/lavfi/pixfmts_crop @@ -33,8 +33,8 @@ yuv422p16be 6647fe1c381c148f8207c988c0e22bf0 yuv422p16le e1548c9dc51202db38a9625c8954203f yuv440p 2472417d980e395ad6843cbb8b633b29 yuv444p 1f151980486848c96bc5585ced99003e -yuv444p16be ac3b159f8c858fcdf475a8c024ee79b6 -yuv444p16le 9a6863bfc5faee206065c11dc994bf0c +yuv444p16be 02d78b564a23df2f68cf6895d3bfe6bf +yuv444p16le cbea9591b954ea31d6a0cb25a9aed599 yuva420p 7536753dfbc7932560fb50c921369a0e yuvj420p 21f891093006d42d7683b0e1d773a657 yuvj422p 9a43d474c407590ad8f213880586b45e diff --git a/tests/ref/lavfi/pixfmts_hflip b/tests/ref/lavfi/pixfmts_hflip index 7f4342f35c..77102524b4 100644 --- a/tests/ref/lavfi/pixfmts_hflip +++ b/tests/ref/lavfi/pixfmts_hflip @@ -33,8 +33,8 @@ yuv422p16be 9dbe0af0eb877987611cf04bfa577202 yuv422p16le 2d8f37231110177cc5e1b61c8cb4b163 yuv440p a99e2b57ed601f39852715c9d675d0d3 yuv444p 947e47f7bb5fdccc659d19b7df2b6fc3 -yuv444p16be debc96a7ec4fec0a412f9d8995bc48a2 -yuv444p16le 5b5e1348a631fc2206bb7ff851a52687 +yuv444p16be cdc7bfb08b8286d05d6a639d1bfc0d26 +yuv444p16le 969caecaaca795477874420540d21e8b yuva420p d83ec0c01498189f179ec574918185f1 yuvj420p df3aaaec3bb157c3bde5f0365af30f4f yuvj422p d113871528d510a192797af59df9c05c diff --git a/tests/ref/lavfi/pixfmts_null b/tests/ref/lavfi/pixfmts_null index 8695b1dea2..ee28c4b78f 100644 --- a/tests/ref/lavfi/pixfmts_null +++ b/tests/ref/lavfi/pixfmts_null @@ -42,8 +42,8 @@ yuv422p16be 86ad3447f97969ce095afeef81fa9abf yuv422p16le a53a9b451f4a81eeae33362c1bbd07f8 yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e -yuv444p16be 95db370ae765dd3d10b7def14704ae73 -yuv444p16le 36b969179b5ad9d312a0d1e1cd6bc402 +yuv444p16be 12554ba5f143126dc5e886b9a8be37e9 +yuv444p16le 35872903aefd0f545255a4452ccc262e yuva420p a29884f3f3dfe1e00b961bc17bef3d47 yuvj420p 32eec78ba51857b16ce9b813a49b7189 yuvj422p 0dfa0ed434f73be51428758c69e082cb diff --git a/tests/ref/lavfi/pixfmts_scale b/tests/ref/lavfi/pixfmts_scale index 8901fe9132..0cb27ef387 100644 --- a/tests/ref/lavfi/pixfmts_scale +++ b/tests/ref/lavfi/pixfmts_scale @@ -42,8 +42,8 @@ yuv422p16be 837945d3a771366a5a72a4ed095a4f53 yuv422p16le b8292ae9b52eb7afc3d8b93e8fd895b4 yuv440p 461503fdb9b90451020aa3b25ddf041c yuv444p 81b2eba962d12e8d64f003ac56f6faf2 -yuv444p16be cc7460f76477aa4b4c33442f67c06a89 -yuv444p16le 9a5ed60d68c0a4a5155f9d376174cdf7 +yuv444p16be fe2c4a3708c4f44a2d91f3c413f33caf +yuv444p16le 1b4fce808e546cd75ef01cdb91da26b5 yuva420p 8673a9131fb47de69788863f93a50eb7 yuvj420p 30427bd6caf5bda93a173dbebe759e09 yuvj422p fc8288f64fd149573f73cf8da05d8e6d diff --git a/tests/ref/lavfi/pixfmts_vflip b/tests/ref/lavfi/pixfmts_vflip index 89502d1745..acead3d990 100644 --- a/tests/ref/lavfi/pixfmts_vflip +++ b/tests/ref/lavfi/pixfmts_vflip @@ -42,8 +42,8 @@ yuv422p16be 8cdfbddf2dd4c44c3efef4ee00170eba yuv422p16le a2f421f6a1af950544081c1797de01ae yuv440p 876385e96165acf51271b20e5d85a416 yuv444p 9c3c667d1613b72d15bc6d851c5eb8f7 -yuv444p16be b092690d22f0b26360fbf5cfd739be17 -yuv444p16le 18768b4ddca92d06f9713fef467276a9 +yuv444p16be c73d2f57f90060d4126241aba04876d3 +yuv444p16le c9ab60ec1ae3ff50da524e83e3553add yuva420p c705d1cf061d8c6580ac690b55f92276 yuvj420p 41fd02b204da0ab62452cd14b595e2e4 yuvj422p 7f6ca9bc1812cde02036d7d29a7cce43 From f44d50a94c120135faeba6b4a1e5551b4397810f Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 8 Jul 2011 14:54:31 -0700 Subject: [PATCH 02/22] swscale: fix 16-bit horizontal scaling underflow. When using e.g. lanczos scaling, values can drop below 0, so they should never be unsigned. --- libswscale/swscale.c | 2 +- tests/ref/lavfi/pixdesc | 4 ++-- tests/ref/lavfi/pixfmts_copy | 4 ++-- tests/ref/lavfi/pixfmts_hflip | 4 ++-- tests/ref/lavfi/pixfmts_null | 4 ++-- tests/ref/lavfi/pixfmts_scale | 24 ++++++++++++------------ tests/ref/lavfi/pixfmts_vflip | 4 ++-- 7 files changed, 23 insertions(+), 23 deletions(-) diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 14807fdc46..a302373b85 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -1854,7 +1854,7 @@ static void hScale16_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_s for (i = 0; i < dstW; i++) { int j; int srcPos = filterPos[i]; - unsigned int val = 0; + int val = 0; for (j = 0; j < filterSize; j++) { val += src[srcPos + j] * filter[filterSize * i + j]; diff --git a/tests/ref/lavfi/pixdesc b/tests/ref/lavfi/pixdesc index ee28c4b78f..76d845abb8 100644 --- a/tests/ref/lavfi/pixdesc +++ b/tests/ref/lavfi/pixdesc @@ -42,8 +42,8 @@ yuv422p16be 86ad3447f97969ce095afeef81fa9abf yuv422p16le a53a9b451f4a81eeae33362c1bbd07f8 yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e -yuv444p16be 12554ba5f143126dc5e886b9a8be37e9 -yuv444p16le 35872903aefd0f545255a4452ccc262e +yuv444p16be 2a8e2bddfe4c208df4119aaa7dc5db28 +yuv444p16le f2117fc9cf66d3a832183e159ff71803 yuva420p a29884f3f3dfe1e00b961bc17bef3d47 yuvj420p 32eec78ba51857b16ce9b813a49b7189 yuvj422p 0dfa0ed434f73be51428758c69e082cb diff --git a/tests/ref/lavfi/pixfmts_copy b/tests/ref/lavfi/pixfmts_copy index ee28c4b78f..76d845abb8 100644 --- a/tests/ref/lavfi/pixfmts_copy +++ b/tests/ref/lavfi/pixfmts_copy @@ -42,8 +42,8 @@ yuv422p16be 86ad3447f97969ce095afeef81fa9abf yuv422p16le a53a9b451f4a81eeae33362c1bbd07f8 yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e -yuv444p16be 12554ba5f143126dc5e886b9a8be37e9 -yuv444p16le 35872903aefd0f545255a4452ccc262e +yuv444p16be 2a8e2bddfe4c208df4119aaa7dc5db28 +yuv444p16le f2117fc9cf66d3a832183e159ff71803 yuva420p a29884f3f3dfe1e00b961bc17bef3d47 yuvj420p 32eec78ba51857b16ce9b813a49b7189 yuvj422p 0dfa0ed434f73be51428758c69e082cb diff --git a/tests/ref/lavfi/pixfmts_hflip b/tests/ref/lavfi/pixfmts_hflip index 77102524b4..4a32e35a1c 100644 --- a/tests/ref/lavfi/pixfmts_hflip +++ b/tests/ref/lavfi/pixfmts_hflip @@ -33,8 +33,8 @@ yuv422p16be 9dbe0af0eb877987611cf04bfa577202 yuv422p16le 2d8f37231110177cc5e1b61c8cb4b163 yuv440p a99e2b57ed601f39852715c9d675d0d3 yuv444p 947e47f7bb5fdccc659d19b7df2b6fc3 -yuv444p16be cdc7bfb08b8286d05d6a639d1bfc0d26 -yuv444p16le 969caecaaca795477874420540d21e8b +yuv444p16be 38cdb28061ebb8e7aa3797238615e77f +yuv444p16le 80d654986d6f3754e924310a045fdb24 yuva420p d83ec0c01498189f179ec574918185f1 yuvj420p df3aaaec3bb157c3bde5f0365af30f4f yuvj422p d113871528d510a192797af59df9c05c diff --git a/tests/ref/lavfi/pixfmts_null b/tests/ref/lavfi/pixfmts_null index ee28c4b78f..76d845abb8 100644 --- a/tests/ref/lavfi/pixfmts_null +++ b/tests/ref/lavfi/pixfmts_null @@ -42,8 +42,8 @@ yuv422p16be 86ad3447f97969ce095afeef81fa9abf yuv422p16le a53a9b451f4a81eeae33362c1bbd07f8 yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e -yuv444p16be 12554ba5f143126dc5e886b9a8be37e9 -yuv444p16le 35872903aefd0f545255a4452ccc262e +yuv444p16be 2a8e2bddfe4c208df4119aaa7dc5db28 +yuv444p16le f2117fc9cf66d3a832183e159ff71803 yuva420p a29884f3f3dfe1e00b961bc17bef3d47 yuvj420p 32eec78ba51857b16ce9b813a49b7189 yuvj422p 0dfa0ed434f73be51428758c69e082cb diff --git a/tests/ref/lavfi/pixfmts_scale b/tests/ref/lavfi/pixfmts_scale index 0cb27ef387..14c748ff10 100644 --- a/tests/ref/lavfi/pixfmts_scale +++ b/tests/ref/lavfi/pixfmts_scale @@ -11,8 +11,8 @@ bgr565le 3a514a298c6161a071ddf9963c06509d bgr8 7f007fa6c153a16e808a9c51605a4016 bgra a5e7040f9a80cccd65e5acf2ca09ace5 gray d7786a7d9d99ac74230cc045cab5632c -gray16be bba98532da29a31599df2feec3b08e3e -gray16le 30267f127d5734c4767f3944f1729a33 +gray16be b554d6c1cc8da23967445be4dd3e4a86 +gray16le 715a33aa1c19cb26b14f5cc000e7a3d1 monob 88c4c050758e64d120f50c7eff694381 monow d31772ebaa877fc2a78565937f7f9673 nv12 4676d59db43d657dc12841f6bc3ab452 @@ -31,19 +31,19 @@ uyvy422 314bd486277111a95d9369b944fa0400 yuv410p 7df8f6d69b56a8dcb6c7ee908e5018b5 yuv411p 1143e7c5cc28fe0922b051b17733bc4c yuv420p fdad2d8df8985e3d17e73c71f713cb14 -yuv420p10be 2343beaf83fccc2ab23a590b2049d38b -yuv420p10le 94d511d783d175f573e7be5cce75ba4d -yuv420p16be f6ef3ba90f238b467c7e72ade927083d -yuv420p16le faf6aab3b1c16e8afbe160686dd360e0 -yuv420p9be fdafb9ad473a559246c4cb0a1f416cd8 -yuv420p9le fccfd3c3941da635b13739f579819b5a +yuv420p10be aad747a7634ba4ed48f149cdfc78792e +yuv420p10le 46547f19a7f58638dff73657477b11b9 +yuv420p16be d7270efce54eb59c7b01c14157a1b890 +yuv420p16le e85abf00bad940a922b623c91c9026d7 +yuv420p9be a073b2d93b2a7dce2069ba252bc43175 +yuv420p9le b67233c3c7d93763d07d88f697c145e1 yuv422p 918e37701ee7377d16a8a6c119c56a40 -yuv422p16be 837945d3a771366a5a72a4ed095a4f53 -yuv422p16le b8292ae9b52eb7afc3d8b93e8fd895b4 +yuv422p16be 5cd8fe1da161d21b65bf75bf4cb50c75 +yuv422p16le 3b545317b1f5e627751525fb2958d88e yuv440p 461503fdb9b90451020aa3b25ddf041c yuv444p 81b2eba962d12e8d64f003ac56f6faf2 -yuv444p16be fe2c4a3708c4f44a2d91f3c413f33caf -yuv444p16le 1b4fce808e546cd75ef01cdb91da26b5 +yuv444p16be 2bd3e992d1533f8e0978a8e0c7008243 +yuv444p16le daee5a461ae3bc53295c392e423aa1d7 yuva420p 8673a9131fb47de69788863f93a50eb7 yuvj420p 30427bd6caf5bda93a173dbebe759e09 yuvj422p fc8288f64fd149573f73cf8da05d8e6d diff --git a/tests/ref/lavfi/pixfmts_vflip b/tests/ref/lavfi/pixfmts_vflip index acead3d990..b440efa26b 100644 --- a/tests/ref/lavfi/pixfmts_vflip +++ b/tests/ref/lavfi/pixfmts_vflip @@ -42,8 +42,8 @@ yuv422p16be 8cdfbddf2dd4c44c3efef4ee00170eba yuv422p16le a2f421f6a1af950544081c1797de01ae yuv440p 876385e96165acf51271b20e5d85a416 yuv444p 9c3c667d1613b72d15bc6d851c5eb8f7 -yuv444p16be c73d2f57f90060d4126241aba04876d3 -yuv444p16le c9ab60ec1ae3ff50da524e83e3553add +yuv444p16be bb86de32c67dd49469989ac184b89592 +yuv444p16le 64b899f52d820b14d05ff95954c15790 yuva420p c705d1cf061d8c6580ac690b55f92276 yuvj420p 41fd02b204da0ab62452cd14b595e2e4 yuvj422p 7f6ca9bc1812cde02036d7d29a7cce43 From bf2cba453244a74331238a472fe0e309f116f4d9 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 8 Jul 2011 12:28:28 -0700 Subject: [PATCH 03/22] swscale: fix crash in 8-bpc bilinear output without alpha. We accessed the alpha array even it wasn't used and didn't exist, hence leading to a NULL pointer segfault. --- libswscale/swscale.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libswscale/swscale.c b/libswscale/swscale.c index a302373b85..527dd80e0c 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -1136,7 +1136,8 @@ yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], *vbuf0 = vbuf[0], *vbuf1 = vbuf[1], - *abuf0 = abuf[0], *abuf1 = abuf[1]; + *abuf0 = hasAlpha ? abuf[0] : NULL, + *abuf1 = hasAlpha ? abuf[1] : NULL; int yalpha1 = 4095 - yalpha; int uvalpha1 = 4095 - uvalpha; int i; From 7d7bacf0f1f43f5cc112977fc3b8438e3e4a4a9b Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 8 Jul 2011 14:57:07 -0700 Subject: [PATCH 04/22] swscale: fix overflow in 16-bit vertical scaling. We operated on 31-bits, but with e.g. lanczos scaling, values can add up to beyond 0x80000000, thus leading to output of zeroes. Drop one bit of precision fixes this. --- libswscale/swscale.c | 18 +++++++++--------- tests/ref/lavfi/pixdesc | 8 ++++---- tests/ref/lavfi/pixfmts_copy | 8 ++++---- tests/ref/lavfi/pixfmts_crop | 8 ++++---- tests/ref/lavfi/pixfmts_hflip | 8 ++++---- tests/ref/lavfi/pixfmts_null | 8 ++++---- tests/ref/lavfi/pixfmts_scale | 16 ++++++++-------- tests/ref/lavfi/pixfmts_vflip | 8 ++++---- 8 files changed, 41 insertions(+), 41 deletions(-) diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 527dd80e0c..db4d231e13 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -195,7 +195,7 @@ yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc, int i; uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; - int shift = 15 + 16 - output_bits; + int shift = 15 + 16 - output_bits - 1; #define output_pixel(pos, val) \ if (big_endian) { \ @@ -212,24 +212,24 @@ yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc, } \ } for (i = 0; i < dstW; i++) { - int val = 1 << (30-output_bits); + int val = 1 << (30-output_bits - 1); int j; for (j = 0; j < lumFilterSize; j++) - val += lumSrc[j][i] * lumFilter[j]; + val += (lumSrc[j][i] * lumFilter[j]) >> 1; output_pixel(&yDest[i], val); } if (uDest) { for (i = 0; i < chrDstW; i++) { - int u = 1 << (30-output_bits); - int v = 1 << (30-output_bits); + int u = 1 << (30-output_bits - 1); + int v = 1 << (30-output_bits - 1); int j; for (j = 0; j < chrFilterSize; j++) { - u += chrUSrc[j][i] * chrFilter[j]; - v += chrVSrc[j][i] * chrFilter[j]; + u += (chrUSrc[j][i] * chrFilter[j]) >> 1; + v += (chrVSrc[j][i] * chrFilter[j]) >> 1; } output_pixel(&uDest[i], u); @@ -239,11 +239,11 @@ yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc, if (CONFIG_SWSCALE_ALPHA && aDest) { for (i = 0; i < dstW; i++) { - int val = 1 << (30-output_bits); + int val = 1 << (30-output_bits - 1); int j; for (j = 0; j < lumFilterSize; j++) - val += alpSrc[j][i] * lumFilter[j]; + val += (alpSrc[j][i] * lumFilter[j]) >> 1; output_pixel(&aDest[i], val); } diff --git a/tests/ref/lavfi/pixdesc b/tests/ref/lavfi/pixdesc index 76d845abb8..a82f8ed9b6 100644 --- a/tests/ref/lavfi/pixdesc +++ b/tests/ref/lavfi/pixdesc @@ -38,12 +38,12 @@ yuv420p16le 2d59c4f1d0314a5a957a7cfc4b6fabcc yuv420p9be ce880fa07830e5297c22acf6e20555ce yuv420p9le 16543fda8f87d94a6cf857d2e8d4461a yuv422p c9bba4529821d796a6ab09f6a5fd355a -yuv422p16be 86ad3447f97969ce095afeef81fa9abf -yuv422p16le a53a9b451f4a81eeae33362c1bbd07f8 +yuv422p16be dc9886f2fccf87cc54b27e071a2c251e +yuv422p16le f181c8d8436f1233ba566d9bc88005ec yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e -yuv444p16be 2a8e2bddfe4c208df4119aaa7dc5db28 -yuv444p16le f2117fc9cf66d3a832183e159ff71803 +yuv444p16be af555dbaa401b142a995566864f47545 +yuv444p16le a803e8016997dad95c5b2a72f54c34d6 yuva420p a29884f3f3dfe1e00b961bc17bef3d47 yuvj420p 32eec78ba51857b16ce9b813a49b7189 yuvj422p 0dfa0ed434f73be51428758c69e082cb diff --git a/tests/ref/lavfi/pixfmts_copy b/tests/ref/lavfi/pixfmts_copy index 76d845abb8..a82f8ed9b6 100644 --- a/tests/ref/lavfi/pixfmts_copy +++ b/tests/ref/lavfi/pixfmts_copy @@ -38,12 +38,12 @@ yuv420p16le 2d59c4f1d0314a5a957a7cfc4b6fabcc yuv420p9be ce880fa07830e5297c22acf6e20555ce yuv420p9le 16543fda8f87d94a6cf857d2e8d4461a yuv422p c9bba4529821d796a6ab09f6a5fd355a -yuv422p16be 86ad3447f97969ce095afeef81fa9abf -yuv422p16le a53a9b451f4a81eeae33362c1bbd07f8 +yuv422p16be dc9886f2fccf87cc54b27e071a2c251e +yuv422p16le f181c8d8436f1233ba566d9bc88005ec yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e -yuv444p16be 2a8e2bddfe4c208df4119aaa7dc5db28 -yuv444p16le f2117fc9cf66d3a832183e159ff71803 +yuv444p16be af555dbaa401b142a995566864f47545 +yuv444p16le a803e8016997dad95c5b2a72f54c34d6 yuva420p a29884f3f3dfe1e00b961bc17bef3d47 yuvj420p 32eec78ba51857b16ce9b813a49b7189 yuvj422p 0dfa0ed434f73be51428758c69e082cb diff --git a/tests/ref/lavfi/pixfmts_crop b/tests/ref/lavfi/pixfmts_crop index fb5c838a4e..e3bb88c101 100644 --- a/tests/ref/lavfi/pixfmts_crop +++ b/tests/ref/lavfi/pixfmts_crop @@ -29,12 +29,12 @@ yuv420p bfea0188ddd4889787c403caae119cc7 yuv420p16be 8365eff38b8c329aeb95fc605fa229bb yuv420p16le 5e8dd38d973d5854abe1ad4efad20cc1 yuv422p f2f930a91fe00d4252c4720b5ecd8961 -yuv422p16be 6647fe1c381c148f8207c988c0e22bf0 -yuv422p16le e1548c9dc51202db38a9625c8954203f +yuv422p16be 93f9b6f33f9529db6de6a9f0ddd70eb5 +yuv422p16le 2e66dcfec54ca6b57aa4bbd9ac234639 yuv440p 2472417d980e395ad6843cbb8b633b29 yuv444p 1f151980486848c96bc5585ced99003e -yuv444p16be 02d78b564a23df2f68cf6895d3bfe6bf -yuv444p16le cbea9591b954ea31d6a0cb25a9aed599 +yuv444p16be e7d1ecf0c11a41b5db192f761f55bd3c +yuv444p16le 3298a0043d982e7cf1a33a1292fa11f0 yuva420p 7536753dfbc7932560fb50c921369a0e yuvj420p 21f891093006d42d7683b0e1d773a657 yuvj422p 9a43d474c407590ad8f213880586b45e diff --git a/tests/ref/lavfi/pixfmts_hflip b/tests/ref/lavfi/pixfmts_hflip index 4a32e35a1c..2084d581e1 100644 --- a/tests/ref/lavfi/pixfmts_hflip +++ b/tests/ref/lavfi/pixfmts_hflip @@ -29,12 +29,12 @@ yuv420p 2d5c80f9ba2ddd85b2aeda3564cc7d64 yuv420p16be 758b0c1e2113b15e7afde48da4e4d024 yuv420p16le 480ccd951dcb806bc875d307e02e50a0 yuv422p 6e728f4eb9eae287c224f396d84be6ea -yuv422p16be 9dbe0af0eb877987611cf04bfa577202 -yuv422p16le 2d8f37231110177cc5e1b61c8cb4b163 +yuv422p16be 8657d2c8d443940300fdb4028d555631 +yuv422p16le 4ab27609981e50de5b1150125718ae76 yuv440p a99e2b57ed601f39852715c9d675d0d3 yuv444p 947e47f7bb5fdccc659d19b7df2b6fc3 -yuv444p16be 38cdb28061ebb8e7aa3797238615e77f -yuv444p16le 80d654986d6f3754e924310a045fdb24 +yuv444p16be a5154ce329db0d2caf0bd43f1347dba3 +yuv444p16le 1f703308b90feb048191b3bccc695671 yuva420p d83ec0c01498189f179ec574918185f1 yuvj420p df3aaaec3bb157c3bde5f0365af30f4f yuvj422p d113871528d510a192797af59df9c05c diff --git a/tests/ref/lavfi/pixfmts_null b/tests/ref/lavfi/pixfmts_null index 76d845abb8..a82f8ed9b6 100644 --- a/tests/ref/lavfi/pixfmts_null +++ b/tests/ref/lavfi/pixfmts_null @@ -38,12 +38,12 @@ yuv420p16le 2d59c4f1d0314a5a957a7cfc4b6fabcc yuv420p9be ce880fa07830e5297c22acf6e20555ce yuv420p9le 16543fda8f87d94a6cf857d2e8d4461a yuv422p c9bba4529821d796a6ab09f6a5fd355a -yuv422p16be 86ad3447f97969ce095afeef81fa9abf -yuv422p16le a53a9b451f4a81eeae33362c1bbd07f8 +yuv422p16be dc9886f2fccf87cc54b27e071a2c251e +yuv422p16le f181c8d8436f1233ba566d9bc88005ec yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e -yuv444p16be 2a8e2bddfe4c208df4119aaa7dc5db28 -yuv444p16le f2117fc9cf66d3a832183e159ff71803 +yuv444p16be af555dbaa401b142a995566864f47545 +yuv444p16le a803e8016997dad95c5b2a72f54c34d6 yuva420p a29884f3f3dfe1e00b961bc17bef3d47 yuvj420p 32eec78ba51857b16ce9b813a49b7189 yuvj422p 0dfa0ed434f73be51428758c69e082cb diff --git a/tests/ref/lavfi/pixfmts_scale b/tests/ref/lavfi/pixfmts_scale index 14c748ff10..094e52462f 100644 --- a/tests/ref/lavfi/pixfmts_scale +++ b/tests/ref/lavfi/pixfmts_scale @@ -31,19 +31,19 @@ uyvy422 314bd486277111a95d9369b944fa0400 yuv410p 7df8f6d69b56a8dcb6c7ee908e5018b5 yuv411p 1143e7c5cc28fe0922b051b17733bc4c yuv420p fdad2d8df8985e3d17e73c71f713cb14 -yuv420p10be aad747a7634ba4ed48f149cdfc78792e -yuv420p10le 46547f19a7f58638dff73657477b11b9 -yuv420p16be d7270efce54eb59c7b01c14157a1b890 -yuv420p16le e85abf00bad940a922b623c91c9026d7 +yuv420p10be af5429f27b9f95bf955e795921c65cdc +yuv420p10le d0b47e6a8a44e6b5ca0fe4349a4e393b +yuv420p16be 9688e33e03b8c8275ab2fb1df0f06bee +yuv420p16le cba8b390ad5e7b8678e419b8ce79c008 yuv420p9be a073b2d93b2a7dce2069ba252bc43175 yuv420p9le b67233c3c7d93763d07d88f697c145e1 yuv422p 918e37701ee7377d16a8a6c119c56a40 -yuv422p16be 5cd8fe1da161d21b65bf75bf4cb50c75 -yuv422p16le 3b545317b1f5e627751525fb2958d88e +yuv422p16be 2cf502d7d386db1f1b3b946679d897b1 +yuv422p16le 3002a4e47520731dcee5929aff49eb74 yuv440p 461503fdb9b90451020aa3b25ddf041c yuv444p 81b2eba962d12e8d64f003ac56f6faf2 -yuv444p16be 2bd3e992d1533f8e0978a8e0c7008243 -yuv444p16le daee5a461ae3bc53295c392e423aa1d7 +yuv444p16be b9f051ce7335923fe33efd162e48da1d +yuv444p16le fa47e317efac988b4a7fa55141c89126 yuva420p 8673a9131fb47de69788863f93a50eb7 yuvj420p 30427bd6caf5bda93a173dbebe759e09 yuvj422p fc8288f64fd149573f73cf8da05d8e6d diff --git a/tests/ref/lavfi/pixfmts_vflip b/tests/ref/lavfi/pixfmts_vflip index b440efa26b..8702eca796 100644 --- a/tests/ref/lavfi/pixfmts_vflip +++ b/tests/ref/lavfi/pixfmts_vflip @@ -38,12 +38,12 @@ yuv420p16le 0f609e588e5a258644ef85170d70e030 yuv420p9be be40ec975fb2873891643cbbbddbc3b0 yuv420p9le 7e606310d3f5ff12badf911e8f333471 yuv422p d7f5cb44d9b0210d66d6a8762640ab34 -yuv422p16be 8cdfbddf2dd4c44c3efef4ee00170eba -yuv422p16le a2f421f6a1af950544081c1797de01ae +yuv422p16be 51d9aa4e78d121c226d919ce97976fe4 +yuv422p16le 12965c54bda8932ca72da194419a9908 yuv440p 876385e96165acf51271b20e5d85a416 yuv444p 9c3c667d1613b72d15bc6d851c5eb8f7 -yuv444p16be bb86de32c67dd49469989ac184b89592 -yuv444p16le 64b899f52d820b14d05ff95954c15790 +yuv444p16be 6502abd75030d462c58d99a8673ec517 +yuv444p16le cd7e88b6d08425450a57555bc86ab210 yuva420p c705d1cf061d8c6580ac690b55f92276 yuvj420p 41fd02b204da0ab62452cd14b595e2e4 yuvj422p 7f6ca9bc1812cde02036d7d29a7cce43 From 4e3e333a79272944b40695166438359b376d7864 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 5 Jul 2011 12:49:11 -0700 Subject: [PATCH 05/22] swscale: error dithering for 16/9/10-bit to 8-bit. Based on a somewhat similar idea in FFmpeg's swscale copy. --- libswscale/swscale.c | 49 ++++++++--- libswscale/swscale_internal.h | 6 ++ libswscale/x86/swscale_template.c | 135 ++++++++++++++++++++++++++---- 3 files changed, 160 insertions(+), 30 deletions(-) diff --git a/libswscale/swscale.c b/libswscale/swscale.c index db4d231e13..dd9f4a108f 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -182,6 +182,18 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={ { 77, 23, 60, 15, 72, 21, 56, 14, }, }; #endif +DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = { +{ 36, 68, 60, 92, 34, 66, 58, 90,}, +{ 100, 4,124, 28, 98, 2,122, 26,}, +{ 52, 84, 44, 76, 50, 82, 42, 74,}, +{ 116, 20,108, 12,114, 18,106, 10,}, +{ 32, 64, 56, 88, 38, 70, 62, 94,}, +{ 96, 0,120, 24,102, 6,126, 30,}, +{ 48, 80, 40, 72, 54, 86, 46, 78,}, +{ 112, 16,104, 8,118, 22,110, 14,}, +}; +DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] = +{ 64, 64, 64, 64, 64, 64, 64, 64 }; static av_always_inline void yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc, @@ -285,10 +297,11 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter, uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; int i; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; //FIXME Optimize (just quickly written not optimized..) for (i=0; ilumDither8, *chrDither = c->chrDither8; for (i=0; i>7; + int val = (lumSrc[i]+ lumDither[i & 7]) >> 7; yDest[i]= av_clip_uint8(val); } if (uDest) for (i=0; i>7; - int v=(chrVSrc[i]+64)>>7; + int u = (chrUSrc[i] + chrDither[i & 7]) >> 7; + int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7; uDest[i]= av_clip_uint8(u); vDest[i]= av_clip_uint8(v); } if (CONFIG_SWSCALE_ALPHA && aDest) for (i=0; i>7; + int val = (alpSrc[i] + lumDither[i & 7]) >> 7; aDest[i]= av_clip_uint8(val); } } @@ -359,11 +373,12 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter, { uint8_t *yDest = dest[0], *uDest = dest[1]; enum PixelFormat dstFormat = c->dstFormat; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; //FIXME Optimize (just quickly written not optimized..) int i; for (i=0; iyuv2packed1; yuv2packed2_fn yuv2packed2 = c->yuv2packed2; yuv2packedX_fn yuv2packedX = c->yuv2packedX; + int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat); /* vars which will change and which we need to store back in the context */ int dstY= c->dstY; @@ -2401,6 +2417,9 @@ static int swScale(SwsContext *c, const uint8_t* src[], lastInChrBuf= -1; } + if (!should_dither) { + c->chrDither8 = c->lumDither8 = ff_sws_pb_64; + } lastDstY= dstY; for (;dstY < dstH; dstY++) { @@ -2490,6 +2509,10 @@ static int swScale(SwsContext *c, const uint8_t* src[], #if HAVE_MMX updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf); #endif + if (should_dither) { + c->chrDither8 = dither_8x8_128[chrDstY & 7]; + c->lumDither8 = dither_8x8_128[dstY & 7]; + } if (dstY >= dstH-2) { // hmm looks like we can't use MMX here without overwriting this array's tail find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX, diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index b3698a3d94..efb8aff088 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -321,6 +321,8 @@ typedef struct SwsContext { #define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48" #define UV_OFF "11*8+4*4*256*3+48" #define UV_OFFx2 "11*8+4*4*256*3+56" +#define DITHER16 "11*8+4*4*256*3+64" +#define DITHER32 "11*8+4*4*256*3+80" DECLARE_ALIGNED(8, uint64_t, redDither); DECLARE_ALIGNED(8, uint64_t, greenDither); @@ -345,6 +347,10 @@ typedef struct SwsContext { int32_t alpMmxFilter[4*MAX_FILTER_SIZE]; DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes + uint16_t dither16[8]; + uint32_t dither32[8]; + + const uint8_t *chrDither8, *lumDither8; #if HAVE_ALTIVEC vector signed short CY; diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 26cd2742a3..fd6ec3a793 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -37,8 +37,8 @@ #define YSCALEYUV2YV12X(offset, dest, end, pos) \ __asm__ volatile(\ - "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ + "movq "DITHER16"+0(%0), %%mm3 \n\t"\ + "movq "DITHER16"+8(%0), %%mm4 \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ ".p2align 4 \n\t" /* FIXME Unroll? */\ @@ -60,8 +60,8 @@ MOVNTQ(%%mm3, (%1, %3))\ "add $8, %3 \n\t"\ "cmp %2, %3 \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ + "movq "DITHER16"+0(%0), %%mm3 \n\t"\ + "movq "DITHER16"+8(%0), %%mm4 \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ "jb 1b \n\t"\ @@ -70,6 +70,42 @@ : "%"REG_d, "%"REG_S\ ); +#if !COMPILE_TEMPLATE_MMX2 +static av_always_inline void +dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot) +{ + if (rot) { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "psrlq $24, %%mm3\n\t" + "psllq $40, %%mm4\n\t" + "por %%mm4, %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + "psraw $4, %%mm3\n\t" + "psraw $4, %%mm4\n\t" + "movq %%mm3, "DITHER16"+0(%1)\n\t" + "movq %%mm4, "DITHER16"+8(%1)\n\t" + :: "r"(srcDither), "r"(&c->redDither) + ); + } else { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + "psraw $4, %%mm3\n\t" + "psraw $4, %%mm4\n\t" + "movq %%mm3, "DITHER16"+0(%1)\n\t" + "movq %%mm4, "DITHER16"+8(%1)\n\t" + :: "r"(srcDither), "r"(&c->redDither) + ); + } +} +#endif + static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, @@ -79,12 +115,16 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, { uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; if (uDest) { x86_reg uv_off = c->uv_offx2 >> 1; + dither_8to16(c, chrDither, 0); YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) + dither_8to16(c, chrDither, 1); YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) } + dither_8to16(c, lumDither, 0); if (CONFIG_SWSCALE_ALPHA && aDest) { YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) } @@ -95,10 +135,10 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \ __asm__ volatile(\ "lea " offset "(%0), %%"REG_d" \n\t"\ - "pxor %%mm4, %%mm4 \n\t"\ - "pxor %%mm5, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ + "movq "DITHER32"+0(%0), %%mm4 \n\t"\ + "movq "DITHER32"+8(%0), %%mm5 \n\t"\ + "movq "DITHER32"+16(%0), %%mm6 \n\t"\ + "movq "DITHER32"+24(%0), %%mm7 \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ ".p2align 4 \n\t"\ "1: \n\t"\ @@ -142,10 +182,10 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, "add $8, %3 \n\t"\ "cmp %2, %3 \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\ - "pxor %%mm4, %%mm4 \n\t"\ - "pxor %%mm5, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ + "movq "DITHER32"+0(%0), %%mm4 \n\t"\ + "movq "DITHER32"+8(%0), %%mm5 \n\t"\ + "movq "DITHER32"+16(%0), %%mm6 \n\t"\ + "movq "DITHER32"+24(%0), %%mm7 \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ "jb 1b \n\t"\ :: "r" (&c->redDither),\ @@ -153,6 +193,62 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, : "%"REG_a, "%"REG_d, "%"REG_S\ ); +#if !COMPILE_TEMPLATE_MMX2 +static av_always_inline void +dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot) +{ + if (rot) { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm4\n\t" + "movq %%mm4, %%mm5\n\t" + "psrlq $24, %%mm4\n\t" + "psllq $40, %%mm5\n\t" + "por %%mm5, %%mm4\n\t" + "movq %%mm4, %%mm6\n\t" + "punpcklbw %%mm0, %%mm4\n\t" + "punpckhbw %%mm0, %%mm6\n\t" + "movq %%mm4, %%mm5\n\t" + "movq %%mm6, %%mm7\n\t" + "punpcklwd %%mm0, %%mm4\n\t" + "punpckhwd %%mm0, %%mm5\n\t" + "punpcklwd %%mm0, %%mm6\n\t" + "punpckhwd %%mm0, %%mm7\n\t" + "psllw $12, %%mm4\n\t" + "psllw $12, %%mm5\n\t" + "psllw $12, %%mm6\n\t" + "psllw $12, %%mm7\n\t" + "movq %%mm3, "DITHER32"+0(%1)\n\t" + "movq %%mm4, "DITHER32"+8(%1)\n\t" + "movq %%mm4, "DITHER32"+16(%1)\n\t" + "movq %%mm4, "DITHER32"+24(%1)\n\t" + :: "r"(srcDither), "r"(&c->redDither) + ); + } else { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm4\n\t" + "movq %%mm4, %%mm6\n\t" + "punpcklbw %%mm0, %%mm4\n\t" + "punpckhbw %%mm0, %%mm6\n\t" + "movq %%mm4, %%mm5\n\t" + "movq %%mm6, %%mm7\n\t" + "punpcklwd %%mm0, %%mm4\n\t" + "punpckhwd %%mm0, %%mm5\n\t" + "punpcklwd %%mm0, %%mm6\n\t" + "punpckhwd %%mm0, %%mm7\n\t" + "psllw $12, %%mm4\n\t" + "psllw $12, %%mm5\n\t" + "psllw $12, %%mm6\n\t" + "psllw $12, %%mm7\n\t" + "movq %%mm3, "DITHER32"+0(%1)\n\t" + "movq %%mm4, "DITHER32"+8(%1)\n\t" + "movq %%mm4, "DITHER32"+16(%1)\n\t" + "movq %%mm4, "DITHER32"+24(%1)\n\t" + :: "r"(srcDither), "r"(&c->redDither) + ); + } +} +#endif + static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, @@ -162,12 +258,16 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, { uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; if (uDest) { x86_reg uv_off = c->uv_offx2 >> 1; + dither_8to32(c, chrDither, 0); YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) + dither_8to32(c, chrDither, 1); YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) } + dither_8to32(c, lumDither, 0); if (CONFIG_SWSCALE_ALPHA && aDest) { YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) } @@ -220,19 +320,20 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, chrVSrc + chrDstW, alpSrc + dstW }; x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW }; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; while (p--) { if (dst[p]) { + dither_8to16(c, (p == 2 || p == 3) ? chrDither : lumDither, p == 2); __asm__ volatile( "mov %2, %%"REG_a" \n\t" - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "psllw $6, %%mm7 \n\t" + "movq "DITHER16"+0(%3), %%mm6 \n\t" + "movq "DITHER16"+8(%3), %%mm7 \n\t" ".p2align 4 \n\t" /* FIXME Unroll? */ "1: \n\t" "movq (%0, %%"REG_a", 2), %%mm0 \n\t" "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t" - "paddsw %%mm7, %%mm0 \n\t" + "paddsw %%mm6, %%mm0 \n\t" "paddsw %%mm7, %%mm1 \n\t" "psraw $7, %%mm0 \n\t" "psraw $7, %%mm1 \n\t" @@ -241,7 +342,7 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, "add $8, %%"REG_a" \n\t" "jnc 1b \n\t" :: "r" (src[p]), "r" (dst[p] + counter[p]), - "g" (-counter[p]) + "g" (-counter[p]), "r"(&c->redDither) : "%"REG_a ); } From 1ce724ee393a04a79cab9e7c28290fe7d82dadf2 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 5 Jul 2011 20:48:13 -0700 Subject: [PATCH 06/22] swscale: implement error dithering in planarCopyWrapper. Based on a somewhat similar idea in FFmpeg's swscale. --- libswscale/swscale_unscaled.c | 109 ++++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 19 deletions(-) diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index 87cd655a46..b8daa6b4e7 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -34,6 +34,48 @@ #include "libavutil/bswap.h" #include "libavutil/pixdesc.h" +DECLARE_ALIGNED(8, const uint8_t, dither_8x8_1)[8][8] = { + { 0, 1, 0, 1, 0, 1, 0, 1,}, + { 1, 0, 1, 0, 1, 0, 1, 0,}, + { 0, 1, 0, 1, 0, 1, 0, 1,}, + { 1, 0, 1, 0, 1, 0, 1, 0,}, + { 0, 1, 0, 1, 0, 1, 0, 1,}, + { 1, 0, 1, 0, 1, 0, 1, 0,}, + { 0, 1, 0, 1, 0, 1, 0, 1,}, + { 1, 0, 1, 0, 1, 0, 1, 0,}, +}; +DECLARE_ALIGNED(8, const uint8_t, dither_8x8_3)[8][8] = { + { 1, 2, 1, 2, 1, 2, 1, 2,}, + { 3, 0, 3, 0, 3, 0, 3, 0,}, + { 1, 2, 1, 2, 1, 2, 1, 2,}, + { 3, 0, 3, 0, 3, 0, 3, 0,}, + { 1, 2, 1, 2, 1, 2, 1, 2,}, + { 3, 0, 3, 0, 3, 0, 3, 0,}, + { 1, 2, 1, 2, 1, 2, 1, 2,}, + { 3, 0, 3, 0, 3, 0, 3, 0,}, +}; +DECLARE_ALIGNED(8, const uint8_t, dither_8x8_64)[8][8] = { + { 18, 34, 30, 46, 17, 33, 29, 45,}, + { 50, 2, 62, 14, 49, 1, 61, 13,}, + { 26, 42, 22, 38, 25, 41, 21, 37,}, + { 58, 10, 54, 6, 57, 9, 53, 5,}, + { 16, 32, 28, 44, 19, 35, 31, 47,}, + { 48, 0, 60, 12, 51, 3, 63, 15,}, + { 24, 40, 20, 36, 27, 43, 23, 39,}, + { 56, 8, 52, 4, 59, 11, 55, 7,}, +}; +extern const uint8_t dither_8x8_128[8][8]; +DECLARE_ALIGNED(8, const uint8_t, dither_8x8_256)[8][8] = { + { 72, 136, 120, 184, 68, 132, 116, 180,}, + { 200, 8, 248, 56, 196, 4, 244, 52,}, + { 104, 168, 88, 152, 100, 164, 84, 148,}, + { 232, 40, 216, 24, 228, 36, 212, 20,}, + { 64, 128, 102, 176, 76, 140, 124, 188,}, + { 192, 0, 240, 48, 204, 12, 252, 60,}, + { 96, 160, 80, 144, 108, 172, 92, 156,}, + { 224, 32, 208, 16, 236, 44, 220, 28,}, +}; + #define RGB2YUV_SHIFT 15 #define BY ( (int)(0.114*219/255*(1<> shift); \ + wfunc(&dst[j + 1], (rfunc(&src[j + 1]) + dither[1]) >> shift); \ + wfunc(&dst[j + 2], (rfunc(&src[j + 2]) + dither[2]) >> shift); \ + wfunc(&dst[j + 3], (rfunc(&src[j + 3]) + dither[3]) >> shift); \ + wfunc(&dst[j + 4], (rfunc(&src[j + 4]) + dither[4]) >> shift); \ + wfunc(&dst[j + 5], (rfunc(&src[j + 5]) + dither[5]) >> shift); \ + wfunc(&dst[j + 6], (rfunc(&src[j + 6]) + dither[6]) >> shift); \ + wfunc(&dst[j + 7], (rfunc(&src[j + 7]) + dither[7]) >> shift); \ + } \ + for (; j < length; j++) \ + wfunc(&dst[j], (rfunc(&src[j]) + dither[j & 7]) >> shift); \ + dst += dstStride; \ + src += srcStride; \ + } + static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t* dst[], int dstStride[]) { @@ -475,7 +536,9 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[ COPY9_OR_10TO9_OR_10(int srcpx = rfunc(&srcPtr2[j]); \ wfunc(&dstPtr2[j], (srcpx << 1) | (srcpx >> 9))); \ } else if (dst_depth < src_depth) { \ - COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]) >> 1)); \ + DITHER_COPY(dstPtr2, dstStride[plane]/2, wfunc, \ + srcPtr2, srcStride[plane]/2, rfunc, \ + dither_8x8_1, 1); \ } else { \ COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]))); \ } @@ -493,14 +556,16 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[ } } } else { - // FIXME Maybe dither instead. +#define W8(a, b) { *(a) = (b); } #define COPY9_OR_10TO8(rfunc) \ - for (i = 0; i < height; i++) { \ - for (j = 0; j < length; j++) { \ - dstPtr[j] = rfunc(&srcPtr2[j])>>(src_depth-8); \ - } \ - dstPtr += dstStride[plane]; \ - srcPtr2 += srcStride[plane]/2; \ + if (src_depth == 9) { \ + DITHER_COPY(dstPtr, dstStride[plane], W8, \ + srcPtr2, srcStride[plane]/2, rfunc, \ + dither_8x8_1, 1); \ + } else { \ + DITHER_COPY(dstPtr, dstStride[plane], W8, \ + srcPtr2, srcStride[plane]/2, rfunc, \ + dither_8x8_3, 2); \ } if (isBE(c->srcFormat)) { COPY9_OR_10TO8(AV_RB16); @@ -515,12 +580,14 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[ if (is16BPS(c->srcFormat)) { const uint16_t *srcPtr2 = (const uint16_t*)srcPtr; #define COPY16TO9_OR_10(rfunc, wfunc) \ - for (i = 0; i < height; i++) { \ - for (j = 0; j < length; j++) { \ - wfunc(&dstPtr2[j], rfunc(&srcPtr2[j])>>(16-dst_depth)); \ - } \ - dstPtr2 += dstStride[plane]/2; \ - srcPtr2 += srcStride[plane]/2; \ + if (dst_depth == 9) { \ + DITHER_COPY(dstPtr2, dstStride[plane]/2, wfunc, \ + srcPtr2, srcStride[plane]/2, rfunc, \ + dither_8x8_128, 7); \ + } else { \ + DITHER_COPY(dstPtr2, dstStride[plane]/2, wfunc, \ + srcPtr2, srcStride[plane]/2, rfunc, \ + dither_8x8_64, 6); \ } if (isBE(c->dstFormat)) { if (isBE(c->srcFormat)) { @@ -552,11 +619,15 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[ } } } else if(is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) { - if (!isBE(c->srcFormat)) srcPtr++; - for (i=0; isrcFormat)) { + COPY16TO8(AV_RB16); + } else { + COPY16TO8(AV_RL16); } } else if(!is16BPS(c->srcFormat) && is16BPS(c->dstFormat)) { for (i=0; i Date: Fri, 8 Jul 2011 14:39:04 -0700 Subject: [PATCH 07/22] swscale: rename uv_off/uv_off2 to uv_off_px/byte. --- libswscale/swscale_internal.h | 8 ++-- libswscale/utils.c | 4 +- libswscale/x86/swscale_template.c | 80 +++++++++++++++---------------- 3 files changed, 46 insertions(+), 46 deletions(-) diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index efb8aff088..b602541044 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -319,8 +319,8 @@ typedef struct SwsContext { #define V_TEMP "11*8+4*4*256*2+32" #define Y_TEMP "11*8+4*4*256*2+40" #define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48" -#define UV_OFF "11*8+4*4*256*3+48" -#define UV_OFFx2 "11*8+4*4*256*3+56" +#define UV_OFF_PX "11*8+4*4*256*3+48" +#define UV_OFF_BYTE "11*8+4*4*256*3+56" #define DITHER16 "11*8+4*4*256*3+64" #define DITHER32 "11*8+4*4*256*3+80" @@ -345,8 +345,8 @@ typedef struct SwsContext { DECLARE_ALIGNED(8, uint64_t, v_temp); DECLARE_ALIGNED(8, uint64_t, y_temp); int32_t alpMmxFilter[4*MAX_FILTER_SIZE]; - DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes - DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes + DECLARE_ALIGNED(8, ptrdiff_t, uv_off_px); ///< offset (in pixels) between u and v planes + DECLARE_ALIGNED(8, ptrdiff_t, uv_off_byte); ///< offset (in bytes) between u and v planes uint16_t dither16[8]; uint32_t dither32[8]; diff --git a/libswscale/utils.c b/libswscale/utils.c index fd10fa03fb..296c84557d 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -1048,8 +1048,8 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) FF_ALLOCZ_OR_GOTO(c, c->lumPixBuf[i+c->vLumBufSize], dst_stride+1, fail); c->lumPixBuf[i] = c->lumPixBuf[i+c->vLumBufSize]; } - c->uv_off = dst_stride_px; - c->uv_offx2 = dst_stride; + c->uv_off_px = dst_stride_px; + c->uv_off_byte = dst_stride; for (i=0; ivChrBufSize; i++) { FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i+c->vChrBufSize], dst_stride*2+1, fail); c->chrUPixBuf[i] = c->chrUPixBuf[i+c->vChrBufSize]; diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index fd6ec3a793..28ec4d2d9c 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -118,7 +118,7 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; if (uDest) { - x86_reg uv_off = c->uv_offx2 >> 1; + x86_reg uv_off = c->uv_off_byte >> 1; dither_8to16(c, chrDither, 0); YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) dither_8to16(c, chrDither, 1); @@ -213,14 +213,14 @@ dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot) "punpckhwd %%mm0, %%mm5\n\t" "punpcklwd %%mm0, %%mm6\n\t" "punpckhwd %%mm0, %%mm7\n\t" - "psllw $12, %%mm4\n\t" - "psllw $12, %%mm5\n\t" - "psllw $12, %%mm6\n\t" - "psllw $12, %%mm7\n\t" - "movq %%mm3, "DITHER32"+0(%1)\n\t" - "movq %%mm4, "DITHER32"+8(%1)\n\t" - "movq %%mm4, "DITHER32"+16(%1)\n\t" - "movq %%mm4, "DITHER32"+24(%1)\n\t" + "pslld $12, %%mm4\n\t" + "pslld $12, %%mm5\n\t" + "pslld $12, %%mm6\n\t" + "pslld $12, %%mm7\n\t" + "movq %%mm4, "DITHER32"+0(%1)\n\t" + "movq %%mm5, "DITHER32"+8(%1)\n\t" + "movq %%mm6, "DITHER32"+16(%1)\n\t" + "movq %%mm7, "DITHER32"+24(%1)\n\t" :: "r"(srcDither), "r"(&c->redDither) ); } else { @@ -235,14 +235,14 @@ dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot) "punpckhwd %%mm0, %%mm5\n\t" "punpcklwd %%mm0, %%mm6\n\t" "punpckhwd %%mm0, %%mm7\n\t" - "psllw $12, %%mm4\n\t" - "psllw $12, %%mm5\n\t" - "psllw $12, %%mm6\n\t" - "psllw $12, %%mm7\n\t" - "movq %%mm3, "DITHER32"+0(%1)\n\t" - "movq %%mm4, "DITHER32"+8(%1)\n\t" - "movq %%mm4, "DITHER32"+16(%1)\n\t" - "movq %%mm4, "DITHER32"+24(%1)\n\t" + "pslld $12, %%mm4\n\t" + "pslld $12, %%mm5\n\t" + "pslld $12, %%mm6\n\t" + "pslld $12, %%mm7\n\t" + "movq %%mm4, "DITHER32"+0(%1)\n\t" + "movq %%mm5, "DITHER32"+8(%1)\n\t" + "movq %%mm6, "DITHER32"+16(%1)\n\t" + "movq %%mm7, "DITHER32"+24(%1)\n\t" :: "r"(srcDither), "r"(&c->redDither) ); } @@ -261,7 +261,7 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; if (uDest) { - x86_reg uv_off = c->uv_offx2 >> 1; + x86_reg uv_off = c->uv_off_byte >> 1; dither_8to32(c, chrDither, 0); YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) dither_8to32(c, chrDither, 1); @@ -574,7 +574,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; + x86_reg uv_off = c->uv_off_byte; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX_ACCURATE @@ -607,7 +607,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; + x86_reg uv_off = c->uv_off_byte; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX @@ -664,7 +664,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; + x86_reg uv_off = c->uv_off_byte; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -688,7 +688,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; + x86_reg uv_off = c->uv_off_byte; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -741,7 +741,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; + x86_reg uv_off = c->uv_off_byte; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -765,7 +765,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; + x86_reg uv_off = c->uv_off_byte; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -898,7 +898,7 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; + x86_reg uv_off = c->uv_off_byte; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -922,7 +922,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; + x86_reg uv_off = c->uv_off_byte; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -963,7 +963,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; + x86_reg uv_off = c->uv_off_byte; YSCALEYUV2PACKEDX_ACCURATE /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -984,7 +984,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; + x86_reg uv_off = c->uv_off_byte; YSCALEYUV2PACKEDX /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -1002,10 +1002,10 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_PX"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ @@ -1233,10 +1233,10 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_PX"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ @@ -1288,9 +1288,9 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], ".p2align 4 \n\t"\ "1: \n\t"\ "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_PX"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ @@ -1341,10 +1341,10 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_PX"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ @@ -1608,9 +1608,9 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, ".p2align 4 \n\t"\ "1: \n\t"\ "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_PX"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ "psraw $7, %%mm3 \n\t" \ "psraw $7, %%mm4 \n\t" \ "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ @@ -1626,10 +1626,10 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFFx2"("#c"), "#index" \n\t" \ + "add "UV_OFF_PX"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFFx2"("#c"), "#index" \n\t" \ + "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ "psrlw $8, %%mm3 \n\t" \ From c0483d0c7a8f12564f615dcd73890bcc2f7c2796 Mon Sep 17 00:00:00 2001 From: Daniel Kang Date: Sun, 3 Jul 2011 14:56:09 -0400 Subject: [PATCH 08/22] H.264: Add x86 assembly for 10-bit H.264 predict functions Mainly ported from 8-bit H.264 predict. Some code ported from x264. LGPL ok by author. Signed-off-by: Ronald S. Bultje --- libavcodec/x86/dsputil_mmx.c | 2 +- libavcodec/x86/h264_intrapred_10bit.asm | 931 ++++++++++++++++++++++++ libavcodec/x86/h264_intrapred_init.c | 58 ++ libavcodec/x86/x86util.asm | 8 + 4 files changed, 998 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 7842370fcd..b06a34ddce 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -42,7 +42,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = {0x8000000080000000ULL, 0x8000000080000000ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1 ) = {0x0001000100010001ULL, 0x0001000100010001ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm index 5cb593ac38..d57fc79754 100644 --- a/libavcodec/x86/h264_intrapred_10bit.asm +++ b/libavcodec/x86/h264_intrapred_10bit.asm @@ -29,9 +29,13 @@ SECTION_RODATA SECTION .text +cextern pw_8 cextern pw_4 +cextern pw_2 cextern pw_1 +; dest, left, right, src +; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED4x4_LOWPASS 4 paddw %2, %3 psrlw %2, 1 @@ -335,3 +339,930 @@ cglobal pred8x8_horizontal_10_sse2, 2,3 dec r2 jg .loop REP_RET + +;----------------------------------------------------------------------------- +; void predict_8x8_dc(pixel *src, int stride) +;----------------------------------------------------------------------------- +%macro MOV8 2-3 +; sort of a hack, but it works +%if mmsize==8 + movq [%1+0], %2 + movq [%1+8], %3 +%else + movdqa [%1], %2 +%endif +%endmacro + +%macro PRED8x8_DC 2 +cglobal pred8x8_dc_10_%1, 2,4 +%ifdef ARCH_X86_64 +%define t0 r10 +%else +%define t0 r0m +%endif + sub r0, r1 + pxor m4, m4 + movq m0, [r0+0] + movq m1, [r0+8] + HADDW m0, m2 + mov t0, r0 + HADDW m1, m2 + + movzx r2d, word [r0+r1*1-2] + movzx r3d, word [r0+r1*2-2] + lea r0, [r0+r1*2] + add r2d, r3d + movzx r3d, word [r0+r1*1-2] + add r2d, r3d + movzx r3d, word [r0+r1*2-2] + add r2d, r3d + lea r0, [r0+r1*2] + movd m2, r2d ; s2 + + movzx r2d, word [r0+r1*1-2] + movzx r3d, word [r0+r1*2-2] + lea r0, [r0+r1*2] + add r2d, r3d + movzx r3d, word [r0+r1*1-2] + add r2d, r3d + movzx r3d, word [r0+r1*2-2] + add r2d, r3d + movd m3, r2d ; s3 + + punpcklwd m0, m1 + mov r0, t0 + punpcklwd m2, m3 + punpckldq m0, m2 ; s0, s1, s2, s3 + %2 m3, m0, 11110110b ; s2, s1, s3, s3 + lea r2, [r1+r1*2] + %2 m0, m0, 01110100b ; s0, s1, s3, s1 + paddw m0, m3 + lea r3, [r0+r1*4] + psrlw m0, 2 + pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 +%ifidn %1, sse2 + punpcklwd m0, m0 + pshufd m3, m0, 11111010b + punpckldq m0, m0 + SWAP 0,1 +%else + pshufw m1, m0, 0x00 + pshufw m2, m0, 0x55 + pshufw m3, m0, 0xaa + pshufw m4, m0, 0xff +%endif + MOV8 r0+r1*1, m1, m2 + MOV8 r0+r1*2, m1, m2 + MOV8 r0+r2*1, m1, m2 + MOV8 r0+r1*4, m1, m2 + MOV8 r3+r1*1, m3, m4 + MOV8 r3+r1*2, m3, m4 + MOV8 r3+r2*1, m3, m4 + MOV8 r3+r1*4, m3, m4 + RET +%endmacro + +INIT_MMX +PRED8x8_DC mmxext, pshufw +INIT_XMM +PRED8x8_DC sse2 , pshuflw + +;----------------------------------------------------------------------------- +; void pred8x8_top_dc(pixel *src, int stride) +;----------------------------------------------------------------------------- +%macro PRED8x8_TOP_DC 2 +cglobal pred8x8_top_dc_10_%1, 2,4 + sub r0, r1 + movq m0, [r0+0] + movq m1, [r0+8] + HADDW m0, m2 + HADDW m1, m3 + lea r2, [r1+r1*2] + paddw m0, [pw_2] + paddw m1, [pw_2] + lea r3, [r0+r1*4] + psrlw m0, 2 + psrlw m1, 2 + %2 m0, m0, 0 + %2 m1, m1, 0 +%ifidn %1, sse2 + punpcklqdq m0, m1 +%endif + MOV8 r0+r1*1, m0, m1 + MOV8 r0+r1*2, m0, m1 + MOV8 r0+r2*1, m0, m1 + MOV8 r0+r1*4, m0, m1 + MOV8 r3+r1*1, m0, m1 + MOV8 r3+r1*2, m0, m1 + MOV8 r3+r2*1, m0, m1 + MOV8 r3+r1*4, m0, m1 + RET +%endmacro + +INIT_MMX +PRED8x8_TOP_DC mmxext, pshufw +INIT_XMM +PRED8x8_TOP_DC sse2 , pshuflw + + + +;----------------------------------------------------------------------------- +; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_TOP_DC 1 +cglobal pred8x8l_top_dc_10_%1, 4,4,6 + sub r0, r3 + pxor m7, m7 + mova m0, [r0-16] + mova m3, [r0] + mova m1, [r0+16] + mova m2, m3 + mova m4, m3 + PALIGNR m2, m0, 14, m0 + PALIGNR m1, m4, 2, m4 + test r1, r1 ; top_left + jz .fix_lt_2 + test r2, r2 ; top_right + jz .fix_tr_1 + jmp .body +.fix_lt_2: + mova m5, m3 + pxor m5, m2 + pslldq m5, 14 + psrldq m5, 14 + pxor m2, m5 + test r2, r2 ; top_right + jnz .body +.fix_tr_1: + mova m5, m3 + pxor m5, m1 + psrldq m5, 14 + pslldq m5, 14 + pxor m1, m5 +.body + lea r1, [r3+r3*2] + lea r2, [r0+r3*4] + PRED4x4_LOWPASS m0, m2, m1, m3 + HADDW m0, m1 + paddw m0, [pw_4] + psrlw m0, 3 + SPLATW m0, m0, 0 + mova [r0+r3*1], m0 + mova [r0+r3*2], m0 + mova [r0+r1*1], m0 + mova [r0+r3*4], m0 + mova [r2+r3*1], m0 + mova [r2+r3*2], m0 + mova [r2+r1*1], m0 + mova [r2+r3*4], m0 + RET +%endmacro + +INIT_XMM +%define PALIGNR PALIGNR_MMX +PRED8x8L_TOP_DC sse2 +%define PALIGNR PALIGNR_SSSE3 +PRED8x8L_TOP_DC ssse3 + +;----------------------------------------------------------------------------- +;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) +;----------------------------------------------------------------------------- +;TODO: see if scalar is faster +%macro PRED8x8L_DC 1 +cglobal pred8x8l_dc_10_%1, 4,5,8 + sub r0, r3 + lea r4, [r0+r3*2] + mova m0, [r0+r3*1-16] + punpckhwd m0, [r0+r3*0-16] + mova m1, [r4+r3*1-16] + punpckhwd m1, [r0+r3*2-16] + mov r4, r0 + punpckhdq m1, m0 + lea r0, [r0+r3*4] + mova m2, [r0+r3*1-16] + punpckhwd m2, [r0+r3*0-16] + lea r0, [r0+r3*2] + mova m3, [r0+r3*1-16] + punpckhwd m3, [r0+r3*0-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + lea r0, [r0+r3*2] + mova m0, [r0+r3*0-16] + mova m1, [r4] + mov r0, r4 + mova m4, m3 + mova m2, m3 + PALIGNR m4, m0, 14, m0 + PALIGNR m1, m2, 2, m2 + test r1, r1 + jnz .do_left +.fix_lt_1: + mova m5, m3 + pxor m5, m4 + psrldq m5, 14 + pslldq m5, 12 + pxor m1, m5 + jmp .do_left +.fix_lt_2: + mova m5, m3 + pxor m5, m2 + pslldq m5, 14 + psrldq m5, 14 + pxor m2, m5 + test r2, r2 + jnz .body +.fix_tr_1: + mova m5, m3 + pxor m5, m1 + psrldq m5, 14 + pslldq m5, 14 + pxor m1, m5 + jmp .body +.do_left: + mova m0, m4 + PRED4x4_LOWPASS m2, m1, m4, m3 + mova m4, m0 + mova m7, m2 + PRED4x4_LOWPASS m1, m3, m0, m4 + pslldq m1, 14 + PALIGNR m7, m1, 14, m3 + mova m0, [r0-16] + mova m3, [r0] + mova m1, [r0+16] + mova m2, m3 + mova m4, m3 + PALIGNR m2, m0, 14, m0 + PALIGNR m1, m4, 2, m4 + test r1, r1 + jz .fix_lt_2 + test r2, r2 + jz .fix_tr_1 +.body + lea r1, [r3+r3*2] + PRED4x4_LOWPASS m6, m2, m1, m3 + HADDW m7, m0 + HADDW m6, m0 + lea r2, [r0+r3*4] + paddw m7, [pw_8] + paddw m7, m6 + psrlw m7, 4 + SPLATW m7, m7 + mova [r0+r3*1], m7 + mova [r0+r3*2], m7 + mova [r0+r1*1], m7 + mova [r0+r3*4], m7 + mova [r2+r3*1], m7 + mova [r2+r3*2], m7 + mova [r2+r1*1], m7 + mova [r2+r3*4], m7 + RET +%endmacro + +INIT_XMM +%define PALIGNR PALIGNR_MMX +PRED8x8L_DC sse2 +%define PALIGNR PALIGNR_SSSE3 +PRED8x8L_DC ssse3 + +;----------------------------------------------------------------------------- +; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_VERTICAL 1 +cglobal pred8x8l_vertical_10_%1, 4,4,6 + sub r0, r3 + mova m0, [r0-16] + mova m3, [r0] + mova m1, [r0+16] + mova m2, m3 + mova m4, m3 + PALIGNR m2, m0, 14, m0 + PALIGNR m1, m4, 2, m4 + test r1, r1 ; top_left + jz .fix_lt_2 + test r2, r2 ; top_right + jz .fix_tr_1 + jmp .body +.fix_lt_2: + mova m5, m3 + pxor m5, m2 + pslldq m5, 14 + psrldq m5, 14 + pxor m2, m5 + test r2, r2 ; top_right + jnz .body +.fix_tr_1: + mova m5, m3 + pxor m5, m1 + psrldq m5, 14 + pslldq m5, 14 + pxor m1, m5 +.body + lea r1, [r3+r3*2] + lea r2, [r0+r3*4] + PRED4x4_LOWPASS m0, m2, m1, m3 + mova [r0+r3*1], m0 + mova [r0+r3*2], m0 + mova [r0+r1*1], m0 + mova [r0+r3*4], m0 + mova [r2+r3*1], m0 + mova [r2+r3*2], m0 + mova [r2+r1*1], m0 + mova [r2+r3*4], m0 + RET +%endmacro + +INIT_XMM +%define PALIGNR PALIGNR_MMX +PRED8x8L_VERTICAL sse2 +%define PALIGNR PALIGNR_SSSE3 +PRED8x8L_VERTICAL ssse3 + +;----------------------------------------------------------------------------- +; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_HORIZONTAL 1 +cglobal pred8x8l_horizontal_10_%1, 4,4,8 + sub r0, r3 + lea r2, [r0+r3*2] + mova m0, [r0+r3*1-16] + test r1, r1 + lea r1, [r0+r3] + cmovnz r1, r0 + punpckhwd m0, [r1+r3*0-16] + mova m1, [r2+r3*1-16] + punpckhwd m1, [r0+r3*2-16] + mov r2, r0 + punpckhdq m1, m0 + lea r0, [r0+r3*4] + mova m2, [r0+r3*1-16] + punpckhwd m2, [r0+r3*0-16] + lea r0, [r0+r3*2] + mova m3, [r0+r3*1-16] + punpckhwd m3, [r0+r3*0-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + lea r0, [r0+r3*2] + mova m0, [r0+r3*0-16] + mova m1, [r1+r3*0-16] + mov r0, r2 + mova m4, m3 + mova m2, m3 + PALIGNR m4, m0, 14, m0 + PALIGNR m1, m2, 2, m2 + mova m0, m4 + PRED4x4_LOWPASS m2, m1, m4, m3 + mova m4, m0 + mova m7, m2 + PRED4x4_LOWPASS m1, m3, m0, m4 + pslldq m1, 14 + PALIGNR m7, m1, 14, m3 + lea r1, [r3+r3*2] + punpckhwd m3, m7, m7 + punpcklwd m7, m7 + pshufd m0, m3, 0xff + pshufd m1, m3, 0xaa + lea r2, [r0+r3*4] + pshufd m2, m3, 0x55 + pshufd m3, m3, 0x00 + pshufd m4, m7, 0xff + pshufd m5, m7, 0xaa + pshufd m6, m7, 0x55 + pshufd m7, m7, 0x00 + mova [r0+r3*1], m0 + mova [r0+r3*2], m1 + mova [r0+r1*1], m2 + mova [r0+r3*4], m3 + mova [r2+r3*1], m4 + mova [r2+r3*2], m5 + mova [r2+r1*1], m6 + mova [r2+r3*4], m7 + RET +%endmacro + +INIT_XMM +%define PALIGNR PALIGNR_MMX +PRED8x8L_HORIZONTAL sse2 +%define PALIGNR PALIGNR_SSSE3 +PRED8x8L_HORIZONTAL ssse3 + +;----------------------------------------------------------------------------- +;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_DOWN_LEFT 1 +cglobal pred8x8l_down_left_10_%1, 4,4,8 + sub r0, r3 + mova m0, [r0-16] + mova m3, [r0] + mova m1, [r0+16] + mova m2, m3 + mova m4, m3 + PALIGNR m2, m0, 14, m0 + PALIGNR m1, m4, 2, m4 + test r1, r1 + jz .fix_lt_2 + test r2, r2 + jz .fix_tr_1 + jmp .do_top +.fix_lt_2: + mova m5, m3 + pxor m5, m2 + pslldq m5, 14 + psrldq m5, 14 + pxor m2, m5 + test r2, r2 + jnz .do_top +.fix_tr_1: + mova m5, m3 + pxor m5, m1 + psrldq m5, 14 + pslldq m5, 14 + pxor m1, m5 + jmp .do_top +.fix_tr_2: + punpckhwd m3, m3 + pshufd m1, m3, 0xFF + jmp .do_topright +.do_top: + PRED4x4_LOWPASS m4, m2, m1, m3 + mova m7, m4 + test r2, r2 + jz .fix_tr_2 + mova m0, [r0+16] + mova m5, m0 + mova m2, m0 + mova m4, m0 + psrldq m5, 14 + PALIGNR m2, m3, 14, m3 + PALIGNR m5, m4, 2, m4 + PRED4x4_LOWPASS m1, m2, m5, m0 +.do_topright: + lea r1, [r3+r3*2] + mova m6, m1 + psrldq m1, 14 + mova m4, m1 + lea r2, [r0+r3*4] + mova m2, m6 + PALIGNR m2, m7, 2, m0 + mova m3, m6 + PALIGNR m3, m7, 14, m0 + PALIGNR m4, m6, 2, m0 + mova m5, m7 + mova m1, m7 + mova m7, m6 + pslldq m1, 2 + PRED4x4_LOWPASS m0, m1, m2, m5 + PRED4x4_LOWPASS m1, m3, m4, m7 + mova [r2+r3*4], m1 + mova m2, m0 + pslldq m1, 2 + psrldq m2, 14 + pslldq m0, 2 + por m1, m2 + mova [r2+r1*1], m1 + mova m2, m0 + pslldq m1, 2 + psrldq m2, 14 + pslldq m0, 2 + por m1, m2 + mova [r2+r3*2], m1 + mova m2, m0 + pslldq m1, 2 + psrldq m2, 14 + pslldq m0, 2 + por m1, m2 + mova [r2+r3*1], m1 + mova m2, m0 + pslldq m1, 2 + psrldq m2, 14 + pslldq m0, 2 + por m1, m2 + mova [r0+r3*4], m1 + mova m2, m0 + pslldq m1, 2 + psrldq m2, 14 + pslldq m0, 2 + por m1, m2 + mova [r0+r1*1], m1 + mova m2, m0 + pslldq m1, 2 + psrldq m2, 14 + pslldq m0, 2 + por m1, m2 + mova [r0+r3*2], m1 + pslldq m1, 2 + psrldq m0, 14 + por m1, m0 + mova [r0+r3*1], m1 + RET +%endmacro + +INIT_XMM +%define PALIGNR PALIGNR_MMX +PRED8x8L_DOWN_LEFT sse2 +%define PALIGNR PALIGNR_SSSE3 +PRED8x8L_DOWN_LEFT ssse3 + +;----------------------------------------------------------------------------- +;void pred8x8l_down_right_mxext(pixel *src, int has_topleft, int has_topright, int stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_DOWN_RIGHT 1 +cglobal pred8x8l_down_right_10_%1, 4,5,8 + sub r0, r3 + lea r4, [r0+r3*2] + mova m0, [r0+r3*1-16] + punpckhwd m0, [r0+r3*0-16] + mova m1, [r4+r3*1-16] + punpckhwd m1, [r0+r3*2-16] + mov r4, r0 + punpckhdq m1, m0 + lea r0, [r0+r3*4] + mova m2, [r0+r3*1-16] + punpckhwd m2, [r0+r3*0-16] + lea r0, [r0+r3*2] + mova m3, [r0+r3*1-16] + punpckhwd m3, [r0+r3*0-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + lea r0, [r0+r3*2] + mova m0, [r0+r3*0-16] + mova m1, [r4] + mov r0, r4 + mova m4, m3 + mova m2, m3 + PALIGNR m4, m0, 14, m0 + PALIGNR m1, m2, 2, m2 + test r1, r1 ; top_left + jz .fix_lt_1 +.do_left: + mova m0, m4 + PRED4x4_LOWPASS m2, m1, m4, m3 + mova m4, m0 + mova m7, m2 + mova m6, m2 + PRED4x4_LOWPASS m1, m3, m0, m4 + pslldq m1, 14 + PALIGNR m7, m1, 14, m3 + mova m0, [r0-16] + mova m3, [r0] + mova m1, [r0+16] + mova m2, m3 + mova m4, m3 + PALIGNR m2, m0, 14, m0 + PALIGNR m1, m4, 2, m4 + test r1, r1 ; top_left + jz .fix_lt_2 + test r2, r2 ; top_right + jz .fix_tr_1 +.do_top: + PRED4x4_LOWPASS m4, m2, m1, m3 + mova m5, m4 + jmp .body +.fix_lt_1: + mova m5, m3 + pxor m5, m4 + psrldq m5, 14 + pslldq m5, 12 + pxor m1, m5 + jmp .do_left +.fix_lt_2: + mova m5, m3 + pxor m5, m2 + pslldq m5, 14 + psrldq m5, 14 + pxor m2, m5 + test r2, r2 ; top_right + jnz .do_top +.fix_tr_1: + mova m5, m3 + pxor m5, m1 + psrldq m5, 14 + pslldq m5, 14 + pxor m1, m5 + jmp .do_top +.body + lea r1, [r3+r3*2] + mova m1, m7 + mova m7, m5 + mova m5, m6 + mova m2, m7 + lea r2, [r0+r3*4] + PALIGNR m2, m6, 2, m0 + mova m3, m7 + PALIGNR m3, m6, 14, m0 + mova m4, m7 + psrldq m4, 2 + PRED4x4_LOWPASS m0, m1, m2, m5 + PRED4x4_LOWPASS m1, m3, m4, m7 + mova [r2+r3*4], m0 + mova m2, m1 + psrldq m0, 2 + pslldq m2, 14 + psrldq m1, 2 + por m0, m2 + mova [r2+r1*1], m0 + mova m2, m1 + psrldq m0, 2 + pslldq m2, 14 + psrldq m1, 2 + por m0, m2 + mova [r2+r3*2], m0 + mova m2, m1 + psrldq m0, 2 + pslldq m2, 14 + psrldq m1, 2 + por m0, m2 + mova [r2+r3*1], m0 + mova m2, m1 + psrldq m0, 2 + pslldq m2, 14 + psrldq m1, 2 + por m0, m2 + mova [r0+r3*4], m0 + mova m2, m1 + psrldq m0, 2 + pslldq m2, 14 + psrldq m1, 2 + por m0, m2 + mova [r0+r1*1], m0 + mova m2, m1 + psrldq m0, 2 + pslldq m2, 14 + psrldq m1, 2 + por m0, m2 + mova [r0+r3*2], m0 + psrldq m0, 2 + pslldq m1, 14 + por m0, m1 + mova [r0+r3*1], m0 + RET +%endmacro + +INIT_XMM +%define PALIGNR PALIGNR_MMX +PRED8x8L_DOWN_RIGHT sse2 +%define PALIGNR PALIGNR_SSSE3 +PRED8x8L_DOWN_RIGHT ssse3 + +;----------------------------------------------------------------------------- +; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_VERTICAL_RIGHT 1 +cglobal pred8x8l_vertical_right_10_%1, 4,5,8 + sub r0, r3 + lea r4, [r0+r3*2] + mova m0, [r0+r3*1-16] + punpckhwd m0, [r0+r3*0-16] + mova m1, [r4+r3*1-16] + punpckhwd m1, [r0+r3*2-16] + mov r4, r0 + punpckhdq m1, m0 + lea r0, [r0+r3*4] + mova m2, [r0+r3*1-16] + punpckhwd m2, [r0+r3*0-16] + lea r0, [r0+r3*2] + mova m3, [r0+r3*1-16] + punpckhwd m3, [r0+r3*0-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + lea r0, [r0+r3*2] + mova m0, [r0+r3*0-16] + mova m1, [r4] + mov r0, r4 + mova m4, m3 + mova m2, m3 + PALIGNR m4, m0, 14, m0 + PALIGNR m1, m2, 2, m2 + test r1, r1 + jz .fix_lt_1 + jmp .do_left +.fix_lt_1: + mova m5, m3 + pxor m5, m4 + psrldq m5, 14 + pslldq m5, 12 + pxor m1, m5 + jmp .do_left +.fix_lt_2: + mova m5, m3 + pxor m5, m2 + pslldq m5, 14 + psrldq m5, 14 + pxor m2, m5 + test r2, r2 + jnz .do_top +.fix_tr_1: + mova m5, m3 + pxor m5, m1 + psrldq m5, 14 + pslldq m5, 14 + pxor m1, m5 + jmp .do_top +.do_left: + mova m0, m4 + PRED4x4_LOWPASS m2, m1, m4, m3 + mova m7, m2 + mova m0, [r0-16] + mova m3, [r0] + mova m1, [r0+16] + mova m2, m3 + mova m4, m3 + PALIGNR m2, m0, 14, m0 + PALIGNR m1, m4, 2, m4 + test r1, r1 + jz .fix_lt_2 + test r2, r2 + jz .fix_tr_1 +.do_top + PRED4x4_LOWPASS m6, m2, m1, m3 + lea r1, [r3+r3*2] + mova m2, m6 + mova m3, m6 + PALIGNR m3, m7, 14, m0 + PALIGNR m6, m7, 12, m1 + mova m4, m3 + pavgw m3, m2 + lea r2, [r0+r3*4] + PRED4x4_LOWPASS m0, m6, m2, m4 + mova [r0+r3*1], m3 + mova [r0+r3*2], m0 + mova m5, m0 + mova m6, m3 + mova m1, m7 + mova m2, m1 + pslldq m2, 2 + mova m3, m1 + pslldq m3, 4 + PRED4x4_LOWPASS m0, m1, m3, m2 + PALIGNR m6, m0, 14, m2 + mova [r0+r1*1], m6 + pslldq m0, 2 + PALIGNR m5, m0, 14, m1 + mova [r0+r3*4], m5 + pslldq m0, 2 + PALIGNR m6, m0, 14, m2 + mova [r2+r3*1], m6 + pslldq m0, 2 + PALIGNR m5, m0, 14, m1 + mova [r2+r3*2], m5 + pslldq m0, 2 + PALIGNR m6, m0, 14, m2 + mova [r2+r1*1], m6 + pslldq m0, 2 + PALIGNR m5, m0, 14, m1 + mova [r2+r3*4], m5 + RET +%endmacro + +INIT_XMM +%define PALIGNR PALIGNR_MMX +PRED8x8L_VERTICAL_RIGHT sse2 +%define PALIGNR PALIGNR_SSSE3 +PRED8x8L_VERTICAL_RIGHT ssse3 + +;----------------------------------------------------------------------------- +; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_HORIZONTAL_UP 1 +cglobal pred8x8l_horizontal_up_10_%1, 4,4,8 + sub r0, r3 + lea r2, [r0+r3*2] + mova m0, [r0+r3*1-16] + test r1, r1 + lea r1, [r0+r3] + cmovnz r1, r0 + punpckhwd m0, [r1+r3*0-16] + mova m1, [r2+r3*1-16] + punpckhwd m1, [r0+r3*2-16] + mov r2, r0 + punpckhdq m1, m0 + lea r0, [r0+r3*4] + mova m2, [r0+r3*1-16] + punpckhwd m2, [r0+r3*0-16] + lea r0, [r0+r3*2] + mova m3, [r0+r3*1-16] + punpckhwd m3, [r0+r3*0-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + lea r0, [r0+r3*2] + mova m0, [r0+r3*0-16] + mova m1, [r1+r3*0-16] + mov r0, r2 + mova m4, m3 + mova m2, m3 + PALIGNR m4, m0, 14, m0 + PALIGNR m1, m2, 2, m2 + mova m0, m4 + PRED4x4_LOWPASS m2, m1, m4, m3 + mova m4, m0 + mova m7, m2 + PRED4x4_LOWPASS m1, m3, m0, m4 + pslldq m1, 14 + PALIGNR m7, m1, 14, m3 + lea r1, [r3+r3*2] + pshufd m0, m7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 + pslldq m7, 14 ; l7 .. .. .. .. .. .. .. + mova m2, m0 + pslld m0, 16 + psrld m2, 16 + por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0 + mova m3, m2 + mova m4, m2 + mova m5, m2 + psrldq m2, 2 + psrldq m3, 4 + lea r2, [r0+r3*4] + por m2, m7 ; l7 l7 l6 l5 l4 l3 l2 l1 + punpckhwd m7, m7 + por m3, m7 ; l7 l7 l7 l6 l5 l4 l3 l2 + pavgw m4, m2 + PRED4x4_LOWPASS m1, m3, m5, m2 + mova m5, m4 + punpcklwd m4, m1 ; p4 p3 p2 p1 + punpckhwd m5, m1 ; p8 p7 p6 p5 + mova m6, m5 + mova m7, m5 + mova m0, m5 + PALIGNR m5, m4, 4, m1 + pshufd m1, m6, 11111001b + PALIGNR m6, m4, 8, m2 + pshufd m2, m7, 11111110b + PALIGNR m7, m4, 12, m3 + pshufd m3, m0, 11111111b + mova [r0+r3*1], m4 + mova [r0+r3*2], m5 + mova [r0+r1*1], m6 + mova [r0+r3*4], m7 + mova [r2+r3*1], m0 + mova [r2+r3*2], m1 + mova [r2+r1*1], m2 + mova [r2+r3*4], m3 + RET +%endmacro + +INIT_XMM +%define PALIGNR PALIGNR_MMX +PRED8x8L_HORIZONTAL_UP sse2 +%define PALIGNR PALIGNR_SSSE3 +PRED8x8L_HORIZONTAL_UP ssse3 + + + +;----------------------------------------------------------------------------- +; void pred16x16_vertical(pixel *src, int stride) +;----------------------------------------------------------------------------- +%macro MOV16 3-5 + mova [%1+ 0], %2 + mova [%1+mmsize], %3 +%if mmsize==8 + mova [%1+ 16], %4 + mova [%1+ 24], %5 +%endif +%endmacro + +%macro PRED16x16_VERTICAL 1 +cglobal pred16x16_vertical_10_%1, 2,3 + sub r0, r1 + mov r2, 8 + mova m0, [r0+ 0] + mova m1, [r0+mmsize] +%if mmsize==8 + mova m2, [r0+16] + mova m3, [r0+24] +%endif +.loop: + MOV16 r0+r1*1, m0, m1, m2, m3 + MOV16 r0+r1*2, m0, m1, m2, m3 + lea r0, [r0+r1*2] + dec r2 + jg .loop + REP_RET +%endmacro + +INIT_MMX +PRED16x16_VERTICAL mmxext +INIT_XMM +PRED16x16_VERTICAL sse2 + +;----------------------------------------------------------------------------- +; void pred16x16_horizontal(pixel *src, int stride) +;----------------------------------------------------------------------------- +%macro PRED16x16_HORIZONTAL 1 +cglobal pred16x16_horizontal_10_%1, 2,3 + mov r2, 8 +.vloop: + movd m0, [r0+r1*0-4] + movd m1, [r0+r1*1-4] + SPLATW m0, m0, 1 + SPLATW m1, m1, 1 + MOV16 r0+r1*0, m0, m0, m0, m0 + MOV16 r0+r1*1, m1, m1, m1, m1 + lea r0, [r0+r1*2] + dec r2 + jge .vloop + REP_RET +%endmacro + +INIT_MMX +PRED16x16_HORIZONTAL mmxext +INIT_XMM +PRED16x16_HORIZONTAL sse2 diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index 9d6726c31c..a0c5164db1 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -43,9 +43,41 @@ PRED4x4(horizontal_down, 10, avx) #define PRED8x8(TYPE, DEPTH, OPT) \ void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); +PRED8x8(dc, 10, mmxext) +PRED8x8(dc, 10, sse2) +PRED8x8(top_dc, 10, mmxext) +PRED8x8(top_dc, 10, sse2) PRED8x8(vertical, 10, sse2) PRED8x8(horizontal, 10, sse2) +#define PRED8x8L(TYPE, DEPTH, OPT)\ +void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_topleft, int has_topright, int stride); + +PRED8x8L(dc, 10, sse2) +PRED8x8L(dc, 10, ssse3) +PRED8x8L(top_dc, 10, sse2) +PRED8x8L(top_dc, 10, ssse3) +PRED8x8L(vertical, 10, sse2) +PRED8x8L(vertical, 10, ssse3) +PRED8x8L(horizontal, 10, sse2) +PRED8x8L(horizontal, 10, ssse3) +PRED8x8L(down_left, 10, sse2) +PRED8x8L(down_left, 10, ssse3) +PRED8x8L(down_right, 10, sse2) +PRED8x8L(down_right, 10, ssse3) +PRED8x8L(vertical_right, 10, sse2) +PRED8x8L(vertical_right, 10, ssse3) +PRED8x8L(horizontal_up, 10, sse2) +PRED8x8L(horizontal_up, 10, ssse3) + +#define PRED16x16(TYPE, DEPTH, OPT)\ +void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); + +PRED16x16(vertical, 10, mmxext) +PRED16x16(vertical, 10, sse2) +PRED16x16(horizontal, 10, mmxext) +PRED16x16(horizontal, 10, sse2) + void ff_pred16x16_vertical_mmx (uint8_t *src, int stride); void ff_pred16x16_vertical_sse (uint8_t *src, int stride); void ff_pred16x16_horizontal_mmx (uint8_t *src, int stride); @@ -253,6 +285,12 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth if (mm_flags & AV_CPU_FLAG_MMX2) { h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext; h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; + + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_mmxext; + + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext; } if (mm_flags & AV_CPU_FLAG_SSE2) { h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2; @@ -261,13 +299,33 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2; h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2; + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2; h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2; h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2; + + h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2; + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2; + h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2; + h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2; + + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2; } if (mm_flags & AV_CPU_FLAG_SSSE3) { h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3; h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3; h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3; + + h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_ssse3; + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3; + h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_ssse3; + h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_ssse3; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3; } #if HAVE_AVX if (mm_flags & AV_CPU_FLAG_AVX) { diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm index 141e96000c..45196625fe 100644 --- a/libavcodec/x86/x86util.asm +++ b/libavcodec/x86/x86util.asm @@ -528,6 +528,14 @@ %endif %endmacro +%macro SPLATD 2-3 0 +%if mmsize == 16 + pshufd %1, %2, (%3)*0x55 +%else + pshufw %1, %2, (%3)*0x11 + ((%3)+1)*0x44 +%endif +%endmacro + %macro CLIPW 3 ;(dst, min, max) pmaxsw %1, %2 pminsw %1, %3 From bbdd52ed343cb594d4af07f9b369df6a00117774 Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Tue, 5 Jul 2011 18:08:22 -0700 Subject: [PATCH 09/22] H.264: av_always_inline some more functions These weren't getting inlined all the time in all gcc versions. --- libavcodec/h264.c | 4 ++-- libavcodec/h264.h | 4 ++-- libavcodec/h264_mvpred.h | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/libavcodec/h264.c b/libavcodec/h264.c index dbac4fda95..f7c0345daf 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -1443,7 +1443,7 @@ static void decode_postinit(H264Context *h, int setup_finished){ ff_thread_finish_setup(s->avctx); } -static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int chroma444, int simple){ +static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int chroma444, int simple){ MpegEncContext * const s = &h->s; uint8_t *top_border; int top_idx = 1; @@ -1518,7 +1518,7 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src } } -static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, +static av_always_inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int chroma444, diff --git a/libavcodec/h264.h b/libavcodec/h264.h index dad06e2007..a2abab9d9b 100644 --- a/libavcodec/h264.h +++ b/libavcodec/h264.h @@ -766,11 +766,11 @@ static av_always_inline uint16_t pack8to16(int a, int b){ /** * gets the chroma qp. */ -static inline int get_chroma_qp(H264Context *h, int t, int qscale){ +static av_always_inline int get_chroma_qp(H264Context *h, int t, int qscale){ return h->pps.chroma_qp_table[t][qscale]; } -static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my); +static av_always_inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my); static void fill_decode_neighbors(H264Context *h, int mb_type){ MpegEncContext * const s = &h->s; diff --git a/libavcodec/h264_mvpred.h b/libavcodec/h264_mvpred.h index f603e7ff38..c4e65b5847 100644 --- a/libavcodec/h264_mvpred.h +++ b/libavcodec/h264_mvpred.h @@ -35,7 +35,7 @@ //#undef NDEBUG #include -static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){ +static av_always_inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){ const int topright_ref= h->ref_cache[list][ i - 8 + part_width ]; MpegEncContext *s = &h->s; @@ -92,7 +92,7 @@ static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, in * @param mx the x component of the predicted motion vector * @param my the y component of the predicted motion vector */ -static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){ +static av_always_inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){ const int index8= scan8[n]; const int top_ref= h->ref_cache[list][ index8 - 8 ]; const int left_ref= h->ref_cache[list][ index8 - 1 ]; @@ -147,7 +147,7 @@ static inline void pred_motion(H264Context * const h, int n, int part_width, int * @param mx the x component of the predicted motion vector * @param my the y component of the predicted motion vector */ -static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){ +static av_always_inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){ if(n==0){ const int top_ref= h->ref_cache[list][ scan8[0] - 8 ]; const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ]; @@ -182,7 +182,7 @@ static inline void pred_16x8_motion(H264Context * const h, int n, int list, int * @param mx the x component of the predicted motion vector * @param my the y component of the predicted motion vector */ -static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){ +static av_always_inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){ if(n==0){ const int left_ref= h->ref_cache[list][ scan8[0] - 1 ]; const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ]; From 5136ba7c690f50ebe12bba6e3320a18e1d4fd936 Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Tue, 5 Jul 2011 17:55:14 -0700 Subject: [PATCH 10/22] H.264: faster P-SKIP decoding Inline the relevant parts of fill_decode_caches into P-SKIP mv prediction to avoid calling the whole thing. --- libavcodec/h264.h | 1 - libavcodec/h264_mvpred.h | 111 +++++++++++++++++++++++++++++++++++---- 2 files changed, 102 insertions(+), 10 deletions(-) diff --git a/libavcodec/h264.h b/libavcodec/h264.h index a2abab9d9b..d34e6db573 100644 --- a/libavcodec/h264.h +++ b/libavcodec/h264.h @@ -1331,7 +1331,6 @@ static void av_unused decode_mb_skip(H264Context *h){ mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; fill_decode_neighbors(h, mb_type); - fill_decode_caches(h, mb_type); //FIXME check what is needed and what not ... pred_pskip_motion(h, &mx, &my); fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); fill_rectangle( h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4); diff --git a/libavcodec/h264_mvpred.h b/libavcodec/h264_mvpred.h index c4e65b5847..8159f8a0dc 100644 --- a/libavcodec/h264_mvpred.h +++ b/libavcodec/h264_mvpred.h @@ -213,21 +213,114 @@ static av_always_inline void pred_8x16_motion(H264Context * const h, int n, int pred_motion(h, n, 2, list, ref, mx, my); } -static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){ - const int top_ref = h->ref_cache[0][ scan8[0] - 8 ]; - const int left_ref= h->ref_cache[0][ scan8[0] - 1 ]; +#define FIX_MV_MBAFF(type, refn, mvn, idx)\ + if(FRAME_MBAFF){\ + if(MB_FIELD){\ + if(!IS_INTERLACED(type)){\ + refn <<= 1;\ + AV_COPY32(mvbuf[idx], mvn);\ + mvbuf[idx][1] /= 2;\ + mvn = mvbuf[idx];\ + }\ + }else{\ + if(IS_INTERLACED(type)){\ + refn >>= 1;\ + AV_COPY32(mvbuf[idx], mvn);\ + mvbuf[idx][1] <<= 1;\ + mvn = mvbuf[idx];\ + }\ + }\ + } - tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y); - - if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE - || !( top_ref | AV_RN32A(h->mv_cache[0][ scan8[0] - 8 ])) - || !(left_ref | AV_RN32A(h->mv_cache[0][ scan8[0] - 1 ]))){ +static av_always_inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){ + DECLARE_ALIGNED(4, static const int16_t, zeromv)[2] = {0}; + DECLARE_ALIGNED(4, int16_t, mvbuf)[3][2]; + MpegEncContext * const s = &h->s; + int8_t *ref = s->current_picture.ref_index[0]; + int16_t (*mv)[2] = s->current_picture.motion_val[0]; + int top_ref, left_ref, diagonal_ref, match_count; + const int16_t *A, *B, *C; + int b_stride = h->b_stride; + /* To avoid doing an entire fill_decode_caches, we inline the relevant parts here. + * FIXME: this is a partial duplicate of the logic in fill_decode_caches, but it's + * faster this way. Is there a way to avoid this duplication? + */ + if(USES_LIST(h->left_type[LTOP], 0)){ + left_ref = ref[4*h->left_mb_xy[LTOP] + 1 + (h->left_block[0]&~1)]; + A = mv[h->mb2b_xy[h->left_mb_xy[LTOP]] + 3 + b_stride*h->left_block[0]]; + FIX_MV_MBAFF(h->left_type[LTOP], left_ref, A, 0); + if(!(left_ref | AV_RN32A(A))){ + *mx = *my = 0; + return; + } + }else if(h->left_type[LTOP]){ + left_ref = LIST_NOT_USED; + A = zeromv; + }else{ *mx = *my = 0; return; } - pred_motion(h, 0, 4, 0, 0, mx, my); + if(USES_LIST(h->top_type, 0)){ + top_ref = ref[4*h->top_mb_xy + 2]; + B = mv[h->mb2b_xy[h->top_mb_xy] + 3*b_stride]; + FIX_MV_MBAFF(h->top_type, top_ref, B, 1); + if(!(top_ref | AV_RN32A(B))){ + *mx = *my = 0; + return; + } + }else if(h->top_type){ + top_ref = LIST_NOT_USED; + B = zeromv; + }else{ + *mx = *my = 0; + return; + } + + tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y); + + if(USES_LIST(h->topright_type, 0)){ + diagonal_ref = ref[4*h->topright_mb_xy + 2]; + C = mv[h->mb2b_xy[h->topright_mb_xy] + 3*b_stride]; + FIX_MV_MBAFF(h->topright_type, diagonal_ref, C, 2); + }else if(h->topright_type){ + diagonal_ref = LIST_NOT_USED; + C = zeromv; + }else{ + if(USES_LIST(h->topleft_type, 0)){ + diagonal_ref = ref[4*h->topleft_mb_xy + 1 + (h->topleft_partition & 2)]; + C = mv[h->mb2b_xy[h->topleft_mb_xy] + 3 + b_stride + (h->topleft_partition & 2*b_stride)]; + FIX_MV_MBAFF(h->topleft_type, diagonal_ref, C, 2); + }else if(h->topleft_type){ + diagonal_ref = LIST_NOT_USED; + C = zeromv; + }else{ + diagonal_ref = PART_NOT_AVAILABLE; + C = zeromv; + } + } + + match_count= !diagonal_ref + !top_ref + !left_ref; + tprintf(h->s.avctx, "pred_pskip_motion match_count=%d\n", match_count); + if(match_count > 1){ + *mx= mid_pred(A[0], B[0], C[0]); + *my= mid_pred(A[1], B[1], C[1]); + }else if(match_count==1){ + if(!left_ref){ + *mx= A[0]; + *my= A[1]; + }else if(!top_ref){ + *mx= B[0]; + *my= B[1]; + }else{ + *mx= C[0]; + *my= C[1]; + } + }else{ + *mx= mid_pred(A[0], B[0], C[0]); + *my= mid_pred(A[1], B[1], C[1]); + } return; } From ef0c5948018216aadae18e5626ed0b1b95668f4f Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Wed, 6 Jul 2011 07:58:50 -0700 Subject: [PATCH 11/22] H.264: merge fill_rectangle into P-SKIP MV prediction, to match B-SKIP --- libavcodec/h264.h | 7 ++----- libavcodec/h264_mvpred.h | 42 +++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/libavcodec/h264.h b/libavcodec/h264.h index d34e6db573..e88005aa3e 100644 --- a/libavcodec/h264.h +++ b/libavcodec/h264.h @@ -770,7 +770,7 @@ static av_always_inline int get_chroma_qp(H264Context *h, int t, int qscale){ return h->pps.chroma_qp_table[t][qscale]; } -static av_always_inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my); +static av_always_inline void pred_pskip_motion(H264Context * const h); static void fill_decode_neighbors(H264Context *h, int mb_type){ MpegEncContext * const s = &h->s; @@ -1327,13 +1327,10 @@ static void av_unused decode_mb_skip(H264Context *h){ } else { - int mx, my; mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; fill_decode_neighbors(h, mb_type); - pred_pskip_motion(h, &mx, &my); - fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); - fill_rectangle( h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4); + pred_pskip_motion(h); } write_back_motion(h, mb_type); diff --git a/libavcodec/h264_mvpred.h b/libavcodec/h264_mvpred.h index 8159f8a0dc..6dceee3c90 100644 --- a/libavcodec/h264_mvpred.h +++ b/libavcodec/h264_mvpred.h @@ -232,16 +232,18 @@ static av_always_inline void pred_8x16_motion(H264Context * const h, int n, int }\ } -static av_always_inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){ +static av_always_inline void pred_pskip_motion(H264Context * const h){ DECLARE_ALIGNED(4, static const int16_t, zeromv)[2] = {0}; DECLARE_ALIGNED(4, int16_t, mvbuf)[3][2]; MpegEncContext * const s = &h->s; int8_t *ref = s->current_picture.ref_index[0]; int16_t (*mv)[2] = s->current_picture.motion_val[0]; - int top_ref, left_ref, diagonal_ref, match_count; + int top_ref, left_ref, diagonal_ref, match_count, mx, my; const int16_t *A, *B, *C; int b_stride = h->b_stride; + fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); + /* To avoid doing an entire fill_decode_caches, we inline the relevant parts here. * FIXME: this is a partial duplicate of the logic in fill_decode_caches, but it's * faster this way. Is there a way to avoid this duplication? @@ -251,15 +253,13 @@ static av_always_inline void pred_pskip_motion(H264Context * const h, int * cons A = mv[h->mb2b_xy[h->left_mb_xy[LTOP]] + 3 + b_stride*h->left_block[0]]; FIX_MV_MBAFF(h->left_type[LTOP], left_ref, A, 0); if(!(left_ref | AV_RN32A(A))){ - *mx = *my = 0; - return; + goto zeromv; } }else if(h->left_type[LTOP]){ left_ref = LIST_NOT_USED; A = zeromv; }else{ - *mx = *my = 0; - return; + goto zeromv; } if(USES_LIST(h->top_type, 0)){ @@ -267,15 +267,13 @@ static av_always_inline void pred_pskip_motion(H264Context * const h, int * cons B = mv[h->mb2b_xy[h->top_mb_xy] + 3*b_stride]; FIX_MV_MBAFF(h->top_type, top_ref, B, 1); if(!(top_ref | AV_RN32A(B))){ - *mx = *my = 0; - return; + goto zeromv; } }else if(h->top_type){ top_ref = LIST_NOT_USED; B = zeromv; }else{ - *mx = *my = 0; - return; + goto zeromv; } tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y); @@ -304,24 +302,28 @@ static av_always_inline void pred_pskip_motion(H264Context * const h, int * cons match_count= !diagonal_ref + !top_ref + !left_ref; tprintf(h->s.avctx, "pred_pskip_motion match_count=%d\n", match_count); if(match_count > 1){ - *mx= mid_pred(A[0], B[0], C[0]); - *my= mid_pred(A[1], B[1], C[1]); + mx = mid_pred(A[0], B[0], C[0]); + my = mid_pred(A[1], B[1], C[1]); }else if(match_count==1){ if(!left_ref){ - *mx= A[0]; - *my= A[1]; + mx = A[0]; + my = A[1]; }else if(!top_ref){ - *mx= B[0]; - *my= B[1]; + mx = B[0]; + my = B[1]; }else{ - *mx= C[0]; - *my= C[1]; + mx = C[0]; + my = C[1]; } }else{ - *mx= mid_pred(A[0], B[0], C[0]); - *my= mid_pred(A[1], B[1], C[1]); + mx = mid_pred(A[0], B[0], C[0]); + my = mid_pred(A[1], B[1], C[1]); } + fill_rectangle( h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4); + return; +zeromv: + fill_rectangle( h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); return; } From 298e52c99caabebba63bb2b7074d8c4b5c2c0f92 Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Wed, 6 Jul 2011 12:26:04 -0700 Subject: [PATCH 12/22] H.264: Remove redundant hl_motion_16/8 code --- libavcodec/h264.c | 52 +++++++++-------------------------------------- 1 file changed, 10 insertions(+), 42 deletions(-) diff --git a/libavcodec/h264.c b/libavcodec/h264.c index f7c0345daf..f79f4d1d35 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -778,24 +778,6 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t prefetch_motion(h, 1, pixel_shift, chroma444); } -#define hl_motion_fn(sh, bits) \ -static av_always_inline void hl_motion_ ## bits(H264Context *h, \ - uint8_t *dest_y, \ - uint8_t *dest_cb, uint8_t *dest_cr, \ - qpel_mc_func (*qpix_put)[16], \ - h264_chroma_mc_func (*chroma_put), \ - qpel_mc_func (*qpix_avg)[16], \ - h264_chroma_mc_func (*chroma_avg), \ - h264_weight_func *weight_op, \ - h264_biweight_func *weight_avg, \ - int chroma444) \ -{ \ - hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, \ - qpix_avg, chroma_avg, weight_op, weight_avg, sh, chroma444); \ -} -hl_motion_fn(0, 8); -hl_motion_fn(1, 16); - static void free_tables(H264Context *h, int free_rbsp){ int i; H264Context *hx; @@ -1876,18 +1858,11 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i if(h->deblocking_filter) xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, 0, simple, pixel_shift); }else if(is_h264){ - if (pixel_shift) { - hl_motion_16(h, dest_y, dest_cb, dest_cr, - s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, - s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, - h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, 0); - } else - hl_motion_8(h, dest_y, dest_cb, dest_cr, - s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, - s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, - h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, 0); + hl_motion(h, dest_y, dest_cb, dest_cr, + s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, + s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, + h->h264dsp.weight_h264_pixels_tab, + h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 0); } hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0); @@ -2017,18 +1992,11 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl if(h->deblocking_filter) xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, linesize, 0, 1, simple, pixel_shift); }else{ - if (pixel_shift) { - hl_motion_16(h, dest[0], dest[1], dest[2], - s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, - s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, - h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, 1); - } else - hl_motion_8(h, dest[0], dest[1], dest[2], - s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, - s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, - h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, 1); + hl_motion(h, dest[0], dest[1], dest[2], + s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, + s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, + h->h264dsp.weight_h264_pixels_tab, + h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 1); } for (p = 0; p < plane_count; p++) From 99b6d2c065c3823e77e23cadaf9077ca954b36ff Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Wed, 6 Jul 2011 13:25:13 -0700 Subject: [PATCH 13/22] H.264: use fill_rectangle in CABAC decoding --- libavcodec/h264_cabac.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c index 6dacf7a336..390a7b6587 100644 --- a/libavcodec/h264_cabac.c +++ b/libavcodec/h264_cabac.c @@ -1818,8 +1818,7 @@ static av_always_inline void decode_cabac_luma_residual( H264Context *h, const u } } } else { - uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8+16*p] ]; - nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0; + fill_rectangle(&h->non_zero_count_cache[scan8[4*i8x8+16*p]], 2, 2, 8, 0, 1); } } } From 6a2176aac05e1edbcdf8fb9c26d572d092a00c3c Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Wed, 6 Jul 2011 14:08:30 -0700 Subject: [PATCH 14/22] H.264: improve qp_thresh check Eliminate redundant check in filter_mb_fast, consider bit depth in calculating qp_thresh. --- libavcodec/h264.c | 4 +++- libavcodec/h264_loopfilter.c | 6 +----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/libavcodec/h264.c b/libavcodec/h264.c index f79f4d1d35..33c9527946 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -2931,7 +2931,9 @@ static int decode_slice_header(H264Context *h, H264Context *h0){ } } } - h->qp_thresh= 15 + 52 - FFMIN(h->slice_alpha_c0_offset, h->slice_beta_offset) - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]); + h->qp_thresh = 15 + 52 - FFMIN(h->slice_alpha_c0_offset, h->slice_beta_offset) + - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]) + + 6 * (h->sps.bit_depth_luma - 8); #if 0 //FMO if( h->pps.num_slice_groups > 1 && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5) diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c index c7163472d3..5de9f784d9 100644 --- a/libavcodec/h264_loopfilter.c +++ b/libavcodec/h264_loopfilter.c @@ -216,7 +216,7 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, MpegEncContext * const s = &h->s; int mb_xy; int mb_type, left_type, top_type; - int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh; + int qp, qp0, qp1, qpc, qpc0, qpc1; int chroma = !(CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY)); int chroma444 = CHROMA444; @@ -241,10 +241,6 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, qp1 = (qp + qp1 + 1) >> 1; qpc0 = (qpc + qpc0 + 1) >> 1; qpc1 = (qpc + qpc1 + 1) >> 1; - qp_thresh = 15+52 - h->slice_alpha_c0_offset; - if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh && - qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh) - return; if( IS_INTRA(mb_type) ) { static const int16_t bS4[4] = {4,4,4,4}; From d8dbe20241558a1b4439ac33120caa4dafebf418 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Thu, 7 Jul 2011 01:31:04 +0200 Subject: [PATCH 15/22] libxvid: Add const qualifier to silence compiler warning. libavcodec/libxvidff.c:752: warning: initialization discards qualifiers from pointer target type --- libavcodec/libxvidff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/libxvidff.c b/libavcodec/libxvidff.c index fd0aea58fa..3ca4d1b2ec 100644 --- a/libavcodec/libxvidff.c +++ b/libavcodec/libxvidff.c @@ -749,7 +749,7 @@ static int xvid_ff_2pass_before(struct xvid_context *ref, static int xvid_ff_2pass_after(struct xvid_context *ref, xvid_plg_data_t *param) { char *log = ref->twopassbuffer; - char *frame_types = " ipbs"; + const char *frame_types = " ipbs"; char frame_type; /* Quick bounds check */ From 99fecc64b064a013559d3d61f7d9790e3c95c80e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= Date: Thu, 23 Jun 2011 15:59:33 +0200 Subject: [PATCH 16/22] gxf: Fix 25 fps DV material in GXF being misdetected as 50 fps Set DV packet durations using fields_per_frame. This requires turning gxf_stream_info into the demuxer's context for access to the value in gxf_packet(). Since MPEG-2 seems to work fine this done only for DV. Signed-off-by: Anton Khirnov --- libavformat/gxf.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/libavformat/gxf.c b/libavformat/gxf.c index 54fdadfc32..062a147a0c 100644 --- a/libavformat/gxf.c +++ b/libavformat/gxf.c @@ -264,7 +264,7 @@ static int gxf_header(AVFormatContext *s, AVFormatParameters *ap) { int map_len; int len; AVRational main_timebase = {0, 0}; - struct gxf_stream_info si; + struct gxf_stream_info *si = s->priv_data; int i; if (!parse_packet_header(pb, &pkt_type, &map_len) || pkt_type != PKT_MAP) { av_log(s, AV_LOG_ERROR, "map packet not found\n"); @@ -282,7 +282,7 @@ static int gxf_header(AVFormatContext *s, AVFormatParameters *ap) { return 0; } map_len -= len; - gxf_material_tags(pb, &len, &si); + gxf_material_tags(pb, &len, si); avio_skip(pb, len); map_len -= 2; len = avio_rb16(pb); // length of track description @@ -300,7 +300,7 @@ static int gxf_header(AVFormatContext *s, AVFormatParameters *ap) { track_id = avio_r8(pb); track_len = avio_rb16(pb); len -= track_len; - gxf_track_tags(pb, &track_len, &si); + gxf_track_tags(pb, &track_len, si); avio_skip(pb, track_len); if (!(track_type & 0x80)) { av_log(s, AV_LOG_ERROR, "invalid track type %x\n", track_type); @@ -316,12 +316,12 @@ static int gxf_header(AVFormatContext *s, AVFormatParameters *ap) { if (idx < 0) continue; st = s->streams[idx]; if (!main_timebase.num || !main_timebase.den) { - main_timebase.num = si.frames_per_second.den; - main_timebase.den = si.frames_per_second.num * 2; + main_timebase.num = si->frames_per_second.den; + main_timebase.den = si->frames_per_second.num * 2; } - st->start_time = si.first_field; - if (si.first_field != AV_NOPTS_VALUE && si.last_field != AV_NOPTS_VALUE) - st->duration = si.last_field - si.first_field; + st->start_time = si->first_field; + if (si->first_field != AV_NOPTS_VALUE && si->last_field != AV_NOPTS_VALUE) + st->duration = si->last_field - si->first_field; } if (len < 0) av_log(s, AV_LOG_ERROR, "invalid track description length specified\n"); @@ -422,6 +422,8 @@ static int gxf_packet(AVFormatContext *s, AVPacket *pkt) { AVIOContext *pb = s->pb; GXFPktType pkt_type; int pkt_len; + struct gxf_stream_info *si = s->priv_data; + while (!pb->eof_reached) { AVStream *st; int track_type, track_id, ret; @@ -473,6 +475,11 @@ static int gxf_packet(AVFormatContext *s, AVPacket *pkt) { avio_skip(pb, skip); pkt->stream_index = stream_index; pkt->dts = field_nr; + + //set duration manually for DV or else lavf misdetects the frame rate + if (st->codec->codec_id == CODEC_ID_DVVIDEO) + pkt->duration = si->fields_per_frame; + return ret; } return AVERROR(EIO); @@ -518,7 +525,7 @@ static int64_t gxf_read_timestamp(AVFormatContext *s, int stream_index, AVInputFormat ff_gxf_demuxer = { "gxf", NULL_IF_CONFIG_SMALL("GXF format"), - 0, + sizeof(struct gxf_stream_info), gxf_probe, gxf_header, gxf_packet, From 84d098d943bacbc4a3c71c3abeef229d3791b6f7 Mon Sep 17 00:00:00 2001 From: Matthew Hoops Date: Tue, 5 Jul 2011 12:21:22 -0400 Subject: [PATCH 17/22] segafilm: add support for videos with cri adx adpcm Signed-off-by: Ronald S. Bultje --- libavformat/segafilm.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/libavformat/segafilm.c b/libavformat/segafilm.c index 0ad5fbbe0b..0886825523 100644 --- a/libavformat/segafilm.c +++ b/libavformat/segafilm.c @@ -111,7 +111,9 @@ static int film_read_header(AVFormatContext *s, film->audio_samplerate = AV_RB16(&scratch[24]); film->audio_channels = scratch[21]; film->audio_bits = scratch[22]; - if (film->audio_bits == 8) + if (scratch[23] == 2) + film->audio_type = CODEC_ID_ADPCM_ADX; + else if (film->audio_bits == 8) film->audio_type = CODEC_ID_PCM_S8; else if (film->audio_bits == 16) film->audio_type = CODEC_ID_PCM_S16BE; @@ -149,12 +151,19 @@ static int film_read_header(AVFormatContext *s, st->codec->codec_id = film->audio_type; st->codec->codec_tag = 1; st->codec->channels = film->audio_channels; - st->codec->bits_per_coded_sample = film->audio_bits; st->codec->sample_rate = film->audio_samplerate; + + if (film->audio_type == CODEC_ID_ADPCM_ADX) { + st->codec->bits_per_coded_sample = 18 * 8 / 32; + st->codec->block_align = st->codec->channels * 18; + } else { + st->codec->bits_per_coded_sample = film->audio_bits; + st->codec->block_align = st->codec->channels * + st->codec->bits_per_coded_sample / 8; + } + st->codec->bit_rate = st->codec->channels * st->codec->sample_rate * st->codec->bits_per_coded_sample; - st->codec->block_align = st->codec->channels * - st->codec->bits_per_coded_sample / 8; } /* load the sample table */ @@ -187,8 +196,12 @@ static int film_read_header(AVFormatContext *s, film->sample_table[i].pts *= film->base_clock; film->sample_table[i].pts /= film->audio_samplerate; - audio_frame_counter += (film->sample_table[i].sample_size / - (film->audio_channels * film->audio_bits / 8)); + if (film->audio_type == CODEC_ID_ADPCM_ADX) + audio_frame_counter += (film->sample_table[i].sample_size * 32 / + (18 * film->audio_channels)); + else + audio_frame_counter += (film->sample_table[i].sample_size / + (film->audio_channels * film->audio_bits / 8)); } else { film->sample_table[i].stream = film->video_stream_index; film->sample_table[i].pts = AV_RB32(&scratch[8]) & 0x7FFFFFFF; @@ -227,7 +240,8 @@ static int film_read_packet(AVFormatContext *s, return AVERROR(ENOMEM); avio_read(pb, pkt->data, sample->sample_size); } else if ((sample->stream == film->audio_stream_index) && - (film->audio_channels == 2)) { + (film->audio_channels == 2) && + (film->audio_type != CODEC_ID_ADPCM_ADX)) { /* stereo PCM needs to be interleaved */ if (av_new_packet(pkt, sample->sample_size)) From f98c9fb27de84dc4f6123537b754eb2fe1a80c02 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 29 Jun 2011 14:13:18 -0700 Subject: [PATCH 18/22] mpeg1video: don't abort if thread_count is too high. Instead, just decrease it to a valid value and use that. Signed-off-by: Ronald S. Bultje --- libavcodec/mpegvideo.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c index ceed41f230..aec75b541e 100644 --- a/libavcodec/mpegvideo.c +++ b/libavcodec/mpegvideo.c @@ -575,7 +575,11 @@ void MPV_decode_defaults(MpegEncContext *s){ */ av_cold int MPV_common_init(MpegEncContext *s) { - int y_size, c_size, yc_size, i, mb_array_size, mv_table_size, x, y, threads; + int y_size, c_size, yc_size, i, mb_array_size, mv_table_size, x, y, + threads = (s->encoding || + (HAVE_THREADS && + s->avctx->active_thread_type & FF_THREAD_SLICE)) ? + s->avctx->thread_count : 1; if(s->codec_id == CODEC_ID_MPEG2VIDEO && !s->progressive_sequence) s->mb_height = (s->height + 31) / 32 * 2; @@ -589,8 +593,10 @@ av_cold int MPV_common_init(MpegEncContext *s) if((s->encoding || (s->avctx->active_thread_type & FF_THREAD_SLICE)) && (s->avctx->thread_count > MAX_THREADS || (s->avctx->thread_count > s->mb_height && s->mb_height))){ - av_log(s->avctx, AV_LOG_ERROR, "too many threads\n"); - return -1; + int max_threads = FFMIN(MAX_THREADS, s->mb_height); + av_log(s->avctx, AV_LOG_WARNING, "too many threads (%d), reducing to %d\n", + s->avctx->thread_count, max_threads); + threads = max_threads; } if((s->width || s->height) && av_image_check_size(s->width, s->height, 0, s->avctx)) @@ -752,8 +758,6 @@ av_cold int MPV_common_init(MpegEncContext *s) if (s->width && s->height) { if (s->encoding || (HAVE_THREADS && s->avctx->active_thread_type&FF_THREAD_SLICE)) { - threads = s->avctx->thread_count; - for(i=1; ithread_context[i]= av_malloc(sizeof(MpegEncContext)); memcpy(s->thread_context[i], s, sizeof(MpegEncContext)); From 4c98976124f611b1a475bccb1b5177ff536be79c Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Fri, 8 Jul 2011 15:34:56 +0200 Subject: [PATCH 19/22] doc: replace @pxref by @ref where appropriate --- doc/developer.texi | 4 ++-- doc/filters.texi | 8 +++----- doc/muxers.texi | 4 ++-- doc/texi2pod.pl | 1 + 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/doc/developer.texi b/doc/developer.texi index 37b9f3e889..cae4f3df5e 100644 --- a/doc/developer.texi +++ b/doc/developer.texi @@ -224,7 +224,7 @@ Note, some rules were borrowed from the MPlayer project. @section Submitting patches -First, read the (@pxref{Coding Rules}) above if you did not yet, in particular +First, read the @ref{Coding Rules} above if you did not yet, in particular the rules regarding patch submission. As stated already, please do not submit a patch which contains several @@ -238,7 +238,7 @@ for us and greatly increases your chances of getting your patch applied. Use the patcheck tool of Libav to check your patch. The tool is located in the tools directory. -Run the @pxref{Regression Tests} before submitting a patch in order to verify +Run the @ref{Regression Tests} before submitting a patch in order to verify it does not cause unexpected problems. Patches should be posted as base64 encoded attachments (or any other diff --git a/doc/filters.texi b/doc/filters.texi index 9666f582f2..a8076b2bf5 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -796,8 +796,7 @@ Erode an image by using a specific structuring element. This filter corresponds to the libopencv function @code{cvErode}. The filter accepts the parameters: @var{struct_el}:@var{nb_iterations}, -with the same meaning and use of those of the dilate filter -(@pxref{dilate}). +with the same syntax and semantics as the @ref{dilate} filter. @subsection smooth @@ -1093,7 +1092,7 @@ setdar=16:9 setdar=1.77777 @end example -See also the "setsar" filter documentation (@pxref{setsar}). +See also the @ref{setsar} filter documentation. @section setpts @@ -1547,8 +1546,7 @@ form @var{width}x@var{height} or a frame size abbreviation. the form @var{num}/@var{den} or a frame rate abbreviation. @var{src_name} is the name to the frei0r source to load. For more information regarding frei0r and how to set the parameters read the -section "frei0r" (@pxref{frei0r}) in the description of the video -filters. +section @ref{frei0r} in the description of the video filters. Some examples follow: @example diff --git a/doc/muxers.texi b/doc/muxers.texi index 74c014bc70..bcba77cb2c 100644 --- a/doc/muxers.texi +++ b/doc/muxers.texi @@ -51,7 +51,7 @@ and the input video converted to MPEG-2 video, use the command: ffmpeg -i INPUT -acodec pcm_u8 -vcodec mpeg2video -f crc - @end example -See also the @code{framecrc} muxer (@pxref{framecrc}). +See also the @ref{framecrc} muxer. @anchor{framecrc} @section framecrc @@ -88,7 +88,7 @@ MPEG-2 video, use the command: ffmpeg -i INPUT -acodec pcm_u8 -vcodec mpeg2video -f framecrc - @end example -See also the @code{crc} muxer (@pxref{crc}). +See also the @ref{crc} muxer. @section image2 diff --git a/doc/texi2pod.pl b/doc/texi2pod.pl index 84c36ff1e1..0eb5e8d9fe 100755 --- a/doc/texi2pod.pl +++ b/doc/texi2pod.pl @@ -352,6 +352,7 @@ sub postprocess s/\(?\@xref\{(?:[^\}]*)\}(?:[^.<]|(?:<[^<>]*>))*\.\)?//g; s/\s+\(\@pxref\{(?:[^\}]*)\}\)//g; s/;\s+\@pxref\{(?:[^\}]*)\}//g; + s/\@ref\{([^\}]*)\}/$1/g; s/\@noindent\s*//g; s/\@refill//g; s/\@gol//g; From 8bfd7f6a475225a0595bf657f8b99a8fffb461e4 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Wed, 22 Jun 2011 15:33:56 -0400 Subject: [PATCH 20/22] alsa: fallback to buffer_size/4 for period_size. buffer_size/4 is the value used by aplay. This fixes output to null devices, e.g. writing ALSA output to a file. --- libavdevice/alsa-audio-common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libavdevice/alsa-audio-common.c b/libavdevice/alsa-audio-common.c index 4cfc6e9864..1e945fc389 100644 --- a/libavdevice/alsa-audio-common.c +++ b/libavdevice/alsa-audio-common.c @@ -269,6 +269,8 @@ av_cold int ff_alsa_open(AVFormatContext *ctx, snd_pcm_stream_t mode, } snd_pcm_hw_params_get_period_size_min(hw_params, &period_size, NULL); + if (!period_size) + period_size = buffer_size / 4; res = snd_pcm_hw_params_set_period_size_near(h, hw_params, &period_size, NULL); if (res < 0) { av_log(ctx, AV_LOG_ERROR, "cannot set ALSA period size (%s)\n", From e35c674d13a7f180412cfe058530a2e7f1d49a90 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Wed, 22 Jun 2011 16:38:20 -0400 Subject: [PATCH 21/22] alsa: limit buffer_size to 32768 frames. In testing, the file output plugin gave a max buffer size of about 20 million frames, which is way more than what is really needed and causes a memory allocation error on my system. --- libavdevice/alsa-audio-common.c | 1 + libavdevice/alsa-audio.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/libavdevice/alsa-audio-common.c b/libavdevice/alsa-audio-common.c index 1e945fc389..126695f754 100644 --- a/libavdevice/alsa-audio-common.c +++ b/libavdevice/alsa-audio-common.c @@ -260,6 +260,7 @@ av_cold int ff_alsa_open(AVFormatContext *ctx, snd_pcm_stream_t mode, } snd_pcm_hw_params_get_buffer_size_max(hw_params, &buffer_size); + buffer_size = FFMIN(buffer_size, ALSA_BUFFER_SIZE_MAX); /* TODO: maybe use ctx->max_picture_buffer somehow */ res = snd_pcm_hw_params_set_buffer_size_near(h, hw_params, &buffer_size); if (res < 0) { diff --git a/libavdevice/alsa-audio.h b/libavdevice/alsa-audio.h index 1e0be1cac7..ee43463696 100644 --- a/libavdevice/alsa-audio.h +++ b/libavdevice/alsa-audio.h @@ -40,6 +40,8 @@ other formats */ #define DEFAULT_CODEC_ID AV_NE(CODEC_ID_PCM_S16BE, CODEC_ID_PCM_S16LE) +#define ALSA_BUFFER_SIZE_MAX 32768 + typedef struct { AVClass *class; snd_pcm_t *h; From 7f7dc4fb55904e7b51b78ebca746c6755fc9770c Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Sat, 9 Jul 2011 11:03:26 -0700 Subject: [PATCH 22/22] H.264: fix filter_mb_fast with 4:4:4 + 8x8dct --- libavcodec/h264_loopfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c index 5de9f784d9..6eba8dcd02 100644 --- a/libavcodec/h264_loopfilter.c +++ b/libavcodec/h264_loopfilter.c @@ -317,7 +317,7 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, } else { LOCAL_ALIGNED_8(int16_t, bS, [2], [4][4]); int edges; - if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) { + if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 && !chroma444 ) { edges = 4; AV_WN64A(bS[0][0], 0x0002000200020002ULL); AV_WN64A(bS[0][2], 0x0002000200020002ULL);