cosmetics: Make libavcodec/ppc/dsputil_altivec.c conform to style guidelines.

This includes indentation changes, comment reformatting, consistent brace
placement and some prettyprinting.

Originally committed as revision 14318 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Diego Biurrun 2008-07-20 20:56:40 +00:00
parent b6934d7f9e
commit 86255db9b9

View File

@ -40,11 +40,9 @@ int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h
s = 0; s = 0;
sad = (vector unsigned int)vec_splat_u32(0); sad = (vector unsigned int)vec_splat_u32(0);
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* /* Read unaligned pixels into our vectors. The vectors are as follows:
Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */
*/
tv = (vector unsigned char *) pix1; tv = (vector unsigned char *) pix1;
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
@ -88,24 +86,20 @@ int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h
s = 0; s = 0;
sad = (vector unsigned int)vec_splat_u32(0); sad = (vector unsigned int)vec_splat_u32(0);
/* /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
Due to the fact that pix3 = pix2 + line_size, the pix3 of one
iteration becomes pix2 in the next iteration. We can use this iteration becomes pix2 in the next iteration. We can use this
fact to avoid a potentially expensive unaligned read, each fact to avoid a potentially expensive unaligned read, each
time around the loop. time around the loop.
Read unaligned pixels into our vectors. The vectors are as follows: Read unaligned pixels into our vectors. The vectors are as follows:
pix2v: pix2[0]-pix2[15] pix2v: pix2[0]-pix2[15]
Split the pixel vectors into shorts Split the pixel vectors into shorts */
*/
tv = (vector unsigned char *) &pix2[0]; tv = (vector unsigned char *) &pix2[0];
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* /* Read unaligned pixels into our vectors. The vectors are as follows:
Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix3v: pix3[0]-pix3[15] pix3v: pix3[0]-pix3[15] */
*/
tv = (vector unsigned char *) pix1; tv = (vector unsigned char *) pix1;
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
@ -154,15 +148,13 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int
s = 0; s = 0;
/* /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
Due to the fact that pix3 = pix2 + line_size, the pix3 of one
iteration becomes pix2 in the next iteration. We can use this iteration becomes pix2 in the next iteration. We can use this
fact to avoid a potentially expensive unaligned read, as well fact to avoid a potentially expensive unaligned read, as well
as some splitting, and vector addition each time around the loop. as some splitting, and vector addition each time around the loop.
Read unaligned pixels into our vectors. The vectors are as follows: Read unaligned pixels into our vectors. The vectors are as follows:
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
Split the pixel vectors into shorts Split the pixel vectors into shorts */
*/
tv = (vector unsigned char *) &pix2[0]; tv = (vector unsigned char *) &pix2[0];
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
@ -177,11 +169,9 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int
t2 = vec_add(pix2lv, pix2ilv); t2 = vec_add(pix2lv, pix2ilv);
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* /* Read unaligned pixels into our vectors. The vectors are as follows:
Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */
*/
tv = (vector unsigned char *) pix1; tv = (vector unsigned char *) pix1;
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
@ -191,13 +181,11 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int
tv = (vector unsigned char *) &pix3[1]; tv = (vector unsigned char *) &pix3[1];
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
/* /* Note that AltiVec does have vec_avg, but this works on vector pairs
Note that AltiVec does have vec_avg, but this works on vector pairs
and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
Instead, we have to split the pixel vectors into vectors of shorts, Instead, we have to split the pixel vectors into vectors of shorts,
and do the averaging by hand. and do the averaging by hand. */
*/
/* Split the pixel vectors into shorts */ /* Split the pixel vectors into shorts */
pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
@ -384,10 +372,8 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
/* /* Since we want to use unsigned chars, we can take advantage
Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */
of the fact that abs(a-b)^2 = (a-b)^2.
*/
/* Calculate abs differences vector */ /* Calculate abs differences vector */
t3 = vec_max(t1, t2); t3 = vec_max(t1, t2);
@ -435,10 +421,8 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
t1 = vec_perm(pix1v[0], pix1v[1], perm1); t1 = vec_perm(pix1v[0], pix1v[1], perm1);
t2 = vec_perm(pix2v[0], pix2v[1], perm2); t2 = vec_perm(pix2v[0], pix2v[1], perm2);
/* /* Since we want to use unsigned chars, we can take advantage
Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */
of the fact that abs(a-b)^2 = (a-b)^2.
*/
/* Calculate abs differences vector */ /* Calculate abs differences vector */
t3 = vec_max(t1, t2); t3 = vec_max(t1, t2);
@ -500,8 +484,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector signed short shorts; vector signed short shorts;
for(i=0;i<8;i++) for (i = 0; i < 8; i++) {
{
// Read potentially unaligned pixels. // Read potentially unaligned pixels.
// We're reading 16 pixels, and actually only want 8, // We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras. // but we simply ignore the extras.
@ -527,8 +510,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector signed short shorts1, shorts2; vector signed short shorts1, shorts2;
for(i=0;i<4;i++) for (i = 0; i < 4; i++) {
{
// Read potentially unaligned pixels // Read potentially unaligned pixels
// We're reading 16 pixels, and actually only want 8, // We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras. // but we simply ignore the extras.
@ -596,16 +578,14 @@ void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
register vector unsigned char vdst, vsrc; register vector unsigned char vdst, vsrc;
/* dst and src are 16 bytes-aligned (guaranteed) */ /* dst and src are 16 bytes-aligned (guaranteed) */
for(i = 0 ; (i + 15) < w ; i+=16) for (i = 0 ; (i + 15) < w ; i+=16) {
{
vdst = vec_ld(i, (unsigned char*)dst); vdst = vec_ld(i, (unsigned char*)dst);
vsrc = vec_ld(i, (unsigned char*)src); vsrc = vec_ld(i, (unsigned char*)src);
vdst = vec_add(vsrc, vdst); vdst = vec_add(vsrc, vdst);
vec_st(vdst, i, (unsigned char*)dst); vec_st(vdst, i, (unsigned char*)dst);
} }
/* if w is not a multiple of 16 */ /* if w is not a multiple of 16 */
for (; (i < w) ; i++) for (; (i < w) ; i++) {
{
dst[i] = src[i]; dst[i] = src[i];
} }
} }
@ -700,10 +680,8 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* /* block is 8 bytes-aligned, so we're either in the
block is 8 bytes-aligned, so we're either in the left block (16 bytes-aligned) or in the right block (not) */
left block (16 bytes-aligned) or in the right block (not)
*/
int rightside = ((unsigned long)block & 0x0000000F); int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block); blockv = vec_ld(0, block);
@ -711,12 +689,9 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
pixelsv2 = vec_ld(16, (unsigned char*)pixels); pixelsv2 = vec_ld(16, (unsigned char*)pixels);
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
if (rightside) if (rightside) {
{
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
} } else {
else
{
pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
} }
@ -736,25 +711,18 @@ void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_siz
{ {
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
pixelsv1, pixelsv2, register vector unsigned char blockv, temp1, temp2;
pixelsavg; register vector unsigned short pixelssum1, pixelssum2, temp3;
register vector unsigned char
blockv, temp1, temp2;
register vector unsigned short
pixelssum1, pixelssum2, temp3;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
temp1 = vec_ld(0, pixels); temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels); temp2 = vec_ld(16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
} }
pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv1 = vec_mergeh(vczero, pixelsv1);
@ -771,12 +739,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
} }
@ -789,12 +754,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
pixelssum1 = vec_add(pixelssum2, vctwo); pixelssum1 = vec_add(pixelssum2, vctwo);
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
if (rightside) if (rightside) {
{
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} } else {
else
{
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
} }
@ -812,13 +774,9 @@ void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int l
{ {
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
pixelsv1, pixelsv2, register vector unsigned char blockv, temp1, temp2;
pixelsavg; register vector unsigned short pixelssum1, pixelssum2, temp3;
register vector unsigned char
blockv, temp1, temp2;
register vector unsigned short
pixelssum1, pixelssum2, temp3;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
@ -826,12 +784,9 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
temp1 = vec_ld(0, pixels); temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels); temp2 = vec_ld(16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
} }
pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv1 = vec_mergeh(vczero, pixelsv1);
@ -848,12 +803,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
} }
@ -866,12 +818,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
pixelssum1 = vec_add(pixelssum2, vcone); pixelssum1 = vec_add(pixelssum2, vcone);
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
if (rightside) if (rightside) {
{
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} } else {
else
{
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
} }
@ -889,13 +838,10 @@ void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_
{ {
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
pixelsv1, pixelsv2, pixelsv3, pixelsv4; register vector unsigned char blockv, temp1, temp2;
register vector unsigned char register vector unsigned short temp3, temp4,
blockv, temp1, temp2; pixelssum1, pixelssum2, pixelssum3, pixelssum4;
register vector unsigned short
pixelssum1, pixelssum2, temp3,
pixelssum3, pixelssum4, temp4;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
@ -904,12 +850,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
temp1 = vec_ld(0, pixels); temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels); temp2 = vec_ld(16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
} }
pixelsv3 = vec_mergel(vczero, pixelsv1); pixelsv3 = vec_mergel(vczero, pixelsv1);
@ -929,12 +872,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
} }
@ -971,13 +911,10 @@ void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, in
{ {
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
pixelsv1, pixelsv2, pixelsv3, pixelsv4; register vector unsigned char blockv, temp1, temp2;
register vector unsigned char register vector unsigned short temp3, temp4,
blockv, temp1, temp2; pixelssum1, pixelssum2, pixelssum3, pixelssum4;
register vector unsigned short
pixelssum1, pixelssum2, temp3,
pixelssum3, pixelssum4, temp4;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
@ -987,12 +924,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
temp1 = vec_ld(0, pixels); temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels); temp2 = vec_ld(16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
} }
pixelsv3 = vec_mergel(vczero, pixelsv1); pixelsv3 = vec_mergel(vczero, pixelsv1);
@ -1012,12 +946,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
} }
@ -1088,11 +1019,9 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \ /* promote the unsigned chars to signed shorts */ \
/* we're in the 8x8 function, we only care for the first 8 */ \ /* we're in the 8x8 function, we only care for the first 8 */ \
srcV = \ srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)srcO); \ (vector signed char)srcO); \
dstV = \ dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)dstO); \ (vector signed char)dstO); \
/* subtractions inside the first butterfly */ \ /* subtractions inside the first butterfly */ \
but0 = vec_sub(srcV, dstV); \ but0 = vec_sub(srcV, dstV); \
@ -1159,25 +1088,22 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
} }
/* /*
16x8 works with 16 elements ; it allows to avoid replicating 16x8 works with 16 elements; it allows to avoid replicating loads, and
loads, and give the compiler more rooms for scheduling. give the compiler more rooms for scheduling. It's only used from
It's only used from inside hadamard8_diff16_altivec. inside hadamard8_diff16_altivec.
Unfortunately, it seems gcc-3.3 is a bit dumb, and Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT
the compiled code has a LOT of spill code, it seems of spill code, it seems gcc (unlike xlc) cannot keep everything in registers
gcc (unlike xlc) cannot keep everything in registers by itself. The following code include hand-made registers allocation. It's not
by itself. The following code include hand-made clean, but on a 7450 the resulting code is much faster (best case fall from
registers allocation. It's not clean, but on 700+ cycles to 550).
a 7450 the resulting code is much faster (best case
fall from 700+ cycles to 550).
xlc doesn't add spill code, but it doesn't know how to xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,
schedule for the 7450, and its code isn't much faster than and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less
gcc-3.3 on the 7450 (but uses 25% less instructions...) instructions...)
On the 970, the hand-made RA is still a win (around 690 On the 970, the hand-made RA is still a win (around 690 vs. around 780), but
vs. around 780), but xlc goes to around 660 on the xlc goes to around 660 on the regular C code...
regular C code...
*/ */
static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
@ -1255,17 +1181,13 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
dst2 = vec_ld((stride * i) + 16, dst); \ dst2 = vec_ld((stride * i) + 16, dst); \
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \ /* promote the unsigned chars to signed shorts */ \
srcV = \ srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)srcO); \ (vector signed char)srcO); \
dstV = \ dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)dstO); \ (vector signed char)dstO); \
srcW = \ srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed char)srcO); \ (vector signed char)srcO); \
dstW = \ dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed char)dstO); \ (vector signed char)dstO); \
/* subtractions inside the first butterfly */ \ /* subtractions inside the first butterfly */ \
but0 = vec_sub(srcV, dstV); \ but0 = vec_sub(srcV, dstV); \
@ -1452,8 +1374,7 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} else { } else {
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));