Libav
|
00001 /* 00002 * AltiVec-enhanced yuv2yuvX 00003 * 00004 * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org> 00005 * based on the equivalent C code in swscale.c 00006 * 00007 * This file is part of FFmpeg. 00008 * 00009 * FFmpeg is free software; you can redistribute it and/or 00010 * modify it under the terms of the GNU Lesser General Public 00011 * License as published by the Free Software Foundation; either 00012 * version 2.1 of the License, or (at your option) any later version. 00013 * 00014 * FFmpeg is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00017 * Lesser General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU Lesser General Public 00020 * License along with FFmpeg; if not, write to the Free Software 00021 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00022 */ 00023 00024 #define vzero vec_splat_s32(0) 00025 00026 static inline void 00027 altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) 00028 { 00029 register int i; 00030 vector unsigned int altivec_vectorShiftInt19 = 00031 vec_add(vec_splat_u32(10), vec_splat_u32(9)); 00032 if ((unsigned long)dest % 16) { 00033 /* badly aligned store, we force store alignment */ 00034 /* and will handle load misalignment on val w/ vec_perm */ 00035 vector unsigned char perm1; 00036 vector signed int v1; 00037 for (i = 0 ; (i < dstW) && 00038 (((unsigned long)dest + i) % 16) ; i++) { 00039 int t = val[i] >> 19; 00040 dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); 00041 } 00042 perm1 = vec_lvsl(i << 2, val); 00043 v1 = vec_ld(i << 2, val); 00044 for ( ; i < (dstW - 15); i+=16) { 00045 int offset = i << 2; 00046 vector signed int v2 = vec_ld(offset + 16, val); 00047 vector signed int v3 = vec_ld(offset + 32, val); 00048 vector signed int v4 = vec_ld(offset + 48, val); 00049 vector signed int v5 = vec_ld(offset + 64, val); 00050 vector signed int v12 = vec_perm(v1, v2, perm1); 00051 vector signed int v23 = vec_perm(v2, v3, perm1); 00052 vector signed int v34 = vec_perm(v3, v4, perm1); 00053 vector signed int v45 = vec_perm(v4, v5, perm1); 00054 00055 vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19); 00056 vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19); 00057 vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19); 00058 vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19); 00059 vector unsigned short vs1 = vec_packsu(vA, vB); 00060 vector unsigned short vs2 = vec_packsu(vC, vD); 00061 vector unsigned char vf = vec_packsu(vs1, vs2); 00062 vec_st(vf, i, dest); 00063 v1 = v5; 00064 } 00065 } else { // dest is properly aligned, great 00066 for (i = 0; i < (dstW - 15); i+=16) { 00067 int offset = i << 2; 00068 vector signed int v1 = vec_ld(offset, val); 00069 vector signed int v2 = vec_ld(offset + 16, val); 00070 vector signed int v3 = vec_ld(offset + 32, val); 00071 vector signed int v4 = vec_ld(offset + 48, val); 00072 vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19); 00073 vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19); 00074 vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19); 00075 vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19); 00076 vector unsigned short vs1 = vec_packsu(v5, v6); 00077 vector unsigned short vs2 = vec_packsu(v7, v8); 00078 vector unsigned char vf = vec_packsu(vs1, vs2); 00079 vec_st(vf, i, dest); 00080 } 00081 } 00082 for ( ; i < dstW ; i++) { 00083 int t = val[i] >> 19; 00084 dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); 00085 } 00086 } 00087 00088 static inline void 00089 yuv2yuvX_altivec_real(const int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, 00090 const int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, 00091 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW) 00092 { 00093 const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)}; 00094 register int i, j; 00095 { 00096 DECLARE_ALIGNED(16, int, val)[dstW]; 00097 00098 for (i = 0; i < (dstW -7); i+=4) { 00099 vec_st(vini, i << 2, val); 00100 } 00101 for (; i < dstW; i++) { 00102 val[i] = (1 << 18); 00103 } 00104 00105 for (j = 0; j < lumFilterSize; j++) { 00106 vector signed short l1, vLumFilter = vec_ld(j << 1, lumFilter); 00107 vector unsigned char perm, perm0 = vec_lvsl(j << 1, lumFilter); 00108 vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0); 00109 vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter 00110 00111 perm = vec_lvsl(0, lumSrc[j]); 00112 l1 = vec_ld(0, lumSrc[j]); 00113 00114 for (i = 0; i < (dstW - 7); i+=8) { 00115 int offset = i << 2; 00116 vector signed short l2 = vec_ld((i << 1) + 16, lumSrc[j]); 00117 00118 vector signed int v1 = vec_ld(offset, val); 00119 vector signed int v2 = vec_ld(offset + 16, val); 00120 00121 vector signed short ls = vec_perm(l1, l2, perm); // lumSrc[j][i] ... lumSrc[j][i+7] 00122 00123 vector signed int i1 = vec_mule(vLumFilter, ls); 00124 vector signed int i2 = vec_mulo(vLumFilter, ls); 00125 00126 vector signed int vf1 = vec_mergeh(i1, i2); 00127 vector signed int vf2 = vec_mergel(i1, i2); // lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j] 00128 00129 vector signed int vo1 = vec_add(v1, vf1); 00130 vector signed int vo2 = vec_add(v2, vf2); 00131 00132 vec_st(vo1, offset, val); 00133 vec_st(vo2, offset + 16, val); 00134 00135 l1 = l2; 00136 } 00137 for ( ; i < dstW; i++) { 00138 val[i] += lumSrc[j][i] * lumFilter[j]; 00139 } 00140 } 00141 altivec_packIntArrayToCharArray(val, dest, dstW); 00142 } 00143 if (uDest != 0) { 00144 DECLARE_ALIGNED(16, int, u)[chrDstW]; 00145 DECLARE_ALIGNED(16, int, v)[chrDstW]; 00146 00147 for (i = 0; i < (chrDstW -7); i+=4) { 00148 vec_st(vini, i << 2, u); 00149 vec_st(vini, i << 2, v); 00150 } 00151 for (; i < chrDstW; i++) { 00152 u[i] = (1 << 18); 00153 v[i] = (1 << 18); 00154 } 00155 00156 for (j = 0; j < chrFilterSize; j++) { 00157 vector signed short l1, l1_V, vChrFilter = vec_ld(j << 1, chrFilter); 00158 vector unsigned char perm, perm0 = vec_lvsl(j << 1, chrFilter); 00159 vChrFilter = vec_perm(vChrFilter, vChrFilter, perm0); 00160 vChrFilter = vec_splat(vChrFilter, 0); // chrFilter[j] is loaded 8 times in vChrFilter 00161 00162 perm = vec_lvsl(0, chrSrc[j]); 00163 l1 = vec_ld(0, chrSrc[j]); 00164 l1_V = vec_ld(2048 << 1, chrSrc[j]); 00165 00166 for (i = 0; i < (chrDstW - 7); i+=8) { 00167 int offset = i << 2; 00168 vector signed short l2 = vec_ld((i << 1) + 16, chrSrc[j]); 00169 vector signed short l2_V = vec_ld(((i + 2048) << 1) + 16, chrSrc[j]); 00170 00171 vector signed int v1 = vec_ld(offset, u); 00172 vector signed int v2 = vec_ld(offset + 16, u); 00173 vector signed int v1_V = vec_ld(offset, v); 00174 vector signed int v2_V = vec_ld(offset + 16, v); 00175 00176 vector signed short ls = vec_perm(l1, l2, perm); // chrSrc[j][i] ... chrSrc[j][i+7] 00177 vector signed short ls_V = vec_perm(l1_V, l2_V, perm); // chrSrc[j][i+2048] ... chrSrc[j][i+2055] 00178 00179 vector signed int i1 = vec_mule(vChrFilter, ls); 00180 vector signed int i2 = vec_mulo(vChrFilter, ls); 00181 vector signed int i1_V = vec_mule(vChrFilter, ls_V); 00182 vector signed int i2_V = vec_mulo(vChrFilter, ls_V); 00183 00184 vector signed int vf1 = vec_mergeh(i1, i2); 00185 vector signed int vf2 = vec_mergel(i1, i2); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] 00186 vector signed int vf1_V = vec_mergeh(i1_V, i2_V); 00187 vector signed int vf2_V = vec_mergel(i1_V, i2_V); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] 00188 00189 vector signed int vo1 = vec_add(v1, vf1); 00190 vector signed int vo2 = vec_add(v2, vf2); 00191 vector signed int vo1_V = vec_add(v1_V, vf1_V); 00192 vector signed int vo2_V = vec_add(v2_V, vf2_V); 00193 00194 vec_st(vo1, offset, u); 00195 vec_st(vo2, offset + 16, u); 00196 vec_st(vo1_V, offset, v); 00197 vec_st(vo2_V, offset + 16, v); 00198 00199 l1 = l2; 00200 l1_V = l2_V; 00201 } 00202 for ( ; i < chrDstW; i++) { 00203 u[i] += chrSrc[j][i] * chrFilter[j]; 00204 v[i] += chrSrc[j][i + 2048] * chrFilter[j]; 00205 } 00206 } 00207 altivec_packIntArrayToCharArray(u, uDest, chrDstW); 00208 altivec_packIntArrayToCharArray(v, vDest, chrDstW); 00209 } 00210 } 00211 00212 static inline void hScale_altivec_real(int16_t *dst, int dstW, 00213 const uint8_t *src, int srcW, 00214 int xInc, const int16_t *filter, 00215 const int16_t *filterPos, int filterSize) 00216 { 00217 register int i; 00218 DECLARE_ALIGNED(16, int, tempo)[4]; 00219 00220 if (filterSize % 4) { 00221 for (i=0; i<dstW; i++) { 00222 register int j; 00223 register int srcPos = filterPos[i]; 00224 register int val = 0; 00225 for (j=0; j<filterSize; j++) { 00226 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; 00227 } 00228 dst[i] = FFMIN(val>>7, (1<<15)-1); 00229 } 00230 } 00231 else 00232 switch (filterSize) { 00233 case 4: 00234 { 00235 for (i=0; i<dstW; i++) { 00236 register int srcPos = filterPos[i]; 00237 00238 vector unsigned char src_v0 = vec_ld(srcPos, src); 00239 vector unsigned char src_v1, src_vF; 00240 vector signed short src_v, filter_v; 00241 vector signed int val_vEven, val_s; 00242 if ((((int)src + srcPos)% 16) > 12) { 00243 src_v1 = vec_ld(srcPos + 16, src); 00244 } 00245 src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); 00246 00247 src_v = // vec_unpackh sign-extends... 00248 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); 00249 // now put our elements in the even slots 00250 src_v = vec_mergeh(src_v, (vector signed short)vzero); 00251 00252 filter_v = vec_ld(i << 3, filter); 00253 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2). 00254 00255 // The neat trick: We only care for half the elements, 00256 // high or low depending on (i<<3)%16 (it's 0 or 8 here), 00257 // and we're going to use vec_mule, so we choose 00258 // carefully how to "unpack" the elements into the even slots. 00259 if ((i << 3) % 16) 00260 filter_v = vec_mergel(filter_v, (vector signed short)vzero); 00261 else 00262 filter_v = vec_mergeh(filter_v, (vector signed short)vzero); 00263 00264 val_vEven = vec_mule(src_v, filter_v); 00265 val_s = vec_sums(val_vEven, vzero); 00266 vec_st(val_s, 0, tempo); 00267 dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); 00268 } 00269 } 00270 break; 00271 00272 case 8: 00273 { 00274 for (i=0; i<dstW; i++) { 00275 register int srcPos = filterPos[i]; 00276 00277 vector unsigned char src_v0 = vec_ld(srcPos, src); 00278 vector unsigned char src_v1, src_vF; 00279 vector signed short src_v, filter_v; 00280 vector signed int val_v, val_s; 00281 if ((((int)src + srcPos)% 16) > 8) { 00282 src_v1 = vec_ld(srcPos + 16, src); 00283 } 00284 src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); 00285 00286 src_v = // vec_unpackh sign-extends... 00287 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); 00288 filter_v = vec_ld(i << 4, filter); 00289 // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2) 00290 00291 val_v = vec_msums(src_v, filter_v, (vector signed int)vzero); 00292 val_s = vec_sums(val_v, vzero); 00293 vec_st(val_s, 0, tempo); 00294 dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); 00295 } 00296 } 00297 break; 00298 00299 case 16: 00300 { 00301 for (i=0; i<dstW; i++) { 00302 register int srcPos = filterPos[i]; 00303 00304 vector unsigned char src_v0 = vec_ld(srcPos, src); 00305 vector unsigned char src_v1 = vec_ld(srcPos + 16, src); 00306 vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); 00307 00308 vector signed short src_vA = // vec_unpackh sign-extends... 00309 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); 00310 vector signed short src_vB = // vec_unpackh sign-extends... 00311 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); 00312 00313 vector signed short filter_v0 = vec_ld(i << 5, filter); 00314 vector signed short filter_v1 = vec_ld((i << 5) + 16, filter); 00315 // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2) 00316 00317 vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero); 00318 vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc); 00319 00320 vector signed int val_s = vec_sums(val_v, vzero); 00321 00322 vec_st(val_s, 0, tempo); 00323 dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); 00324 } 00325 } 00326 break; 00327 00328 default: 00329 { 00330 for (i=0; i<dstW; i++) { 00331 register int j; 00332 register int srcPos = filterPos[i]; 00333 00334 vector signed int val_s, val_v = (vector signed int)vzero; 00335 vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter); 00336 vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter); 00337 00338 vector unsigned char src_v0 = vec_ld(srcPos, src); 00339 vector unsigned char permS = vec_lvsl(srcPos, src); 00340 00341 for (j = 0 ; j < filterSize - 15; j += 16) { 00342 vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src); 00343 vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS); 00344 00345 vector signed short src_vA = // vec_unpackh sign-extends... 00346 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); 00347 vector signed short src_vB = // vec_unpackh sign-extends... 00348 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); 00349 00350 vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); 00351 vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter); 00352 vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF); 00353 vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF); 00354 00355 vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v); 00356 val_v = vec_msums(src_vB, filter_v1, val_acc); 00357 00358 filter_v0R = filter_v2R; 00359 src_v0 = src_v1; 00360 } 00361 00362 if (j < filterSize-7) { 00363 // loading src_v0 is useless, it's already done above 00364 //vector unsigned char src_v0 = vec_ld(srcPos + j, src); 00365 vector unsigned char src_v1, src_vF; 00366 vector signed short src_v, filter_v1R, filter_v; 00367 if ((((int)src + srcPos)% 16) > 8) { 00368 src_v1 = vec_ld(srcPos + j + 16, src); 00369 } 00370 src_vF = vec_perm(src_v0, src_v1, permS); 00371 00372 src_v = // vec_unpackh sign-extends... 00373 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); 00374 // loading filter_v0R is useless, it's already done above 00375 //vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter); 00376 filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); 00377 filter_v = vec_perm(filter_v0R, filter_v1R, permF); 00378 00379 val_v = vec_msums(src_v, filter_v, val_v); 00380 } 00381 00382 val_s = vec_sums(val_v, vzero); 00383 00384 vec_st(val_s, 0, tempo); 00385 dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); 00386 } 00387 00388 } 00389 } 00390 } 00391 00392 static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, 00393 int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) 00394 { 00395 uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY; 00396 // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]); 00397 uint8_t *ysrc = src[0]; 00398 uint8_t *usrc = src[1]; 00399 uint8_t *vsrc = src[2]; 00400 const int width = c->srcW; 00401 const int height = srcSliceH; 00402 const int lumStride = srcStride[0]; 00403 const int chromStride = srcStride[1]; 00404 const int dstStride = dstStride_a[0]; 00405 const vector unsigned char yperm = vec_lvsl(0, ysrc); 00406 const int vertLumPerChroma = 2; 00407 register unsigned int y; 00408 00409 if (width&15) { 00410 yv12toyuy2(ysrc, usrc, vsrc, dst, c->srcW, srcSliceH, lumStride, chromStride, dstStride); 00411 return srcSliceH; 00412 } 00413 00414 /* This code assumes: 00415 00416 1) dst is 16 bytes-aligned 00417 2) dstStride is a multiple of 16 00418 3) width is a multiple of 16 00419 4) lum & chrom stride are multiples of 8 00420 */ 00421 00422 for (y=0; y<height; y++) { 00423 int i; 00424 for (i = 0; i < width - 31; i+= 32) { 00425 const unsigned int j = i >> 1; 00426 vector unsigned char v_yA = vec_ld(i, ysrc); 00427 vector unsigned char v_yB = vec_ld(i + 16, ysrc); 00428 vector unsigned char v_yC = vec_ld(i + 32, ysrc); 00429 vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm); 00430 vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm); 00431 vector unsigned char v_uA = vec_ld(j, usrc); 00432 vector unsigned char v_uB = vec_ld(j + 16, usrc); 00433 vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc)); 00434 vector unsigned char v_vA = vec_ld(j, vsrc); 00435 vector unsigned char v_vB = vec_ld(j + 16, vsrc); 00436 vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc)); 00437 vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); 00438 vector unsigned char v_uv_b = vec_mergel(v_u, v_v); 00439 vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a); 00440 vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a); 00441 vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b); 00442 vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b); 00443 vec_st(v_yuy2_0, (i << 1), dst); 00444 vec_st(v_yuy2_1, (i << 1) + 16, dst); 00445 vec_st(v_yuy2_2, (i << 1) + 32, dst); 00446 vec_st(v_yuy2_3, (i << 1) + 48, dst); 00447 } 00448 if (i < width) { 00449 const unsigned int j = i >> 1; 00450 vector unsigned char v_y1 = vec_ld(i, ysrc); 00451 vector unsigned char v_u = vec_ld(j, usrc); 00452 vector unsigned char v_v = vec_ld(j, vsrc); 00453 vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); 00454 vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a); 00455 vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a); 00456 vec_st(v_yuy2_0, (i << 1), dst); 00457 vec_st(v_yuy2_1, (i << 1) + 16, dst); 00458 } 00459 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { 00460 usrc += chromStride; 00461 vsrc += chromStride; 00462 } 00463 ysrc += lumStride; 00464 dst += dstStride; 00465 } 00466 00467 return srcSliceH; 00468 } 00469 00470 static inline int yv12touyvy_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, 00471 int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) 00472 { 00473 uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY; 00474 // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]); 00475 uint8_t *ysrc = src[0]; 00476 uint8_t *usrc = src[1]; 00477 uint8_t *vsrc = src[2]; 00478 const int width = c->srcW; 00479 const int height = srcSliceH; 00480 const int lumStride = srcStride[0]; 00481 const int chromStride = srcStride[1]; 00482 const int dstStride = dstStride_a[0]; 00483 const int vertLumPerChroma = 2; 00484 const vector unsigned char yperm = vec_lvsl(0, ysrc); 00485 register unsigned int y; 00486 00487 if (width&15) { 00488 yv12touyvy(ysrc, usrc, vsrc, dst, c->srcW, srcSliceH, lumStride, chromStride, dstStride); 00489 return srcSliceH; 00490 } 00491 00492 /* This code assumes: 00493 00494 1) dst is 16 bytes-aligned 00495 2) dstStride is a multiple of 16 00496 3) width is a multiple of 16 00497 4) lum & chrom stride are multiples of 8 00498 */ 00499 00500 for (y=0; y<height; y++) { 00501 int i; 00502 for (i = 0; i < width - 31; i+= 32) { 00503 const unsigned int j = i >> 1; 00504 vector unsigned char v_yA = vec_ld(i, ysrc); 00505 vector unsigned char v_yB = vec_ld(i + 16, ysrc); 00506 vector unsigned char v_yC = vec_ld(i + 32, ysrc); 00507 vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm); 00508 vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm); 00509 vector unsigned char v_uA = vec_ld(j, usrc); 00510 vector unsigned char v_uB = vec_ld(j + 16, usrc); 00511 vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc)); 00512 vector unsigned char v_vA = vec_ld(j, vsrc); 00513 vector unsigned char v_vB = vec_ld(j + 16, vsrc); 00514 vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc)); 00515 vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); 00516 vector unsigned char v_uv_b = vec_mergel(v_u, v_v); 00517 vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1); 00518 vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1); 00519 vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2); 00520 vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2); 00521 vec_st(v_uyvy_0, (i << 1), dst); 00522 vec_st(v_uyvy_1, (i << 1) + 16, dst); 00523 vec_st(v_uyvy_2, (i << 1) + 32, dst); 00524 vec_st(v_uyvy_3, (i << 1) + 48, dst); 00525 } 00526 if (i < width) { 00527 const unsigned int j = i >> 1; 00528 vector unsigned char v_y1 = vec_ld(i, ysrc); 00529 vector unsigned char v_u = vec_ld(j, usrc); 00530 vector unsigned char v_v = vec_ld(j, vsrc); 00531 vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); 00532 vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1); 00533 vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1); 00534 vec_st(v_uyvy_0, (i << 1), dst); 00535 vec_st(v_uyvy_1, (i << 1) + 16, dst); 00536 } 00537 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { 00538 usrc += chromStride; 00539 vsrc += chromStride; 00540 } 00541 ysrc += lumStride; 00542 dst += dstStride; 00543 } 00544 return srcSliceH; 00545 }