Libav
|
00001 /* 00002 * AltiVec acceleration for colorspace conversion 00003 * 00004 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com> 00005 * 00006 * This file is part of FFmpeg. 00007 * 00008 * FFmpeg is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Lesser General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2.1 of the License, or (at your option) any later version. 00012 * 00013 * FFmpeg is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public 00019 * License along with FFmpeg; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00023 /* 00024 Convert I420 YV12 to RGB in various formats, 00025 it rejects images that are not in 420 formats, 00026 it rejects images that don't have widths of multiples of 16, 00027 it rejects images that don't have heights of multiples of 2. 00028 Reject defers to C simulation code. 00029 00030 Lots of optimizations to be done here. 00031 00032 1. Need to fix saturation code. I just couldn't get it to fly with packs 00033 and adds, so we currently use max/min to clip. 00034 00035 2. The inefficient use of chroma loading needs a bit of brushing up. 00036 00037 3. Analysis of pipeline stalls needs to be done. Use shark to identify 00038 pipeline stalls. 00039 00040 00041 MODIFIED to calculate coeffs from currently selected color space. 00042 MODIFIED core to be a macro where you specify the output format. 00043 ADDED UYVY conversion which is never called due to some thing in swscale. 00044 CORRECTED algorithim selection to be strict on input formats. 00045 ADDED runtime detection of AltiVec. 00046 00047 ADDED altivec_yuv2packedX vertical scl + RGB converter 00048 00049 March 27,2004 00050 PERFORMANCE ANALYSIS 00051 00052 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo 00053 used as test. 00054 The AltiVec version uses 10% of the processor or ~100Mips for D1 video 00055 same sequence. 00056 00057 720 * 480 * 30 ~10MPS 00058 00059 so we have roughly 10 clocks per pixel. This is too high, something has 00060 to be wrong. 00061 00062 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the 00063 need for vec_min. 00064 00065 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have 00066 the input video frame, it was just decompressed so it probably resides in L1 00067 caches. However, we are creating the output video stream. This needs to use the 00068 DSTST instruction to optimize for the cache. We couple this with the fact that 00069 we are not going to be visiting the input buffer again so we mark it Least 00070 Recently Used. This shaves 25% of the processor cycles off. 00071 00072 Now memcpy is the largest mips consumer in the system, probably due 00073 to the inefficient X11 stuff. 00074 00075 GL libraries seem to be very slow on this machine 1.33Ghz PB running 00076 Jaguar, this is not the case for my 1Ghz PB. I thought it might be 00077 a versioning issue, however I have libGL.1.2.dylib for both 00078 machines. (We need to figure this out now.) 00079 00080 GL2 libraries work now with patch for RGB32. 00081 00082 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor. 00083 00084 Integrated luma prescaling adjustment for saturation/contrast/brightness 00085 adjustment. 00086 */ 00087 00088 #include <stdio.h> 00089 #include <stdlib.h> 00090 #include <string.h> 00091 #include <inttypes.h> 00092 #include <assert.h> 00093 #include "config.h" 00094 #include "libswscale/rgb2rgb.h" 00095 #include "libswscale/swscale.h" 00096 #include "libswscale/swscale_internal.h" 00097 00098 #undef PROFILE_THE_BEAST 00099 #undef INC_SCALING 00100 00101 typedef unsigned char ubyte; 00102 typedef signed char sbyte; 00103 00104 00105 /* RGB interleaver, 16 planar pels 8-bit samples per channel in 00106 homogeneous vector registers x0,x1,x2 are interleaved with the 00107 following technique: 00108 00109 o0 = vec_mergeh (x0,x1); 00110 o1 = vec_perm (o0, x2, perm_rgb_0); 00111 o2 = vec_perm (o0, x2, perm_rgb_1); 00112 o3 = vec_mergel (x0,x1); 00113 o4 = vec_perm (o3,o2,perm_rgb_2); 00114 o5 = vec_perm (o3,o2,perm_rgb_3); 00115 00116 perm_rgb_0: o0(RG).h v1(B) --> o1* 00117 0 1 2 3 4 00118 rgbr|gbrg|brgb|rgbr 00119 0010 0100 1001 0010 00120 0102 3145 2673 894A 00121 00122 perm_rgb_1: o0(RG).h v1(B) --> o2 00123 0 1 2 3 4 00124 gbrg|brgb|bbbb|bbbb 00125 0100 1001 1111 1111 00126 B5CD 6EF7 89AB CDEF 00127 00128 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4* 00129 0 1 2 3 4 00130 gbrg|brgb|rgbr|gbrg 00131 1111 1111 0010 0100 00132 89AB CDEF 0182 3945 00133 00134 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5* 00135 0 1 2 3 4 00136 brgb|rgbr|gbrg|brgb 00137 1001 0010 0100 1001 00138 a67b 89cA BdCD eEFf 00139 00140 */ 00141 static 00142 const vector unsigned char 00143 perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, 00144 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a}, 00145 perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, 00146 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f}, 00147 perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 00148 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05}, 00149 perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, 00150 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f}; 00151 00152 #define vec_merge3(x2,x1,x0,y0,y1,y2) \ 00153 do { \ 00154 __typeof__(x0) o0,o2,o3; \ 00155 o0 = vec_mergeh (x0,x1); \ 00156 y0 = vec_perm (o0, x2, perm_rgb_0); \ 00157 o2 = vec_perm (o0, x2, perm_rgb_1); \ 00158 o3 = vec_mergel (x0,x1); \ 00159 y1 = vec_perm (o3,o2,perm_rgb_2); \ 00160 y2 = vec_perm (o3,o2,perm_rgb_3); \ 00161 } while(0) 00162 00163 #define vec_mstbgr24(x0,x1,x2,ptr) \ 00164 do { \ 00165 __typeof__(x0) _0,_1,_2; \ 00166 vec_merge3 (x0,x1,x2,_0,_1,_2); \ 00167 vec_st (_0, 0, ptr++); \ 00168 vec_st (_1, 0, ptr++); \ 00169 vec_st (_2, 0, ptr++); \ 00170 } while (0) 00171 00172 #define vec_mstrgb24(x0,x1,x2,ptr) \ 00173 do { \ 00174 __typeof__(x0) _0,_1,_2; \ 00175 vec_merge3 (x2,x1,x0,_0,_1,_2); \ 00176 vec_st (_0, 0, ptr++); \ 00177 vec_st (_1, 0, ptr++); \ 00178 vec_st (_2, 0, ptr++); \ 00179 } while (0) 00180 00181 /* pack the pixels in rgb0 format 00182 msb R 00183 lsb 0 00184 */ 00185 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \ 00186 do { \ 00187 T _0,_1,_2,_3; \ 00188 _0 = vec_mergeh (x0,x1); \ 00189 _1 = vec_mergeh (x2,x3); \ 00190 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ 00191 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ 00192 vec_st (_2, 0*16, (T *)ptr); \ 00193 vec_st (_3, 1*16, (T *)ptr); \ 00194 _0 = vec_mergel (x0,x1); \ 00195 _1 = vec_mergel (x2,x3); \ 00196 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ 00197 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ 00198 vec_st (_2, 2*16, (T *)ptr); \ 00199 vec_st (_3, 3*16, (T *)ptr); \ 00200 ptr += 4; \ 00201 } while (0) 00202 00203 /* 00204 00205 | 1 0 1.4021 | | Y | 00206 | 1 -0.3441 -0.7142 |x| Cb| 00207 | 1 1.7718 0 | | Cr| 00208 00209 00210 Y: [-128 127] 00211 Cb/Cr : [-128 127] 00212 00213 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode. 00214 00215 */ 00216 00217 00218 00219 00220 #define vec_unh(x) \ 00221 (vector signed short) \ 00222 vec_perm(x,(__typeof__(x)){0}, \ 00223 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ 00224 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07})) 00225 #define vec_unl(x) \ 00226 (vector signed short) \ 00227 vec_perm(x,(__typeof__(x)){0}, \ 00228 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ 00229 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F})) 00230 00231 #define vec_clip_s16(x) \ 00232 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \ 00233 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16})) 00234 00235 #define vec_packclp(x,y) \ 00236 (vector unsigned char)vec_packs \ 00237 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \ 00238 (vector unsigned short)vec_max (y,((vector signed short) {0}))) 00239 00240 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr) 00241 00242 00243 static inline void cvtyuvtoRGB (SwsContext *c, 00244 vector signed short Y, vector signed short U, vector signed short V, 00245 vector signed short *R, vector signed short *G, vector signed short *B) 00246 { 00247 vector signed short vx,ux,uvx; 00248 00249 Y = vec_mradds (Y, c->CY, c->OY); 00250 U = vec_sub (U,(vector signed short) 00251 vec_splat((vector signed short){128},0)); 00252 V = vec_sub (V,(vector signed short) 00253 vec_splat((vector signed short){128},0)); 00254 00255 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15; 00256 ux = vec_sl (U, c->CSHIFT); 00257 *B = vec_mradds (ux, c->CBU, Y); 00258 00259 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15; 00260 vx = vec_sl (V, c->CSHIFT); 00261 *R = vec_mradds (vx, c->CRV, Y); 00262 00263 // uvx = ((CGU*u) + (CGV*v))>>15; 00264 uvx = vec_mradds (U, c->CGU, Y); 00265 *G = vec_mradds (V, c->CGV, uvx); 00266 } 00267 00268 00269 /* 00270 ------------------------------------------------------------------------------ 00271 CS converters 00272 ------------------------------------------------------------------------------ 00273 */ 00274 00275 00276 #define DEFCSP420_CVT(name,out_pixels) \ 00277 static int altivec_##name (SwsContext *c, \ 00278 unsigned char **in, int *instrides, \ 00279 int srcSliceY, int srcSliceH, \ 00280 unsigned char **oplanes, int *outstrides) \ 00281 { \ 00282 int w = c->srcW; \ 00283 int h = srcSliceH; \ 00284 int i,j; \ 00285 int instrides_scl[3]; \ 00286 vector unsigned char y0,y1; \ 00287 \ 00288 vector signed char u,v; \ 00289 \ 00290 vector signed short Y0,Y1,Y2,Y3; \ 00291 vector signed short U,V; \ 00292 vector signed short vx,ux,uvx; \ 00293 vector signed short vx0,ux0,uvx0; \ 00294 vector signed short vx1,ux1,uvx1; \ 00295 vector signed short R0,G0,B0; \ 00296 vector signed short R1,G1,B1; \ 00297 vector unsigned char R,G,B; \ 00298 \ 00299 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \ 00300 vector unsigned char align_perm; \ 00301 \ 00302 vector signed short \ 00303 lCY = c->CY, \ 00304 lOY = c->OY, \ 00305 lCRV = c->CRV, \ 00306 lCBU = c->CBU, \ 00307 lCGU = c->CGU, \ 00308 lCGV = c->CGV; \ 00309 \ 00310 vector unsigned short lCSHIFT = c->CSHIFT; \ 00311 \ 00312 ubyte *y1i = in[0]; \ 00313 ubyte *y2i = in[0]+instrides[0]; \ 00314 ubyte *ui = in[1]; \ 00315 ubyte *vi = in[2]; \ 00316 \ 00317 vector unsigned char *oute \ 00318 = (vector unsigned char *) \ 00319 (oplanes[0]+srcSliceY*outstrides[0]); \ 00320 vector unsigned char *outo \ 00321 = (vector unsigned char *) \ 00322 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ 00323 \ 00324 \ 00325 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \ 00326 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ 00327 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ 00328 \ 00329 \ 00330 for (i=0;i<h/2;i++) { \ 00331 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \ 00332 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \ 00333 \ 00334 for (j=0;j<w/16;j++) { \ 00335 \ 00336 y1ivP = (vector unsigned char *)y1i; \ 00337 y2ivP = (vector unsigned char *)y2i; \ 00338 uivP = (vector unsigned char *)ui; \ 00339 vivP = (vector unsigned char *)vi; \ 00340 \ 00341 align_perm = vec_lvsl (0, y1i); \ 00342 y0 = (vector unsigned char) \ 00343 vec_perm (y1ivP[0], y1ivP[1], align_perm); \ 00344 \ 00345 align_perm = vec_lvsl (0, y2i); \ 00346 y1 = (vector unsigned char) \ 00347 vec_perm (y2ivP[0], y2ivP[1], align_perm); \ 00348 \ 00349 align_perm = vec_lvsl (0, ui); \ 00350 u = (vector signed char) \ 00351 vec_perm (uivP[0], uivP[1], align_perm); \ 00352 \ 00353 align_perm = vec_lvsl (0, vi); \ 00354 v = (vector signed char) \ 00355 vec_perm (vivP[0], vivP[1], align_perm); \ 00356 \ 00357 u = (vector signed char) \ 00358 vec_sub (u,(vector signed char) \ 00359 vec_splat((vector signed char){128},0)); \ 00360 v = (vector signed char) \ 00361 vec_sub (v,(vector signed char) \ 00362 vec_splat((vector signed char){128},0)); \ 00363 \ 00364 U = vec_unpackh (u); \ 00365 V = vec_unpackh (v); \ 00366 \ 00367 \ 00368 Y0 = vec_unh (y0); \ 00369 Y1 = vec_unl (y0); \ 00370 Y2 = vec_unh (y1); \ 00371 Y3 = vec_unl (y1); \ 00372 \ 00373 Y0 = vec_mradds (Y0, lCY, lOY); \ 00374 Y1 = vec_mradds (Y1, lCY, lOY); \ 00375 Y2 = vec_mradds (Y2, lCY, lOY); \ 00376 Y3 = vec_mradds (Y3, lCY, lOY); \ 00377 \ 00378 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \ 00379 ux = vec_sl (U, lCSHIFT); \ 00380 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \ 00381 ux0 = vec_mergeh (ux,ux); \ 00382 ux1 = vec_mergel (ux,ux); \ 00383 \ 00384 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \ 00385 vx = vec_sl (V, lCSHIFT); \ 00386 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \ 00387 vx0 = vec_mergeh (vx,vx); \ 00388 vx1 = vec_mergel (vx,vx); \ 00389 \ 00390 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \ 00391 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \ 00392 uvx = vec_mradds (V, lCGV, uvx); \ 00393 uvx0 = vec_mergeh (uvx,uvx); \ 00394 uvx1 = vec_mergel (uvx,uvx); \ 00395 \ 00396 R0 = vec_add (Y0,vx0); \ 00397 G0 = vec_add (Y0,uvx0); \ 00398 B0 = vec_add (Y0,ux0); \ 00399 R1 = vec_add (Y1,vx1); \ 00400 G1 = vec_add (Y1,uvx1); \ 00401 B1 = vec_add (Y1,ux1); \ 00402 \ 00403 R = vec_packclp (R0,R1); \ 00404 G = vec_packclp (G0,G1); \ 00405 B = vec_packclp (B0,B1); \ 00406 \ 00407 out_pixels(R,G,B,oute); \ 00408 \ 00409 R0 = vec_add (Y2,vx0); \ 00410 G0 = vec_add (Y2,uvx0); \ 00411 B0 = vec_add (Y2,ux0); \ 00412 R1 = vec_add (Y3,vx1); \ 00413 G1 = vec_add (Y3,uvx1); \ 00414 B1 = vec_add (Y3,ux1); \ 00415 R = vec_packclp (R0,R1); \ 00416 G = vec_packclp (G0,G1); \ 00417 B = vec_packclp (B0,B1); \ 00418 \ 00419 \ 00420 out_pixels(R,G,B,outo); \ 00421 \ 00422 y1i += 16; \ 00423 y2i += 16; \ 00424 ui += 8; \ 00425 vi += 8; \ 00426 \ 00427 } \ 00428 \ 00429 outo += (outstrides[0])>>4; \ 00430 oute += (outstrides[0])>>4; \ 00431 \ 00432 ui += instrides_scl[1]; \ 00433 vi += instrides_scl[2]; \ 00434 y1i += instrides_scl[0]; \ 00435 y2i += instrides_scl[0]; \ 00436 } \ 00437 return srcSliceH; \ 00438 } 00439 00440 00441 #define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr) 00442 #define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr) 00443 #define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr) 00444 #define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr) 00445 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr) 00446 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr) 00447 00448 DEFCSP420_CVT (yuv2_abgr, out_abgr) 00449 #if 1 00450 DEFCSP420_CVT (yuv2_bgra, out_bgra) 00451 #else 00452 static int altivec_yuv2_bgra32 (SwsContext *c, 00453 unsigned char **in, int *instrides, 00454 int srcSliceY, int srcSliceH, 00455 unsigned char **oplanes, int *outstrides) 00456 { 00457 int w = c->srcW; 00458 int h = srcSliceH; 00459 int i,j; 00460 int instrides_scl[3]; 00461 vector unsigned char y0,y1; 00462 00463 vector signed char u,v; 00464 00465 vector signed short Y0,Y1,Y2,Y3; 00466 vector signed short U,V; 00467 vector signed short vx,ux,uvx; 00468 vector signed short vx0,ux0,uvx0; 00469 vector signed short vx1,ux1,uvx1; 00470 vector signed short R0,G0,B0; 00471 vector signed short R1,G1,B1; 00472 vector unsigned char R,G,B; 00473 00474 vector unsigned char *uivP, *vivP; 00475 vector unsigned char align_perm; 00476 00477 vector signed short 00478 lCY = c->CY, 00479 lOY = c->OY, 00480 lCRV = c->CRV, 00481 lCBU = c->CBU, 00482 lCGU = c->CGU, 00483 lCGV = c->CGV; 00484 00485 vector unsigned short lCSHIFT = c->CSHIFT; 00486 00487 ubyte *y1i = in[0]; 00488 ubyte *y2i = in[0]+w; 00489 ubyte *ui = in[1]; 00490 ubyte *vi = in[2]; 00491 00492 vector unsigned char *oute 00493 = (vector unsigned char *) 00494 (oplanes[0]+srcSliceY*outstrides[0]); 00495 vector unsigned char *outo 00496 = (vector unsigned char *) 00497 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); 00498 00499 00500 instrides_scl[0] = instrides[0]; 00501 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ 00502 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ 00503 00504 00505 for (i=0;i<h/2;i++) { 00506 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); 00507 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); 00508 00509 for (j=0;j<w/16;j++) { 00510 00511 y0 = vec_ldl (0,y1i); 00512 y1 = vec_ldl (0,y2i); 00513 uivP = (vector unsigned char *)ui; 00514 vivP = (vector unsigned char *)vi; 00515 00516 align_perm = vec_lvsl (0, ui); 00517 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); 00518 00519 align_perm = vec_lvsl (0, vi); 00520 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); 00521 u = (vector signed char) 00522 vec_sub (u,(vector signed char) 00523 vec_splat((vector signed char){128},0)); 00524 00525 v = (vector signed char) 00526 vec_sub (v, (vector signed char) 00527 vec_splat((vector signed char){128},0)); 00528 00529 U = vec_unpackh (u); 00530 V = vec_unpackh (v); 00531 00532 00533 Y0 = vec_unh (y0); 00534 Y1 = vec_unl (y0); 00535 Y2 = vec_unh (y1); 00536 Y3 = vec_unl (y1); 00537 00538 Y0 = vec_mradds (Y0, lCY, lOY); 00539 Y1 = vec_mradds (Y1, lCY, lOY); 00540 Y2 = vec_mradds (Y2, lCY, lOY); 00541 Y3 = vec_mradds (Y3, lCY, lOY); 00542 00543 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ 00544 ux = vec_sl (U, lCSHIFT); 00545 ux = vec_mradds (ux, lCBU, (vector signed short){0}); 00546 ux0 = vec_mergeh (ux,ux); 00547 ux1 = vec_mergel (ux,ux); 00548 00549 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ 00550 vx = vec_sl (V, lCSHIFT); 00551 vx = vec_mradds (vx, lCRV, (vector signed short){0}); 00552 vx0 = vec_mergeh (vx,vx); 00553 vx1 = vec_mergel (vx,vx); 00554 /* uvx = ((CGU*u) + (CGV*v))>>15 */ 00555 uvx = vec_mradds (U, lCGU, (vector signed short){0}); 00556 uvx = vec_mradds (V, lCGV, uvx); 00557 uvx0 = vec_mergeh (uvx,uvx); 00558 uvx1 = vec_mergel (uvx,uvx); 00559 R0 = vec_add (Y0,vx0); 00560 G0 = vec_add (Y0,uvx0); 00561 B0 = vec_add (Y0,ux0); 00562 R1 = vec_add (Y1,vx1); 00563 G1 = vec_add (Y1,uvx1); 00564 B1 = vec_add (Y1,ux1); 00565 R = vec_packclp (R0,R1); 00566 G = vec_packclp (G0,G1); 00567 B = vec_packclp (B0,B1); 00568 00569 out_argb(R,G,B,oute); 00570 R0 = vec_add (Y2,vx0); 00571 G0 = vec_add (Y2,uvx0); 00572 B0 = vec_add (Y2,ux0); 00573 R1 = vec_add (Y3,vx1); 00574 G1 = vec_add (Y3,uvx1); 00575 B1 = vec_add (Y3,ux1); 00576 R = vec_packclp (R0,R1); 00577 G = vec_packclp (G0,G1); 00578 B = vec_packclp (B0,B1); 00579 00580 out_argb(R,G,B,outo); 00581 y1i += 16; 00582 y2i += 16; 00583 ui += 8; 00584 vi += 8; 00585 00586 } 00587 00588 outo += (outstrides[0])>>4; 00589 oute += (outstrides[0])>>4; 00590 00591 ui += instrides_scl[1]; 00592 vi += instrides_scl[2]; 00593 y1i += instrides_scl[0]; 00594 y2i += instrides_scl[0]; 00595 } 00596 return srcSliceH; 00597 } 00598 00599 #endif 00600 00601 00602 DEFCSP420_CVT (yuv2_rgba, out_rgba) 00603 DEFCSP420_CVT (yuv2_argb, out_argb) 00604 DEFCSP420_CVT (yuv2_rgb24, out_rgb24) 00605 DEFCSP420_CVT (yuv2_bgr24, out_bgr24) 00606 00607 00608 // uyvy|uyvy|uyvy|uyvy 00609 // 0123 4567 89ab cdef 00610 static 00611 const vector unsigned char 00612 demux_u = {0x10,0x00,0x10,0x00, 00613 0x10,0x04,0x10,0x04, 00614 0x10,0x08,0x10,0x08, 00615 0x10,0x0c,0x10,0x0c}, 00616 demux_v = {0x10,0x02,0x10,0x02, 00617 0x10,0x06,0x10,0x06, 00618 0x10,0x0A,0x10,0x0A, 00619 0x10,0x0E,0x10,0x0E}, 00620 demux_y = {0x10,0x01,0x10,0x03, 00621 0x10,0x05,0x10,0x07, 00622 0x10,0x09,0x10,0x0B, 00623 0x10,0x0D,0x10,0x0F}; 00624 00625 /* 00626 this is so I can play live CCIR raw video 00627 */ 00628 static int altivec_uyvy_rgb32 (SwsContext *c, 00629 unsigned char **in, int *instrides, 00630 int srcSliceY, int srcSliceH, 00631 unsigned char **oplanes, int *outstrides) 00632 { 00633 int w = c->srcW; 00634 int h = srcSliceH; 00635 int i,j; 00636 vector unsigned char uyvy; 00637 vector signed short Y,U,V; 00638 vector signed short R0,G0,B0,R1,G1,B1; 00639 vector unsigned char R,G,B; 00640 vector unsigned char *out; 00641 ubyte *img; 00642 00643 img = in[0]; 00644 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); 00645 00646 for (i=0;i<h;i++) { 00647 for (j=0;j<w/16;j++) { 00648 uyvy = vec_ld (0, img); 00649 U = (vector signed short) 00650 vec_perm (uyvy, (vector unsigned char){0}, demux_u); 00651 00652 V = (vector signed short) 00653 vec_perm (uyvy, (vector unsigned char){0}, demux_v); 00654 00655 Y = (vector signed short) 00656 vec_perm (uyvy, (vector unsigned char){0}, demux_y); 00657 00658 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); 00659 00660 uyvy = vec_ld (16, img); 00661 U = (vector signed short) 00662 vec_perm (uyvy, (vector unsigned char){0}, demux_u); 00663 00664 V = (vector signed short) 00665 vec_perm (uyvy, (vector unsigned char){0}, demux_v); 00666 00667 Y = (vector signed short) 00668 vec_perm (uyvy, (vector unsigned char){0}, demux_y); 00669 00670 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); 00671 00672 R = vec_packclp (R0,R1); 00673 G = vec_packclp (G0,G1); 00674 B = vec_packclp (B0,B1); 00675 00676 // vec_mstbgr24 (R,G,B, out); 00677 out_rgba (R,G,B,out); 00678 00679 img += 32; 00680 } 00681 } 00682 return srcSliceH; 00683 } 00684 00685 00686 00687 /* Ok currently the acceleration routine only supports 00688 inputs of widths a multiple of 16 00689 and heights a multiple 2 00690 00691 So we just fall back to the C codes for this. 00692 */ 00693 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c) 00694 { 00695 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC)) 00696 return NULL; 00697 00698 /* 00699 and this seems not to matter too much I tried a bunch of 00700 videos with abnormal widths and MPlayer crashes elsewhere. 00701 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 00702 boom with X11 bad match. 00703 00704 */ 00705 if ((c->srcW & 0xf) != 0) return NULL; 00706 00707 switch (c->srcFormat) { 00708 case PIX_FMT_YUV410P: 00709 case PIX_FMT_YUV420P: 00710 /*case IMGFMT_CLPL: ??? */ 00711 case PIX_FMT_GRAY8: 00712 case PIX_FMT_NV12: 00713 case PIX_FMT_NV21: 00714 if ((c->srcH & 0x1) != 0) 00715 return NULL; 00716 00717 switch(c->dstFormat) { 00718 case PIX_FMT_RGB24: 00719 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n"); 00720 return altivec_yuv2_rgb24; 00721 case PIX_FMT_BGR24: 00722 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n"); 00723 return altivec_yuv2_bgr24; 00724 case PIX_FMT_ARGB: 00725 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n"); 00726 return altivec_yuv2_argb; 00727 case PIX_FMT_ABGR: 00728 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n"); 00729 return altivec_yuv2_abgr; 00730 case PIX_FMT_RGBA: 00731 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n"); 00732 return altivec_yuv2_rgba; 00733 case PIX_FMT_BGRA: 00734 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n"); 00735 return altivec_yuv2_bgra; 00736 default: return NULL; 00737 } 00738 break; 00739 00740 case PIX_FMT_UYVY422: 00741 switch(c->dstFormat) { 00742 case PIX_FMT_BGR32: 00743 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n"); 00744 return altivec_uyvy_rgb32; 00745 default: return NULL; 00746 } 00747 break; 00748 00749 } 00750 return NULL; 00751 } 00752 00753 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation) 00754 { 00755 union { 00756 DECLARE_ALIGNED(16, signed short, tmp)[8]; 00757 vector signed short vec; 00758 } buf; 00759 00760 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy 00761 buf.tmp[1] = -256*brightness; //oy 00762 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv 00763 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu 00764 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu 00765 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv 00766 00767 00768 c->CSHIFT = (vector unsigned short)vec_splat_u16(2); 00769 c->CY = vec_splat ((vector signed short)buf.vec, 0); 00770 c->OY = vec_splat ((vector signed short)buf.vec, 1); 00771 c->CRV = vec_splat ((vector signed short)buf.vec, 2); 00772 c->CBU = vec_splat ((vector signed short)buf.vec, 3); 00773 c->CGU = vec_splat ((vector signed short)buf.vec, 4); 00774 c->CGV = vec_splat ((vector signed short)buf.vec, 5); 00775 return; 00776 } 00777 00778 00779 void 00780 ff_yuv2packedX_altivec(SwsContext *c, 00781 const int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, 00782 const int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, 00783 uint8_t *dest, int dstW, int dstY) 00784 { 00785 int i,j; 00786 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; 00787 vector signed short R0,G0,B0,R1,G1,B1; 00788 00789 vector unsigned char R,G,B; 00790 vector unsigned char *out,*nout; 00791 00792 vector signed short RND = vec_splat_s16(1<<3); 00793 vector unsigned short SCL = vec_splat_u16(4); 00794 DECLARE_ALIGNED(16, unsigned long, scratch)[16]; 00795 00796 vector signed short *YCoeffs, *CCoeffs; 00797 00798 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; 00799 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize; 00800 00801 out = (vector unsigned char *)dest; 00802 00803 for (i=0; i<dstW; i+=16) { 00804 Y0 = RND; 00805 Y1 = RND; 00806 /* extract 16 coeffs from lumSrc */ 00807 for (j=0; j<lumFilterSize; j++) { 00808 X0 = vec_ld (0, &lumSrc[j][i]); 00809 X1 = vec_ld (16, &lumSrc[j][i]); 00810 Y0 = vec_mradds (X0, YCoeffs[j], Y0); 00811 Y1 = vec_mradds (X1, YCoeffs[j], Y1); 00812 } 00813 00814 U = RND; 00815 V = RND; 00816 /* extract 8 coeffs from U,V */ 00817 for (j=0; j<chrFilterSize; j++) { 00818 X = vec_ld (0, &chrSrc[j][i/2]); 00819 U = vec_mradds (X, CCoeffs[j], U); 00820 X = vec_ld (0, &chrSrc[j][i/2+2048]); 00821 V = vec_mradds (X, CCoeffs[j], V); 00822 } 00823 00824 /* scale and clip signals */ 00825 Y0 = vec_sra (Y0, SCL); 00826 Y1 = vec_sra (Y1, SCL); 00827 U = vec_sra (U, SCL); 00828 V = vec_sra (V, SCL); 00829 00830 Y0 = vec_clip_s16 (Y0); 00831 Y1 = vec_clip_s16 (Y1); 00832 U = vec_clip_s16 (U); 00833 V = vec_clip_s16 (V); 00834 00835 /* now we have 00836 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 00837 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 00838 00839 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 00840 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 00841 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 00842 */ 00843 00844 U0 = vec_mergeh (U,U); 00845 V0 = vec_mergeh (V,V); 00846 00847 U1 = vec_mergel (U,U); 00848 V1 = vec_mergel (V,V); 00849 00850 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); 00851 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); 00852 00853 R = vec_packclp (R0,R1); 00854 G = vec_packclp (G0,G1); 00855 B = vec_packclp (B0,B1); 00856 00857 switch(c->dstFormat) { 00858 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break; 00859 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break; 00860 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break; 00861 case PIX_FMT_ARGB: out_argb (R,G,B,out); break; 00862 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break; 00863 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break; 00864 default: 00865 { 00866 /* If this is reached, the caller should have called yuv2packedXinC 00867 instead. */ 00868 static int printed_error_message; 00869 if (!printed_error_message) { 00870 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", 00871 sws_format_name(c->dstFormat)); 00872 printed_error_message=1; 00873 } 00874 return; 00875 } 00876 } 00877 } 00878 00879 if (i < dstW) { 00880 i -= 16; 00881 00882 Y0 = RND; 00883 Y1 = RND; 00884 /* extract 16 coeffs from lumSrc */ 00885 for (j=0; j<lumFilterSize; j++) { 00886 X0 = vec_ld (0, &lumSrc[j][i]); 00887 X1 = vec_ld (16, &lumSrc[j][i]); 00888 Y0 = vec_mradds (X0, YCoeffs[j], Y0); 00889 Y1 = vec_mradds (X1, YCoeffs[j], Y1); 00890 } 00891 00892 U = RND; 00893 V = RND; 00894 /* extract 8 coeffs from U,V */ 00895 for (j=0; j<chrFilterSize; j++) { 00896 X = vec_ld (0, &chrSrc[j][i/2]); 00897 U = vec_mradds (X, CCoeffs[j], U); 00898 X = vec_ld (0, &chrSrc[j][i/2+2048]); 00899 V = vec_mradds (X, CCoeffs[j], V); 00900 } 00901 00902 /* scale and clip signals */ 00903 Y0 = vec_sra (Y0, SCL); 00904 Y1 = vec_sra (Y1, SCL); 00905 U = vec_sra (U, SCL); 00906 V = vec_sra (V, SCL); 00907 00908 Y0 = vec_clip_s16 (Y0); 00909 Y1 = vec_clip_s16 (Y1); 00910 U = vec_clip_s16 (U); 00911 V = vec_clip_s16 (V); 00912 00913 /* now we have 00914 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 00915 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7 00916 00917 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 00918 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 00919 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 00920 */ 00921 00922 U0 = vec_mergeh (U,U); 00923 V0 = vec_mergeh (V,V); 00924 00925 U1 = vec_mergel (U,U); 00926 V1 = vec_mergel (V,V); 00927 00928 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); 00929 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); 00930 00931 R = vec_packclp (R0,R1); 00932 G = vec_packclp (G0,G1); 00933 B = vec_packclp (B0,B1); 00934 00935 nout = (vector unsigned char *)scratch; 00936 switch(c->dstFormat) { 00937 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break; 00938 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break; 00939 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break; 00940 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break; 00941 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break; 00942 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break; 00943 default: 00944 /* Unreachable, I think. */ 00945 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", 00946 sws_format_name(c->dstFormat)); 00947 return; 00948 } 00949 00950 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4); 00951 } 00952 00953 }