Libav
|
00001 /* 00002 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> 00003 * 00004 * This file is part of FFmpeg. 00005 * 00006 * FFmpeg is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * FFmpeg is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with FFmpeg; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00021 #undef REAL_MOVNTQ 00022 #undef MOVNTQ 00023 #undef PAVGB 00024 #undef PREFETCH 00025 00026 #if COMPILE_TEMPLATE_AMD3DNOW 00027 #define PREFETCH "prefetch" 00028 #elif COMPILE_TEMPLATE_MMX2 00029 #define PREFETCH "prefetchnta" 00030 #else 00031 #define PREFETCH " # nop" 00032 #endif 00033 00034 #if COMPILE_TEMPLATE_MMX2 00035 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 00036 #elif COMPILE_TEMPLATE_AMD3DNOW 00037 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 00038 #endif 00039 00040 #if COMPILE_TEMPLATE_MMX2 00041 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 00042 #else 00043 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" 00044 #endif 00045 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 00046 00047 #if COMPILE_TEMPLATE_ALTIVEC 00048 #include "ppc/swscale_altivec_template.c" 00049 #endif 00050 00051 #define YSCALEYUV2YV12X(x, offset, dest, width) \ 00052 __asm__ volatile(\ 00053 "xor %%"REG_a", %%"REG_a" \n\t"\ 00054 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 00055 "movq %%mm3, %%mm4 \n\t"\ 00056 "lea " offset "(%0), %%"REG_d" \n\t"\ 00057 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00058 ASMALIGN(4) /* FIXME Unroll? */\ 00059 "1: \n\t"\ 00060 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 00061 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ 00062 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\ 00063 "add $16, %%"REG_d" \n\t"\ 00064 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00065 "test %%"REG_S", %%"REG_S" \n\t"\ 00066 "pmulhw %%mm0, %%mm2 \n\t"\ 00067 "pmulhw %%mm0, %%mm5 \n\t"\ 00068 "paddw %%mm2, %%mm3 \n\t"\ 00069 "paddw %%mm5, %%mm4 \n\t"\ 00070 " jnz 1b \n\t"\ 00071 "psraw $3, %%mm3 \n\t"\ 00072 "psraw $3, %%mm4 \n\t"\ 00073 "packuswb %%mm4, %%mm3 \n\t"\ 00074 MOVNTQ(%%mm3, (%1, %%REGa))\ 00075 "add $8, %%"REG_a" \n\t"\ 00076 "cmp %2, %%"REG_a" \n\t"\ 00077 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 00078 "movq %%mm3, %%mm4 \n\t"\ 00079 "lea " offset "(%0), %%"REG_d" \n\t"\ 00080 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00081 "jb 1b \n\t"\ 00082 :: "r" (&c->redDither),\ 00083 "r" (dest), "g" (width)\ 00084 : "%"REG_a, "%"REG_d, "%"REG_S\ 00085 ); 00086 00087 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ 00088 __asm__ volatile(\ 00089 "lea " offset "(%0), %%"REG_d" \n\t"\ 00090 "xor %%"REG_a", %%"REG_a" \n\t"\ 00091 "pxor %%mm4, %%mm4 \n\t"\ 00092 "pxor %%mm5, %%mm5 \n\t"\ 00093 "pxor %%mm6, %%mm6 \n\t"\ 00094 "pxor %%mm7, %%mm7 \n\t"\ 00095 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00096 ASMALIGN(4) \ 00097 "1: \n\t"\ 00098 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\ 00099 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ 00100 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 00101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\ 00102 "movq %%mm0, %%mm3 \n\t"\ 00103 "punpcklwd %%mm1, %%mm0 \n\t"\ 00104 "punpckhwd %%mm1, %%mm3 \n\t"\ 00105 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ 00106 "pmaddwd %%mm1, %%mm0 \n\t"\ 00107 "pmaddwd %%mm1, %%mm3 \n\t"\ 00108 "paddd %%mm0, %%mm4 \n\t"\ 00109 "paddd %%mm3, %%mm5 \n\t"\ 00110 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\ 00111 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 00112 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 00113 "test %%"REG_S", %%"REG_S" \n\t"\ 00114 "movq %%mm2, %%mm0 \n\t"\ 00115 "punpcklwd %%mm3, %%mm2 \n\t"\ 00116 "punpckhwd %%mm3, %%mm0 \n\t"\ 00117 "pmaddwd %%mm1, %%mm2 \n\t"\ 00118 "pmaddwd %%mm1, %%mm0 \n\t"\ 00119 "paddd %%mm2, %%mm6 \n\t"\ 00120 "paddd %%mm0, %%mm7 \n\t"\ 00121 " jnz 1b \n\t"\ 00122 "psrad $16, %%mm4 \n\t"\ 00123 "psrad $16, %%mm5 \n\t"\ 00124 "psrad $16, %%mm6 \n\t"\ 00125 "psrad $16, %%mm7 \n\t"\ 00126 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 00127 "packssdw %%mm5, %%mm4 \n\t"\ 00128 "packssdw %%mm7, %%mm6 \n\t"\ 00129 "paddw %%mm0, %%mm4 \n\t"\ 00130 "paddw %%mm0, %%mm6 \n\t"\ 00131 "psraw $3, %%mm4 \n\t"\ 00132 "psraw $3, %%mm6 \n\t"\ 00133 "packuswb %%mm6, %%mm4 \n\t"\ 00134 MOVNTQ(%%mm4, (%1, %%REGa))\ 00135 "add $8, %%"REG_a" \n\t"\ 00136 "cmp %2, %%"REG_a" \n\t"\ 00137 "lea " offset "(%0), %%"REG_d" \n\t"\ 00138 "pxor %%mm4, %%mm4 \n\t"\ 00139 "pxor %%mm5, %%mm5 \n\t"\ 00140 "pxor %%mm6, %%mm6 \n\t"\ 00141 "pxor %%mm7, %%mm7 \n\t"\ 00142 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00143 "jb 1b \n\t"\ 00144 :: "r" (&c->redDither),\ 00145 "r" (dest), "g" (width)\ 00146 : "%"REG_a, "%"REG_d, "%"REG_S\ 00147 ); 00148 00149 #define YSCALEYUV2YV121 \ 00150 "mov %2, %%"REG_a" \n\t"\ 00151 ASMALIGN(4) /* FIXME Unroll? */\ 00152 "1: \n\t"\ 00153 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ 00154 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ 00155 "psraw $7, %%mm0 \n\t"\ 00156 "psraw $7, %%mm1 \n\t"\ 00157 "packuswb %%mm1, %%mm0 \n\t"\ 00158 MOVNTQ(%%mm0, (%1, %%REGa))\ 00159 "add $8, %%"REG_a" \n\t"\ 00160 "jnc 1b \n\t" 00161 00162 #define YSCALEYUV2YV121_ACCURATE \ 00163 "mov %2, %%"REG_a" \n\t"\ 00164 "pcmpeqw %%mm7, %%mm7 \n\t"\ 00165 "psrlw $15, %%mm7 \n\t"\ 00166 "psllw $6, %%mm7 \n\t"\ 00167 ASMALIGN(4) /* FIXME Unroll? */\ 00168 "1: \n\t"\ 00169 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ 00170 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ 00171 "paddsw %%mm7, %%mm0 \n\t"\ 00172 "paddsw %%mm7, %%mm1 \n\t"\ 00173 "psraw $7, %%mm0 \n\t"\ 00174 "psraw $7, %%mm1 \n\t"\ 00175 "packuswb %%mm1, %%mm0 \n\t"\ 00176 MOVNTQ(%%mm0, (%1, %%REGa))\ 00177 "add $8, %%"REG_a" \n\t"\ 00178 "jnc 1b \n\t" 00179 00180 /* 00181 :: "m" (-lumFilterSize), "m" (-chrFilterSize), 00182 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), 00183 "r" (dest), "m" (dstW), 00184 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) 00185 : "%eax", "%ebx", "%ecx", "%edx", "%esi" 00186 */ 00187 #define YSCALEYUV2PACKEDX_UV \ 00188 __asm__ volatile(\ 00189 "xor %%"REG_a", %%"REG_a" \n\t"\ 00190 ASMALIGN(4)\ 00191 "nop \n\t"\ 00192 "1: \n\t"\ 00193 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 00194 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00195 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 00196 "movq %%mm3, %%mm4 \n\t"\ 00197 ASMALIGN(4)\ 00198 "2: \n\t"\ 00199 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 00200 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ 00201 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ 00202 "add $16, %%"REG_d" \n\t"\ 00203 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00204 "pmulhw %%mm0, %%mm2 \n\t"\ 00205 "pmulhw %%mm0, %%mm5 \n\t"\ 00206 "paddw %%mm2, %%mm3 \n\t"\ 00207 "paddw %%mm5, %%mm4 \n\t"\ 00208 "test %%"REG_S", %%"REG_S" \n\t"\ 00209 " jnz 2b \n\t"\ 00210 00211 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 00212 "lea "offset"(%0), %%"REG_d" \n\t"\ 00213 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00214 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ 00215 "movq "#dst1", "#dst2" \n\t"\ 00216 ASMALIGN(4)\ 00217 "2: \n\t"\ 00218 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\ 00219 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\ 00220 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\ 00221 "add $16, %%"REG_d" \n\t"\ 00222 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00223 "pmulhw "#coeff", "#src1" \n\t"\ 00224 "pmulhw "#coeff", "#src2" \n\t"\ 00225 "paddw "#src1", "#dst1" \n\t"\ 00226 "paddw "#src2", "#dst2" \n\t"\ 00227 "test %%"REG_S", %%"REG_S" \n\t"\ 00228 " jnz 2b \n\t"\ 00229 00230 #define YSCALEYUV2PACKEDX \ 00231 YSCALEYUV2PACKEDX_UV \ 00232 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 00233 00234 #define YSCALEYUV2PACKEDX_END \ 00235 :: "r" (&c->redDither), \ 00236 "m" (dummy), "m" (dummy), "m" (dummy),\ 00237 "r" (dest), "m" (dstW) \ 00238 : "%"REG_a, "%"REG_d, "%"REG_S \ 00239 ); 00240 00241 #define YSCALEYUV2PACKEDX_ACCURATE_UV \ 00242 __asm__ volatile(\ 00243 "xor %%"REG_a", %%"REG_a" \n\t"\ 00244 ASMALIGN(4)\ 00245 "nop \n\t"\ 00246 "1: \n\t"\ 00247 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 00248 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00249 "pxor %%mm4, %%mm4 \n\t"\ 00250 "pxor %%mm5, %%mm5 \n\t"\ 00251 "pxor %%mm6, %%mm6 \n\t"\ 00252 "pxor %%mm7, %%mm7 \n\t"\ 00253 ASMALIGN(4)\ 00254 "2: \n\t"\ 00255 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ 00256 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ 00257 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 00258 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ 00259 "movq %%mm0, %%mm3 \n\t"\ 00260 "punpcklwd %%mm1, %%mm0 \n\t"\ 00261 "punpckhwd %%mm1, %%mm3 \n\t"\ 00262 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ 00263 "pmaddwd %%mm1, %%mm0 \n\t"\ 00264 "pmaddwd %%mm1, %%mm3 \n\t"\ 00265 "paddd %%mm0, %%mm4 \n\t"\ 00266 "paddd %%mm3, %%mm5 \n\t"\ 00267 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ 00268 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 00269 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 00270 "test %%"REG_S", %%"REG_S" \n\t"\ 00271 "movq %%mm2, %%mm0 \n\t"\ 00272 "punpcklwd %%mm3, %%mm2 \n\t"\ 00273 "punpckhwd %%mm3, %%mm0 \n\t"\ 00274 "pmaddwd %%mm1, %%mm2 \n\t"\ 00275 "pmaddwd %%mm1, %%mm0 \n\t"\ 00276 "paddd %%mm2, %%mm6 \n\t"\ 00277 "paddd %%mm0, %%mm7 \n\t"\ 00278 " jnz 2b \n\t"\ 00279 "psrad $16, %%mm4 \n\t"\ 00280 "psrad $16, %%mm5 \n\t"\ 00281 "psrad $16, %%mm6 \n\t"\ 00282 "psrad $16, %%mm7 \n\t"\ 00283 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 00284 "packssdw %%mm5, %%mm4 \n\t"\ 00285 "packssdw %%mm7, %%mm6 \n\t"\ 00286 "paddw %%mm0, %%mm4 \n\t"\ 00287 "paddw %%mm0, %%mm6 \n\t"\ 00288 "movq %%mm4, "U_TEMP"(%0) \n\t"\ 00289 "movq %%mm6, "V_TEMP"(%0) \n\t"\ 00290 00291 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 00292 "lea "offset"(%0), %%"REG_d" \n\t"\ 00293 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 00294 "pxor %%mm1, %%mm1 \n\t"\ 00295 "pxor %%mm5, %%mm5 \n\t"\ 00296 "pxor %%mm7, %%mm7 \n\t"\ 00297 "pxor %%mm6, %%mm6 \n\t"\ 00298 ASMALIGN(4)\ 00299 "2: \n\t"\ 00300 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ 00301 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ 00302 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 00303 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ 00304 "movq %%mm0, %%mm3 \n\t"\ 00305 "punpcklwd %%mm4, %%mm0 \n\t"\ 00306 "punpckhwd %%mm4, %%mm3 \n\t"\ 00307 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ 00308 "pmaddwd %%mm4, %%mm0 \n\t"\ 00309 "pmaddwd %%mm4, %%mm3 \n\t"\ 00310 "paddd %%mm0, %%mm1 \n\t"\ 00311 "paddd %%mm3, %%mm5 \n\t"\ 00312 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ 00313 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 00314 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 00315 "test %%"REG_S", %%"REG_S" \n\t"\ 00316 "movq %%mm2, %%mm0 \n\t"\ 00317 "punpcklwd %%mm3, %%mm2 \n\t"\ 00318 "punpckhwd %%mm3, %%mm0 \n\t"\ 00319 "pmaddwd %%mm4, %%mm2 \n\t"\ 00320 "pmaddwd %%mm4, %%mm0 \n\t"\ 00321 "paddd %%mm2, %%mm7 \n\t"\ 00322 "paddd %%mm0, %%mm6 \n\t"\ 00323 " jnz 2b \n\t"\ 00324 "psrad $16, %%mm1 \n\t"\ 00325 "psrad $16, %%mm5 \n\t"\ 00326 "psrad $16, %%mm7 \n\t"\ 00327 "psrad $16, %%mm6 \n\t"\ 00328 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 00329 "packssdw %%mm5, %%mm1 \n\t"\ 00330 "packssdw %%mm6, %%mm7 \n\t"\ 00331 "paddw %%mm0, %%mm1 \n\t"\ 00332 "paddw %%mm0, %%mm7 \n\t"\ 00333 "movq "U_TEMP"(%0), %%mm3 \n\t"\ 00334 "movq "V_TEMP"(%0), %%mm4 \n\t"\ 00335 00336 #define YSCALEYUV2PACKEDX_ACCURATE \ 00337 YSCALEYUV2PACKEDX_ACCURATE_UV \ 00338 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 00339 00340 #define YSCALEYUV2RGBX \ 00341 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ 00342 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ 00343 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 00344 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 00345 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 00346 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 00347 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 00348 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 00349 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 00350 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ 00351 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ 00352 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 00353 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 00354 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 00355 "paddw %%mm3, %%mm4 \n\t"\ 00356 "movq %%mm2, %%mm0 \n\t"\ 00357 "movq %%mm5, %%mm6 \n\t"\ 00358 "movq %%mm4, %%mm3 \n\t"\ 00359 "punpcklwd %%mm2, %%mm2 \n\t"\ 00360 "punpcklwd %%mm5, %%mm5 \n\t"\ 00361 "punpcklwd %%mm4, %%mm4 \n\t"\ 00362 "paddw %%mm1, %%mm2 \n\t"\ 00363 "paddw %%mm1, %%mm5 \n\t"\ 00364 "paddw %%mm1, %%mm4 \n\t"\ 00365 "punpckhwd %%mm0, %%mm0 \n\t"\ 00366 "punpckhwd %%mm6, %%mm6 \n\t"\ 00367 "punpckhwd %%mm3, %%mm3 \n\t"\ 00368 "paddw %%mm7, %%mm0 \n\t"\ 00369 "paddw %%mm7, %%mm6 \n\t"\ 00370 "paddw %%mm7, %%mm3 \n\t"\ 00371 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 00372 "packuswb %%mm0, %%mm2 \n\t"\ 00373 "packuswb %%mm6, %%mm5 \n\t"\ 00374 "packuswb %%mm3, %%mm4 \n\t"\ 00375 00376 #define REAL_YSCALEYUV2PACKED(index, c) \ 00377 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 00378 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 00379 "psraw $3, %%mm0 \n\t"\ 00380 "psraw $3, %%mm1 \n\t"\ 00381 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 00382 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 00383 "xor "#index", "#index" \n\t"\ 00384 ASMALIGN(4)\ 00385 "1: \n\t"\ 00386 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 00387 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 00388 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 00389 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 00390 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 00391 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 00392 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 00393 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 00394 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 00395 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 00396 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 00397 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 00398 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 00399 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 00400 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 00401 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 00402 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 00403 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 00404 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 00405 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 00406 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 00407 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 00408 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 00409 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 00410 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 00411 00412 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 00413 00414 #define REAL_YSCALEYUV2RGB_UV(index, c) \ 00415 "xor "#index", "#index" \n\t"\ 00416 ASMALIGN(4)\ 00417 "1: \n\t"\ 00418 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 00419 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 00420 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 00421 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 00422 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 00423 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 00424 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 00425 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 00426 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 00427 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 00428 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 00429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 00430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 00431 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 00432 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 00433 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 00434 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 00435 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 00436 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 00437 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 00438 00439 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 00440 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 00441 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 00442 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 00443 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 00444 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 00445 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 00446 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 00447 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 00448 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 00449 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 00450 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 00451 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 00452 00453 #define REAL_YSCALEYUV2RGB_COEFF(c) \ 00454 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 00455 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 00456 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 00457 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 00458 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 00459 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 00460 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 00461 "paddw %%mm3, %%mm4 \n\t"\ 00462 "movq %%mm2, %%mm0 \n\t"\ 00463 "movq %%mm5, %%mm6 \n\t"\ 00464 "movq %%mm4, %%mm3 \n\t"\ 00465 "punpcklwd %%mm2, %%mm2 \n\t"\ 00466 "punpcklwd %%mm5, %%mm5 \n\t"\ 00467 "punpcklwd %%mm4, %%mm4 \n\t"\ 00468 "paddw %%mm1, %%mm2 \n\t"\ 00469 "paddw %%mm1, %%mm5 \n\t"\ 00470 "paddw %%mm1, %%mm4 \n\t"\ 00471 "punpckhwd %%mm0, %%mm0 \n\t"\ 00472 "punpckhwd %%mm6, %%mm6 \n\t"\ 00473 "punpckhwd %%mm3, %%mm3 \n\t"\ 00474 "paddw %%mm7, %%mm0 \n\t"\ 00475 "paddw %%mm7, %%mm6 \n\t"\ 00476 "paddw %%mm7, %%mm3 \n\t"\ 00477 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 00478 "packuswb %%mm0, %%mm2 \n\t"\ 00479 "packuswb %%mm6, %%mm5 \n\t"\ 00480 "packuswb %%mm3, %%mm4 \n\t"\ 00481 00482 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 00483 00484 #define YSCALEYUV2RGB(index, c) \ 00485 REAL_YSCALEYUV2RGB_UV(index, c) \ 00486 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 00487 REAL_YSCALEYUV2RGB_COEFF(c) 00488 00489 #define REAL_YSCALEYUV2PACKED1(index, c) \ 00490 "xor "#index", "#index" \n\t"\ 00491 ASMALIGN(4)\ 00492 "1: \n\t"\ 00493 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 00494 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 00495 "psraw $7, %%mm3 \n\t" \ 00496 "psraw $7, %%mm4 \n\t" \ 00497 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 00498 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 00499 "psraw $7, %%mm1 \n\t" \ 00500 "psraw $7, %%mm7 \n\t" \ 00501 00502 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 00503 00504 #define REAL_YSCALEYUV2RGB1(index, c) \ 00505 "xor "#index", "#index" \n\t"\ 00506 ASMALIGN(4)\ 00507 "1: \n\t"\ 00508 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 00509 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 00510 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 00511 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 00512 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 00513 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 00514 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 00515 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 00516 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 00517 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 00518 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 00519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 00520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 00521 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 00522 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 00523 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 00524 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 00525 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 00526 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 00527 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 00528 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 00529 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 00530 "paddw %%mm3, %%mm4 \n\t"\ 00531 "movq %%mm2, %%mm0 \n\t"\ 00532 "movq %%mm5, %%mm6 \n\t"\ 00533 "movq %%mm4, %%mm3 \n\t"\ 00534 "punpcklwd %%mm2, %%mm2 \n\t"\ 00535 "punpcklwd %%mm5, %%mm5 \n\t"\ 00536 "punpcklwd %%mm4, %%mm4 \n\t"\ 00537 "paddw %%mm1, %%mm2 \n\t"\ 00538 "paddw %%mm1, %%mm5 \n\t"\ 00539 "paddw %%mm1, %%mm4 \n\t"\ 00540 "punpckhwd %%mm0, %%mm0 \n\t"\ 00541 "punpckhwd %%mm6, %%mm6 \n\t"\ 00542 "punpckhwd %%mm3, %%mm3 \n\t"\ 00543 "paddw %%mm7, %%mm0 \n\t"\ 00544 "paddw %%mm7, %%mm6 \n\t"\ 00545 "paddw %%mm7, %%mm3 \n\t"\ 00546 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 00547 "packuswb %%mm0, %%mm2 \n\t"\ 00548 "packuswb %%mm6, %%mm5 \n\t"\ 00549 "packuswb %%mm3, %%mm4 \n\t"\ 00550 00551 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 00552 00553 #define REAL_YSCALEYUV2PACKED1b(index, c) \ 00554 "xor "#index", "#index" \n\t"\ 00555 ASMALIGN(4)\ 00556 "1: \n\t"\ 00557 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 00558 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 00559 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 00560 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 00561 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 00562 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 00563 "psrlw $8, %%mm3 \n\t" \ 00564 "psrlw $8, %%mm4 \n\t" \ 00565 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 00566 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 00567 "psraw $7, %%mm1 \n\t" \ 00568 "psraw $7, %%mm7 \n\t" 00569 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 00570 00571 // do vertical chrominance interpolation 00572 #define REAL_YSCALEYUV2RGB1b(index, c) \ 00573 "xor "#index", "#index" \n\t"\ 00574 ASMALIGN(4)\ 00575 "1: \n\t"\ 00576 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 00577 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 00578 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 00579 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 00580 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 00581 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 00582 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ 00583 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ 00584 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 00585 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 00586 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 00587 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 00588 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 00589 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 00590 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 00591 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 00592 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 00593 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 00594 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 00595 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 00596 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 00597 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 00598 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 00599 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 00600 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 00601 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 00602 "paddw %%mm3, %%mm4 \n\t"\ 00603 "movq %%mm2, %%mm0 \n\t"\ 00604 "movq %%mm5, %%mm6 \n\t"\ 00605 "movq %%mm4, %%mm3 \n\t"\ 00606 "punpcklwd %%mm2, %%mm2 \n\t"\ 00607 "punpcklwd %%mm5, %%mm5 \n\t"\ 00608 "punpcklwd %%mm4, %%mm4 \n\t"\ 00609 "paddw %%mm1, %%mm2 \n\t"\ 00610 "paddw %%mm1, %%mm5 \n\t"\ 00611 "paddw %%mm1, %%mm4 \n\t"\ 00612 "punpckhwd %%mm0, %%mm0 \n\t"\ 00613 "punpckhwd %%mm6, %%mm6 \n\t"\ 00614 "punpckhwd %%mm3, %%mm3 \n\t"\ 00615 "paddw %%mm7, %%mm0 \n\t"\ 00616 "paddw %%mm7, %%mm6 \n\t"\ 00617 "paddw %%mm7, %%mm3 \n\t"\ 00618 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 00619 "packuswb %%mm0, %%mm2 \n\t"\ 00620 "packuswb %%mm6, %%mm5 \n\t"\ 00621 "packuswb %%mm3, %%mm4 \n\t"\ 00622 00623 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 00624 00625 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 00626 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ 00627 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ 00628 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ 00629 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ 00630 "packuswb %%mm1, %%mm7 \n\t" 00631 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 00632 00633 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 00634 "movq "#b", "#q2" \n\t" /* B */\ 00635 "movq "#r", "#t" \n\t" /* R */\ 00636 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ 00637 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ 00638 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ 00639 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ 00640 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ 00641 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ 00642 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ 00643 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ 00644 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ 00645 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ 00646 \ 00647 MOVNTQ( q0, (dst, index, 4))\ 00648 MOVNTQ( b, 8(dst, index, 4))\ 00649 MOVNTQ( q2, 16(dst, index, 4))\ 00650 MOVNTQ( q3, 24(dst, index, 4))\ 00651 \ 00652 "add $8, "#index" \n\t"\ 00653 "cmp "#dstw", "#index" \n\t"\ 00654 " jb 1b \n\t" 00655 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 00656 00657 #define REAL_WRITERGB16(dst, dstw, index) \ 00658 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 00659 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ 00660 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 00661 "psrlq $3, %%mm2 \n\t"\ 00662 \ 00663 "movq %%mm2, %%mm1 \n\t"\ 00664 "movq %%mm4, %%mm3 \n\t"\ 00665 \ 00666 "punpcklbw %%mm7, %%mm3 \n\t"\ 00667 "punpcklbw %%mm5, %%mm2 \n\t"\ 00668 "punpckhbw %%mm7, %%mm4 \n\t"\ 00669 "punpckhbw %%mm5, %%mm1 \n\t"\ 00670 \ 00671 "psllq $3, %%mm3 \n\t"\ 00672 "psllq $3, %%mm4 \n\t"\ 00673 \ 00674 "por %%mm3, %%mm2 \n\t"\ 00675 "por %%mm4, %%mm1 \n\t"\ 00676 \ 00677 MOVNTQ(%%mm2, (dst, index, 2))\ 00678 MOVNTQ(%%mm1, 8(dst, index, 2))\ 00679 \ 00680 "add $8, "#index" \n\t"\ 00681 "cmp "#dstw", "#index" \n\t"\ 00682 " jb 1b \n\t" 00683 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 00684 00685 #define REAL_WRITERGB15(dst, dstw, index) \ 00686 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 00687 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ 00688 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 00689 "psrlq $3, %%mm2 \n\t"\ 00690 "psrlq $1, %%mm5 \n\t"\ 00691 \ 00692 "movq %%mm2, %%mm1 \n\t"\ 00693 "movq %%mm4, %%mm3 \n\t"\ 00694 \ 00695 "punpcklbw %%mm7, %%mm3 \n\t"\ 00696 "punpcklbw %%mm5, %%mm2 \n\t"\ 00697 "punpckhbw %%mm7, %%mm4 \n\t"\ 00698 "punpckhbw %%mm5, %%mm1 \n\t"\ 00699 \ 00700 "psllq $2, %%mm3 \n\t"\ 00701 "psllq $2, %%mm4 \n\t"\ 00702 \ 00703 "por %%mm3, %%mm2 \n\t"\ 00704 "por %%mm4, %%mm1 \n\t"\ 00705 \ 00706 MOVNTQ(%%mm2, (dst, index, 2))\ 00707 MOVNTQ(%%mm1, 8(dst, index, 2))\ 00708 \ 00709 "add $8, "#index" \n\t"\ 00710 "cmp "#dstw", "#index" \n\t"\ 00711 " jb 1b \n\t" 00712 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 00713 00714 #define WRITEBGR24OLD(dst, dstw, index) \ 00715 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 00716 "movq %%mm2, %%mm1 \n\t" /* B */\ 00717 "movq %%mm5, %%mm6 \n\t" /* R */\ 00718 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 00719 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 00720 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 00721 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 00722 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 00723 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 00724 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 00725 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 00726 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 00727 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 00728 \ 00729 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 00730 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ 00731 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\ 00732 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\ 00733 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ 00734 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ 00735 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ 00736 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 00737 \ 00738 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 00739 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ 00740 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ 00741 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ 00742 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\ 00743 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ 00744 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ 00745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\ 00746 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\ 00747 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ 00748 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ 00749 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ 00750 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ 00751 \ 00752 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ 00753 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ 00754 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ 00755 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\ 00756 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\ 00757 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ 00758 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ 00759 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ 00760 \ 00761 MOVNTQ(%%mm0, (dst))\ 00762 MOVNTQ(%%mm2, 8(dst))\ 00763 MOVNTQ(%%mm3, 16(dst))\ 00764 "add $24, "#dst" \n\t"\ 00765 \ 00766 "add $8, "#index" \n\t"\ 00767 "cmp "#dstw", "#index" \n\t"\ 00768 " jb 1b \n\t" 00769 00770 #define WRITEBGR24MMX(dst, dstw, index) \ 00771 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 00772 "movq %%mm2, %%mm1 \n\t" /* B */\ 00773 "movq %%mm5, %%mm6 \n\t" /* R */\ 00774 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 00775 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 00776 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 00777 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 00778 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 00779 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 00780 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 00781 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 00782 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 00783 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 00784 \ 00785 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 00786 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ 00787 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ 00788 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ 00789 \ 00790 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ 00791 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ 00792 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ 00793 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ 00794 \ 00795 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ 00796 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ 00797 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ 00798 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ 00799 \ 00800 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ 00801 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ 00802 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ 00803 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 00804 MOVNTQ(%%mm0, (dst))\ 00805 \ 00806 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ 00807 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ 00808 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ 00809 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ 00810 MOVNTQ(%%mm6, 8(dst))\ 00811 \ 00812 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ 00813 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ 00814 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ 00815 MOVNTQ(%%mm5, 16(dst))\ 00816 \ 00817 "add $24, "#dst" \n\t"\ 00818 \ 00819 "add $8, "#index" \n\t"\ 00820 "cmp "#dstw", "#index" \n\t"\ 00821 " jb 1b \n\t" 00822 00823 #define WRITEBGR24MMX2(dst, dstw, index) \ 00824 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 00825 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 00826 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 00827 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ 00828 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ 00829 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ 00830 \ 00831 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ 00832 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ 00833 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ 00834 \ 00835 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ 00836 "por %%mm1, %%mm6 \n\t"\ 00837 "por %%mm3, %%mm6 \n\t"\ 00838 MOVNTQ(%%mm6, (dst))\ 00839 \ 00840 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ 00841 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ 00842 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ 00843 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ 00844 \ 00845 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ 00846 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ 00847 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ 00848 \ 00849 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ 00850 "por %%mm3, %%mm6 \n\t"\ 00851 MOVNTQ(%%mm6, 8(dst))\ 00852 \ 00853 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ 00854 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ 00855 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ 00856 \ 00857 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ 00858 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ 00859 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ 00860 \ 00861 "por %%mm1, %%mm3 \n\t"\ 00862 "por %%mm3, %%mm6 \n\t"\ 00863 MOVNTQ(%%mm6, 16(dst))\ 00864 \ 00865 "add $24, "#dst" \n\t"\ 00866 \ 00867 "add $8, "#index" \n\t"\ 00868 "cmp "#dstw", "#index" \n\t"\ 00869 " jb 1b \n\t" 00870 00871 #if COMPILE_TEMPLATE_MMX2 00872 #undef WRITEBGR24 00873 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) 00874 #else 00875 #undef WRITEBGR24 00876 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) 00877 #endif 00878 00879 #define REAL_WRITEYUY2(dst, dstw, index) \ 00880 "packuswb %%mm3, %%mm3 \n\t"\ 00881 "packuswb %%mm4, %%mm4 \n\t"\ 00882 "packuswb %%mm7, %%mm1 \n\t"\ 00883 "punpcklbw %%mm4, %%mm3 \n\t"\ 00884 "movq %%mm1, %%mm7 \n\t"\ 00885 "punpcklbw %%mm3, %%mm1 \n\t"\ 00886 "punpckhbw %%mm3, %%mm7 \n\t"\ 00887 \ 00888 MOVNTQ(%%mm1, (dst, index, 2))\ 00889 MOVNTQ(%%mm7, 8(dst, index, 2))\ 00890 \ 00891 "add $8, "#index" \n\t"\ 00892 "cmp "#dstw", "#index" \n\t"\ 00893 " jb 1b \n\t" 00894 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 00895 00896 00897 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, 00898 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc, 00899 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW) 00900 { 00901 #if COMPILE_TEMPLATE_MMX 00902 if(!(c->flags & SWS_BITEXACT)) { 00903 if (c->flags & SWS_ACCURATE_RND) { 00904 if (uDest) { 00905 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) 00906 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) 00907 } 00908 if (CONFIG_SWSCALE_ALPHA && aDest) { 00909 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW) 00910 } 00911 00912 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW) 00913 } else { 00914 if (uDest) { 00915 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) 00916 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) 00917 } 00918 if (CONFIG_SWSCALE_ALPHA && aDest) { 00919 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW) 00920 } 00921 00922 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW) 00923 } 00924 return; 00925 } 00926 #endif 00927 #if COMPILE_TEMPLATE_ALTIVEC 00928 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, 00929 chrFilter, chrSrc, chrFilterSize, 00930 dest, uDest, vDest, dstW, chrDstW); 00931 #else //COMPILE_TEMPLATE_ALTIVEC 00932 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, 00933 chrFilter, chrSrc, chrFilterSize, 00934 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW); 00935 #endif //!COMPILE_TEMPLATE_ALTIVEC 00936 } 00937 00938 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, 00939 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, 00940 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat) 00941 { 00942 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, 00943 chrFilter, chrSrc, chrFilterSize, 00944 dest, uDest, dstW, chrDstW, dstFormat); 00945 } 00946 00947 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc, 00948 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW) 00949 { 00950 int i; 00951 #if COMPILE_TEMPLATE_MMX 00952 if(!(c->flags & SWS_BITEXACT)) { 00953 long p= 4; 00954 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW}; 00955 uint8_t *dst[4]= {aDest, dest, uDest, vDest}; 00956 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW}; 00957 00958 if (c->flags & SWS_ACCURATE_RND) { 00959 while(p--) { 00960 if (dst[p]) { 00961 __asm__ volatile( 00962 YSCALEYUV2YV121_ACCURATE 00963 :: "r" (src[p]), "r" (dst[p] + counter[p]), 00964 "g" (-counter[p]) 00965 : "%"REG_a 00966 ); 00967 } 00968 } 00969 } else { 00970 while(p--) { 00971 if (dst[p]) { 00972 __asm__ volatile( 00973 YSCALEYUV2YV121 00974 :: "r" (src[p]), "r" (dst[p] + counter[p]), 00975 "g" (-counter[p]) 00976 : "%"REG_a 00977 ); 00978 } 00979 } 00980 } 00981 return; 00982 } 00983 #endif 00984 for (i=0; i<dstW; i++) { 00985 int val= (lumSrc[i]+64)>>7; 00986 00987 if (val&256) { 00988 if (val<0) val=0; 00989 else val=255; 00990 } 00991 00992 dest[i]= val; 00993 } 00994 00995 if (uDest) 00996 for (i=0; i<chrDstW; i++) { 00997 int u=(chrSrc[i ]+64)>>7; 00998 int v=(chrSrc[i + VOFW]+64)>>7; 00999 01000 if ((u|v)&256) { 01001 if (u<0) u=0; 01002 else if (u>255) u=255; 01003 if (v<0) v=0; 01004 else if (v>255) v=255; 01005 } 01006 01007 uDest[i]= u; 01008 vDest[i]= v; 01009 } 01010 01011 if (CONFIG_SWSCALE_ALPHA && aDest) 01012 for (i=0; i<dstW; i++) { 01013 int val= (alpSrc[i]+64)>>7; 01014 aDest[i]= av_clip_uint8(val); 01015 } 01016 } 01017 01018 01022 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, 01023 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, 01024 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY) 01025 { 01026 #if COMPILE_TEMPLATE_MMX 01027 x86_reg dummy=0; 01028 if(!(c->flags & SWS_BITEXACT)) { 01029 if (c->flags & SWS_ACCURATE_RND) { 01030 switch(c->dstFormat) { 01031 case PIX_FMT_RGB32: 01032 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 01033 YSCALEYUV2PACKEDX_ACCURATE 01034 YSCALEYUV2RGBX 01035 "movq %%mm2, "U_TEMP"(%0) \n\t" 01036 "movq %%mm4, "V_TEMP"(%0) \n\t" 01037 "movq %%mm5, "Y_TEMP"(%0) \n\t" 01038 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) 01039 "movq "Y_TEMP"(%0), %%mm5 \n\t" 01040 "psraw $3, %%mm1 \n\t" 01041 "psraw $3, %%mm7 \n\t" 01042 "packuswb %%mm7, %%mm1 \n\t" 01043 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) 01044 01045 YSCALEYUV2PACKEDX_END 01046 } else { 01047 YSCALEYUV2PACKEDX_ACCURATE 01048 YSCALEYUV2RGBX 01049 "pcmpeqd %%mm7, %%mm7 \n\t" 01050 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01051 01052 YSCALEYUV2PACKEDX_END 01053 } 01054 return; 01055 case PIX_FMT_BGR24: 01056 YSCALEYUV2PACKEDX_ACCURATE 01057 YSCALEYUV2RGBX 01058 "pxor %%mm7, %%mm7 \n\t" 01059 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize 01060 "add %4, %%"REG_c" \n\t" 01061 WRITEBGR24(%%REGc, %5, %%REGa) 01062 01063 01064 :: "r" (&c->redDither), 01065 "m" (dummy), "m" (dummy), "m" (dummy), 01066 "r" (dest), "m" (dstW) 01067 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S 01068 ); 01069 return; 01070 case PIX_FMT_RGB555: 01071 YSCALEYUV2PACKEDX_ACCURATE 01072 YSCALEYUV2RGBX 01073 "pxor %%mm7, %%mm7 \n\t" 01074 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01075 #ifdef DITHER1XBPP 01076 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 01077 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 01078 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 01079 #endif 01080 01081 WRITERGB15(%4, %5, %%REGa) 01082 YSCALEYUV2PACKEDX_END 01083 return; 01084 case PIX_FMT_RGB565: 01085 YSCALEYUV2PACKEDX_ACCURATE 01086 YSCALEYUV2RGBX 01087 "pxor %%mm7, %%mm7 \n\t" 01088 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01089 #ifdef DITHER1XBPP 01090 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 01091 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 01092 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 01093 #endif 01094 01095 WRITERGB16(%4, %5, %%REGa) 01096 YSCALEYUV2PACKEDX_END 01097 return; 01098 case PIX_FMT_YUYV422: 01099 YSCALEYUV2PACKEDX_ACCURATE 01100 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01101 01102 "psraw $3, %%mm3 \n\t" 01103 "psraw $3, %%mm4 \n\t" 01104 "psraw $3, %%mm1 \n\t" 01105 "psraw $3, %%mm7 \n\t" 01106 WRITEYUY2(%4, %5, %%REGa) 01107 YSCALEYUV2PACKEDX_END 01108 return; 01109 } 01110 } else { 01111 switch(c->dstFormat) { 01112 case PIX_FMT_RGB32: 01113 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 01114 YSCALEYUV2PACKEDX 01115 YSCALEYUV2RGBX 01116 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) 01117 "psraw $3, %%mm1 \n\t" 01118 "psraw $3, %%mm7 \n\t" 01119 "packuswb %%mm7, %%mm1 \n\t" 01120 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 01121 YSCALEYUV2PACKEDX_END 01122 } else { 01123 YSCALEYUV2PACKEDX 01124 YSCALEYUV2RGBX 01125 "pcmpeqd %%mm7, %%mm7 \n\t" 01126 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01127 YSCALEYUV2PACKEDX_END 01128 } 01129 return; 01130 case PIX_FMT_BGR24: 01131 YSCALEYUV2PACKEDX 01132 YSCALEYUV2RGBX 01133 "pxor %%mm7, %%mm7 \n\t" 01134 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize 01135 "add %4, %%"REG_c" \n\t" 01136 WRITEBGR24(%%REGc, %5, %%REGa) 01137 01138 :: "r" (&c->redDither), 01139 "m" (dummy), "m" (dummy), "m" (dummy), 01140 "r" (dest), "m" (dstW) 01141 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S 01142 ); 01143 return; 01144 case PIX_FMT_RGB555: 01145 YSCALEYUV2PACKEDX 01146 YSCALEYUV2RGBX 01147 "pxor %%mm7, %%mm7 \n\t" 01148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01149 #ifdef DITHER1XBPP 01150 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 01151 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 01152 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 01153 #endif 01154 01155 WRITERGB15(%4, %5, %%REGa) 01156 YSCALEYUV2PACKEDX_END 01157 return; 01158 case PIX_FMT_RGB565: 01159 YSCALEYUV2PACKEDX 01160 YSCALEYUV2RGBX 01161 "pxor %%mm7, %%mm7 \n\t" 01162 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01163 #ifdef DITHER1XBPP 01164 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 01165 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 01166 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 01167 #endif 01168 01169 WRITERGB16(%4, %5, %%REGa) 01170 YSCALEYUV2PACKEDX_END 01171 return; 01172 case PIX_FMT_YUYV422: 01173 YSCALEYUV2PACKEDX 01174 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01175 01176 "psraw $3, %%mm3 \n\t" 01177 "psraw $3, %%mm4 \n\t" 01178 "psraw $3, %%mm1 \n\t" 01179 "psraw $3, %%mm7 \n\t" 01180 WRITEYUY2(%4, %5, %%REGa) 01181 YSCALEYUV2PACKEDX_END 01182 return; 01183 } 01184 } 01185 } 01186 #endif /* COMPILE_TEMPLATE_MMX */ 01187 #if COMPILE_TEMPLATE_ALTIVEC 01188 /* The following list of supported dstFormat values should 01189 match what's found in the body of ff_yuv2packedX_altivec() */ 01190 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf && 01191 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || 01192 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || 01193 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)) 01194 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, 01195 chrFilter, chrSrc, chrFilterSize, 01196 dest, dstW, dstY); 01197 else 01198 #endif 01199 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, 01200 chrFilter, chrSrc, chrFilterSize, 01201 alpSrc, dest, dstW, dstY); 01202 } 01203 01207 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1, 01208 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) 01209 { 01210 int yalpha1=4095- yalpha; 01211 int uvalpha1=4095-uvalpha; 01212 int i; 01213 01214 #if COMPILE_TEMPLATE_MMX 01215 if(!(c->flags & SWS_BITEXACT)) { 01216 switch(c->dstFormat) { 01217 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( 01218 case PIX_FMT_RGB32: 01219 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 01220 #if ARCH_X86_64 01221 __asm__ volatile( 01222 YSCALEYUV2RGB(%%r8, %5) 01223 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) 01224 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 01225 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 01226 "packuswb %%mm7, %%mm1 \n\t" 01227 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 01228 01229 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest), 01230 "a" (&c->redDither) 01231 ,"r" (abuf0), "r" (abuf1) 01232 : "%r8" 01233 ); 01234 #else 01235 *(const uint16_t **)(&c->u_temp)=abuf0; 01236 *(const uint16_t **)(&c->v_temp)=abuf1; 01237 __asm__ volatile( 01238 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01239 "mov %4, %%"REG_b" \n\t" 01240 "push %%"REG_BP" \n\t" 01241 YSCALEYUV2RGB(%%REGBP, %5) 01242 "push %0 \n\t" 01243 "push %1 \n\t" 01244 "mov "U_TEMP"(%5), %0 \n\t" 01245 "mov "V_TEMP"(%5), %1 \n\t" 01246 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1) 01247 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 01248 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 01249 "packuswb %%mm7, %%mm1 \n\t" 01250 "pop %1 \n\t" 01251 "pop %0 \n\t" 01252 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 01253 "pop %%"REG_BP" \n\t" 01254 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01255 01256 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01257 "a" (&c->redDither) 01258 ); 01259 #endif 01260 } else { 01261 __asm__ volatile( 01262 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01263 "mov %4, %%"REG_b" \n\t" 01264 "push %%"REG_BP" \n\t" 01265 YSCALEYUV2RGB(%%REGBP, %5) 01266 "pcmpeqd %%mm7, %%mm7 \n\t" 01267 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01268 "pop %%"REG_BP" \n\t" 01269 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01270 01271 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01272 "a" (&c->redDither) 01273 ); 01274 } 01275 return; 01276 case PIX_FMT_BGR24: 01277 __asm__ volatile( 01278 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01279 "mov %4, %%"REG_b" \n\t" 01280 "push %%"REG_BP" \n\t" 01281 YSCALEYUV2RGB(%%REGBP, %5) 01282 "pxor %%mm7, %%mm7 \n\t" 01283 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 01284 "pop %%"REG_BP" \n\t" 01285 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01286 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01287 "a" (&c->redDither) 01288 ); 01289 return; 01290 case PIX_FMT_RGB555: 01291 __asm__ volatile( 01292 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01293 "mov %4, %%"REG_b" \n\t" 01294 "push %%"REG_BP" \n\t" 01295 YSCALEYUV2RGB(%%REGBP, %5) 01296 "pxor %%mm7, %%mm7 \n\t" 01297 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01298 #ifdef DITHER1XBPP 01299 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01300 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01301 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01302 #endif 01303 01304 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 01305 "pop %%"REG_BP" \n\t" 01306 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01307 01308 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01309 "a" (&c->redDither) 01310 ); 01311 return; 01312 case PIX_FMT_RGB565: 01313 __asm__ volatile( 01314 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01315 "mov %4, %%"REG_b" \n\t" 01316 "push %%"REG_BP" \n\t" 01317 YSCALEYUV2RGB(%%REGBP, %5) 01318 "pxor %%mm7, %%mm7 \n\t" 01319 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01320 #ifdef DITHER1XBPP 01321 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01322 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01323 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01324 #endif 01325 01326 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 01327 "pop %%"REG_BP" \n\t" 01328 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01329 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01330 "a" (&c->redDither) 01331 ); 01332 return; 01333 case PIX_FMT_YUYV422: 01334 __asm__ volatile( 01335 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01336 "mov %4, %%"REG_b" \n\t" 01337 "push %%"REG_BP" \n\t" 01338 YSCALEYUV2PACKED(%%REGBP, %5) 01339 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 01340 "pop %%"REG_BP" \n\t" 01341 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01342 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01343 "a" (&c->redDither) 01344 ); 01345 return; 01346 default: break; 01347 } 01348 } 01349 #endif //COMPILE_TEMPLATE_MMX 01350 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C) 01351 } 01352 01356 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1, 01357 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y) 01358 { 01359 const int yalpha1=0; 01360 int i; 01361 01362 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 01363 const int yalpha= 4096; //FIXME ... 01364 01365 if (flags&SWS_FULL_CHR_H_INT) { 01366 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y); 01367 return; 01368 } 01369 01370 #if COMPILE_TEMPLATE_MMX 01371 if(!(flags & SWS_BITEXACT)) { 01372 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 01373 switch(dstFormat) { 01374 case PIX_FMT_RGB32: 01375 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 01376 __asm__ volatile( 01377 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01378 "mov %4, %%"REG_b" \n\t" 01379 "push %%"REG_BP" \n\t" 01380 YSCALEYUV2RGB1(%%REGBP, %5) 01381 YSCALEYUV2RGB1_ALPHA(%%REGBP) 01382 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01383 "pop %%"REG_BP" \n\t" 01384 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01385 01386 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01387 "a" (&c->redDither) 01388 ); 01389 } else { 01390 __asm__ volatile( 01391 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01392 "mov %4, %%"REG_b" \n\t" 01393 "push %%"REG_BP" \n\t" 01394 YSCALEYUV2RGB1(%%REGBP, %5) 01395 "pcmpeqd %%mm7, %%mm7 \n\t" 01396 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01397 "pop %%"REG_BP" \n\t" 01398 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01399 01400 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01401 "a" (&c->redDither) 01402 ); 01403 } 01404 return; 01405 case PIX_FMT_BGR24: 01406 __asm__ volatile( 01407 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01408 "mov %4, %%"REG_b" \n\t" 01409 "push %%"REG_BP" \n\t" 01410 YSCALEYUV2RGB1(%%REGBP, %5) 01411 "pxor %%mm7, %%mm7 \n\t" 01412 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 01413 "pop %%"REG_BP" \n\t" 01414 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01415 01416 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01417 "a" (&c->redDither) 01418 ); 01419 return; 01420 case PIX_FMT_RGB555: 01421 __asm__ volatile( 01422 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01423 "mov %4, %%"REG_b" \n\t" 01424 "push %%"REG_BP" \n\t" 01425 YSCALEYUV2RGB1(%%REGBP, %5) 01426 "pxor %%mm7, %%mm7 \n\t" 01427 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01428 #ifdef DITHER1XBPP 01429 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01430 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01431 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01432 #endif 01433 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 01434 "pop %%"REG_BP" \n\t" 01435 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01436 01437 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01438 "a" (&c->redDither) 01439 ); 01440 return; 01441 case PIX_FMT_RGB565: 01442 __asm__ volatile( 01443 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01444 "mov %4, %%"REG_b" \n\t" 01445 "push %%"REG_BP" \n\t" 01446 YSCALEYUV2RGB1(%%REGBP, %5) 01447 "pxor %%mm7, %%mm7 \n\t" 01448 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01449 #ifdef DITHER1XBPP 01450 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01451 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01452 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01453 #endif 01454 01455 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 01456 "pop %%"REG_BP" \n\t" 01457 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01458 01459 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01460 "a" (&c->redDither) 01461 ); 01462 return; 01463 case PIX_FMT_YUYV422: 01464 __asm__ volatile( 01465 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01466 "mov %4, %%"REG_b" \n\t" 01467 "push %%"REG_BP" \n\t" 01468 YSCALEYUV2PACKED1(%%REGBP, %5) 01469 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 01470 "pop %%"REG_BP" \n\t" 01471 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01472 01473 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01474 "a" (&c->redDither) 01475 ); 01476 return; 01477 } 01478 } else { 01479 switch(dstFormat) { 01480 case PIX_FMT_RGB32: 01481 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 01482 __asm__ volatile( 01483 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01484 "mov %4, %%"REG_b" \n\t" 01485 "push %%"REG_BP" \n\t" 01486 YSCALEYUV2RGB1b(%%REGBP, %5) 01487 YSCALEYUV2RGB1_ALPHA(%%REGBP) 01488 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01489 "pop %%"REG_BP" \n\t" 01490 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01491 01492 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01493 "a" (&c->redDither) 01494 ); 01495 } else { 01496 __asm__ volatile( 01497 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01498 "mov %4, %%"REG_b" \n\t" 01499 "push %%"REG_BP" \n\t" 01500 YSCALEYUV2RGB1b(%%REGBP, %5) 01501 "pcmpeqd %%mm7, %%mm7 \n\t" 01502 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 01503 "pop %%"REG_BP" \n\t" 01504 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01505 01506 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01507 "a" (&c->redDither) 01508 ); 01509 } 01510 return; 01511 case PIX_FMT_BGR24: 01512 __asm__ volatile( 01513 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01514 "mov %4, %%"REG_b" \n\t" 01515 "push %%"REG_BP" \n\t" 01516 YSCALEYUV2RGB1b(%%REGBP, %5) 01517 "pxor %%mm7, %%mm7 \n\t" 01518 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 01519 "pop %%"REG_BP" \n\t" 01520 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01521 01522 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01523 "a" (&c->redDither) 01524 ); 01525 return; 01526 case PIX_FMT_RGB555: 01527 __asm__ volatile( 01528 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01529 "mov %4, %%"REG_b" \n\t" 01530 "push %%"REG_BP" \n\t" 01531 YSCALEYUV2RGB1b(%%REGBP, %5) 01532 "pxor %%mm7, %%mm7 \n\t" 01533 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01534 #ifdef DITHER1XBPP 01535 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01536 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01537 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01538 #endif 01539 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 01540 "pop %%"REG_BP" \n\t" 01541 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01542 01543 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01544 "a" (&c->redDither) 01545 ); 01546 return; 01547 case PIX_FMT_RGB565: 01548 __asm__ volatile( 01549 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01550 "mov %4, %%"REG_b" \n\t" 01551 "push %%"REG_BP" \n\t" 01552 YSCALEYUV2RGB1b(%%REGBP, %5) 01553 "pxor %%mm7, %%mm7 \n\t" 01554 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 01555 #ifdef DITHER1XBPP 01556 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 01557 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 01558 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 01559 #endif 01560 01561 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 01562 "pop %%"REG_BP" \n\t" 01563 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01564 01565 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01566 "a" (&c->redDither) 01567 ); 01568 return; 01569 case PIX_FMT_YUYV422: 01570 __asm__ volatile( 01571 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 01572 "mov %4, %%"REG_b" \n\t" 01573 "push %%"REG_BP" \n\t" 01574 YSCALEYUV2PACKED1b(%%REGBP, %5) 01575 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 01576 "pop %%"REG_BP" \n\t" 01577 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 01578 01579 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 01580 "a" (&c->redDither) 01581 ); 01582 return; 01583 } 01584 } 01585 } 01586 #endif /* COMPILE_TEMPLATE_MMX */ 01587 if (uvalpha < 2048) { 01588 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) 01589 } else { 01590 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) 01591 } 01592 } 01593 01594 //FIXME yuy2* can read up to 7 samples too much 01595 01596 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) 01597 { 01598 #if COMPILE_TEMPLATE_MMX 01599 __asm__ volatile( 01600 "movq "MANGLE(bm01010101)", %%mm2 \n\t" 01601 "mov %0, %%"REG_a" \n\t" 01602 "1: \n\t" 01603 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 01604 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 01605 "pand %%mm2, %%mm0 \n\t" 01606 "pand %%mm2, %%mm1 \n\t" 01607 "packuswb %%mm1, %%mm0 \n\t" 01608 "movq %%mm0, (%2, %%"REG_a") \n\t" 01609 "add $8, %%"REG_a" \n\t" 01610 " js 1b \n\t" 01611 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) 01612 : "%"REG_a 01613 ); 01614 #else 01615 int i; 01616 for (i=0; i<width; i++) 01617 dst[i]= src[2*i]; 01618 #endif 01619 } 01620 01621 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 01622 { 01623 #if COMPILE_TEMPLATE_MMX 01624 __asm__ volatile( 01625 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 01626 "mov %0, %%"REG_a" \n\t" 01627 "1: \n\t" 01628 "movq (%1, %%"REG_a",4), %%mm0 \n\t" 01629 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" 01630 "psrlw $8, %%mm0 \n\t" 01631 "psrlw $8, %%mm1 \n\t" 01632 "packuswb %%mm1, %%mm0 \n\t" 01633 "movq %%mm0, %%mm1 \n\t" 01634 "psrlw $8, %%mm0 \n\t" 01635 "pand %%mm4, %%mm1 \n\t" 01636 "packuswb %%mm0, %%mm0 \n\t" 01637 "packuswb %%mm1, %%mm1 \n\t" 01638 "movd %%mm0, (%3, %%"REG_a") \n\t" 01639 "movd %%mm1, (%2, %%"REG_a") \n\t" 01640 "add $4, %%"REG_a" \n\t" 01641 " js 1b \n\t" 01642 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) 01643 : "%"REG_a 01644 ); 01645 #else 01646 int i; 01647 for (i=0; i<width; i++) { 01648 dstU[i]= src1[4*i + 1]; 01649 dstV[i]= src1[4*i + 3]; 01650 } 01651 #endif 01652 assert(src1 == src2); 01653 } 01654 01655 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 01656 { 01657 #if COMPILE_TEMPLATE_MMX 01658 __asm__ volatile( 01659 "mov %0, %%"REG_a" \n\t" 01660 "1: \n\t" 01661 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 01662 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 01663 "movq (%2, %%"REG_a",2), %%mm2 \n\t" 01664 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t" 01665 "psrlw $8, %%mm0 \n\t" 01666 "psrlw $8, %%mm1 \n\t" 01667 "psrlw $8, %%mm2 \n\t" 01668 "psrlw $8, %%mm3 \n\t" 01669 "packuswb %%mm1, %%mm0 \n\t" 01670 "packuswb %%mm3, %%mm2 \n\t" 01671 "movq %%mm0, (%3, %%"REG_a") \n\t" 01672 "movq %%mm2, (%4, %%"REG_a") \n\t" 01673 "add $8, %%"REG_a" \n\t" 01674 " js 1b \n\t" 01675 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width) 01676 : "%"REG_a 01677 ); 01678 #else 01679 int i; 01680 for (i=0; i<width; i++) { 01681 dstU[i]= src1[2*i + 1]; 01682 dstV[i]= src2[2*i + 1]; 01683 } 01684 #endif 01685 } 01686 01687 /* This is almost identical to the previous, end exists only because 01688 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ 01689 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) 01690 { 01691 #if COMPILE_TEMPLATE_MMX 01692 __asm__ volatile( 01693 "mov %0, %%"REG_a" \n\t" 01694 "1: \n\t" 01695 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 01696 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 01697 "psrlw $8, %%mm0 \n\t" 01698 "psrlw $8, %%mm1 \n\t" 01699 "packuswb %%mm1, %%mm0 \n\t" 01700 "movq %%mm0, (%2, %%"REG_a") \n\t" 01701 "add $8, %%"REG_a" \n\t" 01702 " js 1b \n\t" 01703 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) 01704 : "%"REG_a 01705 ); 01706 #else 01707 int i; 01708 for (i=0; i<width; i++) 01709 dst[i]= src[2*i+1]; 01710 #endif 01711 } 01712 01713 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 01714 { 01715 #if COMPILE_TEMPLATE_MMX 01716 __asm__ volatile( 01717 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 01718 "mov %0, %%"REG_a" \n\t" 01719 "1: \n\t" 01720 "movq (%1, %%"REG_a",4), %%mm0 \n\t" 01721 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" 01722 "pand %%mm4, %%mm0 \n\t" 01723 "pand %%mm4, %%mm1 \n\t" 01724 "packuswb %%mm1, %%mm0 \n\t" 01725 "movq %%mm0, %%mm1 \n\t" 01726 "psrlw $8, %%mm0 \n\t" 01727 "pand %%mm4, %%mm1 \n\t" 01728 "packuswb %%mm0, %%mm0 \n\t" 01729 "packuswb %%mm1, %%mm1 \n\t" 01730 "movd %%mm0, (%3, %%"REG_a") \n\t" 01731 "movd %%mm1, (%2, %%"REG_a") \n\t" 01732 "add $4, %%"REG_a" \n\t" 01733 " js 1b \n\t" 01734 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) 01735 : "%"REG_a 01736 ); 01737 #else 01738 int i; 01739 for (i=0; i<width; i++) { 01740 dstU[i]= src1[4*i + 0]; 01741 dstV[i]= src1[4*i + 2]; 01742 } 01743 #endif 01744 assert(src1 == src2); 01745 } 01746 01747 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 01748 { 01749 #if COMPILE_TEMPLATE_MMX 01750 __asm__ volatile( 01751 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 01752 "mov %0, %%"REG_a" \n\t" 01753 "1: \n\t" 01754 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 01755 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 01756 "movq (%2, %%"REG_a",2), %%mm2 \n\t" 01757 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t" 01758 "pand %%mm4, %%mm0 \n\t" 01759 "pand %%mm4, %%mm1 \n\t" 01760 "pand %%mm4, %%mm2 \n\t" 01761 "pand %%mm4, %%mm3 \n\t" 01762 "packuswb %%mm1, %%mm0 \n\t" 01763 "packuswb %%mm3, %%mm2 \n\t" 01764 "movq %%mm0, (%3, %%"REG_a") \n\t" 01765 "movq %%mm2, (%4, %%"REG_a") \n\t" 01766 "add $8, %%"REG_a" \n\t" 01767 " js 1b \n\t" 01768 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width) 01769 : "%"REG_a 01770 ); 01771 #else 01772 int i; 01773 for (i=0; i<width; i++) { 01774 dstU[i]= src1[2*i]; 01775 dstV[i]= src2[2*i]; 01776 } 01777 #endif 01778 } 01779 01780 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2, 01781 const uint8_t *src, long width) 01782 { 01783 #if COMPILE_TEMPLATE_MMX 01784 __asm__ volatile( 01785 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 01786 "mov %0, %%"REG_a" \n\t" 01787 "1: \n\t" 01788 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 01789 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 01790 "movq %%mm0, %%mm2 \n\t" 01791 "movq %%mm1, %%mm3 \n\t" 01792 "pand %%mm4, %%mm0 \n\t" 01793 "pand %%mm4, %%mm1 \n\t" 01794 "psrlw $8, %%mm2 \n\t" 01795 "psrlw $8, %%mm3 \n\t" 01796 "packuswb %%mm1, %%mm0 \n\t" 01797 "packuswb %%mm3, %%mm2 \n\t" 01798 "movq %%mm0, (%2, %%"REG_a") \n\t" 01799 "movq %%mm2, (%3, %%"REG_a") \n\t" 01800 "add $8, %%"REG_a" \n\t" 01801 " js 1b \n\t" 01802 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width) 01803 : "%"REG_a 01804 ); 01805 #else 01806 int i; 01807 for (i = 0; i < width; i++) { 01808 dst1[i] = src[2*i+0]; 01809 dst2[i] = src[2*i+1]; 01810 } 01811 #endif 01812 } 01813 01814 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV, 01815 const uint8_t *src1, const uint8_t *src2, 01816 long width, uint32_t *unused) 01817 { 01818 RENAME(nvXXtoUV)(dstU, dstV, src1, width); 01819 } 01820 01821 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV, 01822 const uint8_t *src1, const uint8_t *src2, 01823 long width, uint32_t *unused) 01824 { 01825 RENAME(nvXXtoUV)(dstV, dstU, src1, width); 01826 } 01827 01828 #if COMPILE_TEMPLATE_MMX 01829 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat) 01830 { 01831 01832 if(srcFormat == PIX_FMT_BGR24) { 01833 __asm__ volatile( 01834 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t" 01835 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t" 01836 : 01837 ); 01838 } else { 01839 __asm__ volatile( 01840 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t" 01841 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t" 01842 : 01843 ); 01844 } 01845 01846 __asm__ volatile( 01847 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t" 01848 "mov %2, %%"REG_a" \n\t" 01849 "pxor %%mm7, %%mm7 \n\t" 01850 "1: \n\t" 01851 PREFETCH" 64(%0) \n\t" 01852 "movd (%0), %%mm0 \n\t" 01853 "movd 2(%0), %%mm1 \n\t" 01854 "movd 6(%0), %%mm2 \n\t" 01855 "movd 8(%0), %%mm3 \n\t" 01856 "add $12, %0 \n\t" 01857 "punpcklbw %%mm7, %%mm0 \n\t" 01858 "punpcklbw %%mm7, %%mm1 \n\t" 01859 "punpcklbw %%mm7, %%mm2 \n\t" 01860 "punpcklbw %%mm7, %%mm3 \n\t" 01861 "pmaddwd %%mm5, %%mm0 \n\t" 01862 "pmaddwd %%mm6, %%mm1 \n\t" 01863 "pmaddwd %%mm5, %%mm2 \n\t" 01864 "pmaddwd %%mm6, %%mm3 \n\t" 01865 "paddd %%mm1, %%mm0 \n\t" 01866 "paddd %%mm3, %%mm2 \n\t" 01867 "paddd %%mm4, %%mm0 \n\t" 01868 "paddd %%mm4, %%mm2 \n\t" 01869 "psrad $15, %%mm0 \n\t" 01870 "psrad $15, %%mm2 \n\t" 01871 "packssdw %%mm2, %%mm0 \n\t" 01872 "packuswb %%mm0, %%mm0 \n\t" 01873 "movd %%mm0, (%1, %%"REG_a") \n\t" 01874 "add $4, %%"REG_a" \n\t" 01875 " js 1b \n\t" 01876 : "+r" (src) 01877 : "r" (dst+width), "g" ((x86_reg)-width) 01878 : "%"REG_a 01879 ); 01880 } 01881 01882 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat) 01883 { 01884 __asm__ volatile( 01885 "movq 24(%4), %%mm6 \n\t" 01886 "mov %3, %%"REG_a" \n\t" 01887 "pxor %%mm7, %%mm7 \n\t" 01888 "1: \n\t" 01889 PREFETCH" 64(%0) \n\t" 01890 "movd (%0), %%mm0 \n\t" 01891 "movd 2(%0), %%mm1 \n\t" 01892 "punpcklbw %%mm7, %%mm0 \n\t" 01893 "punpcklbw %%mm7, %%mm1 \n\t" 01894 "movq %%mm0, %%mm2 \n\t" 01895 "movq %%mm1, %%mm3 \n\t" 01896 "pmaddwd (%4), %%mm0 \n\t" 01897 "pmaddwd 8(%4), %%mm1 \n\t" 01898 "pmaddwd 16(%4), %%mm2 \n\t" 01899 "pmaddwd %%mm6, %%mm3 \n\t" 01900 "paddd %%mm1, %%mm0 \n\t" 01901 "paddd %%mm3, %%mm2 \n\t" 01902 01903 "movd 6(%0), %%mm1 \n\t" 01904 "movd 8(%0), %%mm3 \n\t" 01905 "add $12, %0 \n\t" 01906 "punpcklbw %%mm7, %%mm1 \n\t" 01907 "punpcklbw %%mm7, %%mm3 \n\t" 01908 "movq %%mm1, %%mm4 \n\t" 01909 "movq %%mm3, %%mm5 \n\t" 01910 "pmaddwd (%4), %%mm1 \n\t" 01911 "pmaddwd 8(%4), %%mm3 \n\t" 01912 "pmaddwd 16(%4), %%mm4 \n\t" 01913 "pmaddwd %%mm6, %%mm5 \n\t" 01914 "paddd %%mm3, %%mm1 \n\t" 01915 "paddd %%mm5, %%mm4 \n\t" 01916 01917 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t" 01918 "paddd %%mm3, %%mm0 \n\t" 01919 "paddd %%mm3, %%mm2 \n\t" 01920 "paddd %%mm3, %%mm1 \n\t" 01921 "paddd %%mm3, %%mm4 \n\t" 01922 "psrad $15, %%mm0 \n\t" 01923 "psrad $15, %%mm2 \n\t" 01924 "psrad $15, %%mm1 \n\t" 01925 "psrad $15, %%mm4 \n\t" 01926 "packssdw %%mm1, %%mm0 \n\t" 01927 "packssdw %%mm4, %%mm2 \n\t" 01928 "packuswb %%mm0, %%mm0 \n\t" 01929 "packuswb %%mm2, %%mm2 \n\t" 01930 "movd %%mm0, (%1, %%"REG_a") \n\t" 01931 "movd %%mm2, (%2, %%"REG_a") \n\t" 01932 "add $4, %%"REG_a" \n\t" 01933 " js 1b \n\t" 01934 : "+r" (src) 01935 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24]) 01936 : "%"REG_a 01937 ); 01938 } 01939 #endif 01940 01941 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) 01942 { 01943 #if COMPILE_TEMPLATE_MMX 01944 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24); 01945 #else 01946 int i; 01947 for (i=0; i<width; i++) { 01948 int b= src[i*3+0]; 01949 int g= src[i*3+1]; 01950 int r= src[i*3+2]; 01951 01952 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); 01953 } 01954 #endif /* COMPILE_TEMPLATE_MMX */ 01955 } 01956 01957 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 01958 { 01959 #if COMPILE_TEMPLATE_MMX 01960 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24); 01961 #else 01962 int i; 01963 for (i=0; i<width; i++) { 01964 int b= src1[3*i + 0]; 01965 int g= src1[3*i + 1]; 01966 int r= src1[3*i + 2]; 01967 01968 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 01969 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 01970 } 01971 #endif /* COMPILE_TEMPLATE_MMX */ 01972 assert(src1 == src2); 01973 } 01974 01975 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 01976 { 01977 int i; 01978 for (i=0; i<width; i++) { 01979 int b= src1[6*i + 0] + src1[6*i + 3]; 01980 int g= src1[6*i + 1] + src1[6*i + 4]; 01981 int r= src1[6*i + 2] + src1[6*i + 5]; 01982 01983 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 01984 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 01985 } 01986 assert(src1 == src2); 01987 } 01988 01989 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) 01990 { 01991 #if COMPILE_TEMPLATE_MMX 01992 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24); 01993 #else 01994 int i; 01995 for (i=0; i<width; i++) { 01996 int r= src[i*3+0]; 01997 int g= src[i*3+1]; 01998 int b= src[i*3+2]; 01999 02000 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); 02001 } 02002 #endif 02003 } 02004 02005 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 02006 { 02007 #if COMPILE_TEMPLATE_MMX 02008 assert(src1==src2); 02009 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24); 02010 #else 02011 int i; 02012 assert(src1==src2); 02013 for (i=0; i<width; i++) { 02014 int r= src1[3*i + 0]; 02015 int g= src1[3*i + 1]; 02016 int b= src1[3*i + 2]; 02017 02018 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 02019 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 02020 } 02021 #endif 02022 } 02023 02024 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 02025 { 02026 int i; 02027 assert(src1==src2); 02028 for (i=0; i<width; i++) { 02029 int r= src1[6*i + 0] + src1[6*i + 3]; 02030 int g= src1[6*i + 1] + src1[6*i + 4]; 02031 int b= src1[6*i + 2] + src1[6*i + 5]; 02032 02033 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 02034 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 02035 } 02036 } 02037 02038 02039 // bilinear / bicubic scaling 02040 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc, 02041 const int16_t *filter, const int16_t *filterPos, long filterSize) 02042 { 02043 #if COMPILE_TEMPLATE_MMX 02044 assert(filterSize % 4 == 0 && filterSize>0); 02045 if (filterSize==4) { // Always true for upscaling, sometimes for down, too. 02046 x86_reg counter= -2*dstW; 02047 filter-= counter*2; 02048 filterPos-= counter/2; 02049 dst-= counter/2; 02050 __asm__ volatile( 02051 #if defined(PIC) 02052 "push %%"REG_b" \n\t" 02053 #endif 02054 "pxor %%mm7, %%mm7 \n\t" 02055 "push %%"REG_BP" \n\t" // we use 7 regs here ... 02056 "mov %%"REG_a", %%"REG_BP" \n\t" 02057 ASMALIGN(4) 02058 "1: \n\t" 02059 "movzwl (%2, %%"REG_BP"), %%eax \n\t" 02060 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" 02061 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" 02062 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" 02063 "movd (%3, %%"REG_a"), %%mm0 \n\t" 02064 "movd (%3, %%"REG_b"), %%mm2 \n\t" 02065 "punpcklbw %%mm7, %%mm0 \n\t" 02066 "punpcklbw %%mm7, %%mm2 \n\t" 02067 "pmaddwd %%mm1, %%mm0 \n\t" 02068 "pmaddwd %%mm2, %%mm3 \n\t" 02069 "movq %%mm0, %%mm4 \n\t" 02070 "punpckldq %%mm3, %%mm0 \n\t" 02071 "punpckhdq %%mm3, %%mm4 \n\t" 02072 "paddd %%mm4, %%mm0 \n\t" 02073 "psrad $7, %%mm0 \n\t" 02074 "packssdw %%mm0, %%mm0 \n\t" 02075 "movd %%mm0, (%4, %%"REG_BP") \n\t" 02076 "add $4, %%"REG_BP" \n\t" 02077 " jnc 1b \n\t" 02078 02079 "pop %%"REG_BP" \n\t" 02080 #if defined(PIC) 02081 "pop %%"REG_b" \n\t" 02082 #endif 02083 : "+a" (counter) 02084 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) 02085 #if !defined(PIC) 02086 : "%"REG_b 02087 #endif 02088 ); 02089 } else if (filterSize==8) { 02090 x86_reg counter= -2*dstW; 02091 filter-= counter*4; 02092 filterPos-= counter/2; 02093 dst-= counter/2; 02094 __asm__ volatile( 02095 #if defined(PIC) 02096 "push %%"REG_b" \n\t" 02097 #endif 02098 "pxor %%mm7, %%mm7 \n\t" 02099 "push %%"REG_BP" \n\t" // we use 7 regs here ... 02100 "mov %%"REG_a", %%"REG_BP" \n\t" 02101 ASMALIGN(4) 02102 "1: \n\t" 02103 "movzwl (%2, %%"REG_BP"), %%eax \n\t" 02104 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" 02105 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" 02106 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" 02107 "movd (%3, %%"REG_a"), %%mm0 \n\t" 02108 "movd (%3, %%"REG_b"), %%mm2 \n\t" 02109 "punpcklbw %%mm7, %%mm0 \n\t" 02110 "punpcklbw %%mm7, %%mm2 \n\t" 02111 "pmaddwd %%mm1, %%mm0 \n\t" 02112 "pmaddwd %%mm2, %%mm3 \n\t" 02113 02114 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" 02115 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" 02116 "movd 4(%3, %%"REG_a"), %%mm4 \n\t" 02117 "movd 4(%3, %%"REG_b"), %%mm2 \n\t" 02118 "punpcklbw %%mm7, %%mm4 \n\t" 02119 "punpcklbw %%mm7, %%mm2 \n\t" 02120 "pmaddwd %%mm1, %%mm4 \n\t" 02121 "pmaddwd %%mm2, %%mm5 \n\t" 02122 "paddd %%mm4, %%mm0 \n\t" 02123 "paddd %%mm5, %%mm3 \n\t" 02124 "movq %%mm0, %%mm4 \n\t" 02125 "punpckldq %%mm3, %%mm0 \n\t" 02126 "punpckhdq %%mm3, %%mm4 \n\t" 02127 "paddd %%mm4, %%mm0 \n\t" 02128 "psrad $7, %%mm0 \n\t" 02129 "packssdw %%mm0, %%mm0 \n\t" 02130 "movd %%mm0, (%4, %%"REG_BP") \n\t" 02131 "add $4, %%"REG_BP" \n\t" 02132 " jnc 1b \n\t" 02133 02134 "pop %%"REG_BP" \n\t" 02135 #if defined(PIC) 02136 "pop %%"REG_b" \n\t" 02137 #endif 02138 : "+a" (counter) 02139 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) 02140 #if !defined(PIC) 02141 : "%"REG_b 02142 #endif 02143 ); 02144 } else { 02145 const uint8_t *offset = src+filterSize; 02146 x86_reg counter= -2*dstW; 02147 //filter-= counter*filterSize/2; 02148 filterPos-= counter/2; 02149 dst-= counter/2; 02150 __asm__ volatile( 02151 "pxor %%mm7, %%mm7 \n\t" 02152 ASMALIGN(4) 02153 "1: \n\t" 02154 "mov %2, %%"REG_c" \n\t" 02155 "movzwl (%%"REG_c", %0), %%eax \n\t" 02156 "movzwl 2(%%"REG_c", %0), %%edx \n\t" 02157 "mov %5, %%"REG_c" \n\t" 02158 "pxor %%mm4, %%mm4 \n\t" 02159 "pxor %%mm5, %%mm5 \n\t" 02160 "2: \n\t" 02161 "movq (%1), %%mm1 \n\t" 02162 "movq (%1, %6), %%mm3 \n\t" 02163 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" 02164 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" 02165 "punpcklbw %%mm7, %%mm0 \n\t" 02166 "punpcklbw %%mm7, %%mm2 \n\t" 02167 "pmaddwd %%mm1, %%mm0 \n\t" 02168 "pmaddwd %%mm2, %%mm3 \n\t" 02169 "paddd %%mm3, %%mm5 \n\t" 02170 "paddd %%mm0, %%mm4 \n\t" 02171 "add $8, %1 \n\t" 02172 "add $4, %%"REG_c" \n\t" 02173 "cmp %4, %%"REG_c" \n\t" 02174 " jb 2b \n\t" 02175 "add %6, %1 \n\t" 02176 "movq %%mm4, %%mm0 \n\t" 02177 "punpckldq %%mm5, %%mm4 \n\t" 02178 "punpckhdq %%mm5, %%mm0 \n\t" 02179 "paddd %%mm0, %%mm4 \n\t" 02180 "psrad $7, %%mm4 \n\t" 02181 "packssdw %%mm4, %%mm4 \n\t" 02182 "mov %3, %%"REG_a" \n\t" 02183 "movd %%mm4, (%%"REG_a", %0) \n\t" 02184 "add $4, %0 \n\t" 02185 " jnc 1b \n\t" 02186 02187 : "+r" (counter), "+r" (filter) 02188 : "m" (filterPos), "m" (dst), "m"(offset), 02189 "m" (src), "r" ((x86_reg)filterSize*2) 02190 : "%"REG_a, "%"REG_c, "%"REG_d 02191 ); 02192 } 02193 #else 02194 #if COMPILE_TEMPLATE_ALTIVEC 02195 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); 02196 #else 02197 int i; 02198 for (i=0; i<dstW; i++) { 02199 int j; 02200 int srcPos= filterPos[i]; 02201 int val=0; 02202 //printf("filterPos: %d\n", filterPos[i]); 02203 for (j=0; j<filterSize; j++) { 02204 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); 02205 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; 02206 } 02207 //filter += hFilterSize; 02208 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ... 02209 //dst[i] = val>>7; 02210 } 02211 #endif /* COMPILE_TEMPLATE_ALTIVEC */ 02212 #endif /* COMPILE_MMX */ 02213 } 02214 02215 //FIXME all pal and rgb srcFormats could do this convertion as well 02216 //FIXME all scalers more complex than bilinear could do half of this transform 02217 static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width) 02218 { 02219 int i; 02220 for (i = 0; i < width; i++) { 02221 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264 02222 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264 02223 } 02224 } 02225 static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width) 02226 { 02227 int i; 02228 for (i = 0; i < width; i++) { 02229 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469 02230 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469 02231 } 02232 } 02233 static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width) 02234 { 02235 int i; 02236 for (i = 0; i < width; i++) 02237 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14; 02238 } 02239 static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width) 02240 { 02241 int i; 02242 for (i = 0; i < width; i++) 02243 dst[i] = (dst[i]*14071 + 33561947)>>14; 02244 } 02245 02246 #define FAST_BILINEAR_X86 \ 02247 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \ 02248 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \ 02249 "shll $16, %%edi \n\t" \ 02250 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \ 02251 "mov %1, %%"REG_D"\n\t" \ 02252 "shrl $9, %%esi \n\t" \ 02253 02254 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, 02255 long dstWidth, const uint8_t *src, int srcW, 02256 int xInc) 02257 { 02258 #if ARCH_X86 02259 #if COMPILE_TEMPLATE_MMX2 02260 int32_t *filterPos = c->hLumFilterPos; 02261 int16_t *filter = c->hLumFilter; 02262 int canMMX2BeUsed = c->canMMX2BeUsed; 02263 void *mmx2FilterCode= c->lumMmx2FilterCode; 02264 int i; 02265 #if defined(PIC) 02266 DECLARE_ALIGNED(8, uint64_t, ebxsave); 02267 #endif 02268 if (canMMX2BeUsed) { 02269 __asm__ volatile( 02270 #if defined(PIC) 02271 "mov %%"REG_b", %5 \n\t" 02272 #endif 02273 "pxor %%mm7, %%mm7 \n\t" 02274 "mov %0, %%"REG_c" \n\t" 02275 "mov %1, %%"REG_D" \n\t" 02276 "mov %2, %%"REG_d" \n\t" 02277 "mov %3, %%"REG_b" \n\t" 02278 "xor %%"REG_a", %%"REG_a" \n\t" // i 02279 PREFETCH" (%%"REG_c") \n\t" 02280 PREFETCH" 32(%%"REG_c") \n\t" 02281 PREFETCH" 64(%%"REG_c") \n\t" 02282 02283 #if ARCH_X86_64 02284 02285 #define CALL_MMX2_FILTER_CODE \ 02286 "movl (%%"REG_b"), %%esi \n\t"\ 02287 "call *%4 \n\t"\ 02288 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ 02289 "add %%"REG_S", %%"REG_c" \n\t"\ 02290 "add %%"REG_a", %%"REG_D" \n\t"\ 02291 "xor %%"REG_a", %%"REG_a" \n\t"\ 02292 02293 #else 02294 02295 #define CALL_MMX2_FILTER_CODE \ 02296 "movl (%%"REG_b"), %%esi \n\t"\ 02297 "call *%4 \n\t"\ 02298 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ 02299 "add %%"REG_a", %%"REG_D" \n\t"\ 02300 "xor %%"REG_a", %%"REG_a" \n\t"\ 02301 02302 #endif /* ARCH_X86_64 */ 02303 02304 CALL_MMX2_FILTER_CODE 02305 CALL_MMX2_FILTER_CODE 02306 CALL_MMX2_FILTER_CODE 02307 CALL_MMX2_FILTER_CODE 02308 CALL_MMX2_FILTER_CODE 02309 CALL_MMX2_FILTER_CODE 02310 CALL_MMX2_FILTER_CODE 02311 CALL_MMX2_FILTER_CODE 02312 02313 #if defined(PIC) 02314 "mov %5, %%"REG_b" \n\t" 02315 #endif 02316 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), 02317 "m" (mmx2FilterCode) 02318 #if defined(PIC) 02319 ,"m" (ebxsave) 02320 #endif 02321 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D 02322 #if !defined(PIC) 02323 ,"%"REG_b 02324 #endif 02325 ); 02326 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; 02327 } else { 02328 #endif /* COMPILE_TEMPLATE_MMX2 */ 02329 x86_reg xInc_shr16 = xInc >> 16; 02330 uint16_t xInc_mask = xInc & 0xffff; 02331 //NO MMX just normal asm ... 02332 __asm__ volatile( 02333 "xor %%"REG_a", %%"REG_a" \n\t" // i 02334 "xor %%"REG_d", %%"REG_d" \n\t" // xx 02335 "xorl %%ecx, %%ecx \n\t" // xalpha 02336 ASMALIGN(4) 02337 "1: \n\t" 02338 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] 02339 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] 02340 FAST_BILINEAR_X86 02341 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" 02342 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF 02343 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry 02344 02345 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] 02346 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] 02347 FAST_BILINEAR_X86 02348 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t" 02349 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF 02350 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry 02351 02352 02353 "add $2, %%"REG_a" \n\t" 02354 "cmp %2, %%"REG_a" \n\t" 02355 " jb 1b \n\t" 02356 02357 02358 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) 02359 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" 02360 ); 02361 #if COMPILE_TEMPLATE_MMX2 02362 } //if MMX2 can't be used 02363 #endif 02364 #else 02365 int i; 02366 unsigned int xpos=0; 02367 for (i=0;i<dstWidth;i++) { 02368 register unsigned int xx=xpos>>16; 02369 register unsigned int xalpha=(xpos&0xFFFF)>>9; 02370 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; 02371 xpos+=xInc; 02372 } 02373 #endif /* ARCH_X86 */ 02374 } 02375 02376 // *** horizontal scale Y line to temp buffer 02377 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc, 02378 const int16_t *hLumFilter, 02379 const int16_t *hLumFilterPos, int hLumFilterSize, 02380 uint8_t *formatConvBuffer, 02381 uint32_t *pal, int isAlpha) 02382 { 02383 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12; 02384 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange; 02385 02386 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset; 02387 02388 if (toYV12) { 02389 toYV12(formatConvBuffer, src, srcW, pal); 02390 src= formatConvBuffer; 02391 } 02392 02393 if (!c->hyscale_fast) { 02394 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); 02395 } else { // fast bilinear upscale / crap downscale 02396 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc); 02397 } 02398 02399 if (convertRange) 02400 convertRange(dst, dstWidth); 02401 } 02402 02403 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst, 02404 long dstWidth, const uint8_t *src1, 02405 const uint8_t *src2, int srcW, int xInc) 02406 { 02407 #if ARCH_X86 02408 #if COMPILE_TEMPLATE_MMX2 02409 int32_t *filterPos = c->hChrFilterPos; 02410 int16_t *filter = c->hChrFilter; 02411 int canMMX2BeUsed = c->canMMX2BeUsed; 02412 void *mmx2FilterCode= c->chrMmx2FilterCode; 02413 int i; 02414 #if defined(PIC) 02415 DECLARE_ALIGNED(8, uint64_t, ebxsave); 02416 #endif 02417 if (canMMX2BeUsed) { 02418 __asm__ volatile( 02419 #if defined(PIC) 02420 "mov %%"REG_b", %6 \n\t" 02421 #endif 02422 "pxor %%mm7, %%mm7 \n\t" 02423 "mov %0, %%"REG_c" \n\t" 02424 "mov %1, %%"REG_D" \n\t" 02425 "mov %2, %%"REG_d" \n\t" 02426 "mov %3, %%"REG_b" \n\t" 02427 "xor %%"REG_a", %%"REG_a" \n\t" // i 02428 PREFETCH" (%%"REG_c") \n\t" 02429 PREFETCH" 32(%%"REG_c") \n\t" 02430 PREFETCH" 64(%%"REG_c") \n\t" 02431 02432 CALL_MMX2_FILTER_CODE 02433 CALL_MMX2_FILTER_CODE 02434 CALL_MMX2_FILTER_CODE 02435 CALL_MMX2_FILTER_CODE 02436 "xor %%"REG_a", %%"REG_a" \n\t" // i 02437 "mov %5, %%"REG_c" \n\t" // src 02438 "mov %1, %%"REG_D" \n\t" // buf1 02439 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t" 02440 PREFETCH" (%%"REG_c") \n\t" 02441 PREFETCH" 32(%%"REG_c") \n\t" 02442 PREFETCH" 64(%%"REG_c") \n\t" 02443 02444 CALL_MMX2_FILTER_CODE 02445 CALL_MMX2_FILTER_CODE 02446 CALL_MMX2_FILTER_CODE 02447 CALL_MMX2_FILTER_CODE 02448 02449 #if defined(PIC) 02450 "mov %6, %%"REG_b" \n\t" 02451 #endif 02452 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos), 02453 "m" (mmx2FilterCode), "m" (src2) 02454 #if defined(PIC) 02455 ,"m" (ebxsave) 02456 #endif 02457 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D 02458 #if !defined(PIC) 02459 ,"%"REG_b 02460 #endif 02461 ); 02462 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { 02463 //printf("%d %d %d\n", dstWidth, i, srcW); 02464 dst[i] = src1[srcW-1]*128; 02465 dst[i+VOFW] = src2[srcW-1]*128; 02466 } 02467 } else { 02468 #endif /* COMPILE_TEMPLATE_MMX2 */ 02469 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16); 02470 uint16_t xInc_mask = xInc & 0xffff; 02471 __asm__ volatile( 02472 "xor %%"REG_a", %%"REG_a" \n\t" // i 02473 "xor %%"REG_d", %%"REG_d" \n\t" // xx 02474 "xorl %%ecx, %%ecx \n\t" // xalpha 02475 ASMALIGN(4) 02476 "1: \n\t" 02477 "mov %0, %%"REG_S" \n\t" 02478 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] 02479 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] 02480 FAST_BILINEAR_X86 02481 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" 02482 02483 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] 02484 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] 02485 FAST_BILINEAR_X86 02486 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t" 02487 02488 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF 02489 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry 02490 "add $1, %%"REG_a" \n\t" 02491 "cmp %2, %%"REG_a" \n\t" 02492 " jb 1b \n\t" 02493 02494 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, 02495 which is needed to support GCC 4.0. */ 02496 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4) 02497 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask), 02498 #else 02499 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask), 02500 #endif 02501 "r" (src2) 02502 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" 02503 ); 02504 #if COMPILE_TEMPLATE_MMX2 02505 } //if MMX2 can't be used 02506 #endif 02507 #else 02508 int i; 02509 unsigned int xpos=0; 02510 for (i=0;i<dstWidth;i++) { 02511 register unsigned int xx=xpos>>16; 02512 register unsigned int xalpha=(xpos&0xFFFF)>>9; 02513 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); 02514 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); 02515 /* slower 02516 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; 02517 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; 02518 */ 02519 xpos+=xInc; 02520 } 02521 #endif /* ARCH_X86 */ 02522 } 02523 02524 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2, 02525 int srcW, int xInc, const int16_t *hChrFilter, 02526 const int16_t *hChrFilterPos, int hChrFilterSize, 02527 uint8_t *formatConvBuffer, 02528 uint32_t *pal) 02529 { 02530 02531 src1 += c->chrSrcOffset; 02532 src2 += c->chrSrcOffset; 02533 02534 if (c->chrToYV12) { 02535 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 02536 src1= formatConvBuffer; 02537 src2= formatConvBuffer+VOFW; 02538 } 02539 02540 if (!c->hcscale_fast) { 02541 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); 02542 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); 02543 } else { // fast bilinear upscale / crap downscale 02544 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc); 02545 } 02546 02547 if (c->chrConvertRange) 02548 c->chrConvertRange(dst, dstWidth); 02549 } 02550 02551 #define DEBUG_SWSCALE_BUFFERS 0 02552 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__) 02553 02554 static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY, 02555 int srcSliceH, uint8_t* dst[], int dstStride[]) 02556 { 02557 /* load a few things into local vars to make the code more readable? and faster */ 02558 const int srcW= c->srcW; 02559 const int dstW= c->dstW; 02560 const int dstH= c->dstH; 02561 const int chrDstW= c->chrDstW; 02562 const int chrSrcW= c->chrSrcW; 02563 const int lumXInc= c->lumXInc; 02564 const int chrXInc= c->chrXInc; 02565 const enum PixelFormat dstFormat= c->dstFormat; 02566 const int flags= c->flags; 02567 int16_t *vLumFilterPos= c->vLumFilterPos; 02568 int16_t *vChrFilterPos= c->vChrFilterPos; 02569 int16_t *hLumFilterPos= c->hLumFilterPos; 02570 int16_t *hChrFilterPos= c->hChrFilterPos; 02571 int16_t *vLumFilter= c->vLumFilter; 02572 int16_t *vChrFilter= c->vChrFilter; 02573 int16_t *hLumFilter= c->hLumFilter; 02574 int16_t *hChrFilter= c->hChrFilter; 02575 int32_t *lumMmxFilter= c->lumMmxFilter; 02576 int32_t *chrMmxFilter= c->chrMmxFilter; 02577 int32_t av_unused *alpMmxFilter= c->alpMmxFilter; 02578 const int vLumFilterSize= c->vLumFilterSize; 02579 const int vChrFilterSize= c->vChrFilterSize; 02580 const int hLumFilterSize= c->hLumFilterSize; 02581 const int hChrFilterSize= c->hChrFilterSize; 02582 int16_t **lumPixBuf= c->lumPixBuf; 02583 int16_t **chrPixBuf= c->chrPixBuf; 02584 int16_t **alpPixBuf= c->alpPixBuf; 02585 const int vLumBufSize= c->vLumBufSize; 02586 const int vChrBufSize= c->vChrBufSize; 02587 uint8_t *formatConvBuffer= c->formatConvBuffer; 02588 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; 02589 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); 02590 int lastDstY; 02591 uint32_t *pal=c->pal_yuv; 02592 02593 /* vars which will change and which we need to store back in the context */ 02594 int dstY= c->dstY; 02595 int lumBufIndex= c->lumBufIndex; 02596 int chrBufIndex= c->chrBufIndex; 02597 int lastInLumBuf= c->lastInLumBuf; 02598 int lastInChrBuf= c->lastInChrBuf; 02599 02600 if (isPacked(c->srcFormat)) { 02601 src[0]= 02602 src[1]= 02603 src[2]= 02604 src[3]= src[0]; 02605 srcStride[0]= 02606 srcStride[1]= 02607 srcStride[2]= 02608 srcStride[3]= srcStride[0]; 02609 } 02610 srcStride[1]<<= c->vChrDrop; 02611 srcStride[2]<<= c->vChrDrop; 02612 02613 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n", 02614 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3], 02615 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]); 02616 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n", 02617 srcSliceY, srcSliceH, dstY, dstH); 02618 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n", 02619 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize); 02620 02621 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) { 02622 static int warnedAlready=0; //FIXME move this into the context perhaps 02623 if (flags & SWS_PRINT_INFO && !warnedAlready) { 02624 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n" 02625 " ->cannot do aligned memory accesses anymore\n"); 02626 warnedAlready=1; 02627 } 02628 } 02629 02630 /* Note the user might start scaling the picture in the middle so this 02631 will not get executed. This is not really intended but works 02632 currently, so people might do it. */ 02633 if (srcSliceY ==0) { 02634 lumBufIndex=-1; 02635 chrBufIndex=-1; 02636 dstY=0; 02637 lastInLumBuf= -1; 02638 lastInChrBuf= -1; 02639 } 02640 02641 lastDstY= dstY; 02642 02643 for (;dstY < dstH; dstY++) { 02644 unsigned char *dest =dst[0]+dstStride[0]*dstY; 02645 const int chrDstY= dstY>>c->chrDstVSubSample; 02646 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; 02647 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; 02648 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL; 02649 02650 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input 02651 const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)]; 02652 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input 02653 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input 02654 int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input 02655 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input 02656 int enough_lines; 02657 02658 //handle holes (FAST_BILINEAR & weird filters) 02659 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; 02660 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; 02661 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1); 02662 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1); 02663 02664 DEBUG_BUFFERS("dstY: %d\n", dstY); 02665 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n", 02666 firstLumSrcY, lastLumSrcY, lastInLumBuf); 02667 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n", 02668 firstChrSrcY, lastChrSrcY, lastInChrBuf); 02669 02670 // Do we have enough lines in this slice to output the dstY line 02671 enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample); 02672 02673 if (!enough_lines) { 02674 lastLumSrcY = srcSliceY + srcSliceH - 1; 02675 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1; 02676 DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n", 02677 lastLumSrcY, lastChrSrcY); 02678 } 02679 02680 //Do horizontal scaling 02681 while(lastInLumBuf < lastLumSrcY) { 02682 const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; 02683 const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3]; 02684 lumBufIndex++; 02685 assert(lumBufIndex < 2*vLumBufSize); 02686 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); 02687 assert(lastInLumBuf + 1 - srcSliceY >= 0); 02688 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc, 02689 hLumFilter, hLumFilterPos, hLumFilterSize, 02690 formatConvBuffer, 02691 pal, 0); 02692 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) 02693 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc, 02694 hLumFilter, hLumFilterPos, hLumFilterSize, 02695 formatConvBuffer, 02696 pal, 1); 02697 lastInLumBuf++; 02698 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n", 02699 lumBufIndex, lastInLumBuf); 02700 } 02701 while(lastInChrBuf < lastChrSrcY) { 02702 const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; 02703 const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; 02704 chrBufIndex++; 02705 assert(chrBufIndex < 2*vChrBufSize); 02706 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)); 02707 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); 02708 //FIXME replace parameters through context struct (some at least) 02709 02710 if (c->needs_hcscale) 02711 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, 02712 hChrFilter, hChrFilterPos, hChrFilterSize, 02713 formatConvBuffer, 02714 pal); 02715 lastInChrBuf++; 02716 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n", 02717 chrBufIndex, lastInChrBuf); 02718 } 02719 //wrap buf index around to stay inside the ring buffer 02720 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; 02721 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; 02722 if (!enough_lines) 02723 break; //we can't output a dstY line so let's try with the next slice 02724 02725 #if COMPILE_TEMPLATE_MMX 02726 c->blueDither= ff_dither8[dstY&1]; 02727 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555) 02728 c->greenDither= ff_dither8[dstY&1]; 02729 else 02730 c->greenDither= ff_dither4[dstY&1]; 02731 c->redDither= ff_dither8[(dstY+1)&1]; 02732 #endif 02733 if (dstY < dstH-2) { 02734 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; 02735 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; 02736 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; 02737 #if COMPILE_TEMPLATE_MMX 02738 int i; 02739 if (flags & SWS_ACCURATE_RND) { 02740 int s= APCK_SIZE / 8; 02741 for (i=0; i<vLumFilterSize; i+=2) { 02742 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ]; 02743 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)]; 02744 lumMmxFilter[s*i+APCK_COEF/4 ]= 02745 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ] 02746 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); 02747 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { 02748 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ]; 02749 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)]; 02750 alpMmxFilter[s*i+APCK_COEF/4 ]= 02751 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ]; 02752 } 02753 } 02754 for (i=0; i<vChrFilterSize; i+=2) { 02755 *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ]; 02756 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)]; 02757 chrMmxFilter[s*i+APCK_COEF/4 ]= 02758 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ] 02759 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); 02760 } 02761 } else { 02762 for (i=0; i<vLumFilterSize; i++) { 02763 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; 02764 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32; 02765 lumMmxFilter[4*i+2]= 02766 lumMmxFilter[4*i+3]= 02767 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; 02768 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { 02769 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i]; 02770 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32; 02771 alpMmxFilter[4*i+2]= 02772 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2]; 02773 } 02774 } 02775 for (i=0; i<vChrFilterSize; i++) { 02776 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; 02777 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32; 02778 chrMmxFilter[4*i+2]= 02779 chrMmxFilter[4*i+3]= 02780 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; 02781 } 02782 } 02783 #endif 02784 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { 02785 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 02786 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi 02787 c->yuv2nv12X(c, 02788 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 02789 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02790 dest, uDest, dstW, chrDstW, dstFormat); 02791 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like 02792 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 02793 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi 02794 if (is16BPS(dstFormat)) { 02795 yuv2yuvX16inC( 02796 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 02797 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02798 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW, 02799 dstFormat); 02800 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12 02801 const int16_t *lumBuf = lumSrcPtr[0]; 02802 const int16_t *chrBuf= chrSrcPtr[0]; 02803 const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL; 02804 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW); 02805 } else { //General YV12 02806 c->yuv2yuvX(c, 02807 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 02808 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02809 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW); 02810 } 02811 } else { 02812 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); 02813 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); 02814 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB 02815 int chrAlpha= vChrFilter[2*dstY+1]; 02816 if(flags & SWS_FULL_CHR_H_INT) { 02817 yuv2rgbXinC_full(c, //FIXME write a packed1_full function 02818 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 02819 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02820 alpSrcPtr, dest, dstW, dstY); 02821 } else { 02822 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), 02823 alpPixBuf ? *alpSrcPtr : NULL, 02824 dest, dstW, chrAlpha, dstFormat, flags, dstY); 02825 } 02826 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB 02827 int lumAlpha= vLumFilter[2*dstY+1]; 02828 int chrAlpha= vChrFilter[2*dstY+1]; 02829 lumMmxFilter[2]= 02830 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001; 02831 chrMmxFilter[2]= 02832 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001; 02833 if(flags & SWS_FULL_CHR_H_INT) { 02834 yuv2rgbXinC_full(c, //FIXME write a packed2_full function 02835 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 02836 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02837 alpSrcPtr, dest, dstW, dstY); 02838 } else { 02839 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), 02840 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL, 02841 dest, dstW, lumAlpha, chrAlpha, dstY); 02842 } 02843 } else { //general RGB 02844 if(flags & SWS_FULL_CHR_H_INT) { 02845 yuv2rgbXinC_full(c, 02846 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 02847 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02848 alpSrcPtr, dest, dstW, dstY); 02849 } else { 02850 c->yuv2packedX(c, 02851 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 02852 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02853 alpSrcPtr, dest, dstW, dstY); 02854 } 02855 } 02856 } 02857 } else { // hmm looks like we can't use MMX here without overwriting this array's tail 02858 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; 02859 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; 02860 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; 02861 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { 02862 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 02863 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi 02864 yuv2nv12XinC( 02865 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 02866 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02867 dest, uDest, dstW, chrDstW, dstFormat); 02868 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 02869 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 02870 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi 02871 if (is16BPS(dstFormat)) { 02872 yuv2yuvX16inC( 02873 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 02874 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02875 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW, 02876 dstFormat); 02877 } else { 02878 yuv2yuvXinC( 02879 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 02880 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02881 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW); 02882 } 02883 } else { 02884 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); 02885 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); 02886 if(flags & SWS_FULL_CHR_H_INT) { 02887 yuv2rgbXinC_full(c, 02888 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 02889 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02890 alpSrcPtr, dest, dstW, dstY); 02891 } else { 02892 yuv2packedXinC(c, 02893 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 02894 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 02895 alpSrcPtr, dest, dstW, dstY); 02896 } 02897 } 02898 } 02899 } 02900 02901 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf) 02902 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255); 02903 02904 #if COMPILE_TEMPLATE_MMX 02905 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory"); 02906 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ 02907 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory"); 02908 else __asm__ volatile("emms" :::"memory"); 02909 #endif 02910 /* store changed local vars back in the context */ 02911 c->dstY= dstY; 02912 c->lumBufIndex= lumBufIndex; 02913 c->chrBufIndex= chrBufIndex; 02914 c->lastInLumBuf= lastInLumBuf; 02915 c->lastInChrBuf= lastInChrBuf; 02916 02917 return dstY - lastDstY; 02918 } 02919 02920 static void RENAME(sws_init_swScale)(SwsContext *c) 02921 { 02922 enum PixelFormat srcFormat = c->srcFormat; 02923 02924 c->yuv2nv12X = RENAME(yuv2nv12X ); 02925 c->yuv2yuv1 = RENAME(yuv2yuv1 ); 02926 c->yuv2yuvX = RENAME(yuv2yuvX ); 02927 c->yuv2packed1 = RENAME(yuv2packed1 ); 02928 c->yuv2packed2 = RENAME(yuv2packed2 ); 02929 c->yuv2packedX = RENAME(yuv2packedX ); 02930 02931 c->hScale = RENAME(hScale ); 02932 02933 #if COMPILE_TEMPLATE_MMX 02934 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). 02935 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed) 02936 #else 02937 if (c->flags & SWS_FAST_BILINEAR) 02938 #endif 02939 { 02940 c->hyscale_fast = RENAME(hyscale_fast); 02941 c->hcscale_fast = RENAME(hcscale_fast); 02942 } 02943 02944 c->chrToYV12 = NULL; 02945 switch(srcFormat) { 02946 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break; 02947 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break; 02948 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break; 02949 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break; 02950 case PIX_FMT_RGB8 : 02951 case PIX_FMT_BGR8 : 02952 case PIX_FMT_PAL8 : 02953 case PIX_FMT_BGR4_BYTE: 02954 case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break; 02955 case PIX_FMT_YUV420P16BE: 02956 case PIX_FMT_YUV422P16BE: 02957 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break; 02958 case PIX_FMT_YUV420P16LE: 02959 case PIX_FMT_YUV422P16LE: 02960 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break; 02961 } 02962 if (c->chrSrcHSubSample) { 02963 switch(srcFormat) { 02964 case PIX_FMT_RGB48BE: 02965 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break; 02966 case PIX_FMT_RGB32 : 02967 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break; 02968 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break; 02969 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break; 02970 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break; 02971 case PIX_FMT_BGR32 : 02972 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break; 02973 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break; 02974 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break; 02975 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break; 02976 } 02977 } else { 02978 switch(srcFormat) { 02979 case PIX_FMT_RGB48BE: 02980 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break; 02981 case PIX_FMT_RGB32 : 02982 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break; 02983 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break; 02984 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break; 02985 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break; 02986 case PIX_FMT_BGR32 : 02987 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break; 02988 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break; 02989 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break; 02990 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break; 02991 } 02992 } 02993 02994 c->lumToYV12 = NULL; 02995 c->alpToYV12 = NULL; 02996 switch (srcFormat) { 02997 case PIX_FMT_YUYV422 : 02998 case PIX_FMT_YUV420P16BE: 02999 case PIX_FMT_YUV422P16BE: 03000 case PIX_FMT_YUV444P16BE: 03001 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break; 03002 case PIX_FMT_UYVY422 : 03003 case PIX_FMT_YUV420P16LE: 03004 case PIX_FMT_YUV422P16LE: 03005 case PIX_FMT_YUV444P16LE: 03006 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break; 03007 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break; 03008 case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break; 03009 case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break; 03010 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break; 03011 case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break; 03012 case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break; 03013 case PIX_FMT_RGB8 : 03014 case PIX_FMT_BGR8 : 03015 case PIX_FMT_PAL8 : 03016 case PIX_FMT_BGR4_BYTE: 03017 case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break; 03018 case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break; 03019 case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break; 03020 case PIX_FMT_RGB32 : 03021 case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break; 03022 case PIX_FMT_BGR32 : 03023 case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break; 03024 case PIX_FMT_RGB48BE: 03025 case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break; 03026 } 03027 if (c->alpPixBuf) { 03028 switch (srcFormat) { 03029 case PIX_FMT_RGB32 : 03030 case PIX_FMT_RGB32_1: 03031 case PIX_FMT_BGR32 : 03032 case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break; 03033 } 03034 } 03035 03036 switch (srcFormat) { 03037 case PIX_FMT_RGB32 : 03038 case PIX_FMT_BGR32 : 03039 c->alpSrcOffset = 3; 03040 break; 03041 case PIX_FMT_RGB32_1: 03042 case PIX_FMT_BGR32_1: 03043 c->lumSrcOffset = ALT32_CORR; 03044 c->chrSrcOffset = ALT32_CORR; 03045 break; 03046 case PIX_FMT_RGB48LE: 03047 c->lumSrcOffset = 1; 03048 c->chrSrcOffset = 1; 03049 c->alpSrcOffset = 1; 03050 break; 03051 } 03052 03053 if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { 03054 if (c->srcRange) { 03055 c->lumConvertRange = RENAME(lumRangeFromJpeg); 03056 c->chrConvertRange = RENAME(chrRangeFromJpeg); 03057 } else { 03058 c->lumConvertRange = RENAME(lumRangeToJpeg); 03059 c->chrConvertRange = RENAME(chrRangeToJpeg); 03060 } 03061 } 03062 03063 if (!(isGray(srcFormat) || isGray(c->dstFormat) || 03064 srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE)) 03065 c->needs_hcscale = 1; 03066 }